All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: refine kiq read register
@ 2020-04-17  6:53 Yintian Tao
  2020-04-17  7:01 ` Liu, Monk
  2020-04-17 15:39 ` Felix Kuehling
  0 siblings, 2 replies; 25+ messages in thread
From: Yintian Tao @ 2020-04-17  6:53 UTC (permalink / raw)
  To: monk.liu; +Cc: amd-gfx, Yintian Tao

According to the current kiq read register method,
there will be race condition when using KIQ to read
register if multiple clients want to read at same time
just like the expample below:
1. client-A start to read REG-0 throguh KIQ
2. client-A poll the seqno-0
3. client-B start to read REG-1 through KIQ
4. client-B poll the seqno-1
5. the kiq complete these two read operation
6. client-A to read the register at the wb buffer and
   get REG-1 value

Therefore, directly make kiq write the register value at
the ring buffer then there will be no race condition for
the wb buffer.

v2: supply the read_clock and move the reg_val_offs back

Signed-off-by: Yintian Tao <yttao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
 6 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ea576b4260a4..4e1c0239e561 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 
 	spin_lock_init(&kiq->ring_lock);
 
-	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
-	if (r)
-		return r;
-
 	ring->adev = NULL;
 	ring->ring_obj = NULL;
 	ring->use_doorbell = true;
@@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 
 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
 {
-	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
 	amdgpu_ring_fini(ring);
 }
 
@@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
+	uint64_t reg_val_offs = 0;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
 	amdgpu_ring_alloc(ring, 32);
-	amdgpu_ring_emit_rreg(ring, reg);
+	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
+	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags);
@@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return adev->wb.wb[kiq->reg_val_offs];
+	return ring->ring[reg_val_offs];
 
 failed_kiq_read:
 	pr_err("failed to read reg:%x\n", reg);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 634746829024..ee698f0246d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -103,7 +103,6 @@ struct amdgpu_kiq {
 	struct amdgpu_ring	ring;
 	struct amdgpu_irq_src	irq;
 	const struct kiq_pm4_funcs *pmf;
-	uint32_t			reg_val_offs;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index f61664ee4940..a3d88f2aa9f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
 	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
-	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
+	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
+			  uint64_t reg_val_offs);
 	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
 	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
 			      uint32_t val, uint32_t mask);
@@ -265,7 +266,7 @@ struct amdgpu_ring {
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
-#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
+#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
 #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
 #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 0a03e2ad5d95..7c9a5e440509 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
 	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
 }
 
-static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				     uint64_t reg_val_offs)
 {
-	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
 				(5 << 8) |	/* dst: memory */
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index fc6c2f2bc76c..8e7eee7838e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
 }
 
-static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint64_t reg_val_offs)
 {
-	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
 				(5 << 8) |	/* dst: memory */
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 84fcf842316d..ff279b1f5c24 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
+	uint64_t reg_val_offs = 0;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
 	amdgpu_ring_alloc(ring, 32);
+	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 9 |	/* src: register*/
 				(5 << 8) |	/* dst: memory */
@@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags);
@@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
-		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
+	return (uint64_t)ring->ring[reg_val_offs] |
+		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
 
 failed_kiq_read:
 	pr_err("failed to read gpu clock\n");
@@ -5482,21 +5484,19 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
 }
 
-static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint64_t reg_val_offs)
 {
-	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
 				(5 << 8) |	/* dst: memory */
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17  6:53 [PATCH] drm/amdgpu: refine kiq read register Yintian Tao
@ 2020-04-17  7:01 ` Liu, Monk
  2020-04-17  8:58   ` Christian König
  2020-04-17 15:39 ` Felix Kuehling
  1 sibling, 1 reply; 25+ messages in thread
From: Liu, Monk @ 2020-04-17  7:01 UTC (permalink / raw)
  To: Tao, Yintian, Kuehling, Felix, Deucher, Alexander, Koenig,
	Christian, Zhang, Hawking
  Cc: Tao, Yintian, amd-gfx

The change Looks good with me, you can put my RB to your patch .

Since this patch impact on general logic (not SRIOV only) I would like you wait a little longer for @Kuehling, Felix and @Deucher, Alexander and @Koenig, Christian  @Zhang, Hawking 

If any of them gave you a RB I think we can go this way 

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Yintian Tao <yttao@amd.com> 
Sent: Friday, April 17, 2020 2:53 PM
To: Liu, Monk <Monk.Liu@amd.com>
Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
Subject: [PATCH] drm/amdgpu: refine kiq read register

According to the current kiq read register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
   get REG-1 value

Therefore, directly make kiq write the register value at the ring buffer then there will be no race condition for the wb buffer.

v2: supply the read_clock and move the reg_val_offs back

Signed-off-by: Yintian Tao <yttao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
 6 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ea576b4260a4..4e1c0239e561 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 
 	spin_lock_init(&kiq->ring_lock);
 
-	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
-	if (r)
-		return r;
-
 	ring->adev = NULL;
 	ring->ring_obj = NULL;
 	ring->use_doorbell = true;
@@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 
 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
-	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
 	amdgpu_ring_fini(ring);
 }
 
@@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
+	uint64_t reg_val_offs = 0;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
 	amdgpu_ring_alloc(ring, 32);
-	amdgpu_ring_emit_rreg(ring, reg);
+	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
+	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return adev->wb.wb[kiq->reg_val_offs];
+	return ring->ring[reg_val_offs];
 
 failed_kiq_read:
 	pr_err("failed to read reg:%x\n", reg); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 634746829024..ee698f0246d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -103,7 +103,6 @@ struct amdgpu_kiq {
 	struct amdgpu_ring	ring;
 	struct amdgpu_irq_src	irq;
 	const struct kiq_pm4_funcs *pmf;
-	uint32_t			reg_val_offs;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index f61664ee4940..a3d88f2aa9f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
 	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
-	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
+	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
+			  uint64_t reg_val_offs);
 	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
 	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
 			      uint32_t val, uint32_t mask);
@@ -265,7 +266,7 @@ struct amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
+#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), 
+(o))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 0a03e2ad5d95..7c9a5e440509 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
 	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
 
-static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				     uint64_t reg_val_offs)
 {
-	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
 				(5 << 8) |	/* dst: memory */
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index fc6c2f2bc76c..8e7eee7838e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
 
-static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint64_t reg_val_offs)
 {
-	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
 				(5 << 8) |	/* dst: memory */
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 84fcf842316d..ff279b1f5c24 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
+	uint64_t reg_val_offs = 0;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
 	amdgpu_ring_alloc(ring, 32);
+	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 9 |	/* src: register*/
 				(5 << 8) |	/* dst: memory */
@@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
-		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
+	return (uint64_t)ring->ring[reg_val_offs] |
+		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
 
 failed_kiq_read:
 	pr_err("failed to read gpu clock\n");
@@ -5482,21 +5484,19 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
 
-static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint64_t reg_val_offs)
 {
-	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
 				(5 << 8) |	/* dst: memory */
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17  7:01 ` Liu, Monk
@ 2020-04-17  8:58   ` Christian König
  2020-04-17  9:06     ` Liu, Monk
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2020-04-17  8:58 UTC (permalink / raw)
  To: Liu, Monk, Tao, Yintian, Kuehling, Felix, Deucher, Alexander,
	Zhang, Hawking
  Cc: amd-gfx

Looks like a rather important bug fix to me, but I'm not sure if writing 
the value into the ring buffer is a good idea.

See we wanted to map the ring buffers read only and USWC for some time. 
That would result in either not working driver or rather crappy performance.

Can't we just call amdgpu_device_wb_get() in amdgpu_device_wb_get() 
instead and allocate the wb address dynamically?

Regards,
Christian.

Am 17.04.20 um 09:01 schrieb Liu, Monk:
> The change Looks good with me, you can put my RB to your patch .
>
> Since this patch impact on general logic (not SRIOV only) I would like you wait a little longer for @Kuehling, Felix and @Deucher, Alexander and @Koenig, Christian  @Zhang, Hawking
>
> If any of them gave you a RB I think we can go this way
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Yintian Tao <yttao@amd.com>
> Sent: Friday, April 17, 2020 2:53 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
> Subject: [PATCH] drm/amdgpu: refine kiq read register
>
> According to the current kiq read register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>     get REG-1 value
>
> Therefore, directly make kiq write the register value at the ring buffer then there will be no race condition for the wb buffer.
>
> v2: supply the read_clock and move the reg_val_offs back
>
> Signed-off-by: Yintian Tao <yttao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>   6 files changed, 33 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ea576b4260a4..4e1c0239e561 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   
>   	spin_lock_init(&kiq->ring_lock);
>   
> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
> -	if (r)
> -		return r;
> -
>   	ring->adev = NULL;
>   	ring->ring_obj = NULL;
>   	ring->use_doorbell = true;
> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   
>   void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>   	amdgpu_ring_fini(ring);
>   }
>   
> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	uint32_t seq;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
> +	uint64_t reg_val_offs = 0;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>   	amdgpu_ring_alloc(ring, 32);
> -	amdgpu_ring_emit_rreg(ring, reg);
> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return adev->wb.wb[kiq->reg_val_offs];
> +	return ring->ring[reg_val_offs];
>   
>   failed_kiq_read:
>   	pr_err("failed to read reg:%x\n", reg); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 634746829024..ee698f0246d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>   	struct amdgpu_ring	ring;
>   	struct amdgpu_irq_src	irq;
>   	const struct kiq_pm4_funcs *pmf;
> -	uint32_t			reg_val_offs;
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f61664ee4940..a3d88f2aa9f4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>   	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
> +			  uint64_t reg_val_offs);
>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>   			      uint32_t val, uint32_t mask);
> @@ -265,7 +266,7 @@ struct amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d),
> +(o))
>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 0a03e2ad5d95..7c9a5e440509 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>   	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>   
> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				     uint64_t reg_val_offs)
>   {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>   				(5 << 8) |	/* dst: memory */
>   				(1 << 20));	/* write confirm */
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>   }
>   
>   static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fc6c2f2bc76c..8e7eee7838e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>   
> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint64_t reg_val_offs)
>   {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>   				(5 << 8) |	/* dst: memory */
>   				(1 << 20));	/* write confirm */
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>   }
>   
>   static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 84fcf842316d..ff279b1f5c24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	uint32_t seq;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
> +	uint64_t reg_val_offs = 0;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>   	amdgpu_ring_alloc(ring, 32);
> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 9 |	/* src: register*/
>   				(5 << 8) |	/* dst: memory */
> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   				(1 << 20));	/* write confirm */
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
> +	return (uint64_t)ring->ring[reg_val_offs] |
> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>   
>   failed_kiq_read:
>   	pr_err("failed to read gpu clock\n");
> @@ -5482,21 +5484,19 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>   
> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint64_t reg_val_offs)
>   {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>   				(5 << 8) |	/* dst: memory */
>   				(1 << 20));	/* write confirm */
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>   }
>   
>   static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
> --
> 2.17.1
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17  8:58   ` Christian König
@ 2020-04-17  9:06     ` Liu, Monk
  2020-04-17  9:13       ` Christian König
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Monk @ 2020-04-17  9:06 UTC (permalink / raw)
  To: Koenig, Christian, Tao, Yintian, Kuehling, Felix, Deucher,
	Alexander, Zhang, Hawking
  Cc: amd-gfx

Christian

>>
See we wanted to map the ring buffers read only and USWC for some time. 
That would result in either not working driver or rather crappy performance.
<<

For KIQ the ring buffer wouldn't be read only ... should be cacheable type 

Dynamic alloc each time doing KIQ reg read is a overkill to me, leverage ring buffer is a high efficient way. 

Besides looks now the KIQ register reading is really massive, check this code:

4949 static void gfx_v9_0_update_spm_vmid(struct amdgpu_device *adev, unsigned vmid)
4950 {
4951     u32 data;
4952
4953     data = RREG32_SOC15(GC, 0, mmRLC_SPM_MC_CNTL);
4954
4955     data &= ~RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK;
4956     data |= (vmid & RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK) << RLC_SPM_MC_CNTL__RLC_SPM_VMID__SHIFT;
4957
4958     WREG32_SOC15(GC, 0, mmRLC_SPM_MC_CNTL, data);
4959 }

Now  we do KIQ read and write *every time* we do amdgpu_vm_flush  (omg... what's this  ??)



_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Friday, April 17, 2020 4:59 PM
To: Liu, Monk <Monk.Liu@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

Looks like a rather important bug fix to me, but I'm not sure if writing the value into the ring buffer is a good idea.

See we wanted to map the ring buffers read only and USWC for some time. 
That would result in either not working driver or rather crappy performance.

Can't we just call amdgpu_device_wb_get() in amdgpu_device_wb_get() instead and allocate the wb address dynamically?

Regards,
Christian.

Am 17.04.20 um 09:01 schrieb Liu, Monk:
> The change Looks good with me, you can put my RB to your patch .
>
> Since this patch impact on general logic (not SRIOV only) I would like 
> you wait a little longer for @Kuehling, Felix and @Deucher, Alexander 
> and @Koenig, Christian  @Zhang, Hawking
>
> If any of them gave you a RB I think we can go this way
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Yintian Tao <yttao@amd.com>
> Sent: Friday, April 17, 2020 2:53 PM
> To: Liu, Monk <Monk.Liu@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
> Subject: [PATCH] drm/amdgpu: refine kiq read register
>
> According to the current kiq read register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>     get REG-1 value
>
> Therefore, directly make kiq write the register value at the ring buffer then there will be no race condition for the wb buffer.
>
> v2: supply the read_clock and move the reg_val_offs back
>
> Signed-off-by: Yintian Tao <yttao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>   6 files changed, 33 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ea576b4260a4..4e1c0239e561 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device 
> *adev,
>   
>   	spin_lock_init(&kiq->ring_lock);
>   
> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
> -	if (r)
> -		return r;
> -
>   	ring->adev = NULL;
>   	ring->ring_obj = NULL;
>   	ring->use_doorbell = true;
> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device 
> *adev,
>   
>   void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>   	amdgpu_ring_fini(ring);
>   }
>   
> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	uint32_t seq;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
> +	uint64_t reg_val_offs = 0;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>   	amdgpu_ring_alloc(ring, 32);
> -	amdgpu_ring_emit_rreg(ring, reg);
> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return adev->wb.wb[kiq->reg_val_offs];
> +	return ring->ring[reg_val_offs];
>   
>   failed_kiq_read:
>   	pr_err("failed to read reg:%x\n", reg); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 634746829024..ee698f0246d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>   	struct amdgpu_ring	ring;
>   	struct amdgpu_irq_src	irq;
>   	const struct kiq_pm4_funcs *pmf;
> -	uint32_t			reg_val_offs;
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f61664ee4940..a3d88f2aa9f4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>   	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
> +			  uint64_t reg_val_offs);
>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>   			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) 
> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, 
> d) (r)->funcs->emit_rreg((r), (d))
> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
> +(d),
> +(o))
>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), 
> (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) 
> (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define 
> amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 0a03e2ad5d95..7c9a5e440509 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>   	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>   
> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				     uint64_t reg_val_offs)
>   {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>   				(5 << 8) |	/* dst: memory */
>   				(1 << 20));	/* write confirm */
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>   }
>   
>   static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fc6c2f2bc76c..8e7eee7838e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>   
> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint64_t reg_val_offs)
>   {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>   				(5 << 8) |	/* dst: memory */
>   				(1 << 20));	/* write confirm */
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>   }
>   
>   static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 84fcf842316d..ff279b1f5c24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	uint32_t seq;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
> +	uint64_t reg_val_offs = 0;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>   	amdgpu_ring_alloc(ring, 32);
> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 9 |	/* src: register*/
>   				(5 << 8) |	/* dst: memory */
> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   				(1 << 20));	/* write confirm */
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
> +	return (uint64_t)ring->ring[reg_val_offs] |
> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>   
>   failed_kiq_read:
>   	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ 
> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>   
> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint64_t reg_val_offs)
>   {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>   				(5 << 8) |	/* dst: memory */
>   				(1 << 20));	/* write confirm */
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>   }
>   
>   static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg,
> --
> 2.17.1
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17  9:06     ` Liu, Monk
@ 2020-04-17  9:13       ` Christian König
  2020-04-17  9:39         ` Liu, Monk
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2020-04-17  9:13 UTC (permalink / raw)
  To: Liu, Monk, Tao, Yintian, Kuehling, Felix, Deucher, Alexander,
	Zhang, Hawking
  Cc: amd-gfx

> Dynamic alloc each time doing KIQ reg read is a overkill to me
Yeah, that is a rather good argument.

> Now  we do KIQ read and write *every time* we do amdgpu_vm_flush  (omg... what's this  ??)

That is updating the VMID used for the SPM trace. And yes this 
read/modify/write is most likely not a good idea, we should rather just 
write the value we want to have or don't use the KIQ here.

Most likely the later because IIRC this is a per VF register.

Christian.

Am 17.04.20 um 11:06 schrieb Liu, Monk:
> Christian
>
> See we wanted to map the ring buffers read only and USWC for some time.
> That would result in either not working driver or rather crappy performance.
> <<
>
> For KIQ the ring buffer wouldn't be read only ... should be cacheable type
>
> Dynamic alloc each time doing KIQ reg read is a overkill to me, leverage ring buffer is a high efficient way.
>
> Besides looks now the KIQ register reading is really massive, check this code:
>
> 4949 static void gfx_v9_0_update_spm_vmid(struct amdgpu_device *adev, unsigned vmid)
> 4950 {
> 4951     u32 data;
> 4952
> 4953     data = RREG32_SOC15(GC, 0, mmRLC_SPM_MC_CNTL);
> 4954
> 4955     data &= ~RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK;
> 4956     data |= (vmid & RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK) << RLC_SPM_MC_CNTL__RLC_SPM_VMID__SHIFT;
> 4957
> 4958     WREG32_SOC15(GC, 0, mmRLC_SPM_MC_CNTL, data);
> 4959 }
>
> Now  we do KIQ read and write *every time* we do amdgpu_vm_flush  (omg... what's this  ??)
>
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, April 17, 2020 4:59 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Looks like a rather important bug fix to me, but I'm not sure if writing the value into the ring buffer is a good idea.
>
> See we wanted to map the ring buffers read only and USWC for some time.
> That would result in either not working driver or rather crappy performance.
>
> Can't we just call amdgpu_device_wb_get() in amdgpu_device_wb_get() instead and allocate the wb address dynamically?
>
> Regards,
> Christian.
>
> Am 17.04.20 um 09:01 schrieb Liu, Monk:
>> The change Looks good with me, you can put my RB to your patch .
>>
>> Since this patch impact on general logic (not SRIOV only) I would like
>> you wait a little longer for @Kuehling, Felix and @Deucher, Alexander
>> and @Koenig, Christian  @Zhang, Hawking
>>
>> If any of them gave you a RB I think we can go this way
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Yintian Tao <yttao@amd.com>
>> Sent: Friday, April 17, 2020 2:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
>> Subject: [PATCH] drm/amdgpu: refine kiq read register
>>
>> According to the current kiq read register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>      get REG-1 value
>>
>> Therefore, directly make kiq write the register value at the ring buffer then there will be no race condition for the wb buffer.
>>
>> v2: supply the read_clock and move the reg_val_offs back
>>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>    6 files changed, 33 insertions(+), 40 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..4e1c0239e561 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device
>> *adev,
>>    
>>    	spin_lock_init(&kiq->ring_lock);
>>    
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> -
>>    	ring->adev = NULL;
>>    	ring->ring_obj = NULL;
>>    	ring->use_doorbell = true;
>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device
>> *adev,
>>    
>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>    	amdgpu_ring_fini(ring);
>>    }
>>    
>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>    	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	return ring->ring[reg_val_offs];
>>    
>>    failed_kiq_read:
>>    	pr_err("failed to read reg:%x\n", reg); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..ee698f0246d8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>    	struct amdgpu_ring	ring;
>>    	struct amdgpu_irq_src	irq;
>>    	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>>    };
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..a3d88f2aa9f4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>    	void (*end_use)(struct amdgpu_ring *ring);
>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint64_t reg_val_offs);
>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct
>> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>    #define amdgpu_ring_emit_cntxcntl(r, d)
>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>> d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>> +(d),
>> +(o))
>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r),
>> (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m)
>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define
>> amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff
>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7c9a5e440509 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>    
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..8e7eee7838e0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>>    
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..ff279b1f5c24 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>    	amdgpu_ring_alloc(ring, 32);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	return (uint64_t)ring->ring[reg_val_offs] |
>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>    
>>    failed_kiq_read:
>>    	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@
>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>    
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg,
>> --
>> 2.17.1
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17  9:13       ` Christian König
@ 2020-04-17  9:39         ` Liu, Monk
  2020-04-17 11:39           ` Tao, Yintian
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Monk @ 2020-04-17  9:39 UTC (permalink / raw)
  To: Koenig, Christian, Tao, Yintian, Kuehling, Felix, Deucher,
	Alexander, Zhang, Hawking, Ming, Davis, Jiang, Jerry (SW)
  Cc: amd-gfx

Hi Christian

mmRLC_SPM_MC_CNTL

this register is a RLC register, with my understanding it is PF&VF share register, and I did experiment proved it:
1) write abc to it in PF
2) read it from VF, it shows abc
3) write ff to it in VF, read it, it is still abc

So this register with current policy (L1) is a VF read, PF write register, and this register is physically shared among PF/VF 

We should not even try to write it in VF side, no matter CPU or KIQ (KIQ write within VF role will also be blocked by the L1 policy)

From what I can see so far: we need to drop this feature for SRIOV, or we need to change Policy 

+@Ming, Davis and @Jiang, Jerry (SW) for awareness 

DRM-NEXT kernel branch has a new feature to massively use KIQ to read/write this register " mmRLC_SPM_MC_CNTL" which is a PF w/r bug VF R only register.
We need to figure out what should we do on it 

I will talk to UMD guys later (they initiated this feature in our kernel driver )
_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Friday, April 17, 2020 5:14 PM
To: Liu, Monk <Monk.Liu@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

> Dynamic alloc each time doing KIQ reg read is a overkill to me
Yeah, that is a rather good argument.

> Now  we do KIQ read and write *every time* we do amdgpu_vm_flush  
> (omg... what's this  ??)

That is updating the VMID used for the SPM trace. And yes this read/modify/write is most likely not a good idea, we should rather just write the value we want to have or don't use the KIQ here.

Most likely the later because IIRC this is a per VF register.

Christian.

Am 17.04.20 um 11:06 schrieb Liu, Monk:
> Christian
>
> See we wanted to map the ring buffers read only and USWC for some time.
> That would result in either not working driver or rather crappy performance.
> <<
>
> For KIQ the ring buffer wouldn't be read only ... should be cacheable 
> type
>
> Dynamic alloc each time doing KIQ reg read is a overkill to me, leverage ring buffer is a high efficient way.
>
> Besides looks now the KIQ register reading is really massive, check this code:
>
> 4949 static void gfx_v9_0_update_spm_vmid(struct amdgpu_device *adev, 
> unsigned vmid)
> 4950 {
> 4951     u32 data;
> 4952
> 4953     data = RREG32_SOC15(GC, 0, mmRLC_SPM_MC_CNTL);
> 4954
> 4955     data &= ~RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK;
> 4956     data |= (vmid & RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK) << RLC_SPM_MC_CNTL__RLC_SPM_VMID__SHIFT;
> 4957
> 4958     WREG32_SOC15(GC, 0, mmRLC_SPM_MC_CNTL, data);
> 4959 }
>
> Now  we do KIQ read and write *every time* we do amdgpu_vm_flush  
> (omg... what's this  ??)
>
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, April 17, 2020 4:59 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; 
> Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Looks like a rather important bug fix to me, but I'm not sure if writing the value into the ring buffer is a good idea.
>
> See we wanted to map the ring buffers read only and USWC for some time.
> That would result in either not working driver or rather crappy performance.
>
> Can't we just call amdgpu_device_wb_get() in amdgpu_device_wb_get() instead and allocate the wb address dynamically?
>
> Regards,
> Christian.
>
> Am 17.04.20 um 09:01 schrieb Liu, Monk:
>> The change Looks good with me, you can put my RB to your patch .
>>
>> Since this patch impact on general logic (not SRIOV only) I would 
>> like you wait a little longer for @Kuehling, Felix and @Deucher, 
>> Alexander and @Koenig, Christian  @Zhang, Hawking
>>
>> If any of them gave you a RB I think we can go this way
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Yintian Tao <yttao@amd.com>
>> Sent: Friday, April 17, 2020 2:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
>> Subject: [PATCH] drm/amdgpu: refine kiq read register
>>
>> According to the current kiq read register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>      get REG-1 value
>>
>> Therefore, directly make kiq write the register value at the ring buffer then there will be no race condition for the wb buffer.
>>
>> v2: supply the read_clock and move the reg_val_offs back
>>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>    6 files changed, 33 insertions(+), 40 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..4e1c0239e561 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>> amdgpu_device *adev,
>>    
>>    	spin_lock_init(&kiq->ring_lock);
>>    
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> -
>>    	ring->adev = NULL;
>>    	ring->ring_obj = NULL;
>>    	ring->use_doorbell = true;
>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device 
>> *adev,
>>    
>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>    	amdgpu_ring_fini(ring);
>>    }
>>    
>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>    	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	return ring->ring[reg_val_offs];
>>    
>>    failed_kiq_read:
>>    	pr_err("failed to read reg:%x\n", reg); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..ee698f0246d8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>    	struct amdgpu_ring	ring;
>>    	struct amdgpu_irq_src	irq;
>>    	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>>    };
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..a3d88f2aa9f4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>    	void (*end_use)(struct amdgpu_ring *ring);
>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint64_t reg_val_offs);
>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
>> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>    #define amdgpu_ring_emit_cntxcntl(r, d) 
>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>> d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>> +(d),
>> +(o))
>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), 
>> (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) 
>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define 
>> amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7c9a5e440509 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>    
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..8e7eee7838e0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>>    
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..ff279b1f5c24 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>    	amdgpu_ring_alloc(ring, 32);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	return (uint64_t)ring->ring[reg_val_offs] |
>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>    
>>    failed_kiq_read:
>>    	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ 
>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>    
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg,
>> --
>> 2.17.1
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17  9:39         ` Liu, Monk
@ 2020-04-17 11:39           ` Tao, Yintian
  0 siblings, 0 replies; 25+ messages in thread
From: Tao, Yintian @ 2020-04-17 11:39 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Kuehling, Felix, Deucher,
	Alexander, Zhang, Hawking, Ming, Davis, Jiang, Jerry (SW)
  Cc: amd-gfx

Hi  Christian 


Can you help give more details about how this spm trace works
After review the gfx_v9_0_update_spm_vmid function, I think it is some confused.


For example:
It is assumed that there are two gfx job which can be submitted to gfx ring. 
When second gfx job is submitted, the vmid of first gfx job write to mmRLC_SPM_MC_CNTL may be overwritten by the second gfx job vmid.
I am not sure whether it will raise problem.


Best Regards
Yintian Tao

-----Original Message-----
From: Liu, Monk <Monk.Liu@amd.com> 
Sent: 2020年4月17日 17:40
To: Koenig, Christian <Christian.Koenig@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Ming, Davis <Davis.Ming@amd.com>; Jiang, Jerry (SW) <Jerry.Jiang@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu: refine kiq read register

Hi Christian

mmRLC_SPM_MC_CNTL

this register is a RLC register, with my understanding it is PF&VF share register, and I did experiment proved it:
1) write abc to it in PF
2) read it from VF, it shows abc
3) write ff to it in VF, read it, it is still abc

So this register with current policy (L1) is a VF read, PF write register, and this register is physically shared among PF/VF 

We should not even try to write it in VF side, no matter CPU or KIQ (KIQ write within VF role will also be blocked by the L1 policy)

From what I can see so far: we need to drop this feature for SRIOV, or we need to change Policy 

+@Ming, Davis and @Jiang, Jerry (SW) for awareness

DRM-NEXT kernel branch has a new feature to massively use KIQ to read/write this register " mmRLC_SPM_MC_CNTL" which is a PF w/r bug VF R only register.
We need to figure out what should we do on it 

I will talk to UMD guys later (they initiated this feature in our kernel driver ) _____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Friday, April 17, 2020 5:14 PM
To: Liu, Monk <Monk.Liu@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

> Dynamic alloc each time doing KIQ reg read is a overkill to me
Yeah, that is a rather good argument.

> Now  we do KIQ read and write *every time* we do amdgpu_vm_flush 
> (omg... what's this  ??)

That is updating the VMID used for the SPM trace. And yes this read/modify/write is most likely not a good idea, we should rather just write the value we want to have or don't use the KIQ here.

Most likely the later because IIRC this is a per VF register.

Christian.

Am 17.04.20 um 11:06 schrieb Liu, Monk:
> Christian
>
> See we wanted to map the ring buffers read only and USWC for some time.
> That would result in either not working driver or rather crappy performance.
> <<
>
> For KIQ the ring buffer wouldn't be read only ... should be cacheable 
> type
>
> Dynamic alloc each time doing KIQ reg read is a overkill to me, leverage ring buffer is a high efficient way.
>
> Besides looks now the KIQ register reading is really massive, check this code:
>
> 4949 static void gfx_v9_0_update_spm_vmid(struct amdgpu_device *adev, 
> unsigned vmid)
> 4950 {
> 4951     u32 data;
> 4952
> 4953     data = RREG32_SOC15(GC, 0, mmRLC_SPM_MC_CNTL);
> 4954
> 4955     data &= ~RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK;
> 4956     data |= (vmid & RLC_SPM_MC_CNTL__RLC_SPM_VMID_MASK) << RLC_SPM_MC_CNTL__RLC_SPM_VMID__SHIFT;
> 4957
> 4958     WREG32_SOC15(GC, 0, mmRLC_SPM_MC_CNTL, data);
> 4959 }
>
> Now  we do KIQ read and write *every time* we do amdgpu_vm_flush 
> (omg... what's this  ??)
>
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, April 17, 2020 4:59 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; 
> Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Looks like a rather important bug fix to me, but I'm not sure if writing the value into the ring buffer is a good idea.
>
> See we wanted to map the ring buffers read only and USWC for some time.
> That would result in either not working driver or rather crappy performance.
>
> Can't we just call amdgpu_device_wb_get() in amdgpu_device_wb_get() instead and allocate the wb address dynamically?
>
> Regards,
> Christian.
>
> Am 17.04.20 um 09:01 schrieb Liu, Monk:
>> The change Looks good with me, you can put my RB to your patch .
>>
>> Since this patch impact on general logic (not SRIOV only) I would 
>> like you wait a little longer for @Kuehling, Felix and @Deucher, 
>> Alexander and @Koenig, Christian  @Zhang, Hawking
>>
>> If any of them gave you a RB I think we can go this way
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Yintian Tao <yttao@amd.com>
>> Sent: Friday, April 17, 2020 2:53 PM
>> To: Liu, Monk <Monk.Liu@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
>> Subject: [PATCH] drm/amdgpu: refine kiq read register
>>
>> According to the current kiq read register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>      get REG-1 value
>>
>> Therefore, directly make kiq write the register value at the ring buffer then there will be no race condition for the wb buffer.
>>
>> v2: supply the read_clock and move the reg_val_offs back
>>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>    6 files changed, 33 insertions(+), 40 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..4e1c0239e561 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>> amdgpu_device *adev,
>>    
>>    	spin_lock_init(&kiq->ring_lock);
>>    
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> -
>>    	ring->adev = NULL;
>>    	ring->ring_obj = NULL;
>>    	ring->use_doorbell = true;
>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device 
>> *adev,
>>    
>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>    	amdgpu_ring_fini(ring);
>>    }
>>    
>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>    	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	return ring->ring[reg_val_offs];
>>    
>>    failed_kiq_read:
>>    	pr_err("failed to read reg:%x\n", reg); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..ee698f0246d8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>    	struct amdgpu_ring	ring;
>>    	struct amdgpu_irq_src	irq;
>>    	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>>    };
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..a3d88f2aa9f4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>    	void (*end_use)(struct amdgpu_ring *ring);
>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint64_t reg_val_offs);
>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
>> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>    #define amdgpu_ring_emit_cntxcntl(r, d) 
>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>> d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>> +(d),
>> +(o))
>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), 
>> (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) 
>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define 
>> amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7c9a5e440509 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>    
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..8e7eee7838e0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>>    
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..ff279b1f5c24 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>    	amdgpu_ring_alloc(ring, 32);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	return (uint64_t)ring->ring[reg_val_offs] |
>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>    
>>    failed_kiq_read:
>>    	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ 
>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>    
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>    {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>    				(5 << 8) |	/* dst: memory */
>>    				(1 << 20));	/* write confirm */
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg,
>> --
>> 2.17.1
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17  6:53 [PATCH] drm/amdgpu: refine kiq read register Yintian Tao
  2020-04-17  7:01 ` Liu, Monk
@ 2020-04-17 15:39 ` Felix Kuehling
  2020-04-19 17:03   ` Christian König
  2020-04-20  4:16   ` Tao, Yintian
  1 sibling, 2 replies; 25+ messages in thread
From: Felix Kuehling @ 2020-04-17 15:39 UTC (permalink / raw)
  To: Yintian Tao, monk.liu; +Cc: amd-gfx

Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
> According to the current kiq read register method,
> there will be race condition when using KIQ to read
> register if multiple clients want to read at same time
> just like the expample below:
> 1. client-A start to read REG-0 throguh KIQ
> 2. client-A poll the seqno-0
> 3. client-B start to read REG-1 through KIQ
> 4. client-B poll the seqno-1
> 5. the kiq complete these two read operation
> 6. client-A to read the register at the wb buffer and
>    get REG-1 value
>
> Therefore, directly make kiq write the register value at
> the ring buffer then there will be no race condition for
> the wb buffer.
>
> v2: supply the read_clock and move the reg_val_offs back
>
> Signed-off-by: Yintian Tao <yttao@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>  6 files changed, 33 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ea576b4260a4..4e1c0239e561 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>  
>  	spin_lock_init(&kiq->ring_lock);
>  
> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
> -	if (r)
> -		return r;
> -
>  	ring->adev = NULL;
>  	ring->ring_obj = NULL;
>  	ring->use_doorbell = true;
> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>  
>  void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>  {
> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>  	amdgpu_ring_fini(ring);
>  }
>  
> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>  	uint32_t seq;
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>  	struct amdgpu_ring *ring = &kiq->ring;
> +	uint64_t reg_val_offs = 0;
>  
>  	BUG_ON(!ring->funcs->emit_rreg);
>  
>  	spin_lock_irqsave(&kiq->ring_lock, flags);
>  	amdgpu_ring_alloc(ring, 32);
> -	amdgpu_ring_emit_rreg(ring, reg);
> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;

I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise the
reg_val_offset can be past the end of the ring.

But that still leaves a problem if another command is submitted to the
KIQ before you read the returned reg_val from the ring. Your reg_val can
be overwritten by the new command and you get the wrong result. Or the
command can be overwritten with the reg_val, which will most likely hang
the CP.

You could allocate space on the KIQ ring with a NOP command to prevent
that space from being overwritten by other commands.

Regards,
  Felix


> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>  	amdgpu_fence_emit_polling(ring, &seq);
>  	amdgpu_ring_commit(ring);
>  	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> @@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>  	if (cnt > MAX_KIQ_REG_TRY)
>  		goto failed_kiq_read;
>  
> -	return adev->wb.wb[kiq->reg_val_offs];
> +	return ring->ring[reg_val_offs];
>  
>  failed_kiq_read:
>  	pr_err("failed to read reg:%x\n", reg);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 634746829024..ee698f0246d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>  	struct amdgpu_ring	ring;
>  	struct amdgpu_irq_src	irq;
>  	const struct kiq_pm4_funcs *pmf;
> -	uint32_t			reg_val_offs;
>  };
>  
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f61664ee4940..a3d88f2aa9f4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>  	void (*end_use)(struct amdgpu_ring *ring);
>  	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>  	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
> +			  uint64_t reg_val_offs);
>  	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>  	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>  			      uint32_t val, uint32_t mask);
> @@ -265,7 +266,7 @@ struct amdgpu_ring {
>  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>  #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
> -#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o))
>  #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 0a03e2ad5d95..7c9a5e440509 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>  	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>  }
>  
> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				     uint64_t reg_val_offs)
>  {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>  	amdgpu_ring_write(ring, 0 |	/* src: register*/
>  				(5 << 8) |	/* dst: memory */
>  				(1 << 20));	/* write confirm */
>  	amdgpu_ring_write(ring, reg);
>  	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>  }
>  
>  static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fc6c2f2bc76c..8e7eee7838e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>  		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>  }
>  
> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint64_t reg_val_offs)
>  {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>  	amdgpu_ring_write(ring, 0 |	/* src: register*/
>  				(5 << 8) |	/* dst: memory */
>  				(1 << 20));	/* write confirm */
>  	amdgpu_ring_write(ring, reg);
>  	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>  }
>  
>  static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 84fcf842316d..ff279b1f5c24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>  	uint32_t seq;
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>  	struct amdgpu_ring *ring = &kiq->ring;
> +	uint64_t reg_val_offs = 0;
>  
>  	BUG_ON(!ring->funcs->emit_rreg);
>  
>  	spin_lock_irqsave(&kiq->ring_lock, flags);
>  	amdgpu_ring_alloc(ring, 32);
> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>  	amdgpu_ring_write(ring, 9 |	/* src: register*/
>  				(5 << 8) |	/* dst: memory */
> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>  				(1 << 20));	/* write confirm */
>  	amdgpu_ring_write(ring, 0);
>  	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>  	amdgpu_fence_emit_polling(ring, &seq);
>  	amdgpu_ring_commit(ring);
>  	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> @@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>  	if (cnt > MAX_KIQ_REG_TRY)
>  		goto failed_kiq_read;
>  
> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
> +	return (uint64_t)ring->ring[reg_val_offs] |
> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>  
>  failed_kiq_read:
>  	pr_err("failed to read gpu clock\n");
> @@ -5482,21 +5484,19 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>  		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>  }
>  
> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint64_t reg_val_offs)
>  {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>  	amdgpu_ring_write(ring, 0 |	/* src: register*/
>  				(5 << 8) |	/* dst: memory */
>  				(1 << 20));	/* write confirm */
>  	amdgpu_ring_write(ring, reg);
>  	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>  }
>  
>  static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17 15:39 ` Felix Kuehling
@ 2020-04-19 17:03   ` Christian König
  2020-04-20  6:20     ` Liu, Monk
  2020-04-20  4:16   ` Tao, Yintian
  1 sibling, 1 reply; 25+ messages in thread
From: Christian König @ 2020-04-19 17:03 UTC (permalink / raw)
  To: Felix Kuehling, Yintian Tao, monk.liu; +Cc: amd-gfx

Am 17.04.20 um 17:39 schrieb Felix Kuehling:
> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>> According to the current kiq read register method,
>> there will be race condition when using KIQ to read
>> register if multiple clients want to read at same time
>> just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ
>> 2. client-A poll the seqno-0
>> 3. client-B start to read REG-1 through KIQ
>> 4. client-B poll the seqno-1
>> 5. the kiq complete these two read operation
>> 6. client-A to read the register at the wb buffer and
>>     get REG-1 value
>>
>> Therefore, directly make kiq write the register value at
>> the ring buffer then there will be no race condition for
>> the wb buffer.
>>
>> v2: supply the read_clock and move the reg_val_offs back
>>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>   6 files changed, 33 insertions(+), 40 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..4e1c0239e561 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>   
>>   	spin_lock_init(&kiq->ring_lock);
>>   
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> -
>>   	ring->adev = NULL;
>>   	ring->ring_obj = NULL;
>>   	ring->use_doorbell = true;
>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>   
>>   void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>   {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>   	amdgpu_ring_fini(ring);
>>   }
>>   
>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>   	uint32_t seq;
>>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>   	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>   
>>   	BUG_ON(!ring->funcs->emit_rreg);
>>   
>>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>>   	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise the
> reg_val_offset can be past the end of the ring.
>
> But that still leaves a problem if another command is submitted to the
> KIQ before you read the returned reg_val from the ring. Your reg_val can
> be overwritten by the new command and you get the wrong result. Or the
> command can be overwritten with the reg_val, which will most likely hang
> the CP.
>
> You could allocate space on the KIQ ring with a NOP command to prevent
> that space from being overwritten by other commands.

Well I was under the assumption that this is actually what is done here. 
If that is not the case the patch is a rather clear NAK.

Regards,
Christian.

>
> Regards,
>    Felix
>
>
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>   	amdgpu_fence_emit_polling(ring, &seq);
>>   	amdgpu_ring_commit(ring);
>>   	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> @@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>   	if (cnt > MAX_KIQ_REG_TRY)
>>   		goto failed_kiq_read;
>>   
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	return ring->ring[reg_val_offs];
>>   
>>   failed_kiq_read:
>>   	pr_err("failed to read reg:%x\n", reg);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..ee698f0246d8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>   	struct amdgpu_ring	ring;
>>   	struct amdgpu_irq_src	irq;
>>   	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>>   };
>>   
>>   /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..a3d88f2aa9f4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>   	void (*end_use)(struct amdgpu_ring *ring);
>>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>   	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint64_t reg_val_offs);
>>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>   			      uint32_t val, uint32_t mask);
>> @@ -265,7 +266,7 @@ struct amdgpu_ring {
>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
>> -#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o))
>>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>   #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>   #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7c9a5e440509 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>   	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>   }
>>   
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint64_t reg_val_offs)
>>   {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>   				(5 << 8) |	/* dst: memory */
>>   				(1 << 20));	/* write confirm */
>>   	amdgpu_ring_write(ring, reg);
>>   	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>   }
>>   
>>   static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..8e7eee7838e0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>   		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>   }
>>   
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>   {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>   				(5 << 8) |	/* dst: memory */
>>   				(1 << 20));	/* write confirm */
>>   	amdgpu_ring_write(ring, reg);
>>   	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>   }
>>   
>>   static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..ff279b1f5c24 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>   	uint32_t seq;
>>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>   	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>   
>>   	BUG_ON(!ring->funcs->emit_rreg);
>>   
>>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>>   	amdgpu_ring_alloc(ring, 32);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>   	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>   				(5 << 8) |	/* dst: memory */
>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>   				(1 << 20));	/* write confirm */
>>   	amdgpu_ring_write(ring, 0);
>>   	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>   	amdgpu_fence_emit_polling(ring, &seq);
>>   	amdgpu_ring_commit(ring);
>>   	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> @@ -4088,8 +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>   	if (cnt > MAX_KIQ_REG_TRY)
>>   		goto failed_kiq_read;
>>   
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	return (uint64_t)ring->ring[reg_val_offs] |
>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>   
>>   failed_kiq_read:
>>   	pr_err("failed to read gpu clock\n");
>> @@ -5482,21 +5484,19 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>   		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>   }
>>   
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>   {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>   				(5 << 8) |	/* dst: memory */
>>   				(1 << 20));	/* write confirm */
>>   	amdgpu_ring_write(ring, reg);
>>   	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>   }
>>   
>>   static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-17 15:39 ` Felix Kuehling
  2020-04-19 17:03   ` Christian König
@ 2020-04-20  4:16   ` Tao, Yintian
  1 sibling, 0 replies; 25+ messages in thread
From: Tao, Yintian @ 2020-04-20  4:16 UTC (permalink / raw)
  To: Kuehling, Felix, Liu, Monk; +Cc: amd-gfx

Hi Felix

Many thanks for your review. I have modified it according to your comments and suggestion.

Best Regards
Yintian Tao

-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling@amd.com> 
Sent: 2020年4月17日 23:39
To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
> According to the current kiq read register method, there will be race 
> condition when using KIQ to read register if multiple clients want to 
> read at same time just like the expample below:
> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the 
> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll 
> the seqno-1 5. the kiq complete these two read operation 6. client-A 
> to read the register at the wb buffer and
>    get REG-1 value
>
> Therefore, directly make kiq write the register value at the ring 
> buffer then there will be no race condition for the wb buffer.
>
> v2: supply the read_clock and move the reg_val_offs back
>
> Signed-off-by: Yintian Tao <yttao@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------  
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -  
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>  6 files changed, 33 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ea576b4260a4..4e1c0239e561 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device 
> *adev,
>  
>  	spin_lock_init(&kiq->ring_lock);
>  
> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
> -	if (r)
> -		return r;
> -
>  	ring->adev = NULL;
>  	ring->ring_obj = NULL;
>  	ring->use_doorbell = true;
> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device 
> *adev,
>  
>  void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>  	amdgpu_ring_fini(ring);
>  }
>  
> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>  	uint32_t seq;
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>  	struct amdgpu_ring *ring = &kiq->ring;
> +	uint64_t reg_val_offs = 0;
>  
>  	BUG_ON(!ring->funcs->emit_rreg);
>  
>  	spin_lock_irqsave(&kiq->ring_lock, flags);
>  	amdgpu_ring_alloc(ring, 32);
> -	amdgpu_ring_emit_rreg(ring, reg);
> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;

I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise the reg_val_offset can be past the end of the ring.

But that still leaves a problem if another command is submitted to the KIQ before you read the returned reg_val from the ring. Your reg_val can be overwritten by the new command and you get the wrong result. Or the command can be overwritten with the reg_val, which will most likely hang the CP.

You could allocate space on the KIQ ring with a NOP command to prevent that space from being overwritten by other commands.

Regards,
  Felix


> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>  	amdgpu_fence_emit_polling(ring, &seq);
>  	amdgpu_ring_commit(ring);
>  	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 @@ 
> uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>  	if (cnt > MAX_KIQ_REG_TRY)
>  		goto failed_kiq_read;
>  
> -	return adev->wb.wb[kiq->reg_val_offs];
> +	return ring->ring[reg_val_offs];
>  
>  failed_kiq_read:
>  	pr_err("failed to read reg:%x\n", reg); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 634746829024..ee698f0246d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>  	struct amdgpu_ring	ring;
>  	struct amdgpu_irq_src	irq;
>  	const struct kiq_pm4_funcs *pmf;
> -	uint32_t			reg_val_offs;
>  };
>  
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f61664ee4940..a3d88f2aa9f4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>  	void (*end_use)(struct amdgpu_ring *ring);
>  	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>  	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
> +			  uint64_t reg_val_offs);
>  	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>  	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>  			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) 
> (r)->funcs->emit_hdp_flush((r))  #define 
> amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>  #define amdgpu_ring_emit_cntxcntl(r, d) 
> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, 
> d) (r)->funcs->emit_rreg((r), (d))
> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
> +(d), (o))
>  #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), 
> (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) 
> (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define 
> amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 0a03e2ad5d95..7c9a5e440509 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>  	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>  
> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				     uint64_t reg_val_offs)
>  {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>  	amdgpu_ring_write(ring, 0 |	/* src: register*/
>  				(5 << 8) |	/* dst: memory */
>  				(1 << 20));	/* write confirm */
>  	amdgpu_ring_write(ring, reg);
>  	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>  }
>  
>  static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fc6c2f2bc76c..8e7eee7838e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>  		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>  
> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint64_t reg_val_offs)
>  {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>  	amdgpu_ring_write(ring, 0 |	/* src: register*/
>  				(5 << 8) |	/* dst: memory */
>  				(1 << 20));	/* write confirm */
>  	amdgpu_ring_write(ring, reg);
>  	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>  }
>  
>  static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 84fcf842316d..ff279b1f5c24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>  	uint32_t seq;
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>  	struct amdgpu_ring *ring = &kiq->ring;
> +	uint64_t reg_val_offs = 0;
>  
>  	BUG_ON(!ring->funcs->emit_rreg);
>  
>  	spin_lock_irqsave(&kiq->ring_lock, flags);
>  	amdgpu_ring_alloc(ring, 32);
> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>  	amdgpu_ring_write(ring, 9 |	/* src: register*/
>  				(5 << 8) |	/* dst: memory */
> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>  				(1 << 20));	/* write confirm */
>  	amdgpu_ring_write(ring, 0);
>  	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>  	amdgpu_fence_emit_polling(ring, &seq);
>  	amdgpu_ring_commit(ring);
>  	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8 
> @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>  	if (cnt > MAX_KIQ_REG_TRY)
>  		goto failed_kiq_read;
>  
> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
> +	return (uint64_t)ring->ring[reg_val_offs] |
> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>  
>  failed_kiq_read:
>  	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ static 
> void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>  		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>  
> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint64_t reg_val_offs)
>  {
> -	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -
>  	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>  	amdgpu_ring_write(ring, 0 |	/* src: register*/
>  				(5 << 8) |	/* dst: memory */
>  				(1 << 20));	/* write confirm */
>  	amdgpu_ring_write(ring, reg);
>  	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
> +					      reg_val_offs * 4));
>  }
>  
>  static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-19 17:03   ` Christian König
@ 2020-04-20  6:20     ` Liu, Monk
  2020-04-20  7:19       ` Christian König
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Monk @ 2020-04-20  6:20 UTC (permalink / raw)
  To: Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

Christian

>>> Well I was under the assumption that this is actually what is done here. 
If that is not the case the patch is a rather clear NAK.
<<<

There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...

The first problem is :
During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
This is the issue yintian's patch to address, by put the readback value not in a shared WB but in a chunk DW of command submit

The second problem is:
Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the
Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.

The optional way is:
1) We use GPU scheduler to manage KIQ activity, and all jobs are submitted  to KIQ through a IB, thus no overwritten will happen
2) we still skip gpu scheduler but always use IB to put jobs on KIQ, thus each JOB will occupy the fixed space/DW of RB, so we can avoid overwrite unfinished command

We can discuss the second problem later

Can we first get the first problem done ? thanks 


_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Monday, April 20, 2020 1:03 AM
To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

Am 17.04.20 um 17:39 schrieb Felix Kuehling:
> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>> According to the current kiq read register method, there will be race 
>> condition when using KIQ to read register if multiple clients want to 
>> read at same time just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the 
>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll 
>> the seqno-1 5. the kiq complete these two read operation 6. client-A 
>> to read the register at the wb buffer and
>>     get REG-1 value
>>
>> Therefore, directly make kiq write the register value at the ring 
>> buffer then there will be no race condition for the wb buffer.
>>
>> v2: supply the read_clock and move the reg_val_offs back
>>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>   6 files changed, 33 insertions(+), 40 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..4e1c0239e561 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>> amdgpu_device *adev,
>>   
>>   	spin_lock_init(&kiq->ring_lock);
>>   
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> -
>>   	ring->adev = NULL;
>>   	ring->ring_obj = NULL;
>>   	ring->use_doorbell = true;
>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device 
>> *adev,
>>   
>>   void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>   {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>   	amdgpu_ring_fini(ring);
>>   }
>>   
>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>   	uint32_t seq;
>>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>   	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>   
>>   	BUG_ON(!ring->funcs->emit_rreg);
>>   
>>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>>   	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise 
> the reg_val_offset can be past the end of the ring.
>
> But that still leaves a problem if another command is submitted to the 
> KIQ before you read the returned reg_val from the ring. Your reg_val 
> can be overwritten by the new command and you get the wrong result. Or 
> the command can be overwritten with the reg_val, which will most 
> likely hang the CP.
>
> You could allocate space on the KIQ ring with a NOP command to prevent 
> that space from being overwritten by other commands.

Well I was under the assumption that this is actually what is done here. 
If that is not the case the patch is a rather clear NAK.

Regards,
Christian.

>
> Regards,
>    Felix
>
>
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>   	amdgpu_fence_emit_polling(ring, &seq);
>>   	amdgpu_ring_commit(ring);
>>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 
>> @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>   	if (cnt > MAX_KIQ_REG_TRY)
>>   		goto failed_kiq_read;
>>   
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	return ring->ring[reg_val_offs];
>>   
>>   failed_kiq_read:
>>   	pr_err("failed to read reg:%x\n", reg); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..ee698f0246d8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>   	struct amdgpu_ring	ring;
>>   	struct amdgpu_irq_src	irq;
>>   	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>>   };
>>   
>>   /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..a3d88f2aa9f4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>   	void (*end_use)(struct amdgpu_ring *ring);
>>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>   	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint64_t reg_val_offs);
>>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>   			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
>> amdgpu_ring {
>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>   #define amdgpu_ring_emit_cntxcntl(r, d) 
>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, 
>> d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>> +(d), (o))
>>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>   #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>   #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7c9a5e440509 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>   	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>   }
>>   
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint64_t reg_val_offs)
>>   {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>   				(5 << 8) |	/* dst: memory */
>>   				(1 << 20));	/* write confirm */
>>   	amdgpu_ring_write(ring, reg);
>>   	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>   }
>>   
>>   static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..8e7eee7838e0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>   		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>   }
>>   
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>   {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>   				(5 << 8) |	/* dst: memory */
>>   				(1 << 20));	/* write confirm */
>>   	amdgpu_ring_write(ring, reg);
>>   	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>   }
>>   
>>   static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..ff279b1f5c24 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>   	uint32_t seq;
>>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>   	struct amdgpu_ring *ring = &kiq->ring;
>> +	uint64_t reg_val_offs = 0;
>>   
>>   	BUG_ON(!ring->funcs->emit_rreg);
>>   
>>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>>   	amdgpu_ring_alloc(ring, 32);
>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>   	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>   				(5 << 8) |	/* dst: memory */
>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>   				(1 << 20));	/* write confirm */
>>   	amdgpu_ring_write(ring, 0);
>>   	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>   	amdgpu_fence_emit_polling(ring, &seq);
>>   	amdgpu_ring_commit(ring);
>>   	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8 
>> @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>   	if (cnt > MAX_KIQ_REG_TRY)
>>   		goto failed_kiq_read;
>>   
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	return (uint64_t)ring->ring[reg_val_offs] |
>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>   
>>   failed_kiq_read:
>>   	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ 
>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>   		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>   }
>>   
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint64_t reg_val_offs)
>>   {
>> -	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -
>>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>   				(5 << 8) |	/* dst: memory */
>>   				(1 << 20));	/* write confirm */
>>   	amdgpu_ring_write(ring, reg);
>>   	amdgpu_ring_write(ring, 0);
>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>> +					      reg_val_offs * 4));
>>   }
>>   
>>   static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg,
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cmo
> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884e60
> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUFXGJ
> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  6:20     ` Liu, Monk
@ 2020-04-20  7:19       ` Christian König
  2020-04-20  7:37         ` Liu, Monk
  2020-04-20  7:42         ` Tao, Yintian
  0 siblings, 2 replies; 25+ messages in thread
From: Christian König @ 2020-04-20  7:19 UTC (permalink / raw)
  To: Liu, Monk, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

Hi Monk,

> Can we first get the first problem done ?

Please absolutely not! See the problem introduced here is quite worse 
than the actual fix.

Previously we ended up with an invalid value in a concurrent register 
read, now the KIQs overwrites its own commands and most likely causes a 
hang or the hardware to execute something random.

Instead of this crude hack please let us just allocate a fixed number of 
write back slots and use them round robin. Then we can make sure that we 
don't have more than a fixed number of reads in flight at the same time 
as well using the fence values.

This should fix both problems at the same time and not introduce another 
potential problematic hack.

If this patch is already committed please revert it immediately.

Regards,
Christian.

Am 20.04.20 um 08:20 schrieb Liu, Monk:
> Christian
>
>>>> Well I was under the assumption that this is actually what is done here.
> If that is not the case the patch is a rather clear NAK.
> <<<
>
> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>
> The first problem is :
> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
> This is the issue yintian's patch to address, by put the readback value not in a shared WB but in a chunk DW of command submit
>
> The second problem is:
> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the
> Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>
> The optional way is:
> 1) We use GPU scheduler to manage KIQ activity, and all jobs are submitted  to KIQ through a IB, thus no overwritten will happen
> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ, thus each JOB will occupy the fixed space/DW of RB, so we can avoid overwrite unfinished command
>
> We can discuss the second problem later
>
> Can we first get the first problem done ? thanks
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, April 20, 2020 1:03 AM
> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>> According to the current kiq read register method, there will be race
>>> condition when using KIQ to read register if multiple clients want to
>>> read at same time just like the expample below:
>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll
>>> the seqno-1 5. the kiq complete these two read operation 6. client-A
>>> to read the register at the wb buffer and
>>>      get REG-1 value
>>>
>>> Therefore, directly make kiq write the register value at the ring
>>> buffer then there will be no race condition for the wb buffer.
>>>
>>> v2: supply the read_clock and move the reg_val_offs back
>>>
>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>    6 files changed, 33 insertions(+), 40 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index ea576b4260a4..4e1c0239e561 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct
>>> amdgpu_device *adev,
>>>    
>>>    	spin_lock_init(&kiq->ring_lock);
>>>    
>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>> -	if (r)
>>> -		return r;
>>> -
>>>    	ring->adev = NULL;
>>>    	ring->ring_obj = NULL;
>>>    	ring->use_doorbell = true;
>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device
>>> *adev,
>>>    
>>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>    {
>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>    	amdgpu_ring_fini(ring);
>>>    }
>>>    
>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    	uint32_t seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>> +	uint64_t reg_val_offs = 0;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise
>> the reg_val_offset can be past the end of the ring.
>>
>> But that still leaves a problem if another command is submitted to the
>> KIQ before you read the returned reg_val from the ring. Your reg_val
>> can be overwritten by the new command and you get the wrong result. Or
>> the command can be overwritten with the reg_val, which will most
>> likely hang the CP.
>>
>> You could allocate space on the KIQ ring with a NOP command to prevent
>> that space from being overwritten by other commands.
> Well I was under the assumption that this is actually what is done here.
> If that is not the case the patch is a rather clear NAK.
>
> Regards,
> Christian.
>
>> Regards,
>>     Felix
>>
>>
>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>    	amdgpu_fence_emit_polling(ring, &seq);
>>>    	amdgpu_ring_commit(ring);
>>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7
>>> @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    	if (cnt > MAX_KIQ_REG_TRY)
>>>    		goto failed_kiq_read;
>>>    
>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>> +	return ring->ring[reg_val_offs];
>>>    
>>>    failed_kiq_read:
>>>    	pr_err("failed to read reg:%x\n", reg); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> index 634746829024..ee698f0246d8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>    	struct amdgpu_ring	ring;
>>>    	struct amdgpu_irq_src	irq;
>>>    	const struct kiq_pm4_funcs *pmf;
>>> -	uint32_t			reg_val_offs;
>>>    };
>>>    
>>>    /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index f61664ee4940..a3d88f2aa9f4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>    	void (*end_use)(struct amdgpu_ring *ring);
>>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>> +			  uint64_t reg_val_offs);
>>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct
>>> amdgpu_ring {
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>    #define amdgpu_ring_emit_cntxcntl(r, d)
>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>>> d) (r)->funcs->emit_rreg((r), (d))
>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>>> +(d), (o))
>>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>    #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff
>>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>    }
>>>    
>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>> uint32_t reg)
>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				     uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>    }
>>>    
>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>> uint32_t reg)
>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 84fcf842316d..ff279b1f5c24 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    	uint32_t seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>> +	uint64_t reg_val_offs = 0;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, 0);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    	amdgpu_fence_emit_polling(ring, &seq);
>>>    	amdgpu_ring_commit(ring);
>>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 +4090,8
>>> @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    	if (cnt > MAX_KIQ_REG_TRY)
>>>    		goto failed_kiq_read;
>>>    
>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>    
>>>    failed_kiq_read:
>>>    	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@
>>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>    }
>>>    
>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>> uint32_t reg)
>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>> uint32_t reg,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cmo
>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884e60
>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUFXGJ
>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  7:19       ` Christian König
@ 2020-04-20  7:37         ` Liu, Monk
  2020-04-20  7:39           ` Liu, Monk
  2020-04-20  7:42         ` Tao, Yintian
  1 sibling, 1 reply; 25+ messages in thread
From: Liu, Monk @ 2020-04-20  7:37 UTC (permalink / raw)
  To: Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.

Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?



_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Monday, April 20, 2020 3:19 PM
To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

Hi Monk,

> Can we first get the first problem done ?

Please absolutely not! See the problem introduced here is quite worse than the actual fix.

Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.

Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.

This should fix both problems at the same time and not introduce another potential problematic hack.

If this patch is already committed please revert it immediately.

Regards,
Christian.

Am 20.04.20 um 08:20 schrieb Liu, Monk:
> Christian
>
>>>> Well I was under the assumption that this is actually what is done here.
> If that is not the case the patch is a rather clear NAK.
> <<<
>
> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>
> The first problem is :
> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
> This is the issue yintian's patch to address, by put the readback 
> value not in a shared WB but in a chunk DW of command submit
>
> The second problem is:
> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ is 
> busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>
> The optional way is:
> 1) We use GPU scheduler to manage KIQ activity, and all jobs are 
> submitted  to KIQ through a IB, thus no overwritten will happen
> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ, 
> thus each JOB will occupy the fixed space/DW of RB, so we can avoid 
> overwrite unfinished command
>
> We can discuss the second problem later
>
> Can we first get the first problem done ? thanks
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, April 20, 2020 1:03 AM
> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian 
> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>> According to the current kiq read register method, there will be 
>>> race condition when using KIQ to read register if multiple clients 
>>> want to read at same time just like the expample below:
>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll 
>>> the seqno-1 5. the kiq complete these two read operation 6. client-A 
>>> to read the register at the wb buffer and
>>>      get REG-1 value
>>>
>>> Therefore, directly make kiq write the register value at the ring 
>>> buffer then there will be no race condition for the wb buffer.
>>>
>>> v2: supply the read_clock and move the reg_val_offs back
>>>
>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>    6 files changed, 33 insertions(+), 40 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index ea576b4260a4..4e1c0239e561 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>> amdgpu_device *adev,
>>>    
>>>    	spin_lock_init(&kiq->ring_lock);
>>>    
>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>> -	if (r)
>>> -		return r;
>>> -
>>>    	ring->adev = NULL;
>>>    	ring->ring_obj = NULL;
>>>    	ring->use_doorbell = true;
>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>> amdgpu_device *adev,
>>>    
>>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>    {
>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>    	amdgpu_ring_fini(ring);
>>>    }
>>>    
>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    	uint32_t seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>> +	uint64_t reg_val_offs = 0;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise 
>> the reg_val_offset can be past the end of the ring.
>>
>> But that still leaves a problem if another command is submitted to 
>> the KIQ before you read the returned reg_val from the ring. Your 
>> reg_val can be overwritten by the new command and you get the wrong 
>> result. Or the command can be overwritten with the reg_val, which 
>> will most likely hang the CP.
>>
>> You could allocate space on the KIQ ring with a NOP command to 
>> prevent that space from being overwritten by other commands.
> Well I was under the assumption that this is actually what is done here.
> If that is not the case the patch is a rather clear NAK.
>
> Regards,
> Christian.
>
>> Regards,
>>     Felix
>>
>>
>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>    	amdgpu_fence_emit_polling(ring, &seq);
>>>    	amdgpu_ring_commit(ring);
>>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 
>>> @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    	if (cnt > MAX_KIQ_REG_TRY)
>>>    		goto failed_kiq_read;
>>>    
>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>> +	return ring->ring[reg_val_offs];
>>>    
>>>    failed_kiq_read:
>>>    	pr_err("failed to read reg:%x\n", reg); diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> index 634746829024..ee698f0246d8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>    	struct amdgpu_ring	ring;
>>>    	struct amdgpu_irq_src	irq;
>>>    	const struct kiq_pm4_funcs *pmf;
>>> -	uint32_t			reg_val_offs;
>>>    };
>>>    
>>>    /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index f61664ee4940..a3d88f2aa9f4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>    	void (*end_use)(struct amdgpu_ring *ring);
>>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>> +			  uint64_t reg_val_offs);
>>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
>>> amdgpu_ring {
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>    #define amdgpu_ring_emit_cntxcntl(r, d) 
>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>> amdgpu_ring_emit_rreg(r,
>>> d) (r)->funcs->emit_rreg((r), (d))
>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>>> +(d), (o))
>>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>    #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
>>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>    }
>>>    
>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				     uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>    }
>>>    
>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 84fcf842316d..ff279b1f5c24 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    	uint32_t seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>> +	uint64_t reg_val_offs = 0;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, 0);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    	amdgpu_fence_emit_polling(ring, &seq);
>>>    	amdgpu_ring_commit(ring);
>>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 
>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    	if (cnt > MAX_KIQ_REG_TRY)
>>>    		goto failed_kiq_read;
>>>    
>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>    
>>>    failed_kiq_read:
>>>    	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ 
>>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>    }
>>>    
>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>> t 
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cm
>> o
>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884e6
>> 0 
>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUFXG
>> J
>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  7:37         ` Liu, Monk
@ 2020-04-20  7:39           ` Liu, Monk
  2020-04-20  8:16             ` Christian König
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Monk @ 2020-04-20  7:39 UTC (permalink / raw)
  To: Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

>>> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin

It looks doable but really ugly compared with current patch ... and more over there we are going to fix the second problem eventually

What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Liu, Monk 
Sent: Monday, April 20, 2020 3:37 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu: refine kiq read register

>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.

Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?



_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com>
Sent: Monday, April 20, 2020 3:19 PM
To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

Hi Monk,

> Can we first get the first problem done ?

Please absolutely not! See the problem introduced here is quite worse than the actual fix.

Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.

Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.

This should fix both problems at the same time and not introduce another potential problematic hack.

If this patch is already committed please revert it immediately.

Regards,
Christian.

Am 20.04.20 um 08:20 schrieb Liu, Monk:
> Christian
>
>>>> Well I was under the assumption that this is actually what is done here.
> If that is not the case the patch is a rather clear NAK.
> <<<
>
> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>
> The first problem is :
> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
> This is the issue yintian's patch to address, by put the readback 
> value not in a shared WB but in a chunk DW of command submit
>
> The second problem is:
> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ is 
> busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>
> The optional way is:
> 1) We use GPU scheduler to manage KIQ activity, and all jobs are 
> submitted  to KIQ through a IB, thus no overwritten will happen
> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ, 
> thus each JOB will occupy the fixed space/DW of RB, so we can avoid 
> overwrite unfinished command
>
> We can discuss the second problem later
>
> Can we first get the first problem done ? thanks
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, April 20, 2020 1:03 AM
> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian 
> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>> According to the current kiq read register method, there will be 
>>> race condition when using KIQ to read register if multiple clients 
>>> want to read at same time just like the expample below:
>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll 
>>> the seqno-1 5. the kiq complete these two read operation 6. client-A 
>>> to read the register at the wb buffer and
>>>      get REG-1 value
>>>
>>> Therefore, directly make kiq write the register value at the ring 
>>> buffer then there will be no race condition for the wb buffer.
>>>
>>> v2: supply the read_clock and move the reg_val_offs back
>>>
>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>    6 files changed, 33 insertions(+), 40 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index ea576b4260a4..4e1c0239e561 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>> amdgpu_device *adev,
>>>    
>>>    	spin_lock_init(&kiq->ring_lock);
>>>    
>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>> -	if (r)
>>> -		return r;
>>> -
>>>    	ring->adev = NULL;
>>>    	ring->ring_obj = NULL;
>>>    	ring->use_doorbell = true;
>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>> amdgpu_device *adev,
>>>    
>>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>    {
>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>    	amdgpu_ring_fini(ring);
>>>    }
>>>    
>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    	uint32_t seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>> +	uint64_t reg_val_offs = 0;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise 
>> the reg_val_offset can be past the end of the ring.
>>
>> But that still leaves a problem if another command is submitted to 
>> the KIQ before you read the returned reg_val from the ring. Your 
>> reg_val can be overwritten by the new command and you get the wrong 
>> result. Or the command can be overwritten with the reg_val, which 
>> will most likely hang the CP.
>>
>> You could allocate space on the KIQ ring with a NOP command to 
>> prevent that space from being overwritten by other commands.
> Well I was under the assumption that this is actually what is done here.
> If that is not the case the patch is a rather clear NAK.
>
> Regards,
> Christian.
>
>> Regards,
>>     Felix
>>
>>
>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>    	amdgpu_fence_emit_polling(ring, &seq);
>>>    	amdgpu_ring_commit(ring);
>>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 
>>> @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    	if (cnt > MAX_KIQ_REG_TRY)
>>>    		goto failed_kiq_read;
>>>    
>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>> +	return ring->ring[reg_val_offs];
>>>    
>>>    failed_kiq_read:
>>>    	pr_err("failed to read reg:%x\n", reg); diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> index 634746829024..ee698f0246d8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>    	struct amdgpu_ring	ring;
>>>    	struct amdgpu_irq_src	irq;
>>>    	const struct kiq_pm4_funcs *pmf;
>>> -	uint32_t			reg_val_offs;
>>>    };
>>>    
>>>    /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index f61664ee4940..a3d88f2aa9f4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>    	void (*end_use)(struct amdgpu_ring *ring);
>>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>> +			  uint64_t reg_val_offs);
>>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
>>> amdgpu_ring {
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>    #define amdgpu_ring_emit_cntxcntl(r, d) 
>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>> amdgpu_ring_emit_rreg(r,
>>> d) (r)->funcs->emit_rreg((r), (d))
>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>>> +(d), (o))
>>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>    #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
>>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>    }
>>>    
>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				     uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>    }
>>>    
>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 84fcf842316d..ff279b1f5c24 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    	uint32_t seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>> +	uint64_t reg_val_offs = 0;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, 0);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    	amdgpu_fence_emit_polling(ring, &seq);
>>>    	amdgpu_ring_commit(ring);
>>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct 
>>> +amdgpu_device *adev)
>>>    	if (cnt > MAX_KIQ_REG_TRY)
>>>    		goto failed_kiq_read;
>>>    
>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>    
>>>    failed_kiq_read:
>>>    	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ 
>>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>    }
>>>    
>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>> t
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cm
>> o
>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884e6
>> 0
>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUFXG
>> J
>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  7:19       ` Christian König
  2020-04-20  7:37         ` Liu, Monk
@ 2020-04-20  7:42         ` Tao, Yintian
  2020-04-20 16:42           ` Felix Kuehling
  1 sibling, 1 reply; 25+ messages in thread
From: Tao, Yintian @ 2020-04-20  7:42 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Kuehling, Felix; +Cc: amd-gfx

Hi  Christian


This patch has not been merged because it is still under discussion among you, Monk and Felix.

Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
[yttao]: Yes, the fixed number of write back slots can also fix the problem-1 which Monk described but it still can't fix the problem-2. But it seems the fixed number solution can fix one potential bug raised by msleep() when kiq read register.
		Because currently there is no protect mechanism for KIQ ring submission. Now, there are 5 submitter which can infinitely write kiq ring buffer without any limitation.
1. kiq read/write register
2. amdgpu_vm_flush
3. invalidate tlb
4. kfd hiq_mqd_load


Hi  Felix

I have one question about function kgd_gfx_v9_hiq_mqd_load(). I see it directly write contents into kiq ring and not wait for the fence. Do you know how KFD know the hiq_mqd_load complete? Thanks in advance.



Best Regards
Yintian Tao
-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: 2020年4月20日 15:19
To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

Hi Monk,

> Can we first get the first problem done ?

Please absolutely not! See the problem introduced here is quite worse than the actual fix.

Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.

Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.

This should fix both problems at the same time and not introduce another potential problematic hack.

If this patch is already committed please revert it immediately.

Regards,
Christian.

Am 20.04.20 um 08:20 schrieb Liu, Monk:
> Christian
>
>>>> Well I was under the assumption that this is actually what is done here.
> If that is not the case the patch is a rather clear NAK.
> <<<
>
> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>
> The first problem is :
> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
> This is the issue yintian's patch to address, by put the readback 
> value not in a shared WB but in a chunk DW of command submit
>
> The second problem is:
> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ is 
> busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>
> The optional way is:
> 1) We use GPU scheduler to manage KIQ activity, and all jobs are 
> submitted  to KIQ through a IB, thus no overwritten will happen
> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ, 
> thus each JOB will occupy the fixed space/DW of RB, so we can avoid 
> overwrite unfinished command
>
> We can discuss the second problem later
>
> Can we first get the first problem done ? thanks
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, April 20, 2020 1:03 AM
> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian 
> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>> According to the current kiq read register method, there will be 
>>> race condition when using KIQ to read register if multiple clients 
>>> want to read at same time just like the expample below:
>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll 
>>> the seqno-1 5. the kiq complete these two read operation 6. client-A 
>>> to read the register at the wb buffer and
>>>      get REG-1 value
>>>
>>> Therefore, directly make kiq write the register value at the ring 
>>> buffer then there will be no race condition for the wb buffer.
>>>
>>> v2: supply the read_clock and move the reg_val_offs back
>>>
>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>    6 files changed, 33 insertions(+), 40 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index ea576b4260a4..4e1c0239e561 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>> amdgpu_device *adev,
>>>    
>>>    	spin_lock_init(&kiq->ring_lock);
>>>    
>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>> -	if (r)
>>> -		return r;
>>> -
>>>    	ring->adev = NULL;
>>>    	ring->ring_obj = NULL;
>>>    	ring->use_doorbell = true;
>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>> amdgpu_device *adev,
>>>    
>>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>    {
>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>    	amdgpu_ring_fini(ring);
>>>    }
>>>    
>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    	uint32_t seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>> +	uint64_t reg_val_offs = 0;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise 
>> the reg_val_offset can be past the end of the ring.
>>
>> But that still leaves a problem if another command is submitted to 
>> the KIQ before you read the returned reg_val from the ring. Your 
>> reg_val can be overwritten by the new command and you get the wrong 
>> result. Or the command can be overwritten with the reg_val, which 
>> will most likely hang the CP.
>>
>> You could allocate space on the KIQ ring with a NOP command to 
>> prevent that space from being overwritten by other commands.
> Well I was under the assumption that this is actually what is done here.
> If that is not the case the patch is a rather clear NAK.
>
> Regards,
> Christian.
>
>> Regards,
>>     Felix
>>
>>
>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>    	amdgpu_fence_emit_polling(ring, &seq);
>>>    	amdgpu_ring_commit(ring);
>>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7 
>>> @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    	if (cnt > MAX_KIQ_REG_TRY)
>>>    		goto failed_kiq_read;
>>>    
>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>> +	return ring->ring[reg_val_offs];
>>>    
>>>    failed_kiq_read:
>>>    	pr_err("failed to read reg:%x\n", reg); diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> index 634746829024..ee698f0246d8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>    	struct amdgpu_ring	ring;
>>>    	struct amdgpu_irq_src	irq;
>>>    	const struct kiq_pm4_funcs *pmf;
>>> -	uint32_t			reg_val_offs;
>>>    };
>>>    
>>>    /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index f61664ee4940..a3d88f2aa9f4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>    	void (*end_use)(struct amdgpu_ring *ring);
>>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>> +			  uint64_t reg_val_offs);
>>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
>>> amdgpu_ring {
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>    #define amdgpu_ring_emit_cntxcntl(r, d) 
>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>> amdgpu_ring_emit_rreg(r,
>>> d) (r)->funcs->emit_rreg((r), (d))
>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>>> +(d), (o))
>>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>    #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
>>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>    }
>>>    
>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				     uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>    }
>>>    
>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 84fcf842316d..ff279b1f5c24 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    	uint32_t seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>> +	uint64_t reg_val_offs = 0;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>>    	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, 0);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    	amdgpu_fence_emit_polling(ring, &seq);
>>>    	amdgpu_ring_commit(ring);
>>>    	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8 
>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>    	if (cnt > MAX_KIQ_REG_TRY)
>>>    		goto failed_kiq_read;
>>>    
>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>    
>>>    failed_kiq_read:
>>>    	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ 
>>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>    }
>>>    
>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint64_t reg_val_offs)
>>>    {
>>> -	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -
>>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>    				(5 << 8) |	/* dst: memory */
>>>    				(1 << 20));	/* write confirm */
>>>    	amdgpu_ring_write(ring, reg);
>>>    	amdgpu_ring_write(ring, 0);
>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>> +					      reg_val_offs * 4));
>>>    }
>>>    
>>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>> t 
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cm
>> o
>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884e6
>> 0 
>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUFXG
>> J
>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  7:39           ` Liu, Monk
@ 2020-04-20  8:16             ` Christian König
  2020-04-20  8:20               ` Liu, Monk
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2020-04-20  8:16 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?

Yintian patch made the situation absolutely worse. Instead of a whole 
ring buffer wrap around to overwrite things (1024 dw at least) you now 
just need to use up 30 dw to trigger undefined behavior and most likely 
a lockup.

And as Felix pointed out the patch even writes over the end of the ring 
buffer and can cause random corruption to whatever there is.

> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?

Yes, please do so. But please make also sure that the original patch is 
reverted before this starts to cause fallout from testers.

Regards,
Christian.

Am 20.04.20 um 09:39 schrieb Liu, Monk:
>>>> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin
> It looks doable but really ugly compared with current patch ... and more over there we are going to fix the second problem eventually
>
> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Liu, Monk
> Sent: Monday, April 20, 2020 3:37 PM
> To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amdgpu: refine kiq read register
>
>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, April 20, 2020 3:19 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Hi Monk,
>
>> Can we first get the first problem done ?
> Please absolutely not! See the problem introduced here is quite worse than the actual fix.
>
> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>
> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
>
> This should fix both problems at the same time and not introduce another potential problematic hack.
>
> If this patch is already committed please revert it immediately.
>
> Regards,
> Christian.
>
> Am 20.04.20 um 08:20 schrieb Liu, Monk:
>> Christian
>>
>>>>> Well I was under the assumption that this is actually what is done here.
>> If that is not the case the patch is a rather clear NAK.
>> <<<
>>
>> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>>
>> The first problem is :
>> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
>> This is the issue yintian's patch to address, by put the readback
>> value not in a shared WB but in a chunk DW of command submit
>>
>> The second problem is:
>> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ is
>> busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>>
>> The optional way is:
>> 1) We use GPU scheduler to manage KIQ activity, and all jobs are
>> submitted  to KIQ through a IB, thus no overwritten will happen
>> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ,
>> thus each JOB will occupy the fixed space/DW of RB, so we can avoid
>> overwrite unfinished command
>>
>> We can discuss the second problem later
>>
>> Can we first get the first problem done ? thanks
>>
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Monday, April 20, 2020 1:03 AM
>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian
>> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>
>> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>>> According to the current kiq read register method, there will be
>>>> race condition when using KIQ to read register if multiple clients
>>>> want to read at same time just like the expample below:
>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll
>>>> the seqno-1 5. the kiq complete these two read operation 6. client-A
>>>> to read the register at the wb buffer and
>>>>       get REG-1 value
>>>>
>>>> Therefore, directly make kiq write the register value at the ring
>>>> buffer then there will be no race condition for the wb buffer.
>>>>
>>>> v2: supply the read_clock and move the reg_val_offs back
>>>>
>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>> ---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>>     6 files changed, 33 insertions(+), 40 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> index ea576b4260a4..4e1c0239e561 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct
>>>> amdgpu_device *adev,
>>>>     
>>>>     	spin_lock_init(&kiq->ring_lock);
>>>>     
>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>> -	if (r)
>>>> -		return r;
>>>> -
>>>>     	ring->adev = NULL;
>>>>     	ring->ring_obj = NULL;
>>>>     	ring->use_doorbell = true;
>>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct
>>>> amdgpu_device *adev,
>>>>     
>>>>     void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>>     {
>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>>     	amdgpu_ring_fini(ring);
>>>>     }
>>>>     
>>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>     	uint32_t seq;
>>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>> +	uint64_t reg_val_offs = 0;
>>>>     
>>>>     	BUG_ON(!ring->funcs->emit_rreg);
>>>>     
>>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>     	amdgpu_ring_alloc(ring, 32);
>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise
>>> the reg_val_offset can be past the end of the ring.
>>>
>>> But that still leaves a problem if another command is submitted to
>>> the KIQ before you read the returned reg_val from the ring. Your
>>> reg_val can be overwritten by the new command and you get the wrong
>>> result. Or the command can be overwritten with the reg_val, which
>>> will most likely hang the CP.
>>>
>>> You could allocate space on the KIQ ring with a NOP command to
>>> prevent that space from being overwritten by other commands.
>> Well I was under the assumption that this is actually what is done here.
>> If that is not the case the patch is a rather clear NAK.
>>
>> Regards,
>> Christian.
>>
>>> Regards,
>>>      Felix
>>>
>>>
>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>>     	amdgpu_ring_commit(ring);
>>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 +704,7
>>>> @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>>     		goto failed_kiq_read;
>>>>     
>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>> +	return ring->ring[reg_val_offs];
>>>>     
>>>>     failed_kiq_read:
>>>>     	pr_err("failed to read reg:%x\n", reg); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> index 634746829024..ee698f0246d8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>>     	struct amdgpu_ring	ring;
>>>>     	struct amdgpu_irq_src	irq;
>>>>     	const struct kiq_pm4_funcs *pmf;
>>>> -	uint32_t			reg_val_offs;
>>>>     };
>>>>     
>>>>     /*
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index f61664ee4940..a3d88f2aa9f4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>     	void (*end_use)(struct amdgpu_ring *ring);
>>>>     	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>     	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>> +			  uint64_t reg_val_offs);
>>>>     	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>     	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>     			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct
>>>> amdgpu_ring {
>>>>     #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>     #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>     #define amdgpu_ring_emit_cntxcntl(r, d)
>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define
>>>> amdgpu_ring_emit_rreg(r,
>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>>>> +(d), (o))
>>>>     #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>>     #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>     #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff
>>>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>     	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>>     }
>>>>     
>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>> uint32_t reg)
>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				     uint64_t reg_val_offs)
>>>>     {
>>>> -	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>> -
>>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>     				(5 << 8) |	/* dst: memory */
>>>>     				(1 << 20));	/* write confirm */
>>>>     	amdgpu_ring_write(ring, reg);
>>>>     	amdgpu_ring_write(ring, 0);
>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>>     }
>>>>     
>>>>     static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>     		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>>     }
>>>>     
>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>> uint32_t reg)
>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				    uint64_t reg_val_offs)
>>>>     {
>>>> -	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>> -
>>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>     				(5 << 8) |	/* dst: memory */
>>>>     				(1 << 20));	/* write confirm */
>>>>     	amdgpu_ring_write(ring, reg);
>>>>     	amdgpu_ring_write(ring, 0);
>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>>     }
>>>>     
>>>>     static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 84fcf842316d..ff279b1f5c24 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>     	uint32_t seq;
>>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>> +	uint64_t reg_val_offs = 0;
>>>>     
>>>>     	BUG_ON(!ring->funcs->emit_rreg);
>>>>     
>>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>     	amdgpu_ring_alloc(ring, 32);
>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>     	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>>     				(5 << 8) |	/* dst: memory */
>>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>     				(1 << 20));	/* write confirm */
>>>>     	amdgpu_ring_write(ring, 0);
>>>>     	amdgpu_ring_write(ring, 0);
>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>>     	amdgpu_ring_commit(ring);
>>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct
>>>> +amdgpu_device *adev)
>>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>>     		goto failed_kiq_read;
>>>>     
>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>>     
>>>>     failed_kiq_read:
>>>>     	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@
>>>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>     		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>>     }
>>>>     
>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>> uint32_t reg)
>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				    uint64_t reg_val_offs)
>>>>     {
>>>> -	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>> -
>>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>     				(5 << 8) |	/* dst: memory */
>>>>     				(1 << 20));	/* write confirm */
>>>>     	amdgpu_ring_write(ring, reg);
>>>>     	amdgpu_ring_write(ring, 0);
>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>>     }
>>>>     
>>>>     static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>> uint32_t reg,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>>> t
>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cm
>>> o
>>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884e6
>>> 0
>>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUFXG
>>> J
>>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  8:16             ` Christian König
@ 2020-04-20  8:20               ` Liu, Monk
  2020-04-20  8:25                 ` Christian König
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Monk @ 2020-04-20  8:20 UTC (permalink / raw)
  To: Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

Sure, that's fine

Do you have particular suggestion for problem 2 ?  how we avoid  commands being overwritten before it's finished 

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Monday, April 20, 2020 4:17 PM
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?

Yintian patch made the situation absolutely worse. Instead of a whole ring buffer wrap around to overwrite things (1024 dw at least) you now just need to use up 30 dw to trigger undefined behavior and most likely a lockup.

And as Felix pointed out the patch even writes over the end of the ring buffer and can cause random corruption to whatever there is.

> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?

Yes, please do so. But please make also sure that the original patch is reverted before this starts to cause fallout from testers.

Regards,
Christian.

Am 20.04.20 um 09:39 schrieb Liu, Monk:
>>>> Instead of this crude hack please let us just allocate a fixed 
>>>> number of write back slots and use them round robin
> It looks doable but really ugly compared with current patch ... and 
> more over there we are going to fix the second problem eventually
>
> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Liu, Monk
> Sent: Monday, April 20, 2020 3:37 PM
> To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix 
> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amdgpu: refine kiq read register
>
>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>
>
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, April 20, 2020 3:19 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix 
> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Hi Monk,
>
>> Can we first get the first problem done ?
> Please absolutely not! See the problem introduced here is quite worse than the actual fix.
>
> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>
> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
>
> This should fix both problems at the same time and not introduce another potential problematic hack.
>
> If this patch is already committed please revert it immediately.
>
> Regards,
> Christian.
>
> Am 20.04.20 um 08:20 schrieb Liu, Monk:
>> Christian
>>
>>>>> Well I was under the assumption that this is actually what is done here.
>> If that is not the case the patch is a rather clear NAK.
>> <<<
>>
>> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>>
>> The first problem is :
>> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
>> This is the issue yintian's patch to address, by put the readback 
>> value not in a shared WB but in a chunk DW of command submit
>>
>> The second problem is:
>> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ 
>> is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>>
>> The optional way is:
>> 1) We use GPU scheduler to manage KIQ activity, and all jobs are 
>> submitted  to KIQ through a IB, thus no overwritten will happen
>> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ, 
>> thus each JOB will occupy the fixed space/DW of RB, so we can avoid 
>> overwrite unfinished command
>>
>> We can discuss the second problem later
>>
>> Can we first get the first problem done ? thanks
>>
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Monday, April 20, 2020 1:03 AM
>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian 
>> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>
>> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>>> According to the current kiq read register method, there will be 
>>>> race condition when using KIQ to read register if multiple clients 
>>>> want to read at same time just like the expample below:
>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B 
>>>> poll the seqno-1 5. the kiq complete these two read operation 6. 
>>>> client-A to read the register at the wb buffer and
>>>>       get REG-1 value
>>>>
>>>> Therefore, directly make kiq write the register value at the ring 
>>>> buffer then there will be no race condition for the wb buffer.
>>>>
>>>> v2: supply the read_clock and move the reg_val_offs back
>>>>
>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>> ---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>>     6 files changed, 33 insertions(+), 40 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> index ea576b4260a4..4e1c0239e561 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>>> amdgpu_device *adev,
>>>>     
>>>>     	spin_lock_init(&kiq->ring_lock);
>>>>     
>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>> -	if (r)
>>>> -		return r;
>>>> -
>>>>     	ring->adev = NULL;
>>>>     	ring->ring_obj = NULL;
>>>>     	ring->use_doorbell = true;
>>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>>> amdgpu_device *adev,
>>>>     
>>>>     void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>>     {
>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>>     	amdgpu_ring_fini(ring);
>>>>     }
>>>>     
>>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>     	uint32_t seq;
>>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>> +	uint64_t reg_val_offs = 0;
>>>>     
>>>>     	BUG_ON(!ring->funcs->emit_rreg);
>>>>     
>>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>     	amdgpu_ring_alloc(ring, 32);
>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise 
>>> the reg_val_offset can be past the end of the ring.
>>>
>>> But that still leaves a problem if another command is submitted to 
>>> the KIQ before you read the returned reg_val from the ring. Your 
>>> reg_val can be overwritten by the new command and you get the wrong 
>>> result. Or the command can be overwritten with the reg_val, which 
>>> will most likely hang the CP.
>>>
>>> You could allocate space on the KIQ ring with a NOP command to 
>>> prevent that space from being overwritten by other commands.
>> Well I was under the assumption that this is actually what is done here.
>> If that is not the case the patch is a rather clear NAK.
>>
>> Regards,
>> Christian.
>>
>>> Regards,
>>>      Felix
>>>
>>>
>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>>     	amdgpu_ring_commit(ring);
>>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7 
>>>> +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>>     		goto failed_kiq_read;
>>>>     
>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>> +	return ring->ring[reg_val_offs];
>>>>     
>>>>     failed_kiq_read:
>>>>     	pr_err("failed to read reg:%x\n", reg); diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> index 634746829024..ee698f0246d8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>>     	struct amdgpu_ring	ring;
>>>>     	struct amdgpu_irq_src	irq;
>>>>     	const struct kiq_pm4_funcs *pmf;
>>>> -	uint32_t			reg_val_offs;
>>>>     };
>>>>     
>>>>     /*
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index f61664ee4940..a3d88f2aa9f4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>     	void (*end_use)(struct amdgpu_ring *ring);
>>>>     	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>     	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>> +			  uint64_t reg_val_offs);
>>>>     	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>     	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>     			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ 
>>>> struct amdgpu_ring {
>>>>     #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>     #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>     #define amdgpu_ring_emit_cntxcntl(r, d) 
>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>>> amdgpu_ring_emit_rreg(r,
>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>>>> +(d), (o))
>>>>     #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>>     #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>     #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
>>>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>     	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>>     }
>>>>     
>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg)
>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				     uint64_t reg_val_offs)
>>>>     {
>>>> -	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>> -
>>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>     				(5 << 8) |	/* dst: memory */
>>>>     				(1 << 20));	/* write confirm */
>>>>     	amdgpu_ring_write(ring, reg);
>>>>     	amdgpu_ring_write(ring, 0);
>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>>     }
>>>>     
>>>>     static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>     		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>>     }
>>>>     
>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg)
>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				    uint64_t reg_val_offs)
>>>>     {
>>>> -	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>> -
>>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>     				(5 << 8) |	/* dst: memory */
>>>>     				(1 << 20));	/* write confirm */
>>>>     	amdgpu_ring_write(ring, reg);
>>>>     	amdgpu_ring_write(ring, 0);
>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>>     }
>>>>     
>>>>     static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 84fcf842316d..ff279b1f5c24 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>     	uint32_t seq;
>>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>> +	uint64_t reg_val_offs = 0;
>>>>     
>>>>     	BUG_ON(!ring->funcs->emit_rreg);
>>>>     
>>>>     	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>     	amdgpu_ring_alloc(ring, 32);
>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>     	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>>     				(5 << 8) |	/* dst: memory */
>>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>     				(1 << 20));	/* write confirm */
>>>>     	amdgpu_ring_write(ring, 0);
>>>>     	amdgpu_ring_write(ring, 0);
>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>>     	amdgpu_ring_commit(ring);
>>>>     	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct 
>>>> +amdgpu_device *adev)
>>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>>     		goto failed_kiq_read;
>>>>     
>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>>     
>>>>     failed_kiq_read:
>>>>     	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@ 
>>>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>     		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>>     }
>>>>     
>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg)
>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				    uint64_t reg_val_offs)
>>>>     {
>>>> -	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>> -
>>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>     				(5 << 8) |	/* dst: memory */
>>>>     				(1 << 20));	/* write confirm */
>>>>     	amdgpu_ring_write(ring, reg);
>>>>     	amdgpu_ring_write(ring, 0);
>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>> +					      reg_val_offs * 4));
>>>>     }
>>>>     
>>>>     static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg,
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fli
>>> s
>>> t
>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7C
>>> m
>>> o
>>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884e
>>> 6
>>> 0
>>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUFX
>>> G
>>> J
>>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CMo
> nk.Liu%40amd.com%7C3497588e2bd540d132ec08d7e5032cb7%7C3dd8961fe4884e60
> 8e11a82d994e183d%7C0%7C0%7C637229674118038126&amp;sdata=O4z6sIwD%2FLH3
> RSk3bcBRi3RHYLhTeV0An59xBJwqqEQ%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  8:20               ` Liu, Monk
@ 2020-04-20  8:25                 ` Christian König
  2020-04-20  8:36                   ` Liu, Monk
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2020-04-20  8:25 UTC (permalink / raw)
  To: Liu, Monk, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

That is actually only a problem because the KIQ uses polling waits.

See amdgpu_fence_emit() waits for the oldest possible fence to be 
signaled before emitting a new one.

I suggest that we do the same in amdgpu_fence_emit_polling(). A one 
liner like the following should be enough:

amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, 
timeout);

Regards,
Christian.

Am 20.04.20 um 10:20 schrieb Liu, Monk:
> Sure, that's fine
>
> Do you have particular suggestion for problem 2 ?  how we avoid  commands being overwritten before it's finished
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, April 20, 2020 4:17 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
> Yintian patch made the situation absolutely worse. Instead of a whole ring buffer wrap around to overwrite things (1024 dw at least) you now just need to use up 30 dw to trigger undefined behavior and most likely a lockup.
>
> And as Felix pointed out the patch even writes over the end of the ring buffer and can cause random corruption to whatever there is.
>
>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
> Yes, please do so. But please make also sure that the original patch is reverted before this starts to cause fallout from testers.
>
> Regards,
> Christian.
>
> Am 20.04.20 um 09:39 schrieb Liu, Monk:
>>>>> Instead of this crude hack please let us just allocate a fixed
>>>>> number of write back slots and use them round robin
>> It looks doable but really ugly compared with current patch ... and
>> more over there we are going to fix the second problem eventually
>>
>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Liu, Monk
>> Sent: Monday, April 20, 2020 3:37 PM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix
>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu: refine kiq read register
>>
>>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>>
>>
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Monday, April 20, 2020 3:19 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix
>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>
>> Hi Monk,
>>
>>> Can we first get the first problem done ?
>> Please absolutely not! See the problem introduced here is quite worse than the actual fix.
>>
>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>
>> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
>>
>> This should fix both problems at the same time and not introduce another potential problematic hack.
>>
>> If this patch is already committed please revert it immediately.
>>
>> Regards,
>> Christian.
>>
>> Am 20.04.20 um 08:20 schrieb Liu, Monk:
>>> Christian
>>>
>>>>>> Well I was under the assumption that this is actually what is done here.
>>> If that is not the case the patch is a rather clear NAK.
>>> <<<
>>>
>>> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>>>
>>> The first problem is :
>>> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
>>> This is the issue yintian's patch to address, by put the readback
>>> value not in a shared WB but in a chunk DW of command submit
>>>
>>> The second problem is:
>>> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ
>>> is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>>>
>>> The optional way is:
>>> 1) We use GPU scheduler to manage KIQ activity, and all jobs are
>>> submitted  to KIQ through a IB, thus no overwritten will happen
>>> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ,
>>> thus each JOB will occupy the fixed space/DW of RB, so we can avoid
>>> overwrite unfinished command
>>>
>>> We can discuss the second problem later
>>>
>>> Can we first get the first problem done ? thanks
>>>
>>>
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Monday, April 20, 2020 1:03 AM
>>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian
>>> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>>>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>>>> According to the current kiq read register method, there will be
>>>>> race condition when using KIQ to read register if multiple clients
>>>>> want to read at same time just like the expample below:
>>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B
>>>>> poll the seqno-1 5. the kiq complete these two read operation 6.
>>>>> client-A to read the register at the wb buffer and
>>>>>        get REG-1 value
>>>>>
>>>>> Therefore, directly make kiq write the register value at the ring
>>>>> buffer then there will be no race condition for the wb buffer.
>>>>>
>>>>> v2: supply the read_clock and move the reg_val_offs back
>>>>>
>>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>>> ---
>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>>>      6 files changed, 33 insertions(+), 40 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> index ea576b4260a4..4e1c0239e561 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct
>>>>> amdgpu_device *adev,
>>>>>      
>>>>>      	spin_lock_init(&kiq->ring_lock);
>>>>>      
>>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>>> -	if (r)
>>>>> -		return r;
>>>>> -
>>>>>      	ring->adev = NULL;
>>>>>      	ring->ring_obj = NULL;
>>>>>      	ring->use_doorbell = true;
>>>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct
>>>>> amdgpu_device *adev,
>>>>>      
>>>>>      void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>>>      {
>>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>>>      	amdgpu_ring_fini(ring);
>>>>>      }
>>>>>      
>>>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>>      	uint32_t seq;
>>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>> +	uint64_t reg_val_offs = 0;
>>>>>      
>>>>>      	BUG_ON(!ring->funcs->emit_rreg);
>>>>>      
>>>>>      	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>> I think that should be (ring->wptr + 30) & ring->buf_mask. Otherwise
>>>> the reg_val_offset can be past the end of the ring.
>>>>
>>>> But that still leaves a problem if another command is submitted to
>>>> the KIQ before you read the returned reg_val from the ring. Your
>>>> reg_val can be overwritten by the new command and you get the wrong
>>>> result. Or the command can be overwritten with the reg_val, which
>>>> will most likely hang the CP.
>>>>
>>>> You could allocate space on the KIQ ring with a NOP command to
>>>> prevent that space from being overwritten by other commands.
>>> Well I was under the assumption that this is actually what is done here.
>>> If that is not the case the patch is a rather clear NAK.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Regards,
>>>>       Felix
>>>>
>>>>
>>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>>      	amdgpu_ring_commit(ring);
>>>>>      	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7
>>>>> +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>>      		goto failed_kiq_read;
>>>>>      
>>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>>> +	return ring->ring[reg_val_offs];
>>>>>      
>>>>>      failed_kiq_read:
>>>>>      	pr_err("failed to read reg:%x\n", reg); diff --git
>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>> index 634746829024..ee698f0246d8 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>>>      	struct amdgpu_ring	ring;
>>>>>      	struct amdgpu_irq_src	irq;
>>>>>      	const struct kiq_pm4_funcs *pmf;
>>>>> -	uint32_t			reg_val_offs;
>>>>>      };
>>>>>      
>>>>>      /*
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> index f61664ee4940..a3d88f2aa9f4 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>>      	void (*end_use)(struct amdgpu_ring *ring);
>>>>>      	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>>      	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>>> +			  uint64_t reg_val_offs);
>>>>>      	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>>      	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>      			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@
>>>>> struct amdgpu_ring {
>>>>>      #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>>      #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>>      #define amdgpu_ring_emit_cntxcntl(r, d)
>>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define
>>>>> amdgpu_ring_emit_rreg(r,
>>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>>>>> +(d), (o))
>>>>>      #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>>>      #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>>      #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>>>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff
>>>>> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>>      	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>>>      }
>>>>>      
>>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>> uint32_t reg)
>>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>> +				     uint64_t reg_val_offs)
>>>>>      {
>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>> -
>>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>      				(5 << 8) |	/* dst: memory */
>>>>>      				(1 << 20));	/* write confirm */
>>>>>      	amdgpu_ring_write(ring, reg);
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>>      }
>>>>>      
>>>>>      static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>      		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>>>      }
>>>>>      
>>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>> uint32_t reg)
>>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>> +				    uint64_t reg_val_offs)
>>>>>      {
>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>> -
>>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>      				(5 << 8) |	/* dst: memory */
>>>>>      				(1 << 20));	/* write confirm */
>>>>>      	amdgpu_ring_write(ring, reg);
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>>      }
>>>>>      
>>>>>      static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index 84fcf842316d..ff279b1f5c24 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>      	uint32_t seq;
>>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>> +	uint64_t reg_val_offs = 0;
>>>>>      
>>>>>      	BUG_ON(!ring->funcs->emit_rreg);
>>>>>      
>>>>>      	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>      	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>>>      				(5 << 8) |	/* dst: memory */
>>>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>      				(1 << 20));	/* write confirm */
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>>      	amdgpu_ring_commit(ring);
>>>>>      	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct
>>>>> +amdgpu_device *adev)
>>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>>      		goto failed_kiq_read;
>>>>>      
>>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>>>      
>>>>>      failed_kiq_read:
>>>>>      	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 @@
>>>>> static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>      		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>>>      }
>>>>>      
>>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>> uint32_t reg)
>>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>> +				    uint64_t reg_val_offs)
>>>>>      {
>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>> -
>>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>      				(5 << 8) |	/* dst: memory */
>>>>>      				(1 << 20));	/* write confirm */
>>>>>      	amdgpu_ring_write(ring, reg);
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>>      }
>>>>>      
>>>>>      static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>>> uint32_t reg,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fli
>>>> s
>>>> t
>>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7C
>>>> m
>>>> o
>>>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884e
>>>> 6
>>>> 0
>>>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUFX
>>>> G
>>>> J
>>>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CMo
>> nk.Liu%40amd.com%7C3497588e2bd540d132ec08d7e5032cb7%7C3dd8961fe4884e60
>> 8e11a82d994e183d%7C0%7C0%7C637229674118038126&amp;sdata=O4z6sIwD%2FLH3
>> RSk3bcBRi3RHYLhTeV0An59xBJwqqEQ%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  8:25                 ` Christian König
@ 2020-04-20  8:36                   ` Liu, Monk
  2020-04-20  8:47                     ` Christian König
  0 siblings, 1 reply; 25+ messages in thread
From: Liu, Monk @ 2020-04-20  8:36 UTC (permalink / raw)
  To: Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

No, that's not true

The problem is not we submit commands on KIQ ring before the emiited ones are signaled , instead it is because we don't have a unified command submit size on KIQ

e.g.:

---JOB1---JOB2---JOB3---JOB4--->>
If job1 signaled and job 2/3/4 are running, by (your) design driver is allowed to put job5 on the position of job1. But there is chance that job5 take 100DW while job1 only occupy 50dw, thus the job2 will be overwritten by job5\

By gpu scheduler we always have fixed size of commands so jo5 will not overwrite job 2.

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Monday, April 20, 2020 4:26 PM
To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

That is actually only a problem because the KIQ uses polling waits.

See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.

I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:

amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, timeout);

Regards,
Christian.

Am 20.04.20 um 10:20 schrieb Liu, Monk:
> Sure, that's fine
>
> Do you have particular suggestion for problem 2 ?  how we avoid  
> commands being overwritten before it's finished
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, April 20, 2020 4:17 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; 
> Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
> Yintian patch made the situation absolutely worse. Instead of a whole ring buffer wrap around to overwrite things (1024 dw at least) you now just need to use up 30 dw to trigger undefined behavior and most likely a lockup.
>
> And as Felix pointed out the patch even writes over the end of the ring buffer and can cause random corruption to whatever there is.
>
>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
> Yes, please do so. But please make also sure that the original patch is reverted before this starts to cause fallout from testers.
>
> Regards,
> Christian.
>
> Am 20.04.20 um 09:39 schrieb Liu, Monk:
>>>>> Instead of this crude hack please let us just allocate a fixed 
>>>>> number of write back slots and use them round robin
>> It looks doable but really ugly compared with current patch ... and 
>> more over there we are going to fix the second problem eventually
>>
>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Liu, Monk
>> Sent: Monday, April 20, 2020 3:37 PM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix 
>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu: refine kiq read register
>>
>>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>>
>>
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Monday, April 20, 2020 3:19 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix 
>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>
>> Hi Monk,
>>
>>> Can we first get the first problem done ?
>> Please absolutely not! See the problem introduced here is quite worse than the actual fix.
>>
>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>
>> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
>>
>> This should fix both problems at the same time and not introduce another potential problematic hack.
>>
>> If this patch is already committed please revert it immediately.
>>
>> Regards,
>> Christian.
>>
>> Am 20.04.20 um 08:20 schrieb Liu, Monk:
>>> Christian
>>>
>>>>>> Well I was under the assumption that this is actually what is done here.
>>> If that is not the case the patch is a rather clear NAK.
>>> <<<
>>>
>>> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>>>
>>> The first problem is :
>>> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
>>> This is the issue yintian's patch to address, by put the readback 
>>> value not in a shared WB but in a chunk DW of command submit
>>>
>>> The second problem is:
>>> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ 
>>> is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>>>
>>> The optional way is:
>>> 1) We use GPU scheduler to manage KIQ activity, and all jobs are 
>>> submitted  to KIQ through a IB, thus no overwritten will happen
>>> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ, 
>>> thus each JOB will occupy the fixed space/DW of RB, so we can avoid 
>>> overwrite unfinished command
>>>
>>> We can discuss the second problem later
>>>
>>> Can we first get the first problem done ? thanks
>>>
>>>
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Monday, April 20, 2020 1:03 AM
>>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian 
>>> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>>>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>>>> According to the current kiq read register method, there will be 
>>>>> race condition when using KIQ to read register if multiple clients 
>>>>> want to read at same time just like the expample below:
>>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B 
>>>>> poll the seqno-1 5. the kiq complete these two read operation 6.
>>>>> client-A to read the register at the wb buffer and
>>>>>        get REG-1 value
>>>>>
>>>>> Therefore, directly make kiq write the register value at the ring 
>>>>> buffer then there will be no race condition for the wb buffer.
>>>>>
>>>>> v2: supply the read_clock and move the reg_val_offs back
>>>>>
>>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>>> ---
>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>>>      6 files changed, 33 insertions(+), 40 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> index ea576b4260a4..4e1c0239e561 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>>>> amdgpu_device *adev,
>>>>>      
>>>>>      	spin_lock_init(&kiq->ring_lock);
>>>>>      
>>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>>> -	if (r)
>>>>> -		return r;
>>>>> -
>>>>>      	ring->adev = NULL;
>>>>>      	ring->ring_obj = NULL;
>>>>>      	ring->use_doorbell = true;
>>>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>>>> amdgpu_device *adev,
>>>>>      
>>>>>      void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>>>      {
>>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>>>      	amdgpu_ring_fini(ring);
>>>>>      }
>>>>>      
>>>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>>      	uint32_t seq;
>>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>> +	uint64_t reg_val_offs = 0;
>>>>>      
>>>>>      	BUG_ON(!ring->funcs->emit_rreg);
>>>>>      
>>>>>      	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>> I think that should be (ring->wptr + 30) & ring->buf_mask. 
>>>> Otherwise the reg_val_offset can be past the end of the ring.
>>>>
>>>> But that still leaves a problem if another command is submitted to 
>>>> the KIQ before you read the returned reg_val from the ring. Your 
>>>> reg_val can be overwritten by the new command and you get the wrong 
>>>> result. Or the command can be overwritten with the reg_val, which 
>>>> will most likely hang the CP.
>>>>
>>>> You could allocate space on the KIQ ring with a NOP command to 
>>>> prevent that space from being overwritten by other commands.
>>> Well I was under the assumption that this is actually what is done here.
>>> If that is not the case the patch is a rather clear NAK.
>>>
>>> Regards,
>>> Christian.
>>>
>>>> Regards,
>>>>       Felix
>>>>
>>>>
>>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>>      	amdgpu_ring_commit(ring);
>>>>>      	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7
>>>>> +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, 
>>>>> +uint32_t reg)
>>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>>      		goto failed_kiq_read;
>>>>>      
>>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>>> +	return ring->ring[reg_val_offs];
>>>>>      
>>>>>      failed_kiq_read:
>>>>>      	pr_err("failed to read reg:%x\n", reg); diff --git 
>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>> index 634746829024..ee698f0246d8 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>>>      	struct amdgpu_ring	ring;
>>>>>      	struct amdgpu_irq_src	irq;
>>>>>      	const struct kiq_pm4_funcs *pmf;
>>>>> -	uint32_t			reg_val_offs;
>>>>>      };
>>>>>      
>>>>>      /*
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> index f61664ee4940..a3d88f2aa9f4 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>>      	void (*end_use)(struct amdgpu_ring *ring);
>>>>>      	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>>      	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>>> +			  uint64_t reg_val_offs);
>>>>>      	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>>      	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>      			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ 
>>>>> struct amdgpu_ring {
>>>>>      #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>>      #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>>      #define amdgpu_ring_emit_cntxcntl(r, d) 
>>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>>>> amdgpu_ring_emit_rreg(r,
>>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>>>>> +(d), (o))
>>>>>      #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>>>      #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>>      #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>>>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) 
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>>      	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>>>      }
>>>>>      
>>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>> uint32_t reg)
>>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>> +				     uint64_t reg_val_offs)
>>>>>      {
>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>> -
>>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>      				(5 << 8) |	/* dst: memory */
>>>>>      				(1 << 20));	/* write confirm */
>>>>>      	amdgpu_ring_write(ring, reg);
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>>      }
>>>>>      
>>>>>      static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring 
>>>>> *ring, uint32_t reg, diff --git 
>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>      		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>>>      }
>>>>>      
>>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>> uint32_t reg)
>>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>> +				    uint64_t reg_val_offs)
>>>>>      {
>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>> -
>>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>      				(5 << 8) |	/* dst: memory */
>>>>>      				(1 << 20));	/* write confirm */
>>>>>      	amdgpu_ring_write(ring, reg);
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>>      }
>>>>>      
>>>>>      static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> index 84fcf842316d..ff279b1f5c24 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>      	uint32_t seq;
>>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>> +	uint64_t reg_val_offs = 0;
>>>>>      
>>>>>      	BUG_ON(!ring->funcs->emit_rreg);
>>>>>      
>>>>>      	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>      	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>>>      				(5 << 8) |	/* dst: memory */
>>>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>      				(1 << 20));	/* write confirm */
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>>      	amdgpu_ring_commit(ring);
>>>>>      	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct 
>>>>> +amdgpu_device *adev)
>>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>>      		goto failed_kiq_read;
>>>>>      
>>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>>>      
>>>>>      failed_kiq_read:
>>>>>      	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 
>>>>> @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>      		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>>>      }
>>>>>      
>>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>> uint32_t reg)
>>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>> +				    uint64_t reg_val_offs)
>>>>>      {
>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>> -
>>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>      				(5 << 8) |	/* dst: memory */
>>>>>      				(1 << 20));	/* write confirm */
>>>>>      	amdgpu_ring_write(ring, reg);
>>>>>      	amdgpu_ring_write(ring, 0);
>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>> -				kiq->reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>> +					      reg_val_offs * 4));
>>>>>      }
>>>>>      
>>>>>      static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>>>> uint32_t reg,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fl
>>>> i
>>>> s
>>>> t
>>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7
>>>> C
>>>> m
>>>> o
>>>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884
>>>> e
>>>> 6
>>>> 0
>>>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUF
>>>> X
>>>> G
>>>> J
>>>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>> t 
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CM
>> o
>> nk.Liu%40amd.com%7C3497588e2bd540d132ec08d7e5032cb7%7C3dd8961fe4884e6
>> 0
>> 8e11a82d994e183d%7C0%7C0%7C637229674118038126&amp;sdata=O4z6sIwD%2FLH
>> 3
>> RSk3bcBRi3RHYLhTeV0An59xBJwqqEQ%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  8:36                   ` Liu, Monk
@ 2020-04-20  8:47                     ` Christian König
  2020-04-20  9:28                       ` Liu, Monk
  0 siblings, 1 reply; 25+ messages in thread
From: Christian König @ 2020-04-20  8:47 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

Hi Monk,

yeah, that is certainly problematic. But we have some maximum size for 
the KIQ submission, don't we?

The only alternative would be to double check the rptr before we 
allocate space on the ring buffer, but this means quite some additional 
overhead.

Regards,
Christian.

Am 20.04.20 um 10:36 schrieb Liu, Monk:
> No, that's not true
>
> The problem is not we submit commands on KIQ ring before the emiited ones are signaled , instead it is because we don't have a unified command submit size on KIQ
>
> e.g.:
>
> ---JOB1---JOB2---JOB3---JOB4--->>
> If job1 signaled and job 2/3/4 are running, by (your) design driver is allowed to put job5 on the position of job1. But there is chance that job5 take 100DW while job1 only occupy 50dw, thus the job2 will be overwritten by job5\
>
> By gpu scheduler we always have fixed size of commands so jo5 will not overwrite job 2.
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, April 20, 2020 4:26 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> That is actually only a problem because the KIQ uses polling waits.
>
> See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.
>
> I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:
>
> amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, timeout);
>
> Regards,
> Christian.
>
> Am 20.04.20 um 10:20 schrieb Liu, Monk:
>> Sure, that's fine
>>
>> Do you have particular suggestion for problem 2 ?  how we avoid
>> commands being overwritten before it's finished
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Monday, April 20, 2020 4:17 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>;
>> Tao, Yintian <Yintian.Tao@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>
>>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>> Yintian patch made the situation absolutely worse. Instead of a whole ring buffer wrap around to overwrite things (1024 dw at least) you now just need to use up 30 dw to trigger undefined behavior and most likely a lockup.
>>
>> And as Felix pointed out the patch even writes over the end of the ring buffer and can cause random corruption to whatever there is.
>>
>>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>> Yes, please do so. But please make also sure that the original patch is reverted before this starts to cause fallout from testers.
>>
>> Regards,
>> Christian.
>>
>> Am 20.04.20 um 09:39 schrieb Liu, Monk:
>>>>>> Instead of this crude hack please let us just allocate a fixed
>>>>>> number of write back slots and use them round robin
>>> It looks doable but really ugly compared with current patch ... and
>>> more over there we are going to fix the second problem eventually
>>>
>>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Liu, Monk
>>> Sent: Monday, April 20, 2020 3:37 PM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix
>>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>>>
>>>
>>>
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Monday, April 20, 2020 3:19 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix
>>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>> Hi Monk,
>>>
>>>> Can we first get the first problem done ?
>>> Please absolutely not! See the problem introduced here is quite worse than the actual fix.
>>>
>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>>
>>> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
>>>
>>> This should fix both problems at the same time and not introduce another potential problematic hack.
>>>
>>> If this patch is already committed please revert it immediately.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 20.04.20 um 08:20 schrieb Liu, Monk:
>>>> Christian
>>>>
>>>>>>> Well I was under the assumption that this is actually what is done here.
>>>> If that is not the case the patch is a rather clear NAK.
>>>> <<<
>>>>
>>>> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>>>>
>>>> The first problem is :
>>>> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
>>>> This is the issue yintian's patch to address, by put the readback
>>>> value not in a shared WB but in a chunk DW of command submit
>>>>
>>>> The second problem is:
>>>> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ
>>>> is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>>>>
>>>> The optional way is:
>>>> 1) We use GPU scheduler to manage KIQ activity, and all jobs are
>>>> submitted  to KIQ through a IB, thus no overwritten will happen
>>>> 2) we still skip gpu scheduler but always use IB to put jobs on KIQ,
>>>> thus each JOB will occupy the fixed space/DW of RB, so we can avoid
>>>> overwrite unfinished command
>>>>
>>>> We can discuss the second problem later
>>>>
>>>> Can we first get the first problem done ? thanks
>>>>
>>>>
>>>> _____________________________________
>>>> Monk Liu|GPU Virtualization Team |AMD
>>>>
>>>>
>>>> -----Original Message-----
>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> Sent: Monday, April 20, 2020 1:03 AM
>>>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian
>>>> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>>>> Cc: amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>>
>>>> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>>>>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>>>>> According to the current kiq read register method, there will be
>>>>>> race condition when using KIQ to read register if multiple clients
>>>>>> want to read at same time just like the expample below:
>>>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>>>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B
>>>>>> poll the seqno-1 5. the kiq complete these two read operation 6.
>>>>>> client-A to read the register at the wb buffer and
>>>>>>         get REG-1 value
>>>>>>
>>>>>> Therefore, directly make kiq write the register value at the ring
>>>>>> buffer then there will be no race condition for the wb buffer.
>>>>>>
>>>>>> v2: supply the read_clock and move the reg_val_offs back
>>>>>>
>>>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>>>> ---
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>>>>       6 files changed, 33 insertions(+), 40 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> index ea576b4260a4..4e1c0239e561 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct
>>>>>> amdgpu_device *adev,
>>>>>>       
>>>>>>       	spin_lock_init(&kiq->ring_lock);
>>>>>>       
>>>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>>>> -	if (r)
>>>>>> -		return r;
>>>>>> -
>>>>>>       	ring->adev = NULL;
>>>>>>       	ring->ring_obj = NULL;
>>>>>>       	ring->use_doorbell = true;
>>>>>> @@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct
>>>>>> amdgpu_device *adev,
>>>>>>       
>>>>>>       void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>>>>       {
>>>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>>>>       	amdgpu_ring_fini(ring);
>>>>>>       }
>>>>>>       
>>>>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>>>       	uint32_t seq;
>>>>>>       	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>       	struct amdgpu_ring *ring = &kiq->ring;
>>>>>> +	uint64_t reg_val_offs = 0;
>>>>>>       
>>>>>>       	BUG_ON(!ring->funcs->emit_rreg);
>>>>>>       
>>>>>>       	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>       	amdgpu_ring_alloc(ring, 32);
>>>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>> I think that should be (ring->wptr + 30) & ring->buf_mask.
>>>>> Otherwise the reg_val_offset can be past the end of the ring.
>>>>>
>>>>> But that still leaves a problem if another command is submitted to
>>>>> the KIQ before you read the returned reg_val from the ring. Your
>>>>> reg_val can be overwritten by the new command and you get the wrong
>>>>> result. Or the command can be overwritten with the reg_val, which
>>>>> will most likely hang the CP.
>>>>>
>>>>> You could allocate space on the KIQ ring with a NOP command to
>>>>> prevent that space from being overwritten by other commands.
>>>> Well I was under the assumption that this is actually what is done here.
>>>> If that is not the case the patch is a rather clear NAK.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Regards,
>>>>>        Felix
>>>>>
>>>>>
>>>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>>>       	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>       	amdgpu_ring_commit(ring);
>>>>>>       	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7
>>>>>> +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev,
>>>>>> +uint32_t reg)
>>>>>>       	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>       		goto failed_kiq_read;
>>>>>>       
>>>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>>>> +	return ring->ring[reg_val_offs];
>>>>>>       
>>>>>>       failed_kiq_read:
>>>>>>       	pr_err("failed to read reg:%x\n", reg); diff --git
>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> index 634746829024..ee698f0246d8 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>>>>       	struct amdgpu_ring	ring;
>>>>>>       	struct amdgpu_irq_src	irq;
>>>>>>       	const struct kiq_pm4_funcs *pmf;
>>>>>> -	uint32_t			reg_val_offs;
>>>>>>       };
>>>>>>       
>>>>>>       /*
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> index f61664ee4940..a3d88f2aa9f4 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>>>       	void (*end_use)(struct amdgpu_ring *ring);
>>>>>>       	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>>>       	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +			  uint64_t reg_val_offs);
>>>>>>       	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>>>       	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>>       			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@
>>>>>> struct amdgpu_ring {
>>>>>>       #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>>>       #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>>>       #define amdgpu_ring_emit_cntxcntl(r, d)
>>>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define
>>>>>> amdgpu_ring_emit_rreg(r,
>>>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>>>>>> +(d), (o))
>>>>>>       #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>>>>       #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>>>       #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>>>>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>>>       	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				     uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring
>>>>>> *ring, uint32_t reg, diff --git
>>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>>       		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				    uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 84fcf842316d..ff279b1f5c24 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>>       	uint32_t seq;
>>>>>>       	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>       	struct amdgpu_ring *ring = &kiq->ring;
>>>>>> +	uint64_t reg_val_offs = 0;
>>>>>>       
>>>>>>       	BUG_ON(!ring->funcs->emit_rreg);
>>>>>>       
>>>>>>       	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>       	amdgpu_ring_alloc(ring, 32);
>>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>       	amdgpu_ring_commit(ring);
>>>>>>       	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>>>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct
>>>>>> +amdgpu_device *adev)
>>>>>>       	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>       		goto failed_kiq_read;
>>>>>>       
>>>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>>>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>>>>       
>>>>>>       failed_kiq_read:
>>>>>>       	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19
>>>>>> @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>>       		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				    uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>>>> uint32_t reg,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fl
>>>>> i
>>>>> s
>>>>> t
>>>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7
>>>>> C
>>>>> m
>>>>> o
>>>>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe4884
>>>>> e
>>>>> 6
>>>>> 0
>>>>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfUF
>>>>> X
>>>>> G
>>>>> J
>>>>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>>> t
>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CM
>>> o
>>> nk.Liu%40amd.com%7C3497588e2bd540d132ec08d7e5032cb7%7C3dd8961fe4884e6
>>> 0
>>> 8e11a82d994e183d%7C0%7C0%7C637229674118038126&amp;sdata=O4z6sIwD%2FLH
>>> 3
>>> RSk3bcBRi3RHYLhTeV0An59xBJwqqEQ%3D&amp;reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  8:47                     ` Christian König
@ 2020-04-20  9:28                       ` Liu, Monk
  2020-04-20 11:21                         ` Christian König
  2020-04-20 15:18                         ` Liu, Shaoyun
  0 siblings, 2 replies; 25+ messages in thread
From: Liu, Monk @ 2020-04-20  9:28 UTC (permalink / raw)
  To: Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

So why don't we  let gpu scheduler to manage KIQ, and all client of KIQ need to submit jobs within an IB ?

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Monday, April 20, 2020 4:48 PM
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

Hi Monk,

yeah, that is certainly problematic. But we have some maximum size for the KIQ submission, don't we?

The only alternative would be to double check the rptr before we allocate space on the ring buffer, but this means quite some additional overhead.

Regards,
Christian.

Am 20.04.20 um 10:36 schrieb Liu, Monk:
> No, that's not true
>
> The problem is not we submit commands on KIQ ring before the emiited 
> ones are signaled , instead it is because we don't have a unified 
> command submit size on KIQ
>
> e.g.:
>
> ---JOB1---JOB2---JOB3---JOB4--->>
> If job1 signaled and job 2/3/4 are running, by (your) design driver is 
> allowed to put job5 on the position of job1. But there is chance that 
> job5 take 100DW while job1 only occupy 50dw, thus the job2 will be 
> overwritten by job5\
>
> By gpu scheduler we always have fixed size of commands so jo5 will not overwrite job 2.
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, April 20, 2020 4:26 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix 
> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> That is actually only a problem because the KIQ uses polling waits.
>
> See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.
>
> I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:
>
> amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, 
> timeout);
>
> Regards,
> Christian.
>
> Am 20.04.20 um 10:20 schrieb Liu, Monk:
>> Sure, that's fine
>>
>> Do you have particular suggestion for problem 2 ?  how we avoid 
>> commands being overwritten before it's finished
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Monday, April 20, 2020 4:17 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian 
>> <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; 
>> Tao, Yintian <Yintian.Tao@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>
>>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>> Yintian patch made the situation absolutely worse. Instead of a whole ring buffer wrap around to overwrite things (1024 dw at least) you now just need to use up 30 dw to trigger undefined behavior and most likely a lockup.
>>
>> And as Felix pointed out the patch even writes over the end of the ring buffer and can cause random corruption to whatever there is.
>>
>>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>> Yes, please do so. But please make also sure that the original patch is reverted before this starts to cause fallout from testers.
>>
>> Regards,
>> Christian.
>>
>> Am 20.04.20 um 09:39 schrieb Liu, Monk:
>>>>>> Instead of this crude hack please let us just allocate a fixed 
>>>>>> number of write back slots and use them round robin
>>> It looks doable but really ugly compared with current patch ... and 
>>> more over there we are going to fix the second problem eventually
>>>
>>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Liu, Monk
>>> Sent: Monday, April 20, 2020 3:37 PM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix 
>>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>>>
>>>
>>>
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Monday, April 20, 2020 3:19 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix 
>>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>> Hi Monk,
>>>
>>>> Can we first get the first problem done ?
>>> Please absolutely not! See the problem introduced here is quite worse than the actual fix.
>>>
>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>>
>>> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
>>>
>>> This should fix both problems at the same time and not introduce another potential problematic hack.
>>>
>>> If this patch is already committed please revert it immediately.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 20.04.20 um 08:20 schrieb Liu, Monk:
>>>> Christian
>>>>
>>>>>>> Well I was under the assumption that this is actually what is done here.
>>>> If that is not the case the patch is a rather clear NAK.
>>>> <<<
>>>>
>>>> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>>>>
>>>> The first problem is :
>>>> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
>>>> This is the issue yintian's patch to address, by put the readback 
>>>> value not in a shared WB but in a chunk DW of command submit
>>>>
>>>> The second problem is:
>>>> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ 
>>>> is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>>>>
>>>> The optional way is:
>>>> 1) We use GPU scheduler to manage KIQ activity, and all jobs are 
>>>> submitted  to KIQ through a IB, thus no overwritten will happen
>>>> 2) we still skip gpu scheduler but always use IB to put jobs on 
>>>> KIQ, thus each JOB will occupy the fixed space/DW of RB, so we can 
>>>> avoid overwrite unfinished command
>>>>
>>>> We can discuss the second problem later
>>>>
>>>> Can we first get the first problem done ? thanks
>>>>
>>>>
>>>> _____________________________________
>>>> Monk Liu|GPU Virtualization Team |AMD
>>>>
>>>>
>>>> -----Original Message-----
>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> Sent: Monday, April 20, 2020 1:03 AM
>>>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian 
>>>> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>>>> Cc: amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>>
>>>> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>>>>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>>>>> According to the current kiq read register method, there will be 
>>>>>> race condition when using KIQ to read register if multiple 
>>>>>> clients want to read at same time just like the expample below:
>>>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>>>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B 
>>>>>> poll the seqno-1 5. the kiq complete these two read operation 6.
>>>>>> client-A to read the register at the wb buffer and
>>>>>>         get REG-1 value
>>>>>>
>>>>>> Therefore, directly make kiq write the register value at the ring 
>>>>>> buffer then there will be no race condition for the wb buffer.
>>>>>>
>>>>>> v2: supply the read_clock and move the reg_val_offs back
>>>>>>
>>>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>>>> ---
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>>>>       6 files changed, 33 insertions(+), 40 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> index ea576b4260a4..4e1c0239e561 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>>>>> amdgpu_device *adev,
>>>>>>       
>>>>>>       	spin_lock_init(&kiq->ring_lock);
>>>>>>       
>>>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>>>> -	if (r)
>>>>>> -		return r;
>>>>>> -
>>>>>>       	ring->adev = NULL;
>>>>>>       	ring->ring_obj = NULL;
>>>>>>       	ring->use_doorbell = true; @@ -331,7 +327,6 @@ int 
>>>>>> amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>>>>       
>>>>>>       void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>>>>       {
>>>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>>>>       	amdgpu_ring_fini(ring);
>>>>>>       }
>>>>>>       
>>>>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>>>       	uint32_t seq;
>>>>>>       	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>       	struct amdgpu_ring *ring = &kiq->ring;
>>>>>> +	uint64_t reg_val_offs = 0;
>>>>>>       
>>>>>>       	BUG_ON(!ring->funcs->emit_rreg);
>>>>>>       
>>>>>>       	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>       	amdgpu_ring_alloc(ring, 32);
>>>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>> I think that should be (ring->wptr + 30) & ring->buf_mask.
>>>>> Otherwise the reg_val_offset can be past the end of the ring.
>>>>>
>>>>> But that still leaves a problem if another command is submitted to 
>>>>> the KIQ before you read the returned reg_val from the ring. Your 
>>>>> reg_val can be overwritten by the new command and you get the 
>>>>> wrong result. Or the command can be overwritten with the reg_val, 
>>>>> which will most likely hang the CP.
>>>>>
>>>>> You could allocate space on the KIQ ring with a NOP command to 
>>>>> prevent that space from being overwritten by other commands.
>>>> Well I was under the assumption that this is actually what is done here.
>>>> If that is not the case the patch is a rather clear NAK.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Regards,
>>>>>        Felix
>>>>>
>>>>>
>>>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>>>       	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>       	amdgpu_ring_commit(ring);
>>>>>>       	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7
>>>>>> +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, 
>>>>>> +uint32_t reg)
>>>>>>       	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>       		goto failed_kiq_read;
>>>>>>       
>>>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>>>> +	return ring->ring[reg_val_offs];
>>>>>>       
>>>>>>       failed_kiq_read:
>>>>>>       	pr_err("failed to read reg:%x\n", reg); diff --git 
>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> index 634746829024..ee698f0246d8 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>>>>       	struct amdgpu_ring	ring;
>>>>>>       	struct amdgpu_irq_src	irq;
>>>>>>       	const struct kiq_pm4_funcs *pmf;
>>>>>> -	uint32_t			reg_val_offs;
>>>>>>       };
>>>>>>       
>>>>>>       /*
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> index f61664ee4940..a3d88f2aa9f4 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>>>       	void (*end_use)(struct amdgpu_ring *ring);
>>>>>>       	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>>>       	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +			  uint64_t reg_val_offs);
>>>>>>       	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>>>       	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>>       			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ 
>>>>>> struct amdgpu_ring {
>>>>>>       #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>>>       #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>>>       #define amdgpu_ring_emit_cntxcntl(r, d) 
>>>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>>>>> amdgpu_ring_emit_rreg(r,
>>>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>>>> +#define amdgpu_ring_emit_rreg(r, d, o) 
>>>>>> +(r)->funcs->emit_rreg((r), (d), (o))
>>>>>>       #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>>>>       #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>>>       #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, 
>>>>>> m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) 
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>>>       	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				     uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring 
>>>>>> *ring, uint32_t reg, diff --git 
>>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>>       		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				    uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring 
>>>>>> *ring, uint32_t reg, diff --git 
>>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 84fcf842316d..ff279b1f5c24 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>>       	uint32_t seq;
>>>>>>       	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>       	struct amdgpu_ring *ring = &kiq->ring;
>>>>>> +	uint64_t reg_val_offs = 0;
>>>>>>       
>>>>>>       	BUG_ON(!ring->funcs->emit_rreg);
>>>>>>       
>>>>>>       	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>       	amdgpu_ring_alloc(ring, 32);
>>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>       	amdgpu_ring_commit(ring);
>>>>>>       	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>>>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct 
>>>>>> +amdgpu_device *adev)
>>>>>>       	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>       		goto failed_kiq_read;
>>>>>>       
>>>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>>>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>>>>       
>>>>>>       failed_kiq_read:
>>>>>>       	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 
>>>>>> @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>>       		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				    uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring 
>>>>>> *ring, uint32_t reg,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2F
>>>>> l
>>>>> i
>>>>> s
>>>>> t
>>>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%
>>>>> 7
>>>>> C
>>>>> m
>>>>> o
>>>>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe488
>>>>> 4
>>>>> e
>>>>> 6
>>>>> 0
>>>>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfU
>>>>> F
>>>>> X
>>>>> G
>>>>> J
>>>>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fli
>>> s
>>> t
>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7C
>>> M
>>> o
>>> nk.Liu%40amd.com%7C3497588e2bd540d132ec08d7e5032cb7%7C3dd8961fe4884e
>>> 6
>>> 0
>>> 8e11a82d994e183d%7C0%7C0%7C637229674118038126&amp;sdata=O4z6sIwD%2FL
>>> H
>>> 3
>>> RSk3bcBRi3RHYLhTeV0An59xBJwqqEQ%3D&amp;reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CMo
> nk.Liu%40amd.com%7Cc53a79bc90a943f3245008d7e507897e%7C3dd8961fe4884e60
> 8e11a82d994e183d%7C0%7C0%7C637229693602064841&amp;sdata=4Hqe7ucIn6%2Fu
> eQNLR9EYmx%2BRzw8TZbtlL53qsjNSGFE%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  9:28                       ` Liu, Monk
@ 2020-04-20 11:21                         ` Christian König
  2020-04-20 15:18                         ` Liu, Shaoyun
  1 sibling, 0 replies; 25+ messages in thread
From: Christian König @ 2020-04-20 11:21 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Kuehling, Felix, Tao, Yintian; +Cc: amd-gfx

That's what we did initially, but we need to use the KIQ in atomic 
context and that doesn't work with the scheduler.

Additional to that the scheduler has the same problem as the direct 
submit, you need to know how what the maximum dw is a job could take.

Christian.

Am 20.04.20 um 11:28 schrieb Liu, Monk:
> So why don't we  let gpu scheduler to manage KIQ, and all client of KIQ need to submit jobs within an IB ?
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Monday, April 20, 2020 4:48 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> Hi Monk,
>
> yeah, that is certainly problematic. But we have some maximum size for the KIQ submission, don't we?
>
> The only alternative would be to double check the rptr before we allocate space on the ring buffer, but this means quite some additional overhead.
>
> Regards,
> Christian.
>
> Am 20.04.20 um 10:36 schrieb Liu, Monk:
>> No, that's not true
>>
>> The problem is not we submit commands on KIQ ring before the emiited
>> ones are signaled , instead it is because we don't have a unified
>> command submit size on KIQ
>>
>> e.g.:
>>
>> ---JOB1---JOB2---JOB3---JOB4--->>
>> If job1 signaled and job 2/3/4 are running, by (your) design driver is
>> allowed to put job5 on the position of job1. But there is chance that
>> job5 take 100DW while job1 only occupy 50dw, thus the job2 will be
>> overwritten by job5\
>>
>> By gpu scheduler we always have fixed size of commands so jo5 will not overwrite job 2.
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Monday, April 20, 2020 4:26 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix
>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>
>> That is actually only a problem because the KIQ uses polling waits.
>>
>> See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.
>>
>> I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:
>>
>> amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask,
>> timeout);
>>
>> Regards,
>> Christian.
>>
>> Am 20.04.20 um 10:20 schrieb Liu, Monk:
>>> Sure, that's fine
>>>
>>> Do you have particular suggestion for problem 2 ?  how we avoid
>>> commands being overwritten before it's finished
>>>
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Monday, April 20, 2020 4:17 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian
>>> <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>;
>>> Tao, Yintian <Yintian.Tao@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>>> Yintian patch made the situation absolutely worse. Instead of a whole ring buffer wrap around to overwrite things (1024 dw at least) you now just need to use up 30 dw to trigger undefined behavior and most likely a lockup.
>>>
>>> And as Felix pointed out the patch even writes over the end of the ring buffer and can cause random corruption to whatever there is.
>>>
>>>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>>> Yes, please do so. But please make also sure that the original patch is reverted before this starts to cause fallout from testers.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 20.04.20 um 09:39 schrieb Liu, Monk:
>>>>>>> Instead of this crude hack please let us just allocate a fixed
>>>>>>> number of write back slots and use them round robin
>>>> It looks doable but really ugly compared with current patch ... and
>>>> more over there we are going to fix the second problem eventually
>>>>
>>>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>>>> _____________________________________
>>>> Monk Liu|GPU Virtualization Team |AMD
>>>>
>>>>
>>>> -----Original Message-----
>>>> From: Liu, Monk
>>>> Sent: Monday, April 20, 2020 3:37 PM
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix
>>>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>>>> Cc: amd-gfx@lists.freedesktop.org
>>>> Subject: RE: [PATCH] drm/amdgpu: refine kiq read register
>>>>
>>>>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>>>>
>>>>
>>>>
>>>> _____________________________________
>>>> Monk Liu|GPU Virtualization Team |AMD
>>>>
>>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Monday, April 20, 2020 3:19 PM
>>>> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix
>>>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>>>> Cc: amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>>
>>>> Hi Monk,
>>>>
>>>>> Can we first get the first problem done ?
>>>> Please absolutely not! See the problem introduced here is quite worse than the actual fix.
>>>>
>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>>>
>>>> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
>>>>
>>>> This should fix both problems at the same time and not introduce another potential problematic hack.
>>>>
>>>> If this patch is already committed please revert it immediately.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 20.04.20 um 08:20 schrieb Liu, Monk:
>>>>> Christian
>>>>>
>>>>>>>> Well I was under the assumption that this is actually what is done here.
>>>>> If that is not the case the patch is a rather clear NAK.
>>>>> <<<
>>>>>
>>>>> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>>>>>
>>>>> The first problem is :
>>>>> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
>>>>> This is the issue yintian's patch to address, by put the readback
>>>>> value not in a shared WB but in a chunk DW of command submit
>>>>>
>>>>> The second problem is:
>>>>> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ
>>>>> is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>>>>>
>>>>> The optional way is:
>>>>> 1) We use GPU scheduler to manage KIQ activity, and all jobs are
>>>>> submitted  to KIQ through a IB, thus no overwritten will happen
>>>>> 2) we still skip gpu scheduler but always use IB to put jobs on
>>>>> KIQ, thus each JOB will occupy the fixed space/DW of RB, so we can
>>>>> avoid overwrite unfinished command
>>>>>
>>>>> We can discuss the second problem later
>>>>>
>>>>> Can we first get the first problem done ? thanks
>>>>>
>>>>>
>>>>> _____________________________________
>>>>> Monk Liu|GPU Virtualization Team |AMD
>>>>>
>>>>>
>>>>> -----Original Message-----
>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>> Sent: Monday, April 20, 2020 1:03 AM
>>>>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian
>>>>> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>>>>> Cc: amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>>>
>>>>> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>>>>>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>>>>>> According to the current kiq read register method, there will be
>>>>>>> race condition when using KIQ to read register if multiple
>>>>>>> clients want to read at same time just like the expample below:
>>>>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>>>>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B
>>>>>>> poll the seqno-1 5. the kiq complete these two read operation 6.
>>>>>>> client-A to read the register at the wb buffer and
>>>>>>>          get REG-1 value
>>>>>>>
>>>>>>> Therefore, directly make kiq write the register value at the ring
>>>>>>> buffer then there will be no race condition for the wb buffer.
>>>>>>>
>>>>>>> v2: supply the read_clock and move the reg_val_offs back
>>>>>>>
>>>>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>>>>> ---
>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>>>>>        drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>>>>>        drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>>>>>        drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>>>>>        6 files changed, 33 insertions(+), 40 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>> index ea576b4260a4..4e1c0239e561 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct
>>>>>>> amdgpu_device *adev,
>>>>>>>        
>>>>>>>        	spin_lock_init(&kiq->ring_lock);
>>>>>>>        
>>>>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>>>>> -	if (r)
>>>>>>> -		return r;
>>>>>>> -
>>>>>>>        	ring->adev = NULL;
>>>>>>>        	ring->ring_obj = NULL;
>>>>>>>        	ring->use_doorbell = true; @@ -331,7 +327,6 @@ int
>>>>>>> amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>>>>>        
>>>>>>>        void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>>>>>        {
>>>>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>>>>>        	amdgpu_ring_fini(ring);
>>>>>>>        }
>>>>>>>        
>>>>>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>>>>        	uint32_t seq;
>>>>>>>        	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>>        	struct amdgpu_ring *ring = &kiq->ring;
>>>>>>> +	uint64_t reg_val_offs = 0;
>>>>>>>        
>>>>>>>        	BUG_ON(!ring->funcs->emit_rreg);
>>>>>>>        
>>>>>>>        	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>>        	amdgpu_ring_alloc(ring, 32);
>>>>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>>> I think that should be (ring->wptr + 30) & ring->buf_mask.
>>>>>> Otherwise the reg_val_offset can be past the end of the ring.
>>>>>>
>>>>>> But that still leaves a problem if another command is submitted to
>>>>>> the KIQ before you read the returned reg_val from the ring. Your
>>>>>> reg_val can be overwritten by the new command and you get the
>>>>>> wrong result. Or the command can be overwritten with the reg_val,
>>>>>> which will most likely hang the CP.
>>>>>>
>>>>>> You could allocate space on the KIQ ring with a NOP command to
>>>>>> prevent that space from being overwritten by other commands.
>>>>> Well I was under the assumption that this is actually what is done here.
>>>>> If that is not the case the patch is a rather clear NAK.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Regards,
>>>>>>         Felix
>>>>>>
>>>>>>
>>>>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>>>>        	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>>        	amdgpu_ring_commit(ring);
>>>>>>>        	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7
>>>>>>> +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev,
>>>>>>> +uint32_t reg)
>>>>>>>        	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>>        		goto failed_kiq_read;
>>>>>>>        
>>>>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>>>>> +	return ring->ring[reg_val_offs];
>>>>>>>        
>>>>>>>        failed_kiq_read:
>>>>>>>        	pr_err("failed to read reg:%x\n", reg); diff --git
>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>>> index 634746829024..ee698f0246d8 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>>>>>        	struct amdgpu_ring	ring;
>>>>>>>        	struct amdgpu_irq_src	irq;
>>>>>>>        	const struct kiq_pm4_funcs *pmf;
>>>>>>> -	uint32_t			reg_val_offs;
>>>>>>>        };
>>>>>>>        
>>>>>>>        /*
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>>> index f61664ee4940..a3d88f2aa9f4 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>>>>        	void (*end_use)(struct amdgpu_ring *ring);
>>>>>>>        	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>>>>        	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>>> +			  uint64_t reg_val_offs);
>>>>>>>        	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>>>>        	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>>>        			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@
>>>>>>> struct amdgpu_ring {
>>>>>>>        #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>>>>        #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>>>>        #define amdgpu_ring_emit_cntxcntl(r, d)
>>>>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define
>>>>>>> amdgpu_ring_emit_rreg(r,
>>>>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>>>>> +#define amdgpu_ring_emit_rreg(r, d, o)
>>>>>>> +(r)->funcs->emit_rreg((r), (d), (o))
>>>>>>>        #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>>>>>        #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>>>>        #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v,
>>>>>>> m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>>>>        	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>>>>>        }
>>>>>>>        
>>>>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>>>> uint32_t reg)
>>>>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>>> +				     uint64_t reg_val_offs)
>>>>>>>        {
>>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>> -
>>>>>>>        	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>>        	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>>        				(5 << 8) |	/* dst: memory */
>>>>>>>        				(1 << 20));	/* write confirm */
>>>>>>>        	amdgpu_ring_write(ring, reg);
>>>>>>>        	amdgpu_ring_write(ring, 0);
>>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>>> -				kiq->reg_val_offs * 4));
>>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>>> -				kiq->reg_val_offs * 4));
>>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>>> +					      reg_val_offs * 4));
>>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>>> +					      reg_val_offs * 4));
>>>>>>>        }
>>>>>>>        
>>>>>>>        static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring
>>>>>>> *ring, uint32_t reg, diff --git
>>>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>>>        		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>>>>>        }
>>>>>>>        
>>>>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>>>> uint32_t reg)
>>>>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>>> +				    uint64_t reg_val_offs)
>>>>>>>        {
>>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>> -
>>>>>>>        	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>>        	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>>        				(5 << 8) |	/* dst: memory */
>>>>>>>        				(1 << 20));	/* write confirm */
>>>>>>>        	amdgpu_ring_write(ring, reg);
>>>>>>>        	amdgpu_ring_write(ring, 0);
>>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>>> -				kiq->reg_val_offs * 4));
>>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>>> -				kiq->reg_val_offs * 4));
>>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>>> +					      reg_val_offs * 4));
>>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>>> +					      reg_val_offs * 4));
>>>>>>>        }
>>>>>>>        
>>>>>>>        static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring
>>>>>>> *ring, uint32_t reg, diff --git
>>>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>>> index 84fcf842316d..ff279b1f5c24 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>>>        	uint32_t seq;
>>>>>>>        	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>>        	struct amdgpu_ring *ring = &kiq->ring;
>>>>>>> +	uint64_t reg_val_offs = 0;
>>>>>>>        
>>>>>>>        	BUG_ON(!ring->funcs->emit_rreg);
>>>>>>>        
>>>>>>>        	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>>        	amdgpu_ring_alloc(ring, 32);
>>>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>>>>        	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>>        	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>>>>>        				(5 << 8) |	/* dst: memory */
>>>>>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>>>        				(1 << 20));	/* write confirm */
>>>>>>>        	amdgpu_ring_write(ring, 0);
>>>>>>>        	amdgpu_ring_write(ring, 0);
>>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>>> -				kiq->reg_val_offs * 4));
>>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>>> -				kiq->reg_val_offs * 4));
>>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>>> +					      reg_val_offs * 4));
>>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>>> +					      reg_val_offs * 4));
>>>>>>>        	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>>        	amdgpu_ring_commit(ring);
>>>>>>>        	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>>>>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct
>>>>>>> +amdgpu_device *adev)
>>>>>>>        	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>>        		goto failed_kiq_read;
>>>>>>>        
>>>>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>>>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>>>>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>>>>>        
>>>>>>>        failed_kiq_read:
>>>>>>>        	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19
>>>>>>> @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>>>        		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>>>>>        }
>>>>>>>        
>>>>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>>>>> uint32_t reg)
>>>>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>>> +				    uint64_t reg_val_offs)
>>>>>>>        {
>>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>> -
>>>>>>>        	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>>        	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>>        				(5 << 8) |	/* dst: memory */
>>>>>>>        				(1 << 20));	/* write confirm */
>>>>>>>        	amdgpu_ring_write(ring, reg);
>>>>>>>        	amdgpu_ring_write(ring, 0);
>>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>>> -				kiq->reg_val_offs * 4));
>>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>>> -				kiq->reg_val_offs * 4));
>>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>>> +					      reg_val_offs * 4));
>>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>>> +					      reg_val_offs * 4));
>>>>>>>        }
>>>>>>>        
>>>>>>>        static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring
>>>>>>> *ring, uint32_t reg,
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2F
>>>>>> l
>>>>>> i
>>>>>> s
>>>>>> t
>>>>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%
>>>>>> 7
>>>>>> C
>>>>>> m
>>>>>> o
>>>>>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe488
>>>>>> 4
>>>>>> e
>>>>>> 6
>>>>>> 0
>>>>>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfU
>>>>>> F
>>>>>> X
>>>>>> G
>>>>>> J
>>>>>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fli
>>>> s
>>>> t
>>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7C
>>>> M
>>>> o
>>>> nk.Liu%40amd.com%7C3497588e2bd540d132ec08d7e5032cb7%7C3dd8961fe4884e
>>>> 6
>>>> 0
>>>> 8e11a82d994e183d%7C0%7C0%7C637229674118038126&amp;sdata=O4z6sIwD%2FL
>>>> H
>>>> 3
>>>> RSk3bcBRi3RHYLhTeV0An59xBJwqqEQ%3D&amp;reserved=0
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CMo
>> nk.Liu%40amd.com%7Cc53a79bc90a943f3245008d7e507897e%7C3dd8961fe4884e60
>> 8e11a82d994e183d%7C0%7C0%7C637229693602064841&amp;sdata=4Hqe7ucIn6%2Fu
>> eQNLR9EYmx%2BRzw8TZbtlL53qsjNSGFE%3D&amp;reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  9:28                       ` Liu, Monk
  2020-04-20 11:21                         ` Christian König
@ 2020-04-20 15:18                         ` Liu, Shaoyun
  1 sibling, 0 replies; 25+ messages in thread
From: Liu, Shaoyun @ 2020-04-20 15:18 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Kuehling, Felix, Tao,  Yintian; +Cc: amd-gfx

[AMD Official Use Only - Internal Distribution Only]

I didn't calculate the exact required DW used for on e kiq read ( currently  use 32 DW) ,  That's assume it's right , Can we just alloc 2 more  here ? Since each kiq_read  will do the allocation , it won't  conflict with other read operation.

Shaoyun.liu
	

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Liu, Monk
Sent: Monday, April 20, 2020 5:28 AM
To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu: refine kiq read register

So why don't we  let gpu scheduler to manage KIQ, and all client of KIQ need to submit jobs within an IB ?

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Monday, April 20, 2020 4:48 PM
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq read register

Hi Monk,

yeah, that is certainly problematic. But we have some maximum size for the KIQ submission, don't we?

The only alternative would be to double check the rptr before we allocate space on the ring buffer, but this means quite some additional overhead.

Regards,
Christian.

Am 20.04.20 um 10:36 schrieb Liu, Monk:
> No, that's not true
>
> The problem is not we submit commands on KIQ ring before the emiited 
> ones are signaled , instead it is because we don't have a unified 
> command submit size on KIQ
>
> e.g.:
>
> ---JOB1---JOB2---JOB3---JOB4--->>
> If job1 signaled and job 2/3/4 are running, by (your) design driver is 
> allowed to put job5 on the position of job1. But there is chance that
> job5 take 100DW while job1 only occupy 50dw, thus the job2 will be 
> overwritten by job5\
>
> By gpu scheduler we always have fixed size of commands so jo5 will not overwrite job 2.
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, April 20, 2020 4:26 PM
> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix 
> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>
> That is actually only a problem because the KIQ uses polling waits.
>
> See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.
>
> I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:
>
> amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, 
> timeout);
>
> Regards,
> Christian.
>
> Am 20.04.20 um 10:20 schrieb Liu, Monk:
>> Sure, that's fine
>>
>> Do you have particular suggestion for problem 2 ?  how we avoid 
>> commands being overwritten before it's finished
>>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Monday, April 20, 2020 4:17 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian 
>> <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; 
>> Tao, Yintian <Yintian.Tao@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>
>>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>> Yintian patch made the situation absolutely worse. Instead of a whole ring buffer wrap around to overwrite things (1024 dw at least) you now just need to use up 30 dw to trigger undefined behavior and most likely a lockup.
>>
>> And as Felix pointed out the patch even writes over the end of the ring buffer and can cause random corruption to whatever there is.
>>
>>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>> Yes, please do so. But please make also sure that the original patch is reverted before this starts to cause fallout from testers.
>>
>> Regards,
>> Christian.
>>
>> Am 20.04.20 um 09:39 schrieb Liu, Monk:
>>>>>> Instead of this crude hack please let us just allocate a fixed 
>>>>>> number of write back slots and use them round robin
>>> It looks doable but really ugly compared with current patch ... and 
>>> more over there we are going to fix the second problem eventually
>>>
>>> What about let Yintian to provide  one patch to address all those two problem ? so way what you worried about won't happen ?
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Liu, Monk
>>> Sent: Monday, April 20, 2020 3:37 PM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix 
>>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>>>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>> Yintian's patch has nothing to do with the result you mentioned .... the command being overwritten by new initiated commands is a inherent bug, why you put those two stuff together ?
>>>
>>>
>>>
>>> _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Monday, April 20, 2020 3:19 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix 
>>> <Felix.Kuehling@amd.com>; Tao, Yintian <Yintian.Tao@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>
>>> Hi Monk,
>>>
>>>> Can we first get the first problem done ?
>>> Please absolutely not! See the problem introduced here is quite worse than the actual fix.
>>>
>>> Previously we ended up with an invalid value in a concurrent register read, now the KIQs overwrites its own commands and most likely causes a hang or the hardware to execute something random.
>>>
>>> Instead of this crude hack please let us just allocate a fixed number of write back slots and use them round robin. Then we can make sure that we don't have more than a fixed number of reads in flight at the same time as well using the fence values.
>>>
>>> This should fix both problems at the same time and not introduce another potential problematic hack.
>>>
>>> If this patch is already committed please revert it immediately.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 20.04.20 um 08:20 schrieb Liu, Monk:
>>>> Christian
>>>>
>>>>>>> Well I was under the assumption that this is actually what is done here.
>>>> If that is not the case the patch is a rather clear NAK.
>>>> <<<
>>>>
>>>> There are two kinds of problems in the current KIQ reading reg, Yintian tend to fix one of them but not all ...
>>>>
>>>> The first problem is :
>>>> During the sleep of the first KIQ reading, another KIQ reading initiated an the read back register value flushed the first readback value, thus the first reading will get the wrong result.
>>>> This is the issue yintian's patch to address, by put the readback 
>>>> value not in a shared WB but in a chunk DW of command submit
>>>>
>>>> The second problem is:
>>>> Since we don't utilize GPU scheduler for KIQ submit thus if the KIQ 
>>>> is busy with some commands then those unfinished commands maybe overwritten by a new command submit, and that's not the Problem yintian's patch tend to address. Felex pointed it out which is fine and we can use another patch to address it, I'm also planning and scoping it.
>>>>
>>>> The optional way is:
>>>> 1) We use GPU scheduler to manage KIQ activity, and all jobs are 
>>>> submitted  to KIQ through a IB, thus no overwritten will happen
>>>> 2) we still skip gpu scheduler but always use IB to put jobs on 
>>>> KIQ, thus each JOB will occupy the fixed space/DW of RB, so we can 
>>>> avoid overwrite unfinished command
>>>>
>>>> We can discuss the second problem later
>>>>
>>>> Can we first get the first problem done ? thanks
>>>>
>>>>
>>>> _____________________________________
>>>> Monk Liu|GPU Virtualization Team |AMD
>>>>
>>>>
>>>> -----Original Message-----
>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> Sent: Monday, April 20, 2020 1:03 AM
>>>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Tao, Yintian 
>>>> <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>>>> Cc: amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq read register
>>>>
>>>> Am 17.04.20 um 17:39 schrieb Felix Kuehling:
>>>>> Am 2020-04-17 um 2:53 a.m. schrieb Yintian Tao:
>>>>>> According to the current kiq read register method, there will be 
>>>>>> race condition when using KIQ to read register if multiple 
>>>>>> clients want to read at same time just like the expample below:
>>>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>>>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B 
>>>>>> poll the seqno-1 5. the kiq complete these two read operation 6.
>>>>>> client-A to read the register at the wb buffer and
>>>>>>         get REG-1 value
>>>>>>
>>>>>> Therefore, directly make kiq write the register value at the ring 
>>>>>> buffer then there will be no race condition for the wb buffer.
>>>>>>
>>>>>> v2: supply the read_clock and move the reg_val_offs back
>>>>>>
>>>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>>>> ---
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++------
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 14 +++++-------
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 14 +++++-------
>>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 28 ++++++++++++------------
>>>>>>       6 files changed, 33 insertions(+), 40 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> index ea576b4260a4..4e1c0239e561 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> @@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct 
>>>>>> amdgpu_device *adev,
>>>>>>       
>>>>>>       	spin_lock_init(&kiq->ring_lock);
>>>>>>       
>>>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>>>> -	if (r)
>>>>>> -		return r;
>>>>>> -
>>>>>>       	ring->adev = NULL;
>>>>>>       	ring->ring_obj = NULL;
>>>>>>       	ring->use_doorbell = true; @@ -331,7 +327,6 @@ int 
>>>>>> amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>>>>       
>>>>>>       void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>>>>>       {
>>>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>>>>       	amdgpu_ring_fini(ring);
>>>>>>       }
>>>>>>       
>>>>>> @@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>>>       	uint32_t seq;
>>>>>>       	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>       	struct amdgpu_ring *ring = &kiq->ring;
>>>>>> +	uint64_t reg_val_offs = 0;
>>>>>>       
>>>>>>       	BUG_ON(!ring->funcs->emit_rreg);
>>>>>>       
>>>>>>       	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>       	amdgpu_ring_alloc(ring, 32);
>>>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>> I think that should be (ring->wptr + 30) & ring->buf_mask.
>>>>> Otherwise the reg_val_offset can be past the end of the ring.
>>>>>
>>>>> But that still leaves a problem if another command is submitted to 
>>>>> the KIQ before you read the returned reg_val from the ring. Your 
>>>>> reg_val can be overwritten by the new command and you get the 
>>>>> wrong result. Or the command can be overwritten with the reg_val, 
>>>>> which will most likely hang the CP.
>>>>>
>>>>> You could allocate space on the KIQ ring with a NOP command to 
>>>>> prevent that space from being overwritten by other commands.
>>>> Well I was under the assumption that this is actually what is done here.
>>>> If that is not the case the patch is a rather clear NAK.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Regards,
>>>>>        Felix
>>>>>
>>>>>
>>>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>>>       	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>       	amdgpu_ring_commit(ring);
>>>>>>       	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -707,7
>>>>>> +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, 
>>>>>> +uint32_t reg)
>>>>>>       	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>       		goto failed_kiq_read;
>>>>>>       
>>>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>>>> +	return ring->ring[reg_val_offs];
>>>>>>       
>>>>>>       failed_kiq_read:
>>>>>>       	pr_err("failed to read reg:%x\n", reg); diff --git 
>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> index 634746829024..ee698f0246d8 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>>>> @@ -103,7 +103,6 @@ struct amdgpu_kiq {
>>>>>>       	struct amdgpu_ring	ring;
>>>>>>       	struct amdgpu_irq_src	irq;
>>>>>>       	const struct kiq_pm4_funcs *pmf;
>>>>>> -	uint32_t			reg_val_offs;
>>>>>>       };
>>>>>>       
>>>>>>       /*
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> index f61664ee4940..a3d88f2aa9f4 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>>>       	void (*end_use)(struct amdgpu_ring *ring);
>>>>>>       	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>>>       	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +			  uint64_t reg_val_offs);
>>>>>>       	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>>>       	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>>>       			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ 
>>>>>> struct amdgpu_ring {
>>>>>>       #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>>>       #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>>>       #define amdgpu_ring_emit_cntxcntl(r, d) 
>>>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>>>>> amdgpu_ring_emit_rreg(r,
>>>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>>>> +#define amdgpu_ring_emit_rreg(r, d, o) 
>>>>>> +(r)->funcs->emit_rreg((r), (d), (o))
>>>>>>       #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>>>>       #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>>>       #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v,
>>>>>> m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) 
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> index 0a03e2ad5d95..7c9a5e440509 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>>>> @@ -7594,21 +7594,19 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>>>       	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				     uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring 
>>>>>> *ring, uint32_t reg, diff --git 
>>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> index fc6c2f2bc76c..8e7eee7838e0 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>>>> @@ -6383,21 +6383,19 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>>       		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				    uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring 
>>>>>> *ring, uint32_t reg, diff --git 
>>>>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> index 84fcf842316d..ff279b1f5c24 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>>>> @@ -4046,11 +4046,13 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>>       	uint32_t seq;
>>>>>>       	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>       	struct amdgpu_ring *ring = &kiq->ring;
>>>>>> +	uint64_t reg_val_offs = 0;
>>>>>>       
>>>>>>       	BUG_ON(!ring->funcs->emit_rreg);
>>>>>>       
>>>>>>       	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>       	amdgpu_ring_alloc(ring, 32);
>>>>>> +	reg_val_offs = (ring->wptr & ring->buf_mask) + 30;
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>> @@ -4058,10 +4060,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>       	amdgpu_ring_commit(ring);
>>>>>>       	spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -4088,8
>>>>>> +4090,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct 
>>>>>> +amdgpu_device *adev)
>>>>>>       	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>       		goto failed_kiq_read;
>>>>>>       
>>>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>>>> +	return (uint64_t)ring->ring[reg_val_offs] |
>>>>>> +		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
>>>>>>       
>>>>>>       failed_kiq_read:
>>>>>>       	pr_err("failed to read gpu clock\n"); @@ -5482,21 +5484,19 
>>>>>> @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>>>       		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>>>>>       }
>>>>>>       
>>>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>>>> uint32_t reg)
>>>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>>>> +				    uint64_t reg_val_offs)
>>>>>>       {
>>>>>> -	struct amdgpu_device *adev = ring->adev;
>>>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>> -
>>>>>>       	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>>>       	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>>>>       				(5 << 8) |	/* dst: memory */
>>>>>>       				(1 << 20));	/* write confirm */
>>>>>>       	amdgpu_ring_write(ring, reg);
>>>>>>       	amdgpu_ring_write(ring, 0);
>>>>>> -	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> -	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>>>> -				kiq->reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>> +	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
>>>>>> +					      reg_val_offs * 4));
>>>>>>       }
>>>>>>       
>>>>>>       static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring 
>>>>>> *ring, uint32_t reg,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2F
>>>>> l
>>>>> i
>>>>> s
>>>>> t
>>>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%
>>>>> 7
>>>>> C
>>>>> m
>>>>> o
>>>>> nk.liu%40amd.com%7Cadc62a682782487cc88f08d7e4838ff3%7C3dd8961fe488
>>>>> 4
>>>>> e
>>>>> 6
>>>>> 0
>>>>> 8e11a82d994e183d%7C0%7C0%7C637229126019563305&amp;sdata=fK5riNpxfU
>>>>> F
>>>>> X
>>>>> G
>>>>> J
>>>>> YjFzfkMETFJX6s6rntpu7CdjRhFDU%3D&amp;reserved=0
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fli
>>> s
>>> t
>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7C
>>> M
>>> o
>>> nk.Liu%40amd.com%7C3497588e2bd540d132ec08d7e5032cb7%7C3dd8961fe4884e
>>> 6
>>> 0
>>> 8e11a82d994e183d%7C0%7C0%7C637229674118038126&amp;sdata=O4z6sIwD%2FL
>>> H
>>> 3
>>> RSk3bcBRi3RHYLhTeV0An59xBJwqqEQ%3D&amp;reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CMo
> nk.Liu%40amd.com%7Cc53a79bc90a943f3245008d7e507897e%7C3dd8961fe4884e60
> 8e11a82d994e183d%7C0%7C0%7C637229693602064841&amp;sdata=4Hqe7ucIn6%2Fu
> eQNLR9EYmx%2BRzw8TZbtlL53qsjNSGFE%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CShaoyun.Liu%40amd.com%7C86fbb57f600c4820f36a08d7e50d266d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637229717757099134&amp;sdata=FZ7P5J6NyB8%2BQxRfsF3SABp5Qi6%2F5d9lIEg98S3fhjM%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq read register
  2020-04-20  7:42         ` Tao, Yintian
@ 2020-04-20 16:42           ` Felix Kuehling
  0 siblings, 0 replies; 25+ messages in thread
From: Felix Kuehling @ 2020-04-20 16:42 UTC (permalink / raw)
  To: Tao, Yintian, Koenig, Christian, Liu, Monk; +Cc: amd-gfx

Am 2020-04-20 um 3:42 a.m. schrieb Tao, Yintian:
> Hi  Felix
>
> I have one question about function kgd_gfx_v9_hiq_mqd_load(). I see it directly write contents into kiq ring and not wait for the fence. Do you know how KFD know the hiq_mqd_load complete? Thanks in advance.

That's probably a bug. I think we can get away with it, because the HIQ
is only used once the first user mode
KFD process is started.

Worst case, KFD will submit something to the HIQ before it's mapped, and
the doorbell update will be missed. So KFD will think the HIQ is hanging
and trigger a GPU reset.

Regards,
  Felix


>
>
>
> Best Regards
> Yintian Tao
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH] drm/amdgpu: refine kiq read register
@ 2020-04-17  6:36 Yintian Tao
  0 siblings, 0 replies; 25+ messages in thread
From: Yintian Tao @ 2020-04-17  6:36 UTC (permalink / raw)
  To: monk.liu; +Cc: amd-gfx, Yintian Tao

According to the current kiq read register method,
there will be race condition when using KIQ to read
register if multiple clients want to read at same time
just like the expample below:
1. client-A start to read REG-0 throguh KIQ
2. client-A poll the seqno-0
3. client-B start to read REG-1 through KIQ
4. client-B poll the seqno-1
5. the kiq complete these two read operation
6. client-A to read the register at the wb buffer and
   get REG-1 value

Therefore, directly make kiq write the register value at
the ring buffer then there will be no race condition for
the wb buffer.

Signed-off-by: Yintian Tao <yttao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 11 ++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 +++--
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 11 ++++++-----
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 12 ++++++------
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 25 ++++++++++++------------
 6 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ea576b4260a4..1253dd1ba42c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -304,10 +304,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 
 	spin_lock_init(&kiq->ring_lock);
 
-	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
-	if (r)
-		return r;
-
 	ring->adev = NULL;
 	ring->ring_obj = NULL;
 	ring->use_doorbell = true;
@@ -331,7 +327,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 
 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
 {
-	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
 	amdgpu_ring_fini(ring);
 }
 
@@ -675,12 +670,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
+	uint64_t reg_val_offs = 0;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
 	spin_lock_irqsave(&kiq->ring_lock, flags);
 	amdgpu_ring_alloc(ring, 32);
-	amdgpu_ring_emit_rreg(ring, reg);
+	reg_val_offs = (ring->wptr & ring->buf_mask) + 16;
+	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags);
@@ -707,7 +704,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return adev->wb.wb[kiq->reg_val_offs];
+	return ring->ring[reg_val_offs];
 
 failed_kiq_read:
 	pr_err("failed to read reg:%x\n", reg);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 634746829024..ee698f0246d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -103,7 +103,6 @@ struct amdgpu_kiq {
 	struct amdgpu_ring	ring;
 	struct amdgpu_irq_src	irq;
 	const struct kiq_pm4_funcs *pmf;
-	uint32_t			reg_val_offs;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index f61664ee4940..a3d88f2aa9f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
 	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
-	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
+	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
+			  uint64_t reg_val_offs);
 	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
 	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
 			      uint32_t val, uint32_t mask);
@@ -265,7 +266,7 @@ struct amdgpu_ring {
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
-#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
+#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
 #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
 #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 0a03e2ad5d95..5873e56341f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7594,7 +7594,8 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
 	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
 }
 
-static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				     uint64_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
@@ -7605,10 +7606,10 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index fc6c2f2bc76c..29f45495213f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
 }
 
-static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint64_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -6394,10 +6394,10 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 84fcf842316d..02d6ba0f2963 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4046,6 +4046,7 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
+	uint64_t reg_val_offs = 0;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
@@ -4058,10 +4059,10 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&kiq->ring_lock, flags);
@@ -4088,8 +4089,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
-		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
+	return (uint64_t)ring->ring[reg_val_offs] |
+		(uint64_t)ring->ring[reg_val_offs + 1 ] << 32ULL;
 
 failed_kiq_read:
 	pr_err("failed to read gpu clock\n");
@@ -5482,10 +5483,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
 }
 
-static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint64_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -5493,10 +5494,10 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 				(1 << 20));	/* write confirm */
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
-	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+	amdgpu_ring_write(ring, lower_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
+	amdgpu_ring_write(ring, upper_32_bits(ring->gpu_addr +
+					      reg_val_offs * 4));
 }
 
 static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2020-04-20 16:42 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-17  6:53 [PATCH] drm/amdgpu: refine kiq read register Yintian Tao
2020-04-17  7:01 ` Liu, Monk
2020-04-17  8:58   ` Christian König
2020-04-17  9:06     ` Liu, Monk
2020-04-17  9:13       ` Christian König
2020-04-17  9:39         ` Liu, Monk
2020-04-17 11:39           ` Tao, Yintian
2020-04-17 15:39 ` Felix Kuehling
2020-04-19 17:03   ` Christian König
2020-04-20  6:20     ` Liu, Monk
2020-04-20  7:19       ` Christian König
2020-04-20  7:37         ` Liu, Monk
2020-04-20  7:39           ` Liu, Monk
2020-04-20  8:16             ` Christian König
2020-04-20  8:20               ` Liu, Monk
2020-04-20  8:25                 ` Christian König
2020-04-20  8:36                   ` Liu, Monk
2020-04-20  8:47                     ` Christian König
2020-04-20  9:28                       ` Liu, Monk
2020-04-20 11:21                         ` Christian König
2020-04-20 15:18                         ` Liu, Shaoyun
2020-04-20  7:42         ` Tao, Yintian
2020-04-20 16:42           ` Felix Kuehling
2020-04-20  4:16   ` Tao, Yintian
  -- strict thread matches above, loose matches on Subject: below --
2020-04-17  6:36 Yintian Tao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.