All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: refine kiq access register
@ 2020-04-22  4:42 Yintian Tao
  2020-04-22  7:11 ` Tao, Yintian
  0 siblings, 1 reply; 17+ messages in thread
From: Yintian Tao @ 2020-04-22  4:42 UTC (permalink / raw)
  To: christian.koenig, monk.liu; +Cc: amd-gfx, Yintian Tao

According to the current kiq access register method,
there will be race condition when using KIQ to read
register if multiple clients want to read at same time
just like the expample below:
1. client-A start to read REG-0 throguh KIQ
2. client-A poll the seqno-0
3. client-B start to read REG-1 through KIQ
4. client-B poll the seqno-1
5. the kiq complete these two read operation
6. client-A to read the register at the wb buffer and
   get REG-1 value

And if there are multiple clients to frequently write
registers through KIQ which may raise the KIQ ring buffer
overwritten problem.

Therefore, allocate fixed number wb slot for rreg use
and limit the submit number which depends on the kiq
ring_size in order to prevent the overwritten problem.

Signed-off-by: Yintian Tao <yttao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
 12 files changed, 211 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4e1d4cfe7a9f..4530e0de4257 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
 /*
  * Writeback
  */
-#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
+#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
 
 struct amdgpu_wb {
 	struct amdgpu_bo	*wb_obj;
@@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
 int emu_soc_asic_init(struct amdgpu_device *adev);
 
+int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read);
+void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
+
+void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs);
+void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
 /*
  * Registers read & write functions.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..034c9f416499 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 			    uint32_t doorbell_off)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
 	struct v10_compute_mqd *m;
 	uint32_t mec, pipe;
@@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
 		 mec, pipe, queue_id);
 
-	spin_lock(&adev->gfx.kiq.ring_lock);
+	r = amdgpu_gfx_kiq_lock(kiq, false);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto out_unlock;
+	}
+
 	r = amdgpu_ring_alloc(kiq_ring, 7);
 	if (r) {
 		pr_err("Failed to alloc KIQ (%d).\n", r);
 		goto out_unlock;
 	}
 
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
 	amdgpu_ring_write(kiq_ring,
 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
@@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
 	amdgpu_ring_commit(kiq_ring);
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 out_unlock:
-	spin_unlock(&adev->gfx.kiq.ring_lock);
+	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
 	release_queue(kgd);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index df841c2ac5e7..f243d9990ced 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 			    uint32_t doorbell_off)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
 	struct v9_mqd *m;
 	uint32_t mec, pipe;
@@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
 		 mec, pipe, queue_id);
 
-	spin_lock(&adev->gfx.kiq.ring_lock);
+	r = amdgpu_gfx_kiq_lock(kiq, false);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto out_unlock;
+	}
+
 	r = amdgpu_ring_alloc(kiq_ring, 7);
 	if (r) {
 		pr_err("Failed to alloc KIQ (%d).\n", r);
 		goto out_unlock;
 	}
 
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
 	amdgpu_ring_write(kiq_ring,
 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
@@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
 	amdgpu_ring_commit(kiq_ring);
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 out_unlock:
-	spin_unlock(&adev->gfx.kiq.ring_lock);
+	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
 	release_queue(kgd);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ea576b4260a4..c6b1c77b5eac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
 	return -EINVAL;
 }
 
+void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
+				 uint32_t *offset)
+{
+	uint32_t tmp = 0;
+
+	tmp = find_first_zero_bit(kiq->used,
+				      kiq->num_slot);
+	if (tmp < kiq->num_slot) {
+		__set_bit(tmp, kiq->used);
+		*offset = kiq->reg_val_offs[tmp];
+	}
+}
+
+void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
+				 uint32_t offset)
+{
+	uint32_t tmp = 0;
+
+	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
+	if (tmp < kiq->num_slot)
+		__clear_bit(tmp, kiq->used);
+}
+
+int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read)
+{
+	int retry = MAX_KIQ_REG_TRY;
+
+	BUG_ON(in_interrupt());
+	while (retry-- > 0) {
+		spin_lock(&kiq->ring_lock);
+		if ((atomic64_read(&kiq->submit_avail) > 0) &&
+		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
+		     kiq->num_slot : 1)) {
+				break;
+		} else {
+			spin_unlock(&kiq->ring_lock);
+			msleep(2);
+			continue;
+		}
+
+	}
+
+	if (retry > 0)
+		return 0;
+	else
+		return -ETIME;
+}
+
+void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq)
+{
+	spin_unlock(&kiq->ring_lock);
+}
+
+void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs)
+{
+	atomic64_dec(&kiq->submit_avail);
+	if (offs)
+		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
+
+}
+
+void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs)
+{
+	atomic64_inc(&kiq->submit_avail);
+	if (offs)
+		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs);
+}
+
 int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 			     struct amdgpu_ring *ring,
 			     struct amdgpu_irq_src *irq)
 {
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-	int r = 0;
+	int r = 0, i, j;
 
 	spin_lock_init(&kiq->ring_lock);
 
-	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
-	if (r)
-		return r;
+	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
+		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
+		if (r) {
+			for (j = 0; j < i; j++)
+				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
+			return r;
+		}
+	}
 
 	ring->adev = NULL;
 	ring->ring_obj = NULL;
 	ring->use_doorbell = true;
 	ring->doorbell_index = adev->doorbell_index.kiq;
+	kiq->num_slot = MAX_KIQ_RREG_NUM;
+	memset(&kiq->used, 0, sizeof(kiq->used));
+
 
 	r = amdgpu_gfx_kiq_acquire(adev, ring);
 	if (r)
@@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 			     AMDGPU_RING_PRIO_DEFAULT);
 	if (r)
 		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
+	else
+		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
+			     (ring->funcs->align_mask + 1));
 
 	return r;
 }
 
 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
 {
-	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
+	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
+	int i = 0;
+
+	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
+		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
 	amdgpu_ring_fini(ring);
 }
 
@@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
 uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 {
 	signed long r, cnt = 0;
-	unsigned long flags;
-	uint32_t seq;
+	uint32_t seq, reg_val_offs = 0, value = 0;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, true);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto kiq_read_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
 	amdgpu_ring_alloc(ring, 32);
-	amdgpu_ring_emit_rreg(ring, reg);
+	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return adev->wb.wb[kiq->reg_val_offs];
+	mb();
+	value = adev->wb.wb[reg_val_offs];
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+	return value;
 
 failed_kiq_read:
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+kiq_read_exit:
 	pr_err("failed to read reg:%x\n", reg);
 	return ~0;
 }
@@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 {
 	signed long r, cnt = 0;
-	unsigned long flags;
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_wreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, false);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto kiq_write_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_wreg(ring, reg, v);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_write;
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 	return;
 
 failed_kiq_write:
+	amdgpu_gfx_kiq_restore(kiq, NULL);
+kiq_write_exit:
 	pr_err("failed to write reg:%x\n", reg);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 634746829024..bb05594c27e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
 	int invalidate_tlbs_size;
 };
 
+#define MAX_KIQ_RREG_NUM 64
 struct amdgpu_kiq {
 	u64			eop_gpu_addr;
 	struct amdgpu_bo	*eop_obj;
@@ -103,7 +104,10 @@ struct amdgpu_kiq {
 	struct amdgpu_ring	ring;
 	struct amdgpu_irq_src	irq;
 	const struct kiq_pm4_funcs *pmf;
-	uint32_t			reg_val_offs;
+	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
+	atomic64_t		submit_avail;
+	uint32_t		num_slot;
+	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index f61664ee4940..b0ea4f67a6e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
 	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
-	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
+	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
+			  uint32_t reg_val_offs);
 	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
 	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
 			      uint32_t val, uint32_t mask);
@@ -265,7 +266,7 @@ struct amdgpu_ring {
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
-#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
+#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
 #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
 #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
@@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)
 {
 	if (ring->count_dw <= 0)
 		DRM_ERROR("amdgpu: writing more dwords to the ring than expected!\n");
+
 	ring->ring[ring->wptr++ & ring->buf_mask] = v;
 	ring->wptr &= ring->ptr_mask;
 	ring->count_dw--;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 8c10084f44ef..dd54ea802a9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	unsigned long flags;
 	uint32_t seq;
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, false);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto failed_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
 					    ref, mask);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq;
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 	return;
 
 failed_kiq:
+	amdgpu_gfx_kiq_restore(kiq, NULL);
+failed_exit:
 	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 0a03e2ad5d95..7853dbc3c8bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
 	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
 }
 
-static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				     uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index fc6c2f2bc76c..bdbd92d4fe45 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
 }
 
-static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 84fcf842316d..f7dd1036986e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)
 static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 {
 	signed long r, cnt = 0;
-	unsigned long flags;
-	uint32_t seq;
+	uint32_t seq, reg_val_offs;
+	uint64_t value = 0;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, true);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto failed_kiq_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 9 |	/* src: register*/
@@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
-		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
+	value = (uint64_t)adev->wb.wb[reg_val_offs] |
+		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
 
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+	return value;
 failed_kiq_read:
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+failed_kiq_exit:
 	pr_err("failed to read gpu clock\n");
 	return ~0;
 }
@@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
 }
 
-static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 30b75d79efdb..6b4dc0ed4fff 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
-		spin_lock(&adev->gfx.kiq.ring_lock);
+		r = amdgpu_gfx_kiq_lock(kiq, false);
+		if (r) {
+			pr_err("failed to lock kiq\n");
+			return -ETIME;
+		}
+
 		/* 2 dwords flush + 8 dwords fence */
 		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
+		amdgpu_gfx_kiq_consume(kiq, NULL);
 		kiq->pmf->kiq_invalidate_tlbs(ring,
 					pasid, flush_type, all_hub);
 		amdgpu_fence_emit_polling(ring, &seq);
 		amdgpu_ring_commit(ring);
-		spin_unlock(&adev->gfx.kiq.ring_lock);
+		amdgpu_gfx_kiq_unlock(kiq);
 		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
+			amdgpu_gfx_kiq_restore(kiq, false);
 			return -ETIME;
 		}
 
+		amdgpu_gfx_kiq_restore(kiq, NULL);
 		return 0;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index fecdbc471983..6238702c80b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 		if (vega20_xgmi_wa)
 			ndw += kiq->pmf->invalidate_tlbs_size;
 
-		spin_lock(&adev->gfx.kiq.ring_lock);
+		r = amdgpu_gfx_kiq_lock(kiq, false);
+		if (r) {
+			pr_err("failed to lock kiq\n");
+			return -ETIME;
+		}
+
+		amdgpu_gfx_kiq_consume(kiq, NULL);
 		/* 2 dwords flush + 8 dwords fence */
 		amdgpu_ring_alloc(ring, ndw);
 		if (vega20_xgmi_wa)
@@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 					pasid, flush_type, all_hub);
 		amdgpu_fence_emit_polling(ring, &seq);
 		amdgpu_ring_commit(ring);
-		spin_unlock(&adev->gfx.kiq.ring_lock);
+		amdgpu_gfx_kiq_unlock(kiq);
 		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
+			amdgpu_gfx_kiq_restore(kiq, NULL);
 			return -ETIME;
 		}
 
+		amdgpu_gfx_kiq_restore(kiq, NULL);
 		return 0;
 	}
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  4:42 [PATCH] drm/amdgpu: refine kiq access register Yintian Tao
@ 2020-04-22  7:11 ` Tao, Yintian
  2020-04-22  7:23   ` Christian König
  0 siblings, 1 reply; 17+ messages in thread
From: Tao, Yintian @ 2020-04-22  7:11 UTC (permalink / raw)
  To: Tao, Yintian, Koenig, Christian, Liu, Monk, Liu, Shaoyun,
	Kuehling, Felix
  Cc: amd-gfx

Add Felix and Shaoyun

-----Original Message-----
From: Yintian Tao <yttao@amd.com> 
Sent: 2020年4月22日 12:42
To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>
Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
Subject: [PATCH] drm/amdgpu: refine kiq access register

According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
   get REG-1 value

And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.

Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.

Signed-off-by: Yintian Tao <yttao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
 12 files changed, 211 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4e1d4cfe7a9f..4530e0de4257 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
 /*
  * Writeback
  */
-#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
+#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
 
 struct amdgpu_wb {
 	struct amdgpu_bo	*wb_obj;
@@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
 int emu_soc_asic_init(struct amdgpu_device *adev);
 
+int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void 
+amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
+
+void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs); 
+void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
 /*
  * Registers read & write functions.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..034c9f416499 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 			    uint32_t doorbell_off)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
 	struct v10_compute_mqd *m;
 	uint32_t mec, pipe;
@@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
 		 mec, pipe, queue_id);
 
-	spin_lock(&adev->gfx.kiq.ring_lock);
+	r = amdgpu_gfx_kiq_lock(kiq, false);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto out_unlock;
+	}
+
 	r = amdgpu_ring_alloc(kiq_ring, 7);
 	if (r) {
 		pr_err("Failed to alloc KIQ (%d).\n", r);
 		goto out_unlock;
 	}
 
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
 	amdgpu_ring_write(kiq_ring,
 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
 	amdgpu_ring_commit(kiq_ring);
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 out_unlock:
-	spin_unlock(&adev->gfx.kiq.ring_lock);
+	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
 	release_queue(kgd);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index df841c2ac5e7..f243d9990ced 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 			    uint32_t doorbell_off)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
 	struct v9_mqd *m;
 	uint32_t mec, pipe;
@@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
 		 mec, pipe, queue_id);
 
-	spin_lock(&adev->gfx.kiq.ring_lock);
+	r = amdgpu_gfx_kiq_lock(kiq, false);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto out_unlock;
+	}
+
 	r = amdgpu_ring_alloc(kiq_ring, 7);
 	if (r) {
 		pr_err("Failed to alloc KIQ (%d).\n", r);
 		goto out_unlock;
 	}
 
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
 	amdgpu_ring_write(kiq_ring,
 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
 	amdgpu_ring_commit(kiq_ring);
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 out_unlock:
-	spin_unlock(&adev->gfx.kiq.ring_lock);
+	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
 	release_queue(kgd);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ea576b4260a4..c6b1c77b5eac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
 	return -EINVAL;
 }
 
+void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
+				 uint32_t *offset)
+{
+	uint32_t tmp = 0;
+
+	tmp = find_first_zero_bit(kiq->used,
+				      kiq->num_slot);
+	if (tmp < kiq->num_slot) {
+		__set_bit(tmp, kiq->used);
+		*offset = kiq->reg_val_offs[tmp];
+	}
+}
+
+void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
+				 uint32_t offset)
+{
+	uint32_t tmp = 0;
+
+	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
+	if (tmp < kiq->num_slot)
+		__clear_bit(tmp, kiq->used);
+}
+
+int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
+	int retry = MAX_KIQ_REG_TRY;
+
+	BUG_ON(in_interrupt());
+	while (retry-- > 0) {
+		spin_lock(&kiq->ring_lock);
+		if ((atomic64_read(&kiq->submit_avail) > 0) &&
+		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
+		     kiq->num_slot : 1)) {
+				break;
+		} else {
+			spin_unlock(&kiq->ring_lock);
+			msleep(2);
+			continue;
+		}
+
+	}
+
+	if (retry > 0)
+		return 0;
+	else
+		return -ETIME;
+}
+
+void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
+	spin_unlock(&kiq->ring_lock);
+}
+
+void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
+	atomic64_dec(&kiq->submit_avail);
+	if (offs)
+		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
+
+}
+
+void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
+	atomic64_inc(&kiq->submit_avail);
+	if (offs)
+		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
+
 int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 			     struct amdgpu_ring *ring,
 			     struct amdgpu_irq_src *irq)
 {
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-	int r = 0;
+	int r = 0, i, j;
 
 	spin_lock_init(&kiq->ring_lock);
 
-	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
-	if (r)
-		return r;
+	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
+		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
+		if (r) {
+			for (j = 0; j < i; j++)
+				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
+			return r;
+		}
+	}
 
 	ring->adev = NULL;
 	ring->ring_obj = NULL;
 	ring->use_doorbell = true;
 	ring->doorbell_index = adev->doorbell_index.kiq;
+	kiq->num_slot = MAX_KIQ_RREG_NUM;
+	memset(&kiq->used, 0, sizeof(kiq->used));
+
 
 	r = amdgpu_gfx_kiq_acquire(adev, ring);
 	if (r)
@@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 			     AMDGPU_RING_PRIO_DEFAULT);
 	if (r)
 		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
+	else
+		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
+			     (ring->funcs->align_mask + 1));
 
 	return r;
 }
 
 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
-	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
+	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
+	int i = 0;
+
+	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
+		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
 	amdgpu_ring_fini(ring);
 }
 
@@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
 	signed long r, cnt = 0;
-	unsigned long flags;
-	uint32_t seq;
+	uint32_t seq, reg_val_offs = 0, value = 0;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, true);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto kiq_read_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
 	amdgpu_ring_alloc(ring, 32);
-	amdgpu_ring_emit_rreg(ring, reg);
+	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return adev->wb.wb[kiq->reg_val_offs];
+	mb();
+	value = adev->wb.wb[reg_val_offs];
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+	return value;
 
 failed_kiq_read:
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+kiq_read_exit:
 	pr_err("failed to read reg:%x\n", reg);
 	return ~0;
 }
@@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
 	signed long r, cnt = 0;
-	unsigned long flags;
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_wreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, false);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto kiq_write_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_wreg(ring, reg, v);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_write;
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 	return;
 
 failed_kiq_write:
+	amdgpu_gfx_kiq_restore(kiq, NULL);
+kiq_write_exit:
 	pr_err("failed to write reg:%x\n", reg);  } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 634746829024..bb05594c27e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
 	int invalidate_tlbs_size;
 };
 
+#define MAX_KIQ_RREG_NUM 64
 struct amdgpu_kiq {
 	u64			eop_gpu_addr;
 	struct amdgpu_bo	*eop_obj;
@@ -103,7 +104,10 @@ struct amdgpu_kiq {
 	struct amdgpu_ring	ring;
 	struct amdgpu_irq_src	irq;
 	const struct kiq_pm4_funcs *pmf;
-	uint32_t			reg_val_offs;
+	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
+	atomic64_t		submit_avail;
+	uint32_t		num_slot;
+	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index f61664ee4940..b0ea4f67a6e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
 	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
-	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
+	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
+			  uint32_t reg_val_offs);
 	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
 	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
 			      uint32_t val, uint32_t mask);
@@ -265,7 +266,7 @@ struct amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
+#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), 
+(o))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
 	if (ring->count_dw <= 0)
 		DRM_ERROR("amdgpu: writing more dwords to the ring than expected!\n");
+
 	ring->ring[ring->wptr++ & ring->buf_mask] = v;
 	ring->wptr &= ring->ptr_mask;
 	ring->count_dw--;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 8c10084f44ef..dd54ea802a9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	unsigned long flags;
 	uint32_t seq;
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, false);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto failed_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
 					    ref, mask);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq;
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 	return;
 
 failed_kiq:
+	amdgpu_gfx_kiq_restore(kiq, NULL);
+failed_exit:
 	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 0a03e2ad5d95..7853dbc3c8bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
 	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
 
-static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				     uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index fc6c2f2bc76c..bdbd92d4fe45 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
 
-static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 84fcf842316d..f7dd1036986e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
 	signed long r, cnt = 0;
-	unsigned long flags;
-	uint32_t seq;
+	uint32_t seq, reg_val_offs;
+	uint64_t value = 0;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, true);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto failed_kiq_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 9 |	/* src: register*/
@@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
-		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
+	value = (uint64_t)adev->wb.wb[reg_val_offs] |
+		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
 
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+	return value;
 failed_kiq_read:
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+failed_kiq_exit:
 	pr_err("failed to read gpu clock\n");
 	return ~0;
 }
@@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
 
-static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 30b75d79efdb..6b4dc0ed4fff 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
-		spin_lock(&adev->gfx.kiq.ring_lock);
+		r = amdgpu_gfx_kiq_lock(kiq, false);
+		if (r) {
+			pr_err("failed to lock kiq\n");
+			return -ETIME;
+		}
+
 		/* 2 dwords flush + 8 dwords fence */
 		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
+		amdgpu_gfx_kiq_consume(kiq, NULL);
 		kiq->pmf->kiq_invalidate_tlbs(ring,
 					pasid, flush_type, all_hub);
 		amdgpu_fence_emit_polling(ring, &seq);
 		amdgpu_ring_commit(ring);
-		spin_unlock(&adev->gfx.kiq.ring_lock);
+		amdgpu_gfx_kiq_unlock(kiq);
 		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
+			amdgpu_gfx_kiq_restore(kiq, false);
 			return -ETIME;
 		}
 
+		amdgpu_gfx_kiq_restore(kiq, NULL);
 		return 0;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index fecdbc471983..6238702c80b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 		if (vega20_xgmi_wa)
 			ndw += kiq->pmf->invalidate_tlbs_size;
 
-		spin_lock(&adev->gfx.kiq.ring_lock);
+		r = amdgpu_gfx_kiq_lock(kiq, false);
+		if (r) {
+			pr_err("failed to lock kiq\n");
+			return -ETIME;
+		}
+
+		amdgpu_gfx_kiq_consume(kiq, NULL);
 		/* 2 dwords flush + 8 dwords fence */
 		amdgpu_ring_alloc(ring, ndw);
 		if (vega20_xgmi_wa)
@@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 					pasid, flush_type, all_hub);
 		amdgpu_fence_emit_polling(ring, &seq);
 		amdgpu_ring_commit(ring);
-		spin_unlock(&adev->gfx.kiq.ring_lock);
+		amdgpu_gfx_kiq_unlock(kiq);
 		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
+			amdgpu_gfx_kiq_restore(kiq, NULL);
 			return -ETIME;
 		}
 
+		amdgpu_gfx_kiq_restore(kiq, NULL);
 		return 0;
 	}
 
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  7:11 ` Tao, Yintian
@ 2020-04-22  7:23   ` Christian König
  2020-04-22  7:35     ` Tao, Yintian
  0 siblings, 1 reply; 17+ messages in thread
From: Christian König @ 2020-04-22  7:23 UTC (permalink / raw)
  To: Tao, Yintian, Liu, Monk, Liu, Shaoyun, Kuehling, Felix; +Cc: amd-gfx

> BUG_ON(in_interrupt());
That won't work like this. The KIQ is also used in interrupt context in 
the driver, that's why we used spin_lock_irqsave().

And I would either say that we should use the trick with the NOP to 
reserve space on the ring buffer or call amdgpu_device_wb_get() for each 
read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should 
work equally well.

You also don't need to worry to much about overflowing the wb area. 
Since we run in an atomic context we can have at most the number of CPU 
in the system + interrupt context here.

Regards,
Christian.

Am 22.04.20 um 09:11 schrieb Tao, Yintian:
> Add Felix and Shaoyun
>
> -----Original Message-----
> From: Yintian Tao <yttao@amd.com>
> Sent: 2020年4月22日 12:42
> To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
> Subject: [PATCH] drm/amdgpu: refine kiq access register
>
> According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>     get REG-1 value
>
> And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.
>
> Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.
>
> Signed-off-by: Yintian Tao <yttao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
>   12 files changed, 211 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4e1d4cfe7a9f..4530e0de4257 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>   /*
>    * Writeback
>    */
> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>   
>   struct amdgpu_wb {
>   	struct amdgpu_bo	*wb_obj;
> @@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>   
>   int emu_soc_asic_init(struct amdgpu_device *adev);
>   
> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void
> +amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
> +
> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs);
> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>   /*
>    * Registers read & write functions.
>    */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..034c9f416499 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   			    uint32_t doorbell_off)
>   {
>   	struct amdgpu_device *adev = get_amdgpu_device(kgd);
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>   	struct v10_compute_mqd *m;
>   	uint32_t mec, pipe;
> @@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>   		 mec, pipe, queue_id);
>   
> -	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_lock(kiq, false);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
>   		goto out_unlock;
>   	}
>   
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>   	amdgpu_ring_write(kiq_ring,
>   			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>   	amdgpu_ring_commit(kiq_ring);
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   out_unlock:
> -	spin_unlock(&adev->gfx.kiq.ring_lock);
> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>   	release_queue(kgd);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..f243d9990ced 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   			    uint32_t doorbell_off)
>   {
>   	struct amdgpu_device *adev = get_amdgpu_device(kgd);
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>   	struct v9_mqd *m;
>   	uint32_t mec, pipe;
> @@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>   		 mec, pipe, queue_id);
>   
> -	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_lock(kiq, false);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
>   		goto out_unlock;
>   	}
>   
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>   	amdgpu_ring_write(kiq_ring,
>   			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>   	amdgpu_ring_commit(kiq_ring);
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   out_unlock:
> -	spin_unlock(&adev->gfx.kiq.ring_lock);
> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>   	release_queue(kgd);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ea576b4260a4..c6b1c77b5eac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>   	return -EINVAL;
>   }
>   
> +void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
> +				 uint32_t *offset)
> +{
> +	uint32_t tmp = 0;
> +
> +	tmp = find_first_zero_bit(kiq->used,
> +				      kiq->num_slot);
> +	if (tmp < kiq->num_slot) {
> +		__set_bit(tmp, kiq->used);
> +		*offset = kiq->reg_val_offs[tmp];
> +	}
> +}
> +
> +void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
> +				 uint32_t offset)
> +{
> +	uint32_t tmp = 0;
> +
> +	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
> +	if (tmp < kiq->num_slot)
> +		__clear_bit(tmp, kiq->used);
> +}
> +
> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
> +	int retry = MAX_KIQ_REG_TRY;
> +
> +	BUG_ON(in_interrupt());
> +	while (retry-- > 0) {
> +		spin_lock(&kiq->ring_lock);
> +		if ((atomic64_read(&kiq->submit_avail) > 0) &&
> +		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
> +		     kiq->num_slot : 1)) {
> +				break;
> +		} else {
> +			spin_unlock(&kiq->ring_lock);
> +			msleep(2);
> +			continue;
> +		}
> +
> +	}
> +
> +	if (retry > 0)
> +		return 0;
> +	else
> +		return -ETIME;
> +}
> +
> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
> +	spin_unlock(&kiq->ring_lock);
> +}
> +
> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
> +	atomic64_dec(&kiq->submit_avail);
> +	if (offs)
> +		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
> +
> +}
> +
> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
> +	atomic64_inc(&kiq->submit_avail);
> +	if (offs)
> +		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
> +
>   int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     struct amdgpu_ring *ring,
>   			     struct amdgpu_irq_src *irq)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -	int r = 0;
> +	int r = 0, i, j;
>   
>   	spin_lock_init(&kiq->ring_lock);
>   
> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
> -	if (r)
> -		return r;
> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
> +		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
> +		if (r) {
> +			for (j = 0; j < i; j++)
> +				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
> +			return r;
> +		}
> +	}
>   
>   	ring->adev = NULL;
>   	ring->ring_obj = NULL;
>   	ring->use_doorbell = true;
>   	ring->doorbell_index = adev->doorbell_index.kiq;
> +	kiq->num_slot = MAX_KIQ_RREG_NUM;
> +	memset(&kiq->used, 0, sizeof(kiq->used));
> +
>   
>   	r = amdgpu_gfx_kiq_acquire(adev, ring);
>   	if (r)
> @@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     AMDGPU_RING_PRIO_DEFAULT);
>   	if (r)
>   		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
> +	else
> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
> +			     (ring->funcs->align_mask + 1));
>   
>   	return r;
>   }
>   
>   void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
> +	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
> +	int i = 0;
> +
> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
> +		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
>   	amdgpu_ring_fini(ring);
>   }
>   
> @@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> -	uint32_t seq;
> +	uint32_t seq, reg_val_offs = 0, value = 0;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, true);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto kiq_read_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>   	amdgpu_ring_alloc(ring, 32);
> -	amdgpu_ring_emit_rreg(ring, reg);
> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return adev->wb.wb[kiq->reg_val_offs];
> +	mb();
> +	value = adev->wb.wb[reg_val_offs];
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +	return value;
>   
>   failed_kiq_read:
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +kiq_read_exit:
>   	pr_err("failed to read reg:%x\n", reg);
>   	return ~0;
>   }
> @@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
>   	uint32_t seq;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_wreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, false);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto kiq_write_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_wreg(ring, reg, v);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_write;
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   	return;
>   
>   failed_kiq_write:
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
> +kiq_write_exit:
>   	pr_err("failed to write reg:%x\n", reg);  } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 634746829024..bb05594c27e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>   	int invalidate_tlbs_size;
>   };
>   
> +#define MAX_KIQ_RREG_NUM 64
>   struct amdgpu_kiq {
>   	u64			eop_gpu_addr;
>   	struct amdgpu_bo	*eop_obj;
> @@ -103,7 +104,10 @@ struct amdgpu_kiq {
>   	struct amdgpu_ring	ring;
>   	struct amdgpu_irq_src	irq;
>   	const struct kiq_pm4_funcs *pmf;
> -	uint32_t			reg_val_offs;
> +	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
> +	atomic64_t		submit_avail;
> +	uint32_t		num_slot;
> +	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f61664ee4940..b0ea4f67a6e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>   	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
> +			  uint32_t reg_val_offs);
>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>   			      uint32_t val, uint32_t mask);
> @@ -265,7 +266,7 @@ struct amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d),
> +(o))
>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
>   	if (ring->count_dw <= 0)
>   		DRM_ERROR("amdgpu: writing more dwords to the ring than expected!\n");
> +
>   	ring->ring[ring->wptr++ & ring->buf_mask] = v;
>   	ring->wptr &= ring->ptr_mask;
>   	ring->count_dw--;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 8c10084f44ef..dd54ea802a9a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	unsigned long flags;
>   	uint32_t seq;
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, false);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto failed_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq;
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   	return;
>   
>   failed_kiq:
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
> +failed_exit:
>   	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 0a03e2ad5d95..7853dbc3c8bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>   	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>   
> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				     uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fc6c2f2bc76c..bdbd92d4fe45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>   
> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 84fcf842316d..f7dd1036986e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> -	uint32_t seq;
> +	uint32_t seq, reg_val_offs;
> +	uint64_t value = 0;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, true);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto failed_kiq_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 9 |	/* src: register*/
> @@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>   
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +	return value;
>   failed_kiq_read:
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +failed_kiq_exit:
>   	pr_err("failed to read gpu clock\n");
>   	return ~0;
>   }
> @@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>   
> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 30b75d79efdb..6b4dc0ed4fff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
> -		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_lock(kiq, false);
> +		if (r) {
> +			pr_err("failed to lock kiq\n");
> +			return -ETIME;
> +		}
> +
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>   		kiq->pmf->kiq_invalidate_tlbs(ring,
>   					pasid, flush_type, all_hub);
>   		amdgpu_fence_emit_polling(ring, &seq);
>   		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq.ring_lock);
> +		amdgpu_gfx_kiq_unlock(kiq);
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
> +			amdgpu_gfx_kiq_restore(kiq, false);
>   			return -ETIME;
>   		}
>   
> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>   		return 0;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index fecdbc471983..6238702c80b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		if (vega20_xgmi_wa)
>   			ndw += kiq->pmf->invalidate_tlbs_size;
>   
> -		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_lock(kiq, false);
> +		if (r) {
> +			pr_err("failed to lock kiq\n");
> +			return -ETIME;
> +		}
> +
> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, ndw);
>   		if (vega20_xgmi_wa)
> @@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   					pasid, flush_type, all_hub);
>   		amdgpu_fence_emit_polling(ring, &seq);
>   		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq.ring_lock);
> +		amdgpu_gfx_kiq_unlock(kiq);
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>   			return -ETIME;
>   		}
>   
> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>   		return 0;
>   	}
>   
> --
> 2.17.1
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  7:23   ` Christian König
@ 2020-04-22  7:35     ` Tao, Yintian
  2020-04-22  7:40       ` Christian König
  0 siblings, 1 reply; 17+ messages in thread
From: Tao, Yintian @ 2020-04-22  7:35 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Liu, Shaoyun, Kuehling, Felix; +Cc: amd-gfx

Hi  Christian


> BUG_ON(in_interrupt());
That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
[yttao]: According to the current drm-next code, I have not find where to access register through KIQ.
		And you need to wait for the free kiq ring buffer space if there is no free kiq ring buffer, here, you wait at interrupt context is illegal.

And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
[yttao]: sorry, can you give me more details about how to use NOP to reserve space? I will use amdgpu_device_wb_get() for the read operation.



-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: 2020年4月22日 15:23
To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq access register

> BUG_ON(in_interrupt());
That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().

And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.

You also don't need to worry to much about overflowing the wb area. 
Since we run in an atomic context we can have at most the number of CPU in the system + interrupt context here.

Regards,
Christian.

Am 22.04.20 um 09:11 schrieb Tao, Yintian:
> Add Felix and Shaoyun
>
> -----Original Message-----
> From: Yintian Tao <yttao@amd.com>
> Sent: 2020年4月22日 12:42
> To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk 
> <Monk.Liu@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
> Subject: [PATCH] drm/amdgpu: refine kiq access register
>
> According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>     get REG-1 value
>
> And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.
>
> Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.
>
> Signed-off-by: Yintian Tao <yttao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
>   12 files changed, 211 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4e1d4cfe7a9f..4530e0de4257 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>   /*
>    * Writeback
>    */
> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>   
>   struct amdgpu_wb {
>   	struct amdgpu_bo	*wb_obj;
> @@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct 
> amdgpu_device *adev);
>   
>   int emu_soc_asic_init(struct amdgpu_device *adev);
>   
> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void 
> +amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
> +
> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs); 
> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>   /*
>    * Registers read & write functions.
>    */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..034c9f416499 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   			    uint32_t doorbell_off)
>   {
>   	struct amdgpu_device *adev = get_amdgpu_device(kgd);
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>   	struct v10_compute_mqd *m;
>   	uint32_t mec, pipe;
> @@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>   		 mec, pipe, queue_id);
>   
> -	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_lock(kiq, false);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
>   		goto out_unlock;
>   	}
>   
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>   	amdgpu_ring_write(kiq_ring,
>   			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>   	amdgpu_ring_commit(kiq_ring);
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   out_unlock:
> -	spin_unlock(&adev->gfx.kiq.ring_lock);
> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>   	release_queue(kgd);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..f243d9990ced 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   			    uint32_t doorbell_off)
>   {
>   	struct amdgpu_device *adev = get_amdgpu_device(kgd);
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>   	struct v9_mqd *m;
>   	uint32_t mec, pipe;
> @@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>   		 mec, pipe, queue_id);
>   
> -	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_lock(kiq, false);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
>   		goto out_unlock;
>   	}
>   
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>   	amdgpu_ring_write(kiq_ring,
>   			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>   	amdgpu_ring_commit(kiq_ring);
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   out_unlock:
> -	spin_unlock(&adev->gfx.kiq.ring_lock);
> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>   	release_queue(kgd);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ea576b4260a4..c6b1c77b5eac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>   	return -EINVAL;
>   }
>   
> +void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
> +				 uint32_t *offset)
> +{
> +	uint32_t tmp = 0;
> +
> +	tmp = find_first_zero_bit(kiq->used,
> +				      kiq->num_slot);
> +	if (tmp < kiq->num_slot) {
> +		__set_bit(tmp, kiq->used);
> +		*offset = kiq->reg_val_offs[tmp];
> +	}
> +}
> +
> +void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
> +				 uint32_t offset)
> +{
> +	uint32_t tmp = 0;
> +
> +	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
> +	if (tmp < kiq->num_slot)
> +		__clear_bit(tmp, kiq->used);
> +}
> +
> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
> +	int retry = MAX_KIQ_REG_TRY;
> +
> +	BUG_ON(in_interrupt());
> +	while (retry-- > 0) {
> +		spin_lock(&kiq->ring_lock);
> +		if ((atomic64_read(&kiq->submit_avail) > 0) &&
> +		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
> +		     kiq->num_slot : 1)) {
> +				break;
> +		} else {
> +			spin_unlock(&kiq->ring_lock);
> +			msleep(2);
> +			continue;
> +		}
> +
> +	}
> +
> +	if (retry > 0)
> +		return 0;
> +	else
> +		return -ETIME;
> +}
> +
> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
> +	spin_unlock(&kiq->ring_lock);
> +}
> +
> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
> +	atomic64_dec(&kiq->submit_avail);
> +	if (offs)
> +		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
> +
> +}
> +
> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
> +	atomic64_inc(&kiq->submit_avail);
> +	if (offs)
> +		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
> +
>   int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     struct amdgpu_ring *ring,
>   			     struct amdgpu_irq_src *irq)
>   {
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> -	int r = 0;
> +	int r = 0, i, j;
>   
>   	spin_lock_init(&kiq->ring_lock);
>   
> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
> -	if (r)
> -		return r;
> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
> +		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
> +		if (r) {
> +			for (j = 0; j < i; j++)
> +				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
> +			return r;
> +		}
> +	}
>   
>   	ring->adev = NULL;
>   	ring->ring_obj = NULL;
>   	ring->use_doorbell = true;
>   	ring->doorbell_index = adev->doorbell_index.kiq;
> +	kiq->num_slot = MAX_KIQ_RREG_NUM;
> +	memset(&kiq->used, 0, sizeof(kiq->used));
> +
>   
>   	r = amdgpu_gfx_kiq_acquire(adev, ring);
>   	if (r)
> @@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     AMDGPU_RING_PRIO_DEFAULT);
>   	if (r)
>   		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
> +	else
> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
> +			     (ring->funcs->align_mask + 1));
>   
>   	return r;
>   }
>   
>   void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
> +	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
> +	int i = 0;
> +
> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
> +		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
>   	amdgpu_ring_fini(ring);
>   }
>   
> @@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> -	uint32_t seq;
> +	uint32_t seq, reg_val_offs = 0, value = 0;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, true);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto kiq_read_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>   	amdgpu_ring_alloc(ring, 32);
> -	amdgpu_ring_emit_rreg(ring, reg);
> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return adev->wb.wb[kiq->reg_val_offs];
> +	mb();
> +	value = adev->wb.wb[reg_val_offs];
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +	return value;
>   
>   failed_kiq_read:
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +kiq_read_exit:
>   	pr_err("failed to read reg:%x\n", reg);
>   	return ~0;
>   }
> @@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
>   	uint32_t seq;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_wreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, false);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto kiq_write_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_wreg(ring, reg, v);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_write;
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   	return;
>   
>   failed_kiq_write:
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
> +kiq_write_exit:
>   	pr_err("failed to write reg:%x\n", reg);  } diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 634746829024..bb05594c27e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>   	int invalidate_tlbs_size;
>   };
>   
> +#define MAX_KIQ_RREG_NUM 64
>   struct amdgpu_kiq {
>   	u64			eop_gpu_addr;
>   	struct amdgpu_bo	*eop_obj;
> @@ -103,7 +104,10 @@ struct amdgpu_kiq {
>   	struct amdgpu_ring	ring;
>   	struct amdgpu_irq_src	irq;
>   	const struct kiq_pm4_funcs *pmf;
> -	uint32_t			reg_val_offs;
> +	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
> +	atomic64_t		submit_avail;
> +	uint32_t		num_slot;
> +	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f61664ee4940..b0ea4f67a6e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>   	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
> +			  uint32_t reg_val_offs);
>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>   			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) 
> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, 
> d) (r)->funcs->emit_rreg((r), (d))
> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
> +(d),
> +(o))
>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
>   	if (ring->count_dw <= 0)
>   		DRM_ERROR("amdgpu: writing more dwords to the ring than 
> expected!\n");
> +
>   	ring->ring[ring->wptr++ & ring->buf_mask] = v;
>   	ring->wptr &= ring->ptr_mask;
>   	ring->count_dw--;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 8c10084f44ef..dd54ea802a9a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	unsigned long flags;
>   	uint32_t seq;
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, false);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto failed_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq;
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   	return;
>   
>   failed_kiq:
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
> +failed_exit:
>   	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 0a03e2ad5d95..7853dbc3c8bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>   	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>   
> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				     uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fc6c2f2bc76c..bdbd92d4fe45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>   
> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 84fcf842316d..f7dd1036986e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> -	uint32_t seq;
> +	uint32_t seq, reg_val_offs;
> +	uint64_t value = 0;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, true);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto failed_kiq_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 9 |	/* src: register*/
> @@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>   
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +	return value;
>   failed_kiq_read:
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +failed_kiq_exit:
>   	pr_err("failed to read gpu clock\n");
>   	return ~0;
>   }
> @@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>   
> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 30b75d79efdb..6b4dc0ed4fff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
> -		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_lock(kiq, false);
> +		if (r) {
> +			pr_err("failed to lock kiq\n");
> +			return -ETIME;
> +		}
> +
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>   		kiq->pmf->kiq_invalidate_tlbs(ring,
>   					pasid, flush_type, all_hub);
>   		amdgpu_fence_emit_polling(ring, &seq);
>   		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq.ring_lock);
> +		amdgpu_gfx_kiq_unlock(kiq);
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
> +			amdgpu_gfx_kiq_restore(kiq, false);
>   			return -ETIME;
>   		}
>   
> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>   		return 0;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index fecdbc471983..6238702c80b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		if (vega20_xgmi_wa)
>   			ndw += kiq->pmf->invalidate_tlbs_size;
>   
> -		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_lock(kiq, false);
> +		if (r) {
> +			pr_err("failed to lock kiq\n");
> +			return -ETIME;
> +		}
> +
> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, ndw);
>   		if (vega20_xgmi_wa)
> @@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   					pasid, flush_type, all_hub);
>   		amdgpu_fence_emit_polling(ring, &seq);
>   		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq.ring_lock);
> +		amdgpu_gfx_kiq_unlock(kiq);
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>   			return -ETIME;
>   		}
>   
> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>   		return 0;
>   	}
>   
> --
> 2.17.1
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  7:35     ` Tao, Yintian
@ 2020-04-22  7:40       ` Christian König
  2020-04-22  7:49         ` Tao, Yintian
  0 siblings, 1 reply; 17+ messages in thread
From: Christian König @ 2020-04-22  7:40 UTC (permalink / raw)
  To: Tao, Yintian, Koenig, Christian, Liu, Monk, Liu, Shaoyun,
	Kuehling, Felix
  Cc: amd-gfx

Am 22.04.20 um 09:35 schrieb Tao, Yintian:
> Hi  Christian
>
>
>> BUG_ON(in_interrupt());
> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
> [yttao]: According to the current drm-next code, I have not find where to access register through KIQ.
> 		And you need to wait for the free kiq ring buffer space if there is no free kiq ring buffer, here, you wait at interrupt context is illegal.

Waiting in atomic context is illegal as well, but we don't have much 
other choice.

We just need to make sure that waiting never happens by making the 
buffers large enough and if it still happens print and error.

> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
> [yttao]: sorry, can you give me more details about how to use NOP to reserve space? I will use amdgpu_device_wb_get() for the read operation.

We could use the NOP PM4 command as Felix suggested, this command has a 
header+length and says that the next X dw should be ignore on the ring 
buffer.

But I think using amdgpu_device_wb_get() is better anyway.

Regards,
Christian.

>
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: 2020年4月22日 15:23
> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
>> BUG_ON(in_interrupt());
> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>
> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>
> You also don't need to worry to much about overflowing the wb area.
> Since we run in an atomic context we can have at most the number of CPU in the system + interrupt context here.
>
> Regards,
> Christian.
>
> Am 22.04.20 um 09:11 schrieb Tao, Yintian:
>> Add Felix and Shaoyun
>>
>> -----Original Message-----
>> From: Yintian Tao <yttao@amd.com>
>> Sent: 2020年4月22日 12:42
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk
>> <Monk.Liu@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
>> Subject: [PATCH] drm/amdgpu: refine kiq access register
>>
>> According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>      get REG-1 value
>>
>> And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.
>>
>> Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.
>>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
>>    .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
>>    .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
>>    12 files changed, 211 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 4e1d4cfe7a9f..4530e0de4257 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>>    /*
>>     * Writeback
>>     */
>> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
>> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>>    
>>    struct amdgpu_wb {
>>    	struct amdgpu_bo	*wb_obj;
>> @@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct
>> amdgpu_device *adev);
>>    
>>    int emu_soc_asic_init(struct amdgpu_device *adev);
>>    
>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void
>> +amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
>> +
>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs);
>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>>    /*
>>     * Registers read & write functions.
>>     */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> index 691c89705bcd..034c9f416499 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> @@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    			    uint32_t doorbell_off)
>>    {
>>    	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>    	struct v10_compute_mqd *m;
>>    	uint32_t mec, pipe;
>> @@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>    		 mec, pipe, queue_id);
>>    
>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto out_unlock;
>> +	}
>> +
>>    	r = amdgpu_ring_alloc(kiq_ring, 7);
>>    	if (r) {
>>    		pr_err("Failed to alloc KIQ (%d).\n", r);
>>    		goto out_unlock;
>>    	}
>>    
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>    	amdgpu_ring_write(kiq_ring,
>>    			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>    	amdgpu_ring_commit(kiq_ring);
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    out_unlock:
>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>    	release_queue(kgd);
>>    
>>    	return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> index df841c2ac5e7..f243d9990ced 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> @@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    			    uint32_t doorbell_off)
>>    {
>>    	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>    	struct v9_mqd *m;
>>    	uint32_t mec, pipe;
>> @@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>    		 mec, pipe, queue_id);
>>    
>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto out_unlock;
>> +	}
>> +
>>    	r = amdgpu_ring_alloc(kiq_ring, 7);
>>    	if (r) {
>>    		pr_err("Failed to alloc KIQ (%d).\n", r);
>>    		goto out_unlock;
>>    	}
>>    
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>    	amdgpu_ring_write(kiq_ring,
>>    			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>    	amdgpu_ring_commit(kiq_ring);
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    out_unlock:
>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>    	release_queue(kgd);
>>    
>>    	return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..c6b1c77b5eac 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>>    	return -EINVAL;
>>    }
>>    
>> +void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
>> +				 uint32_t *offset)
>> +{
>> +	uint32_t tmp = 0;
>> +
>> +	tmp = find_first_zero_bit(kiq->used,
>> +				      kiq->num_slot);
>> +	if (tmp < kiq->num_slot) {
>> +		__set_bit(tmp, kiq->used);
>> +		*offset = kiq->reg_val_offs[tmp];
>> +	}
>> +}
>> +
>> +void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
>> +				 uint32_t offset)
>> +{
>> +	uint32_t tmp = 0;
>> +
>> +	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
>> +	if (tmp < kiq->num_slot)
>> +		__clear_bit(tmp, kiq->used);
>> +}
>> +
>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
>> +	int retry = MAX_KIQ_REG_TRY;
>> +
>> +	BUG_ON(in_interrupt());
>> +	while (retry-- > 0) {
>> +		spin_lock(&kiq->ring_lock);
>> +		if ((atomic64_read(&kiq->submit_avail) > 0) &&
>> +		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
>> +		     kiq->num_slot : 1)) {
>> +				break;
>> +		} else {
>> +			spin_unlock(&kiq->ring_lock);
>> +			msleep(2);
>> +			continue;
>> +		}
>> +
>> +	}
>> +
>> +	if (retry > 0)
>> +		return 0;
>> +	else
>> +		return -ETIME;
>> +}
>> +
>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
>> +	spin_unlock(&kiq->ring_lock);
>> +}
>> +
>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
>> +	atomic64_dec(&kiq->submit_avail);
>> +	if (offs)
>> +		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
>> +
>> +}
>> +
>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
>> +	atomic64_inc(&kiq->submit_avail);
>> +	if (offs)
>> +		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
>> +
>>    int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    			     struct amdgpu_ring *ring,
>>    			     struct amdgpu_irq_src *irq)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -	int r = 0;
>> +	int r = 0, i, j;
>>    
>>    	spin_lock_init(&kiq->ring_lock);
>>    
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
>> +		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
>> +		if (r) {
>> +			for (j = 0; j < i; j++)
>> +				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
>> +			return r;
>> +		}
>> +	}
>>    
>>    	ring->adev = NULL;
>>    	ring->ring_obj = NULL;
>>    	ring->use_doorbell = true;
>>    	ring->doorbell_index = adev->doorbell_index.kiq;
>> +	kiq->num_slot = MAX_KIQ_RREG_NUM;
>> +	memset(&kiq->used, 0, sizeof(kiq->used));
>> +
>>    
>>    	r = amdgpu_gfx_kiq_acquire(adev, ring);
>>    	if (r)
>> @@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    			     AMDGPU_RING_PRIO_DEFAULT);
>>    	if (r)
>>    		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
>> +	else
>> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
>> +			     (ring->funcs->align_mask + 1));
>>    
>>    	return r;
>>    }
>>    
>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>> +	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
>> +	int i = 0;
>> +
>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
>> +		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
>>    	amdgpu_ring_fini(ring);
>>    }
>>    
>> @@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> -	uint32_t seq;
>> +	uint32_t seq, reg_val_offs = 0, value = 0;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto kiq_read_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>    	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	mb();
>> +	value = adev->wb.wb[reg_val_offs];
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +	return value;
>>    
>>    failed_kiq_read:
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +kiq_read_exit:
>>    	pr_err("failed to read reg:%x\n", reg);
>>    	return ~0;
>>    }
>> @@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_wreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto kiq_write_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_emit_wreg(ring, reg, v);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_write;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    	return;
>>    
>>    failed_kiq_write:
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>> +kiq_write_exit:
>>    	pr_err("failed to write reg:%x\n", reg);  } diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..bb05594c27e0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>>    	int invalidate_tlbs_size;
>>    };
>>    
>> +#define MAX_KIQ_RREG_NUM 64
>>    struct amdgpu_kiq {
>>    	u64			eop_gpu_addr;
>>    	struct amdgpu_bo	*eop_obj;
>> @@ -103,7 +104,10 @@ struct amdgpu_kiq {
>>    	struct amdgpu_ring	ring;
>>    	struct amdgpu_irq_src	irq;
>>    	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>> +	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
>> +	atomic64_t		submit_avail;
>> +	uint32_t		num_slot;
>> +	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
>>    };
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..b0ea4f67a6e1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>    	void (*end_use)(struct amdgpu_ring *ring);
>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint32_t reg_val_offs);
>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct
>> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>    #define amdgpu_ring_emit_cntxcntl(r, d)
>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>> d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>> +(d),
>> +(o))
>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
>>    	if (ring->count_dw <= 0)
>>    		DRM_ERROR("amdgpu: writing more dwords to the ring than
>> expected!\n");
>> +
>>    	ring->ring[ring->wptr++ & ring->buf_mask] = v;
>>    	ring->wptr &= ring->ptr_mask;
>>    	ring->count_dw--;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index 8c10084f44ef..dd54ea802a9a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	unsigned long flags;
>>    	uint32_t seq;
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto failed_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    	return;
>>    
>>    failed_kiq:
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>> +failed_exit:
>>    	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7853dbc3c8bd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>    
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..bdbd92d4fe45 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>>    
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..f7dd1036986e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> -	uint32_t seq;
>> +	uint32_t seq, reg_val_offs;
>> +	uint64_t value = 0;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto failed_kiq_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>> @@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
>> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +	return value;
>>    failed_kiq_read:
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +failed_kiq_exit:
>>    	pr_err("failed to read gpu clock\n");
>>    	return ~0;
>>    }
>> @@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>    
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 30b75d79efdb..6b4dc0ed4fff 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>> +		if (r) {
>> +			pr_err("failed to lock kiq\n");
>> +			return -ETIME;
>> +		}
>> +
>>    		/* 2 dwords flush + 8 dwords fence */
>>    		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>    		kiq->pmf->kiq_invalidate_tlbs(ring,
>>    					pasid, flush_type, all_hub);
>>    		amdgpu_fence_emit_polling(ring, &seq);
>>    		amdgpu_ring_commit(ring);
>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>> +		amdgpu_gfx_kiq_unlock(kiq);
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>> +			amdgpu_gfx_kiq_restore(kiq, false);
>>    			return -ETIME;
>>    		}
>>    
>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>    		return 0;
>>    	}
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index fecdbc471983..6238702c80b4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    		if (vega20_xgmi_wa)
>>    			ndw += kiq->pmf->invalidate_tlbs_size;
>>    
>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>> +		if (r) {
>> +			pr_err("failed to lock kiq\n");
>> +			return -ETIME;
>> +		}
>> +
>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>    		/* 2 dwords flush + 8 dwords fence */
>>    		amdgpu_ring_alloc(ring, ndw);
>>    		if (vega20_xgmi_wa)
>> @@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    					pasid, flush_type, all_hub);
>>    		amdgpu_fence_emit_polling(ring, &seq);
>>    		amdgpu_ring_commit(ring);
>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>> +		amdgpu_gfx_kiq_unlock(kiq);
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>>    			return -ETIME;
>>    		}
>>    
>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>    		return 0;
>>    	}
>>    
>> --
>> 2.17.1
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  7:40       ` Christian König
@ 2020-04-22  7:49         ` Tao, Yintian
  2020-04-22  7:53           ` Christian König
  0 siblings, 1 reply; 17+ messages in thread
From: Tao, Yintian @ 2020-04-22  7:49 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Liu, Shaoyun, Kuehling, Felix; +Cc: amd-gfx

Hi Christian


Please see inline comments.
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: 2020年4月22日 15:40
To: Tao, Yintian <Yintian.Tao@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq access register

Am 22.04.20 um 09:35 schrieb Tao, Yintian:
> Hi  Christian
>
>
>> BUG_ON(in_interrupt());
> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
> [yttao]: According to the current drm-next code, I have not find where to access register through KIQ.
> 		And you need to wait for the free kiq ring buffer space if there is no free kiq ring buffer, here, you wait at interrupt context is illegal.

Waiting in atomic context is illegal as well, but we don't have much other choice.
[yttao]: no, there is no sleep in atomic context at my patch.

We just need to make sure that waiting never happens by making the buffers large enough and if it still happens print and error.
[yttao]: this is not the good choice because KMD need to protect it instead of hoping user not frequently invoke KIQ acess.

> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
> [yttao]: sorry, can you give me more details about how to use NOP to reserve space? I will use amdgpu_device_wb_get() for the read operation.

We could use the NOP PM4 command as Felix suggested, this command has a 
header+length and says that the next X dw should be ignore on the ring
buffer.

But I think using amdgpu_device_wb_get() is better anyway.
[yttao]: yes, I agreed with amdgpu_device_wb_get() method because it will fix prevent potential read race condition but NOP method will not prevent it

Regards,
Christian.

>
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: 2020年4月22日 15:23
> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
>> BUG_ON(in_interrupt());
> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>
> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>
> You also don't need to worry to much about overflowing the wb area.
> Since we run in an atomic context we can have at most the number of CPU in the system + interrupt context here.
>
> Regards,
> Christian.
>
> Am 22.04.20 um 09:11 schrieb Tao, Yintian:
>> Add Felix and Shaoyun
>>
>> -----Original Message-----
>> From: Yintian Tao <yttao@amd.com>
>> Sent: 2020年4月22日 12:42
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk
>> <Monk.Liu@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
>> Subject: [PATCH] drm/amdgpu: refine kiq access register
>>
>> According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>      get REG-1 value
>>
>> And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.
>>
>> Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.
>>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
>>    .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
>>    .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
>>    12 files changed, 211 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 4e1d4cfe7a9f..4530e0de4257 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>>    /*
>>     * Writeback
>>     */
>> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
>> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>>    
>>    struct amdgpu_wb {
>>    	struct amdgpu_bo	*wb_obj;
>> @@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct
>> amdgpu_device *adev);
>>    
>>    int emu_soc_asic_init(struct amdgpu_device *adev);
>>    
>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void
>> +amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
>> +
>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs);
>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>>    /*
>>     * Registers read & write functions.
>>     */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> index 691c89705bcd..034c9f416499 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> @@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    			    uint32_t doorbell_off)
>>    {
>>    	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>    	struct v10_compute_mqd *m;
>>    	uint32_t mec, pipe;
>> @@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>    		 mec, pipe, queue_id);
>>    
>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto out_unlock;
>> +	}
>> +
>>    	r = amdgpu_ring_alloc(kiq_ring, 7);
>>    	if (r) {
>>    		pr_err("Failed to alloc KIQ (%d).\n", r);
>>    		goto out_unlock;
>>    	}
>>    
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>    	amdgpu_ring_write(kiq_ring,
>>    			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>    	amdgpu_ring_commit(kiq_ring);
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    out_unlock:
>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>    	release_queue(kgd);
>>    
>>    	return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> index df841c2ac5e7..f243d9990ced 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> @@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    			    uint32_t doorbell_off)
>>    {
>>    	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>    	struct v9_mqd *m;
>>    	uint32_t mec, pipe;
>> @@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>    		 mec, pipe, queue_id);
>>    
>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto out_unlock;
>> +	}
>> +
>>    	r = amdgpu_ring_alloc(kiq_ring, 7);
>>    	if (r) {
>>    		pr_err("Failed to alloc KIQ (%d).\n", r);
>>    		goto out_unlock;
>>    	}
>>    
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>    	amdgpu_ring_write(kiq_ring,
>>    			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>    	amdgpu_ring_commit(kiq_ring);
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    out_unlock:
>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>    	release_queue(kgd);
>>    
>>    	return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..c6b1c77b5eac 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>>    	return -EINVAL;
>>    }
>>    
>> +void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
>> +				 uint32_t *offset)
>> +{
>> +	uint32_t tmp = 0;
>> +
>> +	tmp = find_first_zero_bit(kiq->used,
>> +				      kiq->num_slot);
>> +	if (tmp < kiq->num_slot) {
>> +		__set_bit(tmp, kiq->used);
>> +		*offset = kiq->reg_val_offs[tmp];
>> +	}
>> +}
>> +
>> +void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
>> +				 uint32_t offset)
>> +{
>> +	uint32_t tmp = 0;
>> +
>> +	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
>> +	if (tmp < kiq->num_slot)
>> +		__clear_bit(tmp, kiq->used);
>> +}
>> +
>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
>> +	int retry = MAX_KIQ_REG_TRY;
>> +
>> +	BUG_ON(in_interrupt());
>> +	while (retry-- > 0) {
>> +		spin_lock(&kiq->ring_lock);
>> +		if ((atomic64_read(&kiq->submit_avail) > 0) &&
>> +		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
>> +		     kiq->num_slot : 1)) {
>> +				break;
>> +		} else {
>> +			spin_unlock(&kiq->ring_lock);
>> +			msleep(2);
>> +			continue;
>> +		}
>> +
>> +	}
>> +
>> +	if (retry > 0)
>> +		return 0;
>> +	else
>> +		return -ETIME;
>> +}
>> +
>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
>> +	spin_unlock(&kiq->ring_lock);
>> +}
>> +
>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
>> +	atomic64_dec(&kiq->submit_avail);
>> +	if (offs)
>> +		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
>> +
>> +}
>> +
>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
>> +	atomic64_inc(&kiq->submit_avail);
>> +	if (offs)
>> +		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
>> +
>>    int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    			     struct amdgpu_ring *ring,
>>    			     struct amdgpu_irq_src *irq)
>>    {
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>> -	int r = 0;
>> +	int r = 0, i, j;
>>    
>>    	spin_lock_init(&kiq->ring_lock);
>>    
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
>> +		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
>> +		if (r) {
>> +			for (j = 0; j < i; j++)
>> +				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
>> +			return r;
>> +		}
>> +	}
>>    
>>    	ring->adev = NULL;
>>    	ring->ring_obj = NULL;
>>    	ring->use_doorbell = true;
>>    	ring->doorbell_index = adev->doorbell_index.kiq;
>> +	kiq->num_slot = MAX_KIQ_RREG_NUM;
>> +	memset(&kiq->used, 0, sizeof(kiq->used));
>> +
>>    
>>    	r = amdgpu_gfx_kiq_acquire(adev, ring);
>>    	if (r)
>> @@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    			     AMDGPU_RING_PRIO_DEFAULT);
>>    	if (r)
>>    		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
>> +	else
>> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
>> +			     (ring->funcs->align_mask + 1));
>>    
>>    	return r;
>>    }
>>    
>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>> +	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
>> +	int i = 0;
>> +
>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
>> +		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
>>    	amdgpu_ring_fini(ring);
>>    }
>>    
>> @@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> -	uint32_t seq;
>> +	uint32_t seq, reg_val_offs = 0, value = 0;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto kiq_read_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>    	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	mb();
>> +	value = adev->wb.wb[reg_val_offs];
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +	return value;
>>    
>>    failed_kiq_read:
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +kiq_read_exit:
>>    	pr_err("failed to read reg:%x\n", reg);
>>    	return ~0;
>>    }
>> @@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_wreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto kiq_write_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_emit_wreg(ring, reg, v);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_write;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    	return;
>>    
>>    failed_kiq_write:
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>> +kiq_write_exit:
>>    	pr_err("failed to write reg:%x\n", reg);  } diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..bb05594c27e0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>>    	int invalidate_tlbs_size;
>>    };
>>    
>> +#define MAX_KIQ_RREG_NUM 64
>>    struct amdgpu_kiq {
>>    	u64			eop_gpu_addr;
>>    	struct amdgpu_bo	*eop_obj;
>> @@ -103,7 +104,10 @@ struct amdgpu_kiq {
>>    	struct amdgpu_ring	ring;
>>    	struct amdgpu_irq_src	irq;
>>    	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>> +	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
>> +	atomic64_t		submit_avail;
>> +	uint32_t		num_slot;
>> +	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
>>    };
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..b0ea4f67a6e1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>    	void (*end_use)(struct amdgpu_ring *ring);
>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint32_t reg_val_offs);
>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct
>> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>    #define amdgpu_ring_emit_cntxcntl(r, d)
>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>> d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>> +(d),
>> +(o))
>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
>>    	if (ring->count_dw <= 0)
>>    		DRM_ERROR("amdgpu: writing more dwords to the ring than
>> expected!\n");
>> +
>>    	ring->ring[ring->wptr++ & ring->buf_mask] = v;
>>    	ring->wptr &= ring->ptr_mask;
>>    	ring->count_dw--;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index 8c10084f44ef..dd54ea802a9a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	unsigned long flags;
>>    	uint32_t seq;
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto failed_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    	return;
>>    
>>    failed_kiq:
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>> +failed_exit:
>>    	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7853dbc3c8bd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>    
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..bdbd92d4fe45 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>>    
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..f7dd1036986e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> -	uint32_t seq;
>> +	uint32_t seq, reg_val_offs;
>> +	uint64_t value = 0;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto failed_kiq_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>> @@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
>> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +	return value;
>>    failed_kiq_read:
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +failed_kiq_exit:
>>    	pr_err("failed to read gpu clock\n");
>>    	return ~0;
>>    }
>> @@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>    
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 30b75d79efdb..6b4dc0ed4fff 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>> +		if (r) {
>> +			pr_err("failed to lock kiq\n");
>> +			return -ETIME;
>> +		}
>> +
>>    		/* 2 dwords flush + 8 dwords fence */
>>    		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>    		kiq->pmf->kiq_invalidate_tlbs(ring,
>>    					pasid, flush_type, all_hub);
>>    		amdgpu_fence_emit_polling(ring, &seq);
>>    		amdgpu_ring_commit(ring);
>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>> +		amdgpu_gfx_kiq_unlock(kiq);
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>> +			amdgpu_gfx_kiq_restore(kiq, false);
>>    			return -ETIME;
>>    		}
>>    
>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>    		return 0;
>>    	}
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index fecdbc471983..6238702c80b4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    		if (vega20_xgmi_wa)
>>    			ndw += kiq->pmf->invalidate_tlbs_size;
>>    
>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>> +		if (r) {
>> +			pr_err("failed to lock kiq\n");
>> +			return -ETIME;
>> +		}
>> +
>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>    		/* 2 dwords flush + 8 dwords fence */
>>    		amdgpu_ring_alloc(ring, ndw);
>>    		if (vega20_xgmi_wa)
>> @@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    					pasid, flush_type, all_hub);
>>    		amdgpu_fence_emit_polling(ring, &seq);
>>    		amdgpu_ring_commit(ring);
>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>> +		amdgpu_gfx_kiq_unlock(kiq);
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>>    			return -ETIME;
>>    		}
>>    
>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>    		return 0;
>>    	}
>>    
>> --
>> 2.17.1
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CYintian.Tao%40amd.com%7C16d5caeab344478a395808d7e69070d3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637231380366556593&amp;sdata=mRnRzlSlFufeHytPmCLjKlSon2V6SSSC%2Fxx9c55F2gU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  7:49         ` Tao, Yintian
@ 2020-04-22  7:53           ` Christian König
  2020-04-22  8:06             ` Tao, Yintian
  0 siblings, 1 reply; 17+ messages in thread
From: Christian König @ 2020-04-22  7:53 UTC (permalink / raw)
  To: Tao, Yintian, Liu, Monk, Liu, Shaoyun, Kuehling, Felix; +Cc: amd-gfx

Am 22.04.20 um 09:49 schrieb Tao, Yintian:
> Hi Christian
>
>
> Please see inline comments.
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: 2020年4月22日 15:40
> To: Tao, Yintian <Yintian.Tao@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
> Am 22.04.20 um 09:35 schrieb Tao, Yintian:
>> Hi  Christian
>>
>>
>>> BUG_ON(in_interrupt());
>> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>> [yttao]: According to the current drm-next code, I have not find where to access register through KIQ.
>> 		And you need to wait for the free kiq ring buffer space if there is no free kiq ring buffer, here, you wait at interrupt context is illegal.
> Waiting in atomic context is illegal as well, but we don't have much other choice.
> [yttao]: no, there is no sleep in atomic context at my patch.

I'm not talking about a sleeping, but busy waiting.

> We just need to make sure that waiting never happens by making the buffers large enough and if it still happens print and error.
> [yttao]: this is not the good choice because KMD need to protect it instead of hoping user not frequently invoke KIQ acess.

The only other choice we have is busy waiting, e.g. loop until we get a 
free slot.

Regards,
Christian.

>
>> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>> [yttao]: sorry, can you give me more details about how to use NOP to reserve space? I will use amdgpu_device_wb_get() for the read operation.
> We could use the NOP PM4 command as Felix suggested, this command has a
> header+length and says that the next X dw should be ignore on the ring
> buffer.
>
> But I think using amdgpu_device_wb_get() is better anyway.
> [yttao]: yes, I agreed with amdgpu_device_wb_get() method because it will fix prevent potential read race condition but NOP method will not prevent it
>
> Regards,
> Christian.
>
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: 2020年4月22日 15:23
>> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>>
>>> BUG_ON(in_interrupt());
>> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>>
>> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>>
>> You also don't need to worry to much about overflowing the wb area.
>> Since we run in an atomic context we can have at most the number of CPU in the system + interrupt context here.
>>
>> Regards,
>> Christian.
>>
>> Am 22.04.20 um 09:11 schrieb Tao, Yintian:
>>> Add Felix and Shaoyun
>>>
>>> -----Original Message-----
>>> From: Yintian Tao <yttao@amd.com>
>>> Sent: 2020年4月22日 12:42
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk
>>> <Monk.Liu@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian <Yintian.Tao@amd.com>
>>> Subject: [PATCH] drm/amdgpu: refine kiq access register
>>>
>>> According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>>       get REG-1 value
>>>
>>> And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.
>>>
>>> Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.
>>>
>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
>>>     .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
>>>     .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
>>>     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
>>>     12 files changed, 211 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 4e1d4cfe7a9f..4530e0de4257 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>>>     /*
>>>      * Writeback
>>>      */
>>> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
>>> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>>>     
>>>     struct amdgpu_wb {
>>>     	struct amdgpu_bo	*wb_obj;
>>> @@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct
>>> amdgpu_device *adev);
>>>     
>>>     int emu_soc_asic_init(struct amdgpu_device *adev);
>>>     
>>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void
>>> +amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
>>> +
>>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs);
>>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>>>     /*
>>>      * Registers read & write functions.
>>>      */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>> index 691c89705bcd..034c9f416499 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>> @@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     			    uint32_t doorbell_off)
>>>     {
>>>     	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>>     	struct v10_compute_mqd *m;
>>>     	uint32_t mec, pipe;
>>> @@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>>     		 mec, pipe, queue_id);
>>>     
>>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto out_unlock;
>>> +	}
>>> +
>>>     	r = amdgpu_ring_alloc(kiq_ring, 7);
>>>     	if (r) {
>>>     		pr_err("Failed to alloc KIQ (%d).\n", r);
>>>     		goto out_unlock;
>>>     	}
>>>     
>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>>     	amdgpu_ring_write(kiq_ring,
>>>     			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>>     	amdgpu_ring_commit(kiq_ring);
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     out_unlock:
>>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>>     	release_queue(kgd);
>>>     
>>>     	return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>> index df841c2ac5e7..f243d9990ced 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>> @@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     			    uint32_t doorbell_off)
>>>     {
>>>     	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>>     	struct v9_mqd *m;
>>>     	uint32_t mec, pipe;
>>> @@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>>     		 mec, pipe, queue_id);
>>>     
>>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto out_unlock;
>>> +	}
>>> +
>>>     	r = amdgpu_ring_alloc(kiq_ring, 7);
>>>     	if (r) {
>>>     		pr_err("Failed to alloc KIQ (%d).\n", r);
>>>     		goto out_unlock;
>>>     	}
>>>     
>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>>     	amdgpu_ring_write(kiq_ring,
>>>     			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>>     	amdgpu_ring_commit(kiq_ring);
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     out_unlock:
>>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>>     	release_queue(kgd);
>>>     
>>>     	return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index ea576b4260a4..c6b1c77b5eac 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>>>     	return -EINVAL;
>>>     }
>>>     
>>> +void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
>>> +				 uint32_t *offset)
>>> +{
>>> +	uint32_t tmp = 0;
>>> +
>>> +	tmp = find_first_zero_bit(kiq->used,
>>> +				      kiq->num_slot);
>>> +	if (tmp < kiq->num_slot) {
>>> +		__set_bit(tmp, kiq->used);
>>> +		*offset = kiq->reg_val_offs[tmp];
>>> +	}
>>> +}
>>> +
>>> +void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
>>> +				 uint32_t offset)
>>> +{
>>> +	uint32_t tmp = 0;
>>> +
>>> +	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
>>> +	if (tmp < kiq->num_slot)
>>> +		__clear_bit(tmp, kiq->used);
>>> +}
>>> +
>>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
>>> +	int retry = MAX_KIQ_REG_TRY;
>>> +
>>> +	BUG_ON(in_interrupt());
>>> +	while (retry-- > 0) {
>>> +		spin_lock(&kiq->ring_lock);
>>> +		if ((atomic64_read(&kiq->submit_avail) > 0) &&
>>> +		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
>>> +		     kiq->num_slot : 1)) {
>>> +				break;
>>> +		} else {
>>> +			spin_unlock(&kiq->ring_lock);
>>> +			msleep(2);
>>> +			continue;
>>> +		}
>>> +
>>> +	}
>>> +
>>> +	if (retry > 0)
>>> +		return 0;
>>> +	else
>>> +		return -ETIME;
>>> +}
>>> +
>>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
>>> +	spin_unlock(&kiq->ring_lock);
>>> +}
>>> +
>>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
>>> +	atomic64_dec(&kiq->submit_avail);
>>> +	if (offs)
>>> +		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
>>> +
>>> +}
>>> +
>>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
>>> +	atomic64_inc(&kiq->submit_avail);
>>> +	if (offs)
>>> +		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
>>> +
>>>     int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>     			     struct amdgpu_ring *ring,
>>>     			     struct amdgpu_irq_src *irq)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -	int r = 0;
>>> +	int r = 0, i, j;
>>>     
>>>     	spin_lock_init(&kiq->ring_lock);
>>>     
>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>> -	if (r)
>>> -		return r;
>>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
>>> +		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
>>> +		if (r) {
>>> +			for (j = 0; j < i; j++)
>>> +				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
>>> +			return r;
>>> +		}
>>> +	}
>>>     
>>>     	ring->adev = NULL;
>>>     	ring->ring_obj = NULL;
>>>     	ring->use_doorbell = true;
>>>     	ring->doorbell_index = adev->doorbell_index.kiq;
>>> +	kiq->num_slot = MAX_KIQ_RREG_NUM;
>>> +	memset(&kiq->used, 0, sizeof(kiq->used));
>>> +
>>>     
>>>     	r = amdgpu_gfx_kiq_acquire(adev, ring);
>>>     	if (r)
>>> @@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>     			     AMDGPU_RING_PRIO_DEFAULT);
>>>     	if (r)
>>>     		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
>>> +	else
>>> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
>>> +			     (ring->funcs->align_mask + 1));
>>>     
>>>     	return r;
>>>     }
>>>     
>>>     void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>> +	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
>>> +	int i = 0;
>>> +
>>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
>>> +		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
>>>     	amdgpu_ring_fini(ring);
>>>     }
>>>     
>>> @@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>>>     	signed long r, cnt = 0;
>>> -	unsigned long flags;
>>> -	uint32_t seq;
>>> +	uint32_t seq, reg_val_offs = 0, value = 0;
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>     
>>>     	BUG_ON(!ring->funcs->emit_rreg);
>>>     
>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto kiq_read_exit;
>>> +	}
>>> +
>>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>>     	amdgpu_ring_alloc(ring, 32);
>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>     
>>>     	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>     
>>> @@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>     		goto failed_kiq_read;
>>>     
>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>> +	mb();
>>> +	value = adev->wb.wb[reg_val_offs];
>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>> +	return value;
>>>     
>>>     failed_kiq_read:
>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>> +kiq_read_exit:
>>>     	pr_err("failed to read reg:%x\n", reg);
>>>     	return ~0;
>>>     }
>>> @@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
>>>     	signed long r, cnt = 0;
>>> -	unsigned long flags;
>>>     	uint32_t seq;
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>     
>>>     	BUG_ON(!ring->funcs->emit_wreg);
>>>     
>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto kiq_write_exit;
>>> +	}
>>> +
>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     	amdgpu_ring_alloc(ring, 32);
>>>     	amdgpu_ring_emit_wreg(ring, reg, v);
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>     
>>>     	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>     
>>> @@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>     		goto failed_kiq_write;
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     	return;
>>>     
>>>     failed_kiq_write:
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>> +kiq_write_exit:
>>>     	pr_err("failed to write reg:%x\n", reg);  } diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> index 634746829024..bb05594c27e0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>>>     	int invalidate_tlbs_size;
>>>     };
>>>     
>>> +#define MAX_KIQ_RREG_NUM 64
>>>     struct amdgpu_kiq {
>>>     	u64			eop_gpu_addr;
>>>     	struct amdgpu_bo	*eop_obj;
>>> @@ -103,7 +104,10 @@ struct amdgpu_kiq {
>>>     	struct amdgpu_ring	ring;
>>>     	struct amdgpu_irq_src	irq;
>>>     	const struct kiq_pm4_funcs *pmf;
>>> -	uint32_t			reg_val_offs;
>>> +	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
>>> +	atomic64_t		submit_avail;
>>> +	uint32_t		num_slot;
>>> +	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
>>>     };
>>>     
>>>     /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index f61664ee4940..b0ea4f67a6e1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>     	void (*end_use)(struct amdgpu_ring *ring);
>>>     	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>     	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>> +			  uint32_t reg_val_offs);
>>>     	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>     	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>     			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct
>>> amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>     #define amdgpu_ring_emit_cntxcntl(r, d)
>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>>> d) (r)->funcs->emit_rreg((r), (d))
>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>>> +(d),
>>> +(o))
>>>     #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
>>>     	if (ring->count_dw <= 0)
>>>     		DRM_ERROR("amdgpu: writing more dwords to the ring than
>>> expected!\n");
>>> +
>>>     	ring->ring[ring->wptr++ & ring->buf_mask] = v;
>>>     	ring->wptr &= ring->ptr_mask;
>>>     	ring->count_dw--;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index 8c10084f44ef..dd54ea802a9a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	unsigned long flags;
>>>     	uint32_t seq;
>>>     
>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto failed_exit;
>>> +	}
>>> +
>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     	amdgpu_ring_alloc(ring, 32);
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>     
>>>     	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>     
>>> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>     		goto failed_kiq;
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     	return;
>>>     
>>>     failed_kiq:
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>> +failed_exit:
>>>     	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  }
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 0a03e2ad5d95..7853dbc3c8bd 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>     	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>>     
>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>> uint32_t reg)
>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				     uint32_t reg_val_offs)
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     
>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>     	amdgpu_ring_write(ring, reg);
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     }
>>>     
>>>     static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index fc6c2f2bc76c..bdbd92d4fe45 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>     		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>>>     
>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>> uint32_t reg)
>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint32_t reg_val_offs)
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     
>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>     	amdgpu_ring_write(ring, reg);
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     }
>>>     
>>>     static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 84fcf842316d..f7dd1036986e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
>>>     	signed long r, cnt = 0;
>>> -	unsigned long flags;
>>> -	uint32_t seq;
>>> +	uint32_t seq, reg_val_offs;
>>> +	uint64_t value = 0;
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>     
>>>     	BUG_ON(!ring->funcs->emit_rreg);
>>>     
>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto failed_kiq_exit;
>>> +	}
>>> +
>>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>>     	amdgpu_ring_alloc(ring, 32);
>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>     	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>> @@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>     
>>>     	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>     
>>> @@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>     		goto failed_kiq_read;
>>>     
>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
>>> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>> +	return value;
>>>     failed_kiq_read:
>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>> +failed_kiq_exit:
>>>     	pr_err("failed to read gpu clock\n");
>>>     	return ~0;
>>>     }
>>> @@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>     		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>>     
>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>> uint32_t reg)
>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint32_t reg_val_offs)
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     
>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>> @@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>     	amdgpu_ring_write(ring, reg);
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     }
>>>     
>>>     static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 30b75d79efdb..6b4dc0ed4fff 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     
>>>     	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +		if (r) {
>>> +			pr_err("failed to lock kiq\n");
>>> +			return -ETIME;
>>> +		}
>>> +
>>>     		/* 2 dwords flush + 8 dwords fence */
>>>     		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     		kiq->pmf->kiq_invalidate_tlbs(ring,
>>>     					pasid, flush_type, all_hub);
>>>     		amdgpu_fence_emit_polling(ring, &seq);
>>>     		amdgpu_ring_commit(ring);
>>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>>> +		amdgpu_gfx_kiq_unlock(kiq);
>>>     		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>>     		if (r < 1) {
>>>     			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>> +			amdgpu_gfx_kiq_restore(kiq, false);
>>>     			return -ETIME;
>>>     		}
>>>     
>>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     		return 0;
>>>     	}
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index fecdbc471983..6238702c80b4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>     		if (vega20_xgmi_wa)
>>>     			ndw += kiq->pmf->invalidate_tlbs_size;
>>>     
>>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +		if (r) {
>>> +			pr_err("failed to lock kiq\n");
>>> +			return -ETIME;
>>> +		}
>>> +
>>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     		/* 2 dwords flush + 8 dwords fence */
>>>     		amdgpu_ring_alloc(ring, ndw);
>>>     		if (vega20_xgmi_wa)
>>> @@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>     					pasid, flush_type, all_hub);
>>>     		amdgpu_fence_emit_polling(ring, &seq);
>>>     		amdgpu_ring_commit(ring);
>>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>>> +		amdgpu_gfx_kiq_unlock(kiq);
>>>     		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>>     		if (r < 1) {
>>>     			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     			return -ETIME;
>>>     		}
>>>     
>>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     		return 0;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CYintian.Tao%40amd.com%7C16d5caeab344478a395808d7e69070d3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637231380366556593&amp;sdata=mRnRzlSlFufeHytPmCLjKlSon2V6SSSC%2Fxx9c55F2gU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  7:53           ` Christian König
@ 2020-04-22  8:06             ` Tao, Yintian
  2020-04-22  8:23               ` Christian König
  0 siblings, 1 reply; 17+ messages in thread
From: Tao, Yintian @ 2020-04-22  8:06 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Liu, Shaoyun, Kuehling, Felix; +Cc: amd-gfx

Hi  Christian

Please see inline comments

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: 2020年4月22日 15:54
To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq access register

Am 22.04.20 um 09:49 schrieb Tao, Yintian:
> Hi Christian
>
>
> Please see inline comments.
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: 2020年4月22日 15:40
> To: Tao, Yintian <Yintian.Tao@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun 
> <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
> Am 22.04.20 um 09:35 schrieb Tao, Yintian:
>> Hi  Christian
>>
>>
>>> BUG_ON(in_interrupt());
>> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>> [yttao]: According to the current drm-next code, I have not find where to access register through KIQ.
>> 		And you need to wait for the free kiq ring buffer space if there is no free kiq ring buffer, here, you wait at interrupt context is illegal.
> Waiting in atomic context is illegal as well, but we don't have much other choice.
> [yttao]: no, there is no sleep in atomic context at my patch.

I'm not talking about a sleeping, but busy waiting.

> We just need to make sure that waiting never happens by making the buffers large enough and if it still happens print and error.
> [yttao]: this is not the good choice because KMD need to protect it instead of hoping user not frequently invoke KIQ acess.

The only other choice we have is busy waiting, e.g. loop until we get a free slot.
[yttao]: Yes, now may patch use msleep() to busy waiting. Or you means need to use udelay()? If we use udelay(), it will be the nightmare under multi-VF.
		Because it is assumed that there are 16VF within world-switch 6ms, the bad situation is that one VF will udelay(16*6ms = 96ms) to get one free slot.


Regards,
Christian.

>
>> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>> [yttao]: sorry, can you give me more details about how to use NOP to reserve space? I will use amdgpu_device_wb_get() for the read operation.
> We could use the NOP PM4 command as Felix suggested, this command has 
> a
> header+length and says that the next X dw should be ignore on the ring
> buffer.
>
> But I think using amdgpu_device_wb_get() is better anyway.
> [yttao]: yes, I agreed with amdgpu_device_wb_get() method because it 
> will fix prevent potential read race condition but NOP method will not 
> prevent it
>
> Regards,
> Christian.
>
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: 2020年4月22日 15:23
>> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; 
>> Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix 
>> <Felix.Kuehling@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>>
>>> BUG_ON(in_interrupt());
>> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>>
>> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>>
>> You also don't need to worry to much about overflowing the wb area.
>> Since we run in an atomic context we can have at most the number of CPU in the system + interrupt context here.
>>
>> Regards,
>> Christian.
>>
>> Am 22.04.20 um 09:11 schrieb Tao, Yintian:
>>> Add Felix and Shaoyun
>>>
>>> -----Original Message-----
>>> From: Yintian Tao <yttao@amd.com>
>>> Sent: 2020年4月22日 12:42
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk 
>>> <Monk.Liu@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian 
>>> <Yintian.Tao@amd.com>
>>> Subject: [PATCH] drm/amdgpu: refine kiq access register
>>>
>>> According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>>       get REG-1 value
>>>
>>> And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.
>>>
>>> Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.
>>>
>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
>>>     .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
>>>     .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
>>>     drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
>>>     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
>>>     12 files changed, 211 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 4e1d4cfe7a9f..4530e0de4257 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>>>     /*
>>>      * Writeback
>>>      */
>>> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
>>> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>>>     
>>>     struct amdgpu_wb {
>>>     	struct amdgpu_bo	*wb_obj;
>>> @@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct
>>> amdgpu_device *adev);
>>>     
>>>     int emu_soc_asic_init(struct amdgpu_device *adev);
>>>     
>>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void 
>>> +amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
>>> +
>>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t 
>>> +*offs); void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, 
>>> +uint32_t *offs);
>>>     /*
>>>      * Registers read & write functions.
>>>      */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>> index 691c89705bcd..034c9f416499 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>> @@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     			    uint32_t doorbell_off)
>>>     {
>>>     	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>>     	struct v10_compute_mqd *m;
>>>     	uint32_t mec, pipe;
>>> @@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>>     		 mec, pipe, queue_id);
>>>     
>>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto out_unlock;
>>> +	}
>>> +
>>>     	r = amdgpu_ring_alloc(kiq_ring, 7);
>>>     	if (r) {
>>>     		pr_err("Failed to alloc KIQ (%d).\n", r);
>>>     		goto out_unlock;
>>>     	}
>>>     
>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>>     	amdgpu_ring_write(kiq_ring,
>>>     			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>>     	amdgpu_ring_commit(kiq_ring);
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     out_unlock:
>>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>>     	release_queue(kgd);
>>>     
>>>     	return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>> index df841c2ac5e7..f243d9990ced 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>> @@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     			    uint32_t doorbell_off)
>>>     {
>>>     	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>>     	struct v9_mqd *m;
>>>     	uint32_t mec, pipe;
>>> @@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>>     		 mec, pipe, queue_id);
>>>     
>>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto out_unlock;
>>> +	}
>>> +
>>>     	r = amdgpu_ring_alloc(kiq_ring, 7);
>>>     	if (r) {
>>>     		pr_err("Failed to alloc KIQ (%d).\n", r);
>>>     		goto out_unlock;
>>>     	}
>>>     
>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>>     	amdgpu_ring_write(kiq_ring,
>>>     			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>     	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>>     	amdgpu_ring_commit(kiq_ring);
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     out_unlock:
>>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>>     	release_queue(kgd);
>>>     
>>>     	return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index ea576b4260a4..c6b1c77b5eac 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>>>     	return -EINVAL;
>>>     }
>>>     
>>> +void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
>>> +				 uint32_t *offset)
>>> +{
>>> +	uint32_t tmp = 0;
>>> +
>>> +	tmp = find_first_zero_bit(kiq->used,
>>> +				      kiq->num_slot);
>>> +	if (tmp < kiq->num_slot) {
>>> +		__set_bit(tmp, kiq->used);
>>> +		*offset = kiq->reg_val_offs[tmp];
>>> +	}
>>> +}
>>> +
>>> +void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
>>> +				 uint32_t offset)
>>> +{
>>> +	uint32_t tmp = 0;
>>> +
>>> +	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
>>> +	if (tmp < kiq->num_slot)
>>> +		__clear_bit(tmp, kiq->used);
>>> +}
>>> +
>>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
>>> +	int retry = MAX_KIQ_REG_TRY;
>>> +
>>> +	BUG_ON(in_interrupt());
>>> +	while (retry-- > 0) {
>>> +		spin_lock(&kiq->ring_lock);
>>> +		if ((atomic64_read(&kiq->submit_avail) > 0) &&
>>> +		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
>>> +		     kiq->num_slot : 1)) {
>>> +				break;
>>> +		} else {
>>> +			spin_unlock(&kiq->ring_lock);
>>> +			msleep(2);
>>> +			continue;
>>> +		}
>>> +
>>> +	}
>>> +
>>> +	if (retry > 0)
>>> +		return 0;
>>> +	else
>>> +		return -ETIME;
>>> +}
>>> +
>>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
>>> +	spin_unlock(&kiq->ring_lock);
>>> +}
>>> +
>>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
>>> +	atomic64_dec(&kiq->submit_avail);
>>> +	if (offs)
>>> +		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
>>> +
>>> +}
>>> +
>>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
>>> +	atomic64_inc(&kiq->submit_avail);
>>> +	if (offs)
>>> +		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
>>> +
>>>     int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>     			     struct amdgpu_ring *ring,
>>>     			     struct amdgpu_irq_src *irq)
>>>     {
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> -	int r = 0;
>>> +	int r = 0, i, j;
>>>     
>>>     	spin_lock_init(&kiq->ring_lock);
>>>     
>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>> -	if (r)
>>> -		return r;
>>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
>>> +		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
>>> +		if (r) {
>>> +			for (j = 0; j < i; j++)
>>> +				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
>>> +			return r;
>>> +		}
>>> +	}
>>>     
>>>     	ring->adev = NULL;
>>>     	ring->ring_obj = NULL;
>>>     	ring->use_doorbell = true;
>>>     	ring->doorbell_index = adev->doorbell_index.kiq;
>>> +	kiq->num_slot = MAX_KIQ_RREG_NUM;
>>> +	memset(&kiq->used, 0, sizeof(kiq->used));
>>> +
>>>     
>>>     	r = amdgpu_gfx_kiq_acquire(adev, ring);
>>>     	if (r)
>>> @@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>     			     AMDGPU_RING_PRIO_DEFAULT);
>>>     	if (r)
>>>     		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
>>> +	else
>>> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
>>> +			     (ring->funcs->align_mask + 1));
>>>     
>>>     	return r;
>>>     }
>>>     
>>>     void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>> +	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
>>> +	int i = 0;
>>> +
>>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
>>> +		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
>>>     	amdgpu_ring_fini(ring);
>>>     }
>>>     
>>> @@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>>>     	signed long r, cnt = 0;
>>> -	unsigned long flags;
>>> -	uint32_t seq;
>>> +	uint32_t seq, reg_val_offs = 0, value = 0;
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>     
>>>     	BUG_ON(!ring->funcs->emit_rreg);
>>>     
>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto kiq_read_exit;
>>> +	}
>>> +
>>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>>     	amdgpu_ring_alloc(ring, 32);
>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>     
>>>     	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>     
>>> @@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>     		goto failed_kiq_read;
>>>     
>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>> +	mb();
>>> +	value = adev->wb.wb[reg_val_offs];
>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>> +	return value;
>>>     
>>>     failed_kiq_read:
>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>> +kiq_read_exit:
>>>     	pr_err("failed to read reg:%x\n", reg);
>>>     	return ~0;
>>>     }
>>> @@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
>>>     	signed long r, cnt = 0;
>>> -	unsigned long flags;
>>>     	uint32_t seq;
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>     
>>>     	BUG_ON(!ring->funcs->emit_wreg);
>>>     
>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto kiq_write_exit;
>>> +	}
>>> +
>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     	amdgpu_ring_alloc(ring, 32);
>>>     	amdgpu_ring_emit_wreg(ring, reg, v);
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>     
>>>     	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>     
>>> @@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>     		goto failed_kiq_write;
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     	return;
>>>     
>>>     failed_kiq_write:
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>> +kiq_write_exit:
>>>     	pr_err("failed to write reg:%x\n", reg);  } diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> index 634746829024..bb05594c27e0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>>>     	int invalidate_tlbs_size;
>>>     };
>>>     
>>> +#define MAX_KIQ_RREG_NUM 64
>>>     struct amdgpu_kiq {
>>>     	u64			eop_gpu_addr;
>>>     	struct amdgpu_bo	*eop_obj;
>>> @@ -103,7 +104,10 @@ struct amdgpu_kiq {
>>>     	struct amdgpu_ring	ring;
>>>     	struct amdgpu_irq_src	irq;
>>>     	const struct kiq_pm4_funcs *pmf;
>>> -	uint32_t			reg_val_offs;
>>> +	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
>>> +	atomic64_t		submit_avail;
>>> +	uint32_t		num_slot;
>>> +	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
>>>     };
>>>     
>>>     /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index f61664ee4940..b0ea4f67a6e1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>     	void (*end_use)(struct amdgpu_ring *ring);
>>>     	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>     	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>> +			  uint32_t reg_val_offs);
>>>     	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>     	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>     			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ 
>>> struct amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>     #define amdgpu_ring_emit_cntxcntl(r, d) 
>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>> amdgpu_ring_emit_rreg(r,
>>> d) (r)->funcs->emit_rreg((r), (d))
>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>>> +(d),
>>> +(o))
>>>     #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
>>>     	if (ring->count_dw <= 0)
>>>     		DRM_ERROR("amdgpu: writing more dwords to the ring than 
>>> expected!\n");
>>> +
>>>     	ring->ring[ring->wptr++ & ring->buf_mask] = v;
>>>     	ring->wptr &= ring->ptr_mask;
>>>     	ring->count_dw--;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index 8c10084f44ef..dd54ea802a9a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	unsigned long flags;
>>>     	uint32_t seq;
>>>     
>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto failed_exit;
>>> +	}
>>> +
>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     	amdgpu_ring_alloc(ring, 32);
>>>     	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>     					    ref, mask);
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>     
>>>     	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>     
>>> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>     		goto failed_kiq;
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     	return;
>>>     
>>>     failed_kiq:
>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>> +failed_exit:
>>>     	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  }
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 0a03e2ad5d95..7853dbc3c8bd 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>     	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>>     
>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				     uint32_t reg_val_offs)
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     
>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>     	amdgpu_ring_write(ring, reg);
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     }
>>>     
>>>     static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index fc6c2f2bc76c..bdbd92d4fe45 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>     		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>>>     
>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint32_t reg_val_offs)
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     
>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>     	amdgpu_ring_write(ring, reg);
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     }
>>>     
>>>     static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 84fcf842316d..f7dd1036986e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
>>>     	signed long r, cnt = 0;
>>> -	unsigned long flags;
>>> -	uint32_t seq;
>>> +	uint32_t seq, reg_val_offs;
>>> +	uint64_t value = 0;
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     	struct amdgpu_ring *ring = &kiq->ring;
>>>     
>>>     	BUG_ON(!ring->funcs->emit_rreg);
>>>     
>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>>> +	if (r) {
>>> +		pr_err("failed to lock kiq\n");
>>> +		goto failed_kiq_exit;
>>> +	}
>>> +
>>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>>     	amdgpu_ring_alloc(ring, 32);
>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>     	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>> @@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_fence_emit_polling(ring, &seq);
>>>     	amdgpu_ring_commit(ring);
>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>     
>>>     	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>     
>>> @@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>     	if (cnt > MAX_KIQ_REG_TRY)
>>>     		goto failed_kiq_read;
>>>     
>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
>>> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>>>     
>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>> +	return value;
>>>     failed_kiq_read:
>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>> +failed_kiq_exit:
>>>     	pr_err("failed to read gpu clock\n");
>>>     	return ~0;
>>>     }
>>> @@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>     		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>>     
>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>> uint32_t reg)
>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>> +				    uint32_t reg_val_offs)
>>>     {
>>>     	struct amdgpu_device *adev = ring->adev;
>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     
>>>     	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>     	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>> @@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>     	amdgpu_ring_write(ring, reg);
>>>     	amdgpu_ring_write(ring, 0);
>>>     	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>> -				kiq->reg_val_offs * 4));
>>> +				reg_val_offs * 4));
>>>     }
>>>     
>>>     static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 30b75d79efdb..6b4dc0ed4fff 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>     	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>     
>>>     	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +		if (r) {
>>> +			pr_err("failed to lock kiq\n");
>>> +			return -ETIME;
>>> +		}
>>> +
>>>     		/* 2 dwords flush + 8 dwords fence */
>>>     		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     		kiq->pmf->kiq_invalidate_tlbs(ring,
>>>     					pasid, flush_type, all_hub);
>>>     		amdgpu_fence_emit_polling(ring, &seq);
>>>     		amdgpu_ring_commit(ring);
>>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>>> +		amdgpu_gfx_kiq_unlock(kiq);
>>>     		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>>     		if (r < 1) {
>>>     			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>> +			amdgpu_gfx_kiq_restore(kiq, false);
>>>     			return -ETIME;
>>>     		}
>>>     
>>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     		return 0;
>>>     	}
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index fecdbc471983..6238702c80b4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>     		if (vega20_xgmi_wa)
>>>     			ndw += kiq->pmf->invalidate_tlbs_size;
>>>     
>>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>>> +		if (r) {
>>> +			pr_err("failed to lock kiq\n");
>>> +			return -ETIME;
>>> +		}
>>> +
>>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>>     		/* 2 dwords flush + 8 dwords fence */
>>>     		amdgpu_ring_alloc(ring, ndw);
>>>     		if (vega20_xgmi_wa)
>>> @@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>     					pasid, flush_type, all_hub);
>>>     		amdgpu_fence_emit_polling(ring, &seq);
>>>     		amdgpu_ring_commit(ring);
>>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>>> +		amdgpu_gfx_kiq_unlock(kiq);
>>>     		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>>     		if (r < 1) {
>>>     			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     			return -ETIME;
>>>     		}
>>>     
>>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>>     		return 0;
>>>     	}
>>>     
>>> --
>>> 2.17.1
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>> ts.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7C
>> Yintian.Tao%40amd.com%7C16d5caeab344478a395808d7e69070d3%7C3dd8961fe4
>> 884e608e11a82d994e183d%7C0%7C0%7C637231380366556593&amp;sdata=mRnRzlS
>> lFufeHytPmCLjKlSon2V6SSSC%2Fxx9c55F2gU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  8:06             ` Tao, Yintian
@ 2020-04-22  8:23               ` Christian König
  2020-04-22  8:34                 ` Tao, Yintian
  0 siblings, 1 reply; 17+ messages in thread
From: Christian König @ 2020-04-22  8:23 UTC (permalink / raw)
  To: Tao, Yintian, Liu, Monk, Liu, Shaoyun, Kuehling, Felix; +Cc: amd-gfx

Am 22.04.20 um 10:06 schrieb Tao, Yintian:
> Hi  Christian
>
> Please see inline comments
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: 2020年4月22日 15:54
> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
> Am 22.04.20 um 09:49 schrieb Tao, Yintian:
>> Hi Christian
>>
>>
>> Please see inline comments.
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: 2020年4月22日 15:40
>> To: Tao, Yintian <Yintian.Tao@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun
>> <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>>
>> Am 22.04.20 um 09:35 schrieb Tao, Yintian:
>>> Hi  Christian
>>>
>>>
>>>> BUG_ON(in_interrupt());
>>> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>>> [yttao]: According to the current drm-next code, I have not find where to access register through KIQ.
>>> 		And you need to wait for the free kiq ring buffer space if there is no free kiq ring buffer, here, you wait at interrupt context is illegal.
>> Waiting in atomic context is illegal as well, but we don't have much other choice.
>> [yttao]: no, there is no sleep in atomic context at my patch.
> I'm not talking about a sleeping, but busy waiting.
>
>> We just need to make sure that waiting never happens by making the buffers large enough and if it still happens print and error.
>> [yttao]: this is not the good choice because KMD need to protect it instead of hoping user not frequently invoke KIQ acess.
> The only other choice we have is busy waiting, e.g. loop until we get a free slot.
> [yttao]: Yes, now may patch use msleep() to busy waiting. Or you means need to use udelay()? If we use udelay(), it will be the nightmare under multi-VF.
> 		Because it is assumed that there are 16VF within world-switch 6ms, the bad situation is that one VF will udelay(16*6ms = 96ms) to get one free slot.

You can't use msleep() here since sleeping in atomic or interrupt 
context is forbidden.

The trick is that in atomic context the CPU can't switch to a different 
process, so we have a very limited number of concurrent KIQ reads which 
can happen.

With a MAX_WB of 256 we can easily have 128 CPUs and don't run into 
problems.

Regards,
Christian.

>
>
> Regards,
> Christian.
>
>>> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>>> [yttao]: sorry, can you give me more details about how to use NOP to reserve space? I will use amdgpu_device_wb_get() for the read operation.
>> We could use the NOP PM4 command as Felix suggested, this command has
>> a
>> header+length and says that the next X dw should be ignore on the ring
>> buffer.
>>
>> But I think using amdgpu_device_wb_get() is better anyway.
>> [yttao]: yes, I agreed with amdgpu_device_wb_get() method because it
>> will fix prevent potential read race condition but NOP method will not
>> prevent it
>>
>> Regards,
>> Christian.
>>
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: 2020年4月22日 15:23
>>> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>;
>>> Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix
>>> <Felix.Kuehling@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>>>
>>>> BUG_ON(in_interrupt());
>>> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>>>
>>> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>>>
>>> You also don't need to worry to much about overflowing the wb area.
>>> Since we run in an atomic context we can have at most the number of CPU in the system + interrupt context here.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 22.04.20 um 09:11 schrieb Tao, Yintian:
>>>> Add Felix and Shaoyun
>>>>
>>>> -----Original Message-----
>>>> From: Yintian Tao <yttao@amd.com>
>>>> Sent: 2020年4月22日 12:42
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk
>>>> <Monk.Liu@amd.com>
>>>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian
>>>> <Yintian.Tao@amd.com>
>>>> Subject: [PATCH] drm/amdgpu: refine kiq access register
>>>>
>>>> According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>>>        get REG-1 value
>>>>
>>>> And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.
>>>>
>>>> Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.
>>>>
>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
>>>>      .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
>>>>      .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
>>>>      12 files changed, 211 insertions(+), 48 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 4e1d4cfe7a9f..4530e0de4257 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>>>>      /*
>>>>       * Writeback
>>>>       */
>>>> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
>>>> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>>>>      
>>>>      struct amdgpu_wb {
>>>>      	struct amdgpu_bo	*wb_obj;
>>>> @@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct
>>>> amdgpu_device *adev);
>>>>      
>>>>      int emu_soc_asic_init(struct amdgpu_device *adev);
>>>>      
>>>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void
>>>> +amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
>>>> +
>>>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t
>>>> +*offs); void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq,
>>>> +uint32_t *offs);
>>>>      /*
>>>>       * Registers read & write functions.
>>>>       */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>>> index 691c89705bcd..034c9f416499 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>>> @@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      			    uint32_t doorbell_off)
>>>>      {
>>>>      	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>>>      	struct v10_compute_mqd *m;
>>>>      	uint32_t mec, pipe;
>>>> @@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>>>      		 mec, pipe, queue_id);
>>>>      
>>>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto out_unlock;
>>>> +	}
>>>> +
>>>>      	r = amdgpu_ring_alloc(kiq_ring, 7);
>>>>      	if (r) {
>>>>      		pr_err("Failed to alloc KIQ (%d).\n", r);
>>>>      		goto out_unlock;
>>>>      	}
>>>>      
>>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>>>      	amdgpu_ring_write(kiq_ring,
>>>>      			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>>>      	amdgpu_ring_commit(kiq_ring);
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      out_unlock:
>>>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>>>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>>>      	release_queue(kgd);
>>>>      
>>>>      	return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>>> index df841c2ac5e7..f243d9990ced 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>>> @@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      			    uint32_t doorbell_off)
>>>>      {
>>>>      	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>>>      	struct v9_mqd *m;
>>>>      	uint32_t mec, pipe;
>>>> @@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>>>      		 mec, pipe, queue_id);
>>>>      
>>>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto out_unlock;
>>>> +	}
>>>> +
>>>>      	r = amdgpu_ring_alloc(kiq_ring, 7);
>>>>      	if (r) {
>>>>      		pr_err("Failed to alloc KIQ (%d).\n", r);
>>>>      		goto out_unlock;
>>>>      	}
>>>>      
>>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>>>      	amdgpu_ring_write(kiq_ring,
>>>>      			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>>>      	amdgpu_ring_commit(kiq_ring);
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      out_unlock:
>>>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>>>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>>>      	release_queue(kgd);
>>>>      
>>>>      	return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> index ea576b4260a4..c6b1c77b5eac 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> @@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>>>>      	return -EINVAL;
>>>>      }
>>>>      
>>>> +void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
>>>> +				 uint32_t *offset)
>>>> +{
>>>> +	uint32_t tmp = 0;
>>>> +
>>>> +	tmp = find_first_zero_bit(kiq->used,
>>>> +				      kiq->num_slot);
>>>> +	if (tmp < kiq->num_slot) {
>>>> +		__set_bit(tmp, kiq->used);
>>>> +		*offset = kiq->reg_val_offs[tmp];
>>>> +	}
>>>> +}
>>>> +
>>>> +void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
>>>> +				 uint32_t offset)
>>>> +{
>>>> +	uint32_t tmp = 0;
>>>> +
>>>> +	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
>>>> +	if (tmp < kiq->num_slot)
>>>> +		__clear_bit(tmp, kiq->used);
>>>> +}
>>>> +
>>>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
>>>> +	int retry = MAX_KIQ_REG_TRY;
>>>> +
>>>> +	BUG_ON(in_interrupt());
>>>> +	while (retry-- > 0) {
>>>> +		spin_lock(&kiq->ring_lock);
>>>> +		if ((atomic64_read(&kiq->submit_avail) > 0) &&
>>>> +		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
>>>> +		     kiq->num_slot : 1)) {
>>>> +				break;
>>>> +		} else {
>>>> +			spin_unlock(&kiq->ring_lock);
>>>> +			msleep(2);
>>>> +			continue;
>>>> +		}
>>>> +
>>>> +	}
>>>> +
>>>> +	if (retry > 0)
>>>> +		return 0;
>>>> +	else
>>>> +		return -ETIME;
>>>> +}
>>>> +
>>>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
>>>> +	spin_unlock(&kiq->ring_lock);
>>>> +}
>>>> +
>>>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
>>>> +	atomic64_dec(&kiq->submit_avail);
>>>> +	if (offs)
>>>> +		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
>>>> +
>>>> +}
>>>> +
>>>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
>>>> +	atomic64_inc(&kiq->submit_avail);
>>>> +	if (offs)
>>>> +		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
>>>> +
>>>>      int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>>      			     struct amdgpu_ring *ring,
>>>>      			     struct amdgpu_irq_src *irq)
>>>>      {
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>> -	int r = 0;
>>>> +	int r = 0, i, j;
>>>>      
>>>>      	spin_lock_init(&kiq->ring_lock);
>>>>      
>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>> -	if (r)
>>>> -		return r;
>>>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
>>>> +		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
>>>> +		if (r) {
>>>> +			for (j = 0; j < i; j++)
>>>> +				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
>>>> +			return r;
>>>> +		}
>>>> +	}
>>>>      
>>>>      	ring->adev = NULL;
>>>>      	ring->ring_obj = NULL;
>>>>      	ring->use_doorbell = true;
>>>>      	ring->doorbell_index = adev->doorbell_index.kiq;
>>>> +	kiq->num_slot = MAX_KIQ_RREG_NUM;
>>>> +	memset(&kiq->used, 0, sizeof(kiq->used));
>>>> +
>>>>      
>>>>      	r = amdgpu_gfx_kiq_acquire(adev, ring);
>>>>      	if (r)
>>>> @@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>>      			     AMDGPU_RING_PRIO_DEFAULT);
>>>>      	if (r)
>>>>      		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
>>>> +	else
>>>> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
>>>> +			     (ring->funcs->align_mask + 1));
>>>>      
>>>>      	return r;
>>>>      }
>>>>      
>>>>      void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>> +	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
>>>> +	int i = 0;
>>>> +
>>>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
>>>> +		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
>>>>      	amdgpu_ring_fini(ring);
>>>>      }
>>>>      
>>>> @@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>>>>      	signed long r, cnt = 0;
>>>> -	unsigned long flags;
>>>> -	uint32_t seq;
>>>> +	uint32_t seq, reg_val_offs = 0, value = 0;
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>      
>>>>      	BUG_ON(!ring->funcs->emit_rreg);
>>>>      
>>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto kiq_read_exit;
>>>> +	}
>>>> +
>>>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>>>      	amdgpu_ring_alloc(ring, 32);
>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>>      
>>>>      	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>      
>>>> @@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>      		goto failed_kiq_read;
>>>>      
>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>> +	mb();
>>>> +	value = adev->wb.wb[reg_val_offs];
>>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>>> +	return value;
>>>>      
>>>>      failed_kiq_read:
>>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>>> +kiq_read_exit:
>>>>      	pr_err("failed to read reg:%x\n", reg);
>>>>      	return ~0;
>>>>      }
>>>> @@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
>>>>      	signed long r, cnt = 0;
>>>> -	unsigned long flags;
>>>>      	uint32_t seq;
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>      
>>>>      	BUG_ON(!ring->funcs->emit_wreg);
>>>>      
>>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto kiq_write_exit;
>>>> +	}
>>>> +
>>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>      	amdgpu_ring_emit_wreg(ring, reg, v);
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>>      
>>>>      	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>      
>>>> @@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>      		goto failed_kiq_write;
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      	return;
>>>>      
>>>>      failed_kiq_write:
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>> +kiq_write_exit:
>>>>      	pr_err("failed to write reg:%x\n", reg);  } diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> index 634746829024..bb05594c27e0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>>>>      	int invalidate_tlbs_size;
>>>>      };
>>>>      
>>>> +#define MAX_KIQ_RREG_NUM 64
>>>>      struct amdgpu_kiq {
>>>>      	u64			eop_gpu_addr;
>>>>      	struct amdgpu_bo	*eop_obj;
>>>> @@ -103,7 +104,10 @@ struct amdgpu_kiq {
>>>>      	struct amdgpu_ring	ring;
>>>>      	struct amdgpu_irq_src	irq;
>>>>      	const struct kiq_pm4_funcs *pmf;
>>>> -	uint32_t			reg_val_offs;
>>>> +	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
>>>> +	atomic64_t		submit_avail;
>>>> +	uint32_t		num_slot;
>>>> +	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
>>>>      };
>>>>      
>>>>      /*
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index f61664ee4940..b0ea4f67a6e1 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>      	void (*end_use)(struct amdgpu_ring *ring);
>>>>      	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>      	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>> +			  uint32_t reg_val_offs);
>>>>      	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>      	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>      			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@
>>>> struct amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>      #define amdgpu_ring_emit_cntxcntl(r, d)
>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define
>>>> amdgpu_ring_emit_rreg(r,
>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>>>> +(d),
>>>> +(o))
>>>>      #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
>>>>      	if (ring->count_dw <= 0)
>>>>      		DRM_ERROR("amdgpu: writing more dwords to the ring than
>>>> expected!\n");
>>>> +
>>>>      	ring->ring[ring->wptr++ & ring->buf_mask] = v;
>>>>      	ring->wptr &= ring->ptr_mask;
>>>>      	ring->count_dw--;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> index 8c10084f44ef..dd54ea802a9a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      	unsigned long flags;
>>>>      	uint32_t seq;
>>>>      
>>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto failed_exit;
>>>> +	}
>>>> +
>>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>      	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>>      					    ref, mask);
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>>      
>>>>      	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>      
>>>> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>      		goto failed_kiq;
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      	return;
>>>>      
>>>>      failed_kiq:
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>> +failed_exit:
>>>>      	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  }
>>>>      
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index 0a03e2ad5d95..7853dbc3c8bd 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>      	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>>>      
>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>> uint32_t reg)
>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				     uint32_t reg_val_offs)
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      
>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>>      	amdgpu_ring_write(ring, reg);
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      }
>>>>      
>>>>      static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> index fc6c2f2bc76c..bdbd92d4fe45 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>      		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  }
>>>>      
>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>> uint32_t reg)
>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				    uint32_t reg_val_offs)
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      
>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>>      	amdgpu_ring_write(ring, reg);
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      }
>>>>      
>>>>      static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 84fcf842316d..f7dd1036986e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
>>>>      	signed long r, cnt = 0;
>>>> -	unsigned long flags;
>>>> -	uint32_t seq;
>>>> +	uint32_t seq, reg_val_offs;
>>>> +	uint64_t value = 0;
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>      
>>>>      	BUG_ON(!ring->funcs->emit_rreg);
>>>>      
>>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto failed_kiq_exit;
>>>> +	}
>>>> +
>>>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>      	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>> @@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>>      
>>>>      	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>      
>>>> @@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>      		goto failed_kiq_read;
>>>>      
>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
>>>> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>>> +	return value;
>>>>      failed_kiq_read:
>>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>>> +failed_kiq_exit:
>>>>      	pr_err("failed to read gpu clock\n");
>>>>      	return ~0;
>>>>      }
>>>> @@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>      		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>>>      
>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>>>> uint32_t reg)
>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				    uint32_t reg_val_offs)
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      
>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>> @@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>>      	amdgpu_ring_write(ring, reg);
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      }
>>>>      
>>>>      static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 30b75d79efdb..6b4dc0ed4fff 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      
>>>>      	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>>>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>>>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +		if (r) {
>>>> +			pr_err("failed to lock kiq\n");
>>>> +			return -ETIME;
>>>> +		}
>>>> +
>>>>      		/* 2 dwords flush + 8 dwords fence */
>>>>      		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>>>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      		kiq->pmf->kiq_invalidate_tlbs(ring,
>>>>      					pasid, flush_type, all_hub);
>>>>      		amdgpu_fence_emit_polling(ring, &seq);
>>>>      		amdgpu_ring_commit(ring);
>>>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>>>> +		amdgpu_gfx_kiq_unlock(kiq);
>>>>      		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>>>      		if (r < 1) {
>>>>      			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>>> +			amdgpu_gfx_kiq_restore(kiq, false);
>>>>      			return -ETIME;
>>>>      		}
>>>>      
>>>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      		return 0;
>>>>      	}
>>>>      
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index fecdbc471983..6238702c80b4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>>      		if (vega20_xgmi_wa)
>>>>      			ndw += kiq->pmf->invalidate_tlbs_size;
>>>>      
>>>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>>>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +		if (r) {
>>>> +			pr_err("failed to lock kiq\n");
>>>> +			return -ETIME;
>>>> +		}
>>>> +
>>>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      		/* 2 dwords flush + 8 dwords fence */
>>>>      		amdgpu_ring_alloc(ring, ndw);
>>>>      		if (vega20_xgmi_wa)
>>>> @@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>>      					pasid, flush_type, all_hub);
>>>>      		amdgpu_fence_emit_polling(ring, &seq);
>>>>      		amdgpu_ring_commit(ring);
>>>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>>>> +		amdgpu_gfx_kiq_unlock(kiq);
>>>>      		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>>>      		if (r < 1) {
>>>>      			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>>> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      			return -ETIME;
>>>>      		}
>>>>      
>>>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      		return 0;
>>>>      	}
>>>>      
>>>> --
>>>> 2.17.1
>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>>> ts.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7C
>>> Yintian.Tao%40amd.com%7C16d5caeab344478a395808d7e69070d3%7C3dd8961fe4
>>> 884e608e11a82d994e183d%7C0%7C0%7C637231380366556593&amp;sdata=mRnRzlS
>>> lFufeHytPmCLjKlSon2V6SSSC%2Fxx9c55F2gU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  8:23               ` Christian König
@ 2020-04-22  8:34                 ` Tao, Yintian
  0 siblings, 0 replies; 17+ messages in thread
From: Tao, Yintian @ 2020-04-22  8:34 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Liu, Shaoyun, Kuehling, Felix; +Cc: amd-gfx

Hi  Christian

Please see inline comments.

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: 2020年4月22日 16:23
To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq access register

Am 22.04.20 um 10:06 schrieb Tao, Yintian:
> Hi  Christian
>
> Please see inline comments
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: 2020年4月22日 15:54
> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; 
> Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix 
> <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
> Am 22.04.20 um 09:49 schrieb Tao, Yintian:
>> Hi Christian
>>
>>
>> Please see inline comments.
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: 2020年4月22日 15:40
>> To: Tao, Yintian <Yintian.Tao@amd.com>; Koenig, Christian 
>> <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Liu, 
>> Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, Felix 
>> <Felix.Kuehling@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>>
>> Am 22.04.20 um 09:35 schrieb Tao, Yintian:
>>> Hi  Christian
>>>
>>>
>>>> BUG_ON(in_interrupt());
>>> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>>> [yttao]: According to the current drm-next code, I have not find where to access register through KIQ.
>>> 		And you need to wait for the free kiq ring buffer space if there is no free kiq ring buffer, here, you wait at interrupt context is illegal.
>> Waiting in atomic context is illegal as well, but we don't have much other choice.
>> [yttao]: no, there is no sleep in atomic context at my patch.
> I'm not talking about a sleeping, but busy waiting.
>
>> We just need to make sure that waiting never happens by making the buffers large enough and if it still happens print and error.
>> [yttao]: this is not the good choice because KMD need to protect it instead of hoping user not frequently invoke KIQ acess.
> The only other choice we have is busy waiting, e.g. loop until we get a free slot.
> [yttao]: Yes, now may patch use msleep() to busy waiting. Or you means need to use udelay()? If we use udelay(), it will be the nightmare under multi-VF.
> 		Because it is assumed that there are 16VF within world-switch 6ms, the bad situation is that one VF will udelay(16*6ms = 96ms) to get one free slot.

You can't use msleep() here since sleeping in atomic or interrupt context is forbidden.

The trick is that in atomic context the CPU can't switch to a different process, so we have a very limited number of concurrent KIQ reads which can happen.

With a MAX_WB of 256 we can easily have 128 CPUs and don't run into problems.
[yttao]: fine, this is a good idea. But it seems current drm-next code, KIQ access still use msleep to wait the fence which is not correct according to your comments.
		I think we need submit another patch to add one more condition "in_atomic()" to prevent it but this function cannot know about held spinlocks in non-preemptible kernels.

Regards,
Christian.

>
>
> Regards,
> Christian.
>
>>> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>>> [yttao]: sorry, can you give me more details about how to use NOP to reserve space? I will use amdgpu_device_wb_get() for the read operation.
>> We could use the NOP PM4 command as Felix suggested, this command has 
>> a
>> header+length and says that the next X dw should be ignore on the 
>> header+ring
>> buffer.
>>
>> But I think using amdgpu_device_wb_get() is better anyway.
>> [yttao]: yes, I agreed with amdgpu_device_wb_get() method because it 
>> will fix prevent potential read race condition but NOP method will 
>> not prevent it
>>
>> Regards,
>> Christian.
>>
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: 2020年4月22日 15:23
>>> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk 
>>> <Monk.Liu@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com>; Kuehling, 
>>> Felix <Felix.Kuehling@amd.com>
>>> Cc: amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>>>
>>>> BUG_ON(in_interrupt());
>>> That won't work like this. The KIQ is also used in interrupt context in the driver, that's why we used spin_lock_irqsave().
>>>
>>> And I would either say that we should use the trick with the NOP to reserve space on the ring buffer or call amdgpu_device_wb_get() for each read. amdgpu_device_wb_get() also uses find_first_zero_bit() and should work equally well.
>>>
>>> You also don't need to worry to much about overflowing the wb area.
>>> Since we run in an atomic context we can have at most the number of CPU in the system + interrupt context here.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 22.04.20 um 09:11 schrieb Tao, Yintian:
>>>> Add Felix and Shaoyun
>>>>
>>>> -----Original Message-----
>>>> From: Yintian Tao <yttao@amd.com>
>>>> Sent: 2020年4月22日 12:42
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk 
>>>> <Monk.Liu@amd.com>
>>>> Cc: amd-gfx@lists.freedesktop.org; Tao, Yintian 
>>>> <Yintian.Tao@amd.com>
>>>> Subject: [PATCH] drm/amdgpu: refine kiq access register
>>>>
>>>> According to the current kiq access register method, there will be race condition when using KIQ to read register if multiple clients want to read at same time just like the expample below:
>>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll the seqno-1 5. the kiq complete these two read operation 6. client-A to read the register at the wb buffer and
>>>>        get REG-1 value
>>>>
>>>> And if there are multiple clients to frequently write registers through KIQ which may raise the KIQ ring buffer overwritten problem.
>>>>
>>>> Therefore, allocate fixed number wb slot for rreg use and limit the submit number which depends on the kiq ring_size in order to prevent the overwritten problem.
>>>>
>>>> Signed-off-by: Yintian Tao <yttao@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   7 +-
>>>>      .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  12 +-
>>>>      .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  12 +-
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 129 ++++++++++++++++--
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   6 +-
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |   6 +-
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  13 +-
>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |   8 +-
>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   8 +-
>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  34 +++--
>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  12 +-
>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  12 +-
>>>>      12 files changed, 211 insertions(+), 48 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 4e1d4cfe7a9f..4530e0de4257 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>>>>      /*
>>>>       * Writeback
>>>>       */
>>>> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
>>>> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>>>>      
>>>>      struct amdgpu_wb {
>>>>      	struct amdgpu_bo	*wb_obj;
>>>> @@ -1028,6 +1028,11 @@ bool amdgpu_device_has_dc_support(struct
>>>> amdgpu_device *adev);
>>>>      
>>>>      int emu_soc_asic_init(struct amdgpu_device *adev);
>>>>      
>>>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read); void 
>>>> +amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq);
>>>> +
>>>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t 
>>>> +*offs); void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, 
>>>> +uint32_t *offs);
>>>>      /*
>>>>       * Registers read & write functions.
>>>>       */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>>> index 691c89705bcd..034c9f416499 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>>>> @@ -309,6 +309,7 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      			    uint32_t doorbell_off)
>>>>      {
>>>>      	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>>>      	struct v10_compute_mqd *m;
>>>>      	uint32_t mec, pipe;
>>>> @@ -324,13 +325,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>>>      		 mec, pipe, queue_id);
>>>>      
>>>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto out_unlock;
>>>> +	}
>>>> +
>>>>      	r = amdgpu_ring_alloc(kiq_ring, 7);
>>>>      	if (r) {
>>>>      		pr_err("Failed to alloc KIQ (%d).\n", r);
>>>>      		goto out_unlock;
>>>>      	}
>>>>      
>>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>>>      	amdgpu_ring_write(kiq_ring,
>>>>      			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 +357,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>>>      	amdgpu_ring_commit(kiq_ring);
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      out_unlock:
>>>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>>>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>>>      	release_queue(kgd);
>>>>      
>>>>      	return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>>> index df841c2ac5e7..f243d9990ced 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>>>> @@ -307,6 +307,7 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      			    uint32_t doorbell_off)
>>>>      {
>>>>      	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>>>      	struct v9_mqd *m;
>>>>      	uint32_t mec, pipe;
>>>> @@ -322,13 +323,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>>>      		 mec, pipe, queue_id);
>>>>      
>>>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto out_unlock;
>>>> +	}
>>>> +
>>>>      	r = amdgpu_ring_alloc(kiq_ring, 7);
>>>>      	if (r) {
>>>>      		pr_err("Failed to alloc KIQ (%d).\n", r);
>>>>      		goto out_unlock;
>>>>      	}
>>>>      
>>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>>>      	amdgpu_ring_write(kiq_ring,
>>>>      			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 +355,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>>>      	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>>>      	amdgpu_ring_commit(kiq_ring);
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      out_unlock:
>>>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>>>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq);
>>>>      	release_queue(kgd);
>>>>      
>>>>      	return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> index ea576b4260a4..c6b1c77b5eac 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> @@ -295,23 +295,99 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>>>>      	return -EINVAL;
>>>>      }
>>>>      
>>>> +void amdgpu_gfx_kiq_reg_offs_get(struct amdgpu_kiq *kiq,
>>>> +				 uint32_t *offset)
>>>> +{
>>>> +	uint32_t tmp = 0;
>>>> +
>>>> +	tmp = find_first_zero_bit(kiq->used,
>>>> +				      kiq->num_slot);
>>>> +	if (tmp < kiq->num_slot) {
>>>> +		__set_bit(tmp, kiq->used);
>>>> +		*offset = kiq->reg_val_offs[tmp];
>>>> +	}
>>>> +}
>>>> +
>>>> +void amdgpu_gfx_kiq_reg_offs_free(struct amdgpu_kiq *kiq,
>>>> +				 uint32_t offset)
>>>> +{
>>>> +	uint32_t tmp = 0;
>>>> +
>>>> +	tmp = (offset - kiq->reg_val_offs[0]) >> 3;
>>>> +	if (tmp < kiq->num_slot)
>>>> +		__clear_bit(tmp, kiq->used);
>>>> +}
>>>> +
>>>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read) {
>>>> +	int retry = MAX_KIQ_REG_TRY;
>>>> +
>>>> +	BUG_ON(in_interrupt());
>>>> +	while (retry-- > 0) {
>>>> +		spin_lock(&kiq->ring_lock);
>>>> +		if ((atomic64_read(&kiq->submit_avail) > 0) &&
>>>> +		    (read ? find_first_zero_bit(kiq->used, kiq->num_slot) <
>>>> +		     kiq->num_slot : 1)) {
>>>> +				break;
>>>> +		} else {
>>>> +			spin_unlock(&kiq->ring_lock);
>>>> +			msleep(2);
>>>> +			continue;
>>>> +		}
>>>> +
>>>> +	}
>>>> +
>>>> +	if (retry > 0)
>>>> +		return 0;
>>>> +	else
>>>> +		return -ETIME;
>>>> +}
>>>> +
>>>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq) {
>>>> +	spin_unlock(&kiq->ring_lock);
>>>> +}
>>>> +
>>>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
>>>> +	atomic64_dec(&kiq->submit_avail);
>>>> +	if (offs)
>>>> +		amdgpu_gfx_kiq_reg_offs_get(kiq, offs);
>>>> +
>>>> +}
>>>> +
>>>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
>>>> +	atomic64_inc(&kiq->submit_avail);
>>>> +	if (offs)
>>>> +		amdgpu_gfx_kiq_reg_offs_free(kiq, *offs); }
>>>> +
>>>>      int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>>      			     struct amdgpu_ring *ring,
>>>>      			     struct amdgpu_irq_src *irq)
>>>>      {
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>> -	int r = 0;
>>>> +	int r = 0, i, j;
>>>>      
>>>>      	spin_lock_init(&kiq->ring_lock);
>>>>      
>>>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>>>> -	if (r)
>>>> -		return r;
>>>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++) {
>>>> +		r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs[i]);
>>>> +		if (r) {
>>>> +			for (j = 0; j < i; j++)
>>>> +				amdgpu_device_wb_free(adev, kiq->reg_val_offs[j]);
>>>> +			return r;
>>>> +		}
>>>> +	}
>>>>      
>>>>      	ring->adev = NULL;
>>>>      	ring->ring_obj = NULL;
>>>>      	ring->use_doorbell = true;
>>>>      	ring->doorbell_index = adev->doorbell_index.kiq;
>>>> +	kiq->num_slot = MAX_KIQ_RREG_NUM;
>>>> +	memset(&kiq->used, 0, sizeof(kiq->used));
>>>> +
>>>>      
>>>>      	r = amdgpu_gfx_kiq_acquire(adev, ring);
>>>>      	if (r)
>>>> @@ -325,13 +401,20 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>>      			     AMDGPU_RING_PRIO_DEFAULT);
>>>>      	if (r)
>>>>      		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
>>>> +	else
>>>> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
>>>> +			     (ring->funcs->align_mask + 1));
>>>>      
>>>>      	return r;
>>>>      }
>>>>      
>>>>      void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)  {
>>>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>>> +	struct amdgpu_kiq *kiq = &ring->adev->gfx.kiq;
>>>> +	int i = 0;
>>>> +
>>>> +	for (i = 0; i < MAX_KIQ_RREG_NUM; i++)
>>>> +		amdgpu_device_wb_free(ring->adev, kiq->reg_val_offs[i]);
>>>>      	amdgpu_ring_fini(ring);
>>>>      }
>>>>      
>>>> @@ -671,19 +754,24 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,  uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>>>>      	signed long r, cnt = 0;
>>>> -	unsigned long flags;
>>>> -	uint32_t seq;
>>>> +	uint32_t seq, reg_val_offs = 0, value = 0;
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>      
>>>>      	BUG_ON(!ring->funcs->emit_rreg);
>>>>      
>>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto kiq_read_exit;
>>>> +	}
>>>> +
>>>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>>>      	amdgpu_ring_alloc(ring, 32);
>>>> -	amdgpu_ring_emit_rreg(ring, reg);
>>>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>>      
>>>>      	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>      
>>>> @@ -707,9 +795,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>      		goto failed_kiq_read;
>>>>      
>>>> -	return adev->wb.wb[kiq->reg_val_offs];
>>>> +	mb();
>>>> +	value = adev->wb.wb[reg_val_offs];
>>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>>> +	return value;
>>>>      
>>>>      failed_kiq_read:
>>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>>> +kiq_read_exit:
>>>>      	pr_err("failed to read reg:%x\n", reg);
>>>>      	return ~0;
>>>>      }
>>>> @@ -717,19 +810,24 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)  {
>>>>      	signed long r, cnt = 0;
>>>> -	unsigned long flags;
>>>>      	uint32_t seq;
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>      
>>>>      	BUG_ON(!ring->funcs->emit_wreg);
>>>>      
>>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto kiq_write_exit;
>>>> +	}
>>>> +
>>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>      	amdgpu_ring_emit_wreg(ring, reg, v);
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>>      
>>>>      	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>      
>>>> @@ -754,8 +852,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>      		goto failed_kiq_write;
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      	return;
>>>>      
>>>>      failed_kiq_write:
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>> +kiq_write_exit:
>>>>      	pr_err("failed to write reg:%x\n", reg);  } diff --git 
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> index 634746829024..bb05594c27e0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>>> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>>>>      	int invalidate_tlbs_size;
>>>>      };
>>>>      
>>>> +#define MAX_KIQ_RREG_NUM 64
>>>>      struct amdgpu_kiq {
>>>>      	u64			eop_gpu_addr;
>>>>      	struct amdgpu_bo	*eop_obj;
>>>> @@ -103,7 +104,10 @@ struct amdgpu_kiq {
>>>>      	struct amdgpu_ring	ring;
>>>>      	struct amdgpu_irq_src	irq;
>>>>      	const struct kiq_pm4_funcs *pmf;
>>>> -	uint32_t			reg_val_offs;
>>>> +	uint32_t		reg_val_offs[MAX_KIQ_RREG_NUM];
>>>> +	atomic64_t		submit_avail;
>>>> +	uint32_t		num_slot;
>>>> +	unsigned long		used[DIV_ROUND_UP(MAX_KIQ_RREG_NUM, BITS_PER_LONG)];
>>>>      };
>>>>      
>>>>      /*
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index f61664ee4940..b0ea4f67a6e1 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>>>      	void (*end_use)(struct amdgpu_ring *ring);
>>>>      	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>>      	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>> +			  uint32_t reg_val_offs);
>>>>      	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>>      	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>      			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ 
>>>> struct amdgpu_ring {  #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))  #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>>>      #define amdgpu_ring_emit_cntxcntl(r, d) 
>>>> (r)->funcs->emit_cntxcntl((r), (d)) -#define 
>>>> amdgpu_ring_emit_rreg(r,
>>>> d) (r)->funcs->emit_rreg((r), (d))
>>>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>>>> +(d),
>>>> +(o))
>>>>      #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) @@ -308,6 +309,7 @@ static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)  {
>>>>      	if (ring->count_dw <= 0)
>>>>      		DRM_ERROR("amdgpu: writing more dwords to the ring than 
>>>> expected!\n");
>>>> +
>>>>      	ring->ring[ring->wptr++ & ring->buf_mask] = v;
>>>>      	ring->wptr &= ring->ptr_mask;
>>>>      	ring->count_dw--;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> index 8c10084f44ef..dd54ea802a9a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      	unsigned long flags;
>>>>      	uint32_t seq;
>>>>      
>>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto failed_exit;
>>>> +	}
>>>> +
>>>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>      	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>>>      					    ref, mask);
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>>      
>>>>      	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>      
>>>> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>      		goto failed_kiq;
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      	return;
>>>>      
>>>>      failed_kiq:
>>>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>>> +failed_exit:
>>>>      	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);  
>>>> }
>>>>      
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> index 0a03e2ad5d95..7853dbc3c8bd 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>>> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>>>      	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));  }
>>>>      
>>>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg)
>>>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				     uint32_t reg_val_offs)
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      
>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>>      	amdgpu_ring_write(ring, reg);
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      }
>>>>      
>>>>      static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> index fc6c2f2bc76c..bdbd92d4fe45 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>      		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;  
>>>> }
>>>>      
>>>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg)
>>>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				    uint32_t reg_val_offs)
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      
>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>>      	amdgpu_ring_write(ring, reg);
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      }
>>>>      
>>>>      static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 84fcf842316d..f7dd1036986e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -4042,14 +4042,20 @@ static int gfx_v9_0_soft_reset(void *handle)  static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)  {
>>>>      	signed long r, cnt = 0;
>>>> -	unsigned long flags;
>>>> -	uint32_t seq;
>>>> +	uint32_t seq, reg_val_offs;
>>>> +	uint64_t value = 0;
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      	struct amdgpu_ring *ring = &kiq->ring;
>>>>      
>>>>      	BUG_ON(!ring->funcs->emit_rreg);
>>>>      
>>>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>> +	r = amdgpu_gfx_kiq_lock(kiq, true);
>>>> +	if (r) {
>>>> +		pr_err("failed to lock kiq\n");
>>>> +		goto failed_kiq_exit;
>>>> +	}
>>>> +
>>>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>>>      	amdgpu_ring_alloc(ring, 32);
>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>      	amdgpu_ring_write(ring, 9 |	/* src: register*/
>>>> @@ -4059,12 +4065,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_fence_emit_polling(ring, &seq);
>>>>      	amdgpu_ring_commit(ring);
>>>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>> +	amdgpu_gfx_kiq_unlock(kiq);
>>>>      
>>>>      	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>      
>>>> @@ -4088,10 +4094,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>>>      	if (cnt > MAX_KIQ_REG_TRY)
>>>>      		goto failed_kiq_read;
>>>>      
>>>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>>>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>>>> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
>>>> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>>>>      
>>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>>> +	return value;
>>>>      failed_kiq_read:
>>>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>>>> +failed_kiq_exit:
>>>>      	pr_err("failed to read gpu clock\n");
>>>>      	return ~0;
>>>>      }
>>>> @@ -5482,10 +5492,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>>>      		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;  }
>>>>      
>>>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg)
>>>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>>>> +				    uint32_t reg_val_offs)
>>>>      {
>>>>      	struct amdgpu_device *adev = ring->adev;
>>>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      
>>>>      	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>>>      	amdgpu_ring_write(ring, 0 |	/* src: register*/
>>>> @@ -5494,9 +5504,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>>>      	amdgpu_ring_write(ring, reg);
>>>>      	amdgpu_ring_write(ring, 0);
>>>>      	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>>>> -				kiq->reg_val_offs * 4));
>>>> +				reg_val_offs * 4));
>>>>      }
>>>>      
>>>>      static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>>>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 30b75d79efdb..6b4dc0ed4fff 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -422,20 +422,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>>      	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>      
>>>>      	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>>>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>>>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +		if (r) {
>>>> +			pr_err("failed to lock kiq\n");
>>>> +			return -ETIME;
>>>> +		}
>>>> +
>>>>      		/* 2 dwords flush + 8 dwords fence */
>>>>      		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>>>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      		kiq->pmf->kiq_invalidate_tlbs(ring,
>>>>      					pasid, flush_type, all_hub);
>>>>      		amdgpu_fence_emit_polling(ring, &seq);
>>>>      		amdgpu_ring_commit(ring);
>>>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>>>> +		amdgpu_gfx_kiq_unlock(kiq);
>>>>      		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>>>      		if (r < 1) {
>>>>      			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>>> +			amdgpu_gfx_kiq_restore(kiq, false);
>>>>      			return -ETIME;
>>>>      		}
>>>>      
>>>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      		return 0;
>>>>      	}
>>>>      
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index fecdbc471983..6238702c80b4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -613,7 +613,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>>      		if (vega20_xgmi_wa)
>>>>      			ndw += kiq->pmf->invalidate_tlbs_size;
>>>>      
>>>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>>>> +		r = amdgpu_gfx_kiq_lock(kiq, false);
>>>> +		if (r) {
>>>> +			pr_err("failed to lock kiq\n");
>>>> +			return -ETIME;
>>>> +		}
>>>> +
>>>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>>>      		/* 2 dwords flush + 8 dwords fence */
>>>>      		amdgpu_ring_alloc(ring, ndw);
>>>>      		if (vega20_xgmi_wa)
>>>> @@ -623,13 +629,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>>>      					pasid, flush_type, all_hub);
>>>>      		amdgpu_fence_emit_polling(ring, &seq);
>>>>      		amdgpu_ring_commit(ring);
>>>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>>>> +		amdgpu_gfx_kiq_unlock(kiq);
>>>>      		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>>>      		if (r < 1) {
>>>>      			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>>> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      			return -ETIME;
>>>>      		}
>>>>      
>>>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>>>      		return 0;
>>>>      	}
>>>>      
>>>> --
>>>> 2.17.1
>>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fli
>>> s 
>>> ts.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7
>>> C
>>> Yintian.Tao%40amd.com%7C16d5caeab344478a395808d7e69070d3%7C3dd8961fe
>>> 4 
>>> 884e608e11a82d994e183d%7C0%7C0%7C637231380366556593&amp;sdata=mRnRzl
>>> S
>>> lFufeHytPmCLjKlSon2V6SSSC%2Fxx9c55F2gU%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22 12:33         ` Christian König
@ 2020-04-22 12:40           ` Tao, Yintian
  0 siblings, 0 replies; 17+ messages in thread
From: Tao, Yintian @ 2020-04-22 12:40 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Kuehling, Felix; +Cc: amd-gfx

Hi  Christian

Thanks, I got it. I will send another patch for the KIQ overrun problem

Best Regards
Yintian Tao
-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: 2020年4月22日 20:33
To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq access register

Am 22.04.20 um 14:20 schrieb Tao, Yintian:
> Hi  Christian
>
>
> Please see inline commetns.
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: 2020年4月22日 19:57
> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; 
> Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
> Am 22.04.20 um 13:49 schrieb Tao, Yintian:
>> Hi  Christian
>>
>>
>> Can you help answer the questions below? Thanks in advance.
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: 2020年4月22日 19:03
>> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; 
>> Kuehling, Felix <Felix.Kuehling@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>>
>> Am 22.04.20 um 11:29 schrieb Yintian Tao:
>>> According to the current kiq access register method, there will be 
>>> race condition when using KIQ to read register if multiple clients 
>>> want to read at same time just like the expample below:
>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll 
>>> the seqno-1 5. the kiq complete these two read operation 6. client-A 
>>> to read the register at the wb buffer and
>>>       get REG-1 value
>>>
>>> And if there are multiple clients to frequently write registers 
>>> through KIQ which may raise the KIQ ring buffer overwritten problem.
>>>
>>> Therefore, allocate fixed number wb slot for rreg use and limit the 
>>> submit number which depends on the kiq ring_size in order to prevent 
>>> the overwritten problem.
>>>
>>> v2: directly use amdgpu_device_wb_get() for each read instead
>>>        of to reserve fixde number slot.
>>>        if there is no enough kiq ring buffer or rreg slot then
>>>        directly print error log and return instead of busy waiting
>> I would split that into three patches. One for each problem we have here:
>>
>> 1. Fix kgd_hiq_mqd_load() and maybe other occasions to use spin_lock_irqsave().
>> [yttao]: Do you mean that we need to use spin_lock_irqsave for the functions just like kgd_hiq_mqd_load()?
> Yes, I strongly think so.
>
> See when you have one spin lock you either need always need to lock it with irqs disabled or never.
>
> In other words we always need to either use spin_lock() or spin_lock_irqsave(), but never mix them with the same lock.
>
> The only exception to this rule is when you take multiple locks, e.g.
> you can do:
>
> spin_lock_irqsave(&a, flags);
> spin_lock(&b, flags);
> spin_lock(&c, flags);
> ....
> spin_unlock_irqsave(&a, flags);
>
> Here you don't need to use spin_lock_irqsave for b and c. But we rarely have that case in the code.
> [yttao]: thanks , I got it. I will submit another patch for it.
>
>> 2. Prevent the overrung of the KIQ. Please drop the approach with the 
>> atomic here. Instead just add a amdgpu_fence_wait_polling() into
>> amdgpu_fence_emit_polling() as I discussed with Monk.
>> [yttao]: Sorry, I can't get your original idea for the amdgpu_fence_wait_polling(). Can you give more details about it? Thanks in advance.
>>
>> "That is actually only a problem because the KIQ uses polling waits.
>>
>> See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.
>>
>> I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:
>>
>> amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, timeout);"
>> [yttao]: there is no usage of num_fences_mask at kiq fence polling, the num_fences_mask is only effective at dma_fence architecture.
>> 		If I understand correctly, do you want the protype code below? If the protype code is wrong, can you help give one sample? Thanks in advance.
>>
>> int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s) {
>>           uint32_t seq;
>>
>>           if (!s)
>>                   return -EINVAL;
>> +		amdgpu_fence_wait_polling(ring, seq, timeout);
>>           seq = ++ring->fence_drv.sync_seq;
> Your understanding sounds more or less correct. The code should look something like this:
>
> seq = ++ring->fence_drv.sync_seq;
> amdgpu_fence_wait_polling(ring, seq -
> number_of_allowed_submissions_to_the_kiq, timeout);
> [yttao]: whether we need directly wait at the first just like below? Otherwise, amdgpu_ring_emit_wreg may overwrite the KIQ ring buffer.

There should always be room for at least one more submission.

As long as we always submit a fence checking the free room there should be fine.

Regards,
Christian.

> +		amdgpu_fence_wait_polling(ring, seq - 
> +number_of_allowed_submissions_to_the_kiq, timeout);
> 		spin_lock_irqsave(&kiq->ring_lock, flags);
>          amdgpu_ring_alloc(ring, 32);
>          amdgpu_ring_emit_wreg(ring, reg, v);
>          amdgpu_fence_emit_polling(ring, &seq); /* wait */
>          amdgpu_ring_commit(ring);
>          spin_unlock_irqrestore(&kiq->ring_lock, flags);
>
> I just used num_fences_mask as 
> number_of_allowed_submissions_to_the_kiq
> because it is probably a good value to start with.
>
> But you could give that as parameter as well if you think that makes more sense.
>
>>           amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>>                           ¦      seq, 0);
>>
>>           *s = seq;
>>
>>           return 0;
>> }
>>
>>
>>
>>
>> 3. Use amdgpu_device_wb_get() each time we need to submit a read.
>> [yttao]: yes, I will do it.
> Thanks,
> Christian.
>
>> Regards,
>> Christian.
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22 12:20       ` Tao, Yintian
@ 2020-04-22 12:33         ` Christian König
  2020-04-22 12:40           ` Tao, Yintian
  0 siblings, 1 reply; 17+ messages in thread
From: Christian König @ 2020-04-22 12:33 UTC (permalink / raw)
  To: Tao, Yintian, Liu, Monk, Kuehling, Felix; +Cc: amd-gfx

Am 22.04.20 um 14:20 schrieb Tao, Yintian:
> Hi  Christian
>
>
> Please see inline commetns.
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: 2020年4月22日 19:57
> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
> Am 22.04.20 um 13:49 schrieb Tao, Yintian:
>> Hi  Christian
>>
>>
>> Can you help answer the questions below? Thanks in advance.
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: 2020年4月22日 19:03
>> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>;
>> Kuehling, Felix <Felix.Kuehling@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>>
>> Am 22.04.20 um 11:29 schrieb Yintian Tao:
>>> According to the current kiq access register method, there will be
>>> race condition when using KIQ to read register if multiple clients
>>> want to read at same time just like the expample below:
>>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll
>>> the seqno-1 5. the kiq complete these two read operation 6. client-A
>>> to read the register at the wb buffer and
>>>       get REG-1 value
>>>
>>> And if there are multiple clients to frequently write registers
>>> through KIQ which may raise the KIQ ring buffer overwritten problem.
>>>
>>> Therefore, allocate fixed number wb slot for rreg use and limit the
>>> submit number which depends on the kiq ring_size in order to prevent
>>> the overwritten problem.
>>>
>>> v2: directly use amdgpu_device_wb_get() for each read instead
>>>        of to reserve fixde number slot.
>>>        if there is no enough kiq ring buffer or rreg slot then
>>>        directly print error log and return instead of busy waiting
>> I would split that into three patches. One for each problem we have here:
>>
>> 1. Fix kgd_hiq_mqd_load() and maybe other occasions to use spin_lock_irqsave().
>> [yttao]: Do you mean that we need to use spin_lock_irqsave for the functions just like kgd_hiq_mqd_load()?
> Yes, I strongly think so.
>
> See when you have one spin lock you either need always need to lock it with irqs disabled or never.
>
> In other words we always need to either use spin_lock() or spin_lock_irqsave(), but never mix them with the same lock.
>
> The only exception to this rule is when you take multiple locks, e.g.
> you can do:
>
> spin_lock_irqsave(&a, flags);
> spin_lock(&b, flags);
> spin_lock(&c, flags);
> ....
> spin_unlock_irqsave(&a, flags);
>
> Here you don't need to use spin_lock_irqsave for b and c. But we rarely have that case in the code.
> [yttao]: thanks , I got it. I will submit another patch for it.
>
>> 2. Prevent the overrung of the KIQ. Please drop the approach with the
>> atomic here. Instead just add a amdgpu_fence_wait_polling() into
>> amdgpu_fence_emit_polling() as I discussed with Monk.
>> [yttao]: Sorry, I can't get your original idea for the amdgpu_fence_wait_polling(). Can you give more details about it? Thanks in advance.
>>
>> "That is actually only a problem because the KIQ uses polling waits.
>>
>> See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.
>>
>> I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:
>>
>> amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, timeout);"
>> [yttao]: there is no usage of num_fences_mask at kiq fence polling, the num_fences_mask is only effective at dma_fence architecture.
>> 		If I understand correctly, do you want the protype code below? If the protype code is wrong, can you help give one sample? Thanks in advance.
>>
>> int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s) {
>>           uint32_t seq;
>>
>>           if (!s)
>>                   return -EINVAL;
>> +		amdgpu_fence_wait_polling(ring, seq, timeout);
>>           seq = ++ring->fence_drv.sync_seq;
> Your understanding sounds more or less correct. The code should look something like this:
>
> seq = ++ring->fence_drv.sync_seq;
> amdgpu_fence_wait_polling(ring, seq -
> number_of_allowed_submissions_to_the_kiq, timeout);
> [yttao]: whether we need directly wait at the first just like below? Otherwise, amdgpu_ring_emit_wreg may overwrite the KIQ ring buffer.

There should always be room for at least one more submission.

As long as we always submit a fence checking the free room there should 
be fine.

Regards,
Christian.

> +		amdgpu_fence_wait_polling(ring, seq - number_of_allowed_submissions_to_the_kiq, timeout);
> 		spin_lock_irqsave(&kiq->ring_lock, flags);
>          amdgpu_ring_alloc(ring, 32);
>          amdgpu_ring_emit_wreg(ring, reg, v);
>          amdgpu_fence_emit_polling(ring, &seq); /* wait */
>          amdgpu_ring_commit(ring);
>          spin_unlock_irqrestore(&kiq->ring_lock, flags);
>
> I just used num_fences_mask as number_of_allowed_submissions_to_the_kiq
> because it is probably a good value to start with.
>
> But you could give that as parameter as well if you think that makes more sense.
>
>>           amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>>                           ¦      seq, 0);
>>
>>           *s = seq;
>>
>>           return 0;
>> }
>>
>>
>>
>>
>> 3. Use amdgpu_device_wb_get() each time we need to submit a read.
>> [yttao]: yes, I will do it.
> Thanks,
> Christian.
>
>> Regards,
>> Christian.
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22 11:57     ` Christian König
@ 2020-04-22 12:20       ` Tao, Yintian
  2020-04-22 12:33         ` Christian König
  0 siblings, 1 reply; 17+ messages in thread
From: Tao, Yintian @ 2020-04-22 12:20 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Kuehling, Felix; +Cc: amd-gfx

Hi  Christian


Please see inline commetns.
-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: 2020年4月22日 19:57
To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq access register

Am 22.04.20 um 13:49 schrieb Tao, Yintian:
> Hi  Christian
>
>
> Can you help answer the questions below? Thanks in advance.
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: 2020年4月22日 19:03
> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; 
> Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
> Am 22.04.20 um 11:29 schrieb Yintian Tao:
>> According to the current kiq access register method, there will be 
>> race condition when using KIQ to read register if multiple clients 
>> want to read at same time just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll 
>> the seqno-1 5. the kiq complete these two read operation 6. client-A 
>> to read the register at the wb buffer and
>>      get REG-1 value
>>
>> And if there are multiple clients to frequently write registers 
>> through KIQ which may raise the KIQ ring buffer overwritten problem.
>>
>> Therefore, allocate fixed number wb slot for rreg use and limit the 
>> submit number which depends on the kiq ring_size in order to prevent 
>> the overwritten problem.
>>
>> v2: directly use amdgpu_device_wb_get() for each read instead
>>       of to reserve fixde number slot.
>>       if there is no enough kiq ring buffer or rreg slot then
>>       directly print error log and return instead of busy waiting
> I would split that into three patches. One for each problem we have here:
>
> 1. Fix kgd_hiq_mqd_load() and maybe other occasions to use spin_lock_irqsave().
> [yttao]: Do you mean that we need to use spin_lock_irqsave for the functions just like kgd_hiq_mqd_load()?

Yes, I strongly think so.

See when you have one spin lock you either need always need to lock it with irqs disabled or never.

In other words we always need to either use spin_lock() or spin_lock_irqsave(), but never mix them with the same lock.

The only exception to this rule is when you take multiple locks, e.g. 
you can do:

spin_lock_irqsave(&a, flags);
spin_lock(&b, flags);
spin_lock(&c, flags);
....
spin_unlock_irqsave(&a, flags);

Here you don't need to use spin_lock_irqsave for b and c. But we rarely have that case in the code.
[yttao]: thanks , I got it. I will submit another patch for it.

> 2. Prevent the overrung of the KIQ. Please drop the approach with the 
> atomic here. Instead just add a amdgpu_fence_wait_polling() into
> amdgpu_fence_emit_polling() as I discussed with Monk.
> [yttao]: Sorry, I can't get your original idea for the amdgpu_fence_wait_polling(). Can you give more details about it? Thanks in advance.
>
> "That is actually only a problem because the KIQ uses polling waits.
>
> See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.
>
> I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:
>
> amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, timeout);"
> [yttao]: there is no usage of num_fences_mask at kiq fence polling, the num_fences_mask is only effective at dma_fence architecture.
> 		If I understand correctly, do you want the protype code below? If the protype code is wrong, can you help give one sample? Thanks in advance.
>
> int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s) {
>          uint32_t seq;
>
>          if (!s)
>                  return -EINVAL;
> +		amdgpu_fence_wait_polling(ring, seq, timeout);
>          seq = ++ring->fence_drv.sync_seq;

Your understanding sounds more or less correct. The code should look something like this:

seq = ++ring->fence_drv.sync_seq;
amdgpu_fence_wait_polling(ring, seq -
number_of_allowed_submissions_to_the_kiq, timeout);
[yttao]: whether we need directly wait at the first just like below? Otherwise, amdgpu_ring_emit_wreg may overwrite the KIQ ring buffer.
+		amdgpu_fence_wait_polling(ring, seq - number_of_allowed_submissions_to_the_kiq, timeout);        
		spin_lock_irqsave(&kiq->ring_lock, flags);
        amdgpu_ring_alloc(ring, 32);
        amdgpu_ring_emit_wreg(ring, reg, v);
        amdgpu_fence_emit_polling(ring, &seq); /* wait */
        amdgpu_ring_commit(ring);
        spin_unlock_irqrestore(&kiq->ring_lock, flags);

I just used num_fences_mask as number_of_allowed_submissions_to_the_kiq
because it is probably a good value to start with.

But you could give that as parameter as well if you think that makes more sense.

>          amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>                          ¦      seq, 0);
>
>          *s = seq;
>
>          return 0;
> }
>
>
>
>
> 3. Use amdgpu_device_wb_get() each time we need to submit a read.
> [yttao]: yes, I will do it.

Thanks,
Christian.

>
> Regards,
> Christian.
>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  8 +-
>>    .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 13 ++-
>>    .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 13 ++-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 83 +++++++++++++++----
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  3 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |  5 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      | 13 ++-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |  8 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |  8 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 35 +++++---
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        | 13 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 13 ++-
>>    12 files changed, 167 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 4e1d4cfe7a9f..1157c1a0b888 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>>    /*
>>     * Writeback
>>     */
>> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
>> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>>    
>>    struct amdgpu_wb {
>>    	struct amdgpu_bo	*wb_obj;
>> @@ -1028,6 +1028,12 @@ bool amdgpu_device_has_dc_support(struct
>> amdgpu_device *adev);
>>    
>>    int emu_soc_asic_init(struct amdgpu_device *adev);
>>    
>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
>> +			unsigned long *flags);
>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long 
>> +*flags);
>> +
>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs); 
>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>>    /*
>>     * Registers read & write functions.
>>     */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> index 691c89705bcd..a65d6a1abc04 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> @@ -309,9 +309,11 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    			    uint32_t doorbell_off)
>>    {
>>    	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>    	struct v10_compute_mqd *m;
>>    	uint32_t mec, pipe;
>> +	unsigned long flags = 0;
>>    	int r;
>>    
>>    	m = get_mqd(mqd);
>> @@ -324,13 +326,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>    		 mec, pipe, queue_id);
>>    
>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto out_unlock;
>> +	}
>> +
>>    	r = amdgpu_ring_alloc(kiq_ring, 7);
>>    	if (r) {
>>    		pr_err("Failed to alloc KIQ (%d).\n", r);
>>    		goto out_unlock;
>>    	}
>>    
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>    	amdgpu_ring_write(kiq_ring,
>>    			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8
>> +358,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>    	amdgpu_ring_commit(kiq_ring);
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    out_unlock:
>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
>>    	release_queue(kgd);
>>    
>>    	return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> index df841c2ac5e7..4435bd716edd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> @@ -307,9 +307,11 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    			    uint32_t doorbell_off)
>>    {
>>    	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>    	struct v9_mqd *m;
>>    	uint32_t mec, pipe;
>> +	unsigned long flags = 0;
>>    	int r;
>>    
>>    	m = get_mqd(mqd);
>> @@ -322,13 +324,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>    		 mec, pipe, queue_id);
>>    
>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto out_unlock;
>> +	}
>> +
>>    	r = amdgpu_ring_alloc(kiq_ring, 7);
>>    	if (r) {
>>    		pr_err("Failed to alloc KIQ (%d).\n", r);
>>    		goto out_unlock;
>>    	}
>>    
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>    	amdgpu_ring_write(kiq_ring,
>>    			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8
>> +356,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>    	amdgpu_ring_commit(kiq_ring);
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    out_unlock:
>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
>>    	release_queue(kgd);
>>    
>>    	return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..c0dc7f1849c4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -295,6 +295,43 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>>    	return -EINVAL;
>>    }
>>    
>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
>> +			unsigned long *flags)
>> +{
>> +	struct amdgpu_wb *wb = &(kiq->ring.adev)->wb;
>> +
>> +	spin_lock_irqsave(&kiq->ring_lock, *flags);
>> +	if ((atomic64_read(&kiq->submit_avail) > 0) &&
>> +	    (read ? find_first_zero_bit(wb->used, wb->num_wb) <
>> +	     wb->num_wb : 1)) {
>> +		return 0;
>> +	} else {
>> +		spin_unlock_irqrestore(&kiq->ring_lock, *flags);
>> +		pr_err("critical! too more kiq accessers\n");
>> +		return -EDEADLK;
>> +	}
>> +}
>> +
>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long
>> +*flags) {
>> +	spin_unlock_irqrestore(&kiq->ring_lock, *flags); }
>> +
>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
>> +	atomic64_dec(&kiq->submit_avail);
>> +	if (offs)
>> +		amdgpu_device_wb_get(kiq->ring.adev, offs);
>> +
>> +}
>> +
>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
>> +	atomic64_inc(&kiq->submit_avail);
>> +	if (offs)
>> +		amdgpu_device_wb_free(kiq->ring.adev, *offs); }
>> +
>>    int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    			     struct amdgpu_ring *ring,
>>    			     struct amdgpu_irq_src *irq) @@ -304,10 +341,6 @@ int 
>> amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    
>>    	spin_lock_init(&kiq->ring_lock);
>>    
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> -
>>    	ring->adev = NULL;
>>    	ring->ring_obj = NULL;
>>    	ring->use_doorbell = true;
>> @@ -325,13 +358,15 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    			     AMDGPU_RING_PRIO_DEFAULT);
>>    	if (r)
>>    		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
>> +	else
>> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
>> +			     (ring->funcs->align_mask + 1));
>>    
>>    	return r;
>>    }
>>    
>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>    {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>    	amdgpu_ring_fini(ring);
>>    }
>>    
>> @@ -671,19 +706,25 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
>>    uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> -	uint32_t seq;
>> +	unsigned long flags = 0;
>> +	uint32_t seq, reg_val_offs = 0, value = 0;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto kiq_read_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>    	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -707,9 +748,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	mb();
>> +	value = adev->wb.wb[reg_val_offs];
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +	return value;
>>    
>>    failed_kiq_read:
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +kiq_read_exit:
>>    	pr_err("failed to read reg:%x\n", reg);
>>    	return ~0;
>>    }
>> @@ -717,19 +763,25 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>    {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> +	unsigned long flags = 0;
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_wreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto kiq_write_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_emit_wreg(ring, reg, v);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -754,8 +806,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_write;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    	return;
>>    
>>    failed_kiq_write:
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>> +kiq_write_exit:
>>    	pr_err("failed to write reg:%x\n", reg);
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..ff7597ca6cad 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>>    	int invalidate_tlbs_size;
>>    };
>>    
>> +#define MAX_KIQ_RREG_NUM 64
>>    struct amdgpu_kiq {
>>    	u64			eop_gpu_addr;
>>    	struct amdgpu_bo	*eop_obj;
>> @@ -103,7 +104,7 @@ struct amdgpu_kiq {
>>    	struct amdgpu_ring	ring;
>>    	struct amdgpu_irq_src	irq;
>>    	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>> +	atomic64_t		submit_avail;
>>    };
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..137d3d2b46e8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>    	void (*end_use)(struct amdgpu_ring *ring);
>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint32_t reg_val_offs);
>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
>> amdgpu_ring {
>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>    #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>    #define amdgpu_ring_emit_cntxcntl(r, d) 
>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>> d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
>> +(d), (o))
>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>    #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index 8c10084f44ef..a4c3f284691c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	unsigned long flags;
>>    	uint32_t seq;
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto failed_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    	return;
>>    
>>    failed_kiq:
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>> +failed_exit:
>>    	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7853dbc3c8bd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>    }
>>    
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..bdbd92d4fe45 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>    }
>>    
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..293219452c0f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4042,14 +4042,21 @@ static int gfx_v9_0_soft_reset(void *handle)
>>    static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> -	uint32_t seq;
>> +	unsigned long flags = 0;
>> +	uint32_t seq, reg_val_offs;
>> +	uint64_t value = 0;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto failed_kiq_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>> @@ -4059,12 +4066,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -4088,10 +4095,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
>> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +	return value;
>>    failed_kiq_read:
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +failed_kiq_exit:
>>    	pr_err("failed to read gpu clock\n");
>>    	return ~0;
>>    }
>> @@ -5482,10 +5493,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>    }
>>    
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
>> uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -5494,9 +5505,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 30b75d79efdb..ab960b052c0d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -415,6 +415,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    {
>>    	int vmid, i;
>>    	signed long r;
>> +	unsigned long flags = 0;
>>    	uint32_t seq;
>>    	uint16_t queried_pasid;
>>    	bool ret;
>> @@ -422,20 +423,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>> +		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +		if (r) {
>> +			pr_err("failed to lock kiq\n");
>> +			return -ETIME;
>> +		}
>> +
>>    		/* 2 dwords flush + 8 dwords fence */
>>    		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>    		kiq->pmf->kiq_invalidate_tlbs(ring,
>>    					pasid, flush_type, all_hub);
>>    		amdgpu_fence_emit_polling(ring, &seq);
>>    		amdgpu_ring_commit(ring);
>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>> +		amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>> +			amdgpu_gfx_kiq_restore(kiq, false);
>>    			return -ETIME;
>>    		}
>>    
>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>    		return 0;
>>    	}
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index fecdbc471983..d14fd56a959e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -589,6 +589,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    {
>>    	int vmid, i;
>>    	signed long r;
>> +	unsigned long flags = 0;
>>    	uint32_t seq;
>>    	uint16_t queried_pasid;
>>    	bool ret;
>> @@ -613,7 +614,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    		if (vega20_xgmi_wa)
>>    			ndw += kiq->pmf->invalidate_tlbs_size;
>>    
>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>> +		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +		if (r) {
>> +			pr_err("failed to lock kiq\n");
>> +			return -ETIME;
>> +		}
>> +
>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>    		/* 2 dwords flush + 8 dwords fence */
>>    		amdgpu_ring_alloc(ring, ndw);
>>    		if (vega20_xgmi_wa)
>> @@ -623,13 +630,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    					pasid, flush_type, all_hub);
>>    		amdgpu_fence_emit_polling(ring, &seq);
>>    		amdgpu_ring_commit(ring);
>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>> +		amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>>    			return -ETIME;
>>    		}
>>    
>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>    		return 0;
>>    	}
>>    

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22 11:49   ` Tao, Yintian
@ 2020-04-22 11:57     ` Christian König
  2020-04-22 12:20       ` Tao, Yintian
  0 siblings, 1 reply; 17+ messages in thread
From: Christian König @ 2020-04-22 11:57 UTC (permalink / raw)
  To: Tao, Yintian, Liu, Monk, Kuehling, Felix; +Cc: amd-gfx

Am 22.04.20 um 13:49 schrieb Tao, Yintian:
> Hi  Christian
>
>
> Can you help answer the questions below? Thanks in advance.
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: 2020年4月22日 19:03
> To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: refine kiq access register
>
> Am 22.04.20 um 11:29 schrieb Yintian Tao:
>> According to the current kiq access register method, there will be
>> race condition when using KIQ to read register if multiple clients
>> want to read at same time just like the expample below:
>> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the
>> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll
>> the seqno-1 5. the kiq complete these two read operation 6. client-A
>> to read the register at the wb buffer and
>>      get REG-1 value
>>
>> And if there are multiple clients to frequently write registers
>> through KIQ which may raise the KIQ ring buffer overwritten problem.
>>
>> Therefore, allocate fixed number wb slot for rreg use and limit the
>> submit number which depends on the kiq ring_size in order to prevent
>> the overwritten problem.
>>
>> v2: directly use amdgpu_device_wb_get() for each read instead
>>       of to reserve fixde number slot.
>>       if there is no enough kiq ring buffer or rreg slot then
>>       directly print error log and return instead of busy waiting
> I would split that into three patches. One for each problem we have here:
>
> 1. Fix kgd_hiq_mqd_load() and maybe other occasions to use spin_lock_irqsave().
> [yttao]: Do you mean that we need to use spin_lock_irqsave for the functions just like kgd_hiq_mqd_load()?

Yes, I strongly think so.

See when you have one spin lock you either need always need to lock it 
with irqs disabled or never.

In other words we always need to either use spin_lock() or 
spin_lock_irqsave(), but never mix them with the same lock.

The only exception to this rule is when you take multiple locks, e.g. 
you can do:

spin_lock_irqsave(&a, flags);
spin_lock(&b, flags);
spin_lock(&c, flags);
....
spin_unlock_irqsave(&a, flags);

Here you don't need to use spin_lock_irqsave for b and c. But we rarely 
have that case in the code.

> 2. Prevent the overrung of the KIQ. Please drop the approach with the atomic here. Instead just add a amdgpu_fence_wait_polling() into
> amdgpu_fence_emit_polling() as I discussed with Monk.
> [yttao]: Sorry, I can't get your original idea for the amdgpu_fence_wait_polling(). Can you give more details about it? Thanks in advance.
>
> "That is actually only a problem because the KIQ uses polling waits.
>
> See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.
>
> I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:
>
> amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, timeout);"
> [yttao]: there is no usage of num_fences_mask at kiq fence polling, the num_fences_mask is only effective at dma_fence architecture.
> 		If I understand correctly, do you want the protype code below? If the protype code is wrong, can you help give one sample? Thanks in advance.
>
> int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s)
> {
>          uint32_t seq;
>
>          if (!s)
>                  return -EINVAL;
> +		amdgpu_fence_wait_polling(ring, seq, timeout);
>          seq = ++ring->fence_drv.sync_seq;

Your understanding sounds more or less correct. The code should look 
something like this:

seq = ++ring->fence_drv.sync_seq;
amdgpu_fence_wait_polling(ring, seq - 
number_of_allowed_submissions_to_the_kiq, timeout);

I just used num_fences_mask as number_of_allowed_submissions_to_the_kiq 
because it is probably a good value to start with.

But you could give that as parameter as well if you think that makes 
more sense.

>          amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>                          ¦      seq, 0);
>
>          *s = seq;
>
>          return 0;
> }
>
>
>
>
> 3. Use amdgpu_device_wb_get() each time we need to submit a read.
> [yttao]: yes, I will do it.

Thanks,
Christian.

>
> Regards,
> Christian.
>
>> Signed-off-by: Yintian Tao <yttao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  8 +-
>>    .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 13 ++-
>>    .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 13 ++-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 83 +++++++++++++++----
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  3 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |  5 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      | 13 ++-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |  8 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |  8 +-
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 35 +++++---
>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        | 13 ++-
>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 13 ++-
>>    12 files changed, 167 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 4e1d4cfe7a9f..1157c1a0b888 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>>    /*
>>     * Writeback
>>     */
>> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
>> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>>    
>>    struct amdgpu_wb {
>>    	struct amdgpu_bo	*wb_obj;
>> @@ -1028,6 +1028,12 @@ bool amdgpu_device_has_dc_support(struct
>> amdgpu_device *adev);
>>    
>>    int emu_soc_asic_init(struct amdgpu_device *adev);
>>    
>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
>> +			unsigned long *flags);
>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long
>> +*flags);
>> +
>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs);
>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>>    /*
>>     * Registers read & write functions.
>>     */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> index 691c89705bcd..a65d6a1abc04 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> @@ -309,9 +309,11 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    			    uint32_t doorbell_off)
>>    {
>>    	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>    	struct v10_compute_mqd *m;
>>    	uint32_t mec, pipe;
>> +	unsigned long flags = 0;
>>    	int r;
>>    
>>    	m = get_mqd(mqd);
>> @@ -324,13 +326,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>    		 mec, pipe, queue_id);
>>    
>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto out_unlock;
>> +	}
>> +
>>    	r = amdgpu_ring_alloc(kiq_ring, 7);
>>    	if (r) {
>>    		pr_err("Failed to alloc KIQ (%d).\n", r);
>>    		goto out_unlock;
>>    	}
>>    
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>    	amdgpu_ring_write(kiq_ring,
>>    			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8
>> +358,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>    	amdgpu_ring_commit(kiq_ring);
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    out_unlock:
>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
>>    	release_queue(kgd);
>>    
>>    	return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> index df841c2ac5e7..4435bd716edd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> @@ -307,9 +307,11 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    			    uint32_t doorbell_off)
>>    {
>>    	struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>>    	struct v9_mqd *m;
>>    	uint32_t mec, pipe;
>> +	unsigned long flags = 0;
>>    	int r;
>>    
>>    	m = get_mqd(mqd);
>> @@ -322,13 +324,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>>    		 mec, pipe, queue_id);
>>    
>> -	spin_lock(&adev->gfx.kiq.ring_lock);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto out_unlock;
>> +	}
>> +
>>    	r = amdgpu_ring_alloc(kiq_ring, 7);
>>    	if (r) {
>>    		pr_err("Failed to alloc KIQ (%d).\n", r);
>>    		goto out_unlock;
>>    	}
>>    
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>>    	amdgpu_ring_write(kiq_ring,
>>    			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8
>> +356,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>>    	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>>    	amdgpu_ring_commit(kiq_ring);
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    out_unlock:
>> -	spin_unlock(&adev->gfx.kiq.ring_lock);
>> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
>>    	release_queue(kgd);
>>    
>>    	return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index ea576b4260a4..c0dc7f1849c4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -295,6 +295,43 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>>    	return -EINVAL;
>>    }
>>    
>> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
>> +			unsigned long *flags)
>> +{
>> +	struct amdgpu_wb *wb = &(kiq->ring.adev)->wb;
>> +
>> +	spin_lock_irqsave(&kiq->ring_lock, *flags);
>> +	if ((atomic64_read(&kiq->submit_avail) > 0) &&
>> +	    (read ? find_first_zero_bit(wb->used, wb->num_wb) <
>> +	     wb->num_wb : 1)) {
>> +		return 0;
>> +	} else {
>> +		spin_unlock_irqrestore(&kiq->ring_lock, *flags);
>> +		pr_err("critical! too more kiq accessers\n");
>> +		return -EDEADLK;
>> +	}
>> +}
>> +
>> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long
>> +*flags) {
>> +	spin_unlock_irqrestore(&kiq->ring_lock, *flags); }
>> +
>> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
>> +	atomic64_dec(&kiq->submit_avail);
>> +	if (offs)
>> +		amdgpu_device_wb_get(kiq->ring.adev, offs);
>> +
>> +}
>> +
>> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
>> +	atomic64_inc(&kiq->submit_avail);
>> +	if (offs)
>> +		amdgpu_device_wb_free(kiq->ring.adev, *offs); }
>> +
>>    int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    			     struct amdgpu_ring *ring,
>>    			     struct amdgpu_irq_src *irq)
>> @@ -304,10 +341,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device
>> *adev,
>>    
>>    	spin_lock_init(&kiq->ring_lock);
>>    
>> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
>> -	if (r)
>> -		return r;
>> -
>>    	ring->adev = NULL;
>>    	ring->ring_obj = NULL;
>>    	ring->use_doorbell = true;
>> @@ -325,13 +358,15 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>    			     AMDGPU_RING_PRIO_DEFAULT);
>>    	if (r)
>>    		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
>> +	else
>> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
>> +			     (ring->funcs->align_mask + 1));
>>    
>>    	return r;
>>    }
>>    
>>    void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>>    {
>> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>>    	amdgpu_ring_fini(ring);
>>    }
>>    
>> @@ -671,19 +706,25 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
>>    uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> -	uint32_t seq;
>> +	unsigned long flags = 0;
>> +	uint32_t seq, reg_val_offs = 0, value = 0;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto kiq_read_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>    	amdgpu_ring_alloc(ring, 32);
>> -	amdgpu_ring_emit_rreg(ring, reg);
>> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -707,9 +748,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return adev->wb.wb[kiq->reg_val_offs];
>> +	mb();
>> +	value = adev->wb.wb[reg_val_offs];
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +	return value;
>>    
>>    failed_kiq_read:
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +kiq_read_exit:
>>    	pr_err("failed to read reg:%x\n", reg);
>>    	return ~0;
>>    }
>> @@ -717,19 +763,25 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>    void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>    {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> +	unsigned long flags = 0;
>>    	uint32_t seq;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_wreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto kiq_write_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_emit_wreg(ring, reg, v);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -754,8 +806,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_write;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    	return;
>>    
>>    failed_kiq_write:
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>> +kiq_write_exit:
>>    	pr_err("failed to write reg:%x\n", reg);
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 634746829024..ff7597ca6cad 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>>    	int invalidate_tlbs_size;
>>    };
>>    
>> +#define MAX_KIQ_RREG_NUM 64
>>    struct amdgpu_kiq {
>>    	u64			eop_gpu_addr;
>>    	struct amdgpu_bo	*eop_obj;
>> @@ -103,7 +104,7 @@ struct amdgpu_kiq {
>>    	struct amdgpu_ring	ring;
>>    	struct amdgpu_irq_src	irq;
>>    	const struct kiq_pm4_funcs *pmf;
>> -	uint32_t			reg_val_offs;
>> +	atomic64_t		submit_avail;
>>    };
>>    
>>    /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index f61664ee4940..137d3d2b46e8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>>    	void (*end_use)(struct amdgpu_ring *ring);
>>    	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>    	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
>> +			  uint32_t reg_val_offs);
>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>    			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct
>> amdgpu_ring {
>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>    #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>>    #define amdgpu_ring_emit_cntxcntl(r, d)
>> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r,
>> d) (r)->funcs->emit_rreg((r), (d))
>> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r),
>> +(d), (o))
>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>    #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff
>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index 8c10084f44ef..a4c3f284691c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	unsigned long flags;
>>    	uint32_t seq;
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto failed_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>>    					    ref, mask);
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>>    	return;
>>    
>>    failed_kiq:
>> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>> +failed_exit:
>>    	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 0a03e2ad5d95..7853dbc3c8bd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>>    	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>    }
>>    
>> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				     uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index fc6c2f2bc76c..bdbd92d4fe45 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>>    }
>>    
>> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 84fcf842316d..293219452c0f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4042,14 +4042,21 @@ static int gfx_v9_0_soft_reset(void *handle)
>>    static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    {
>>    	signed long r, cnt = 0;
>> -	unsigned long flags;
>> -	uint32_t seq;
>> +	unsigned long flags = 0;
>> +	uint32_t seq, reg_val_offs;
>> +	uint64_t value = 0;
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    	struct amdgpu_ring *ring = &kiq->ring;
>>    
>>    	BUG_ON(!ring->funcs->emit_rreg);
>>    
>> -	spin_lock_irqsave(&kiq->ring_lock, flags);
>> +	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
>> +	if (r) {
>> +		pr_err("failed to lock kiq\n");
>> +		goto failed_kiq_exit;
>> +	}
>> +
>> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>>    	amdgpu_ring_alloc(ring, 32);
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 9 |	/* src: register*/
>> @@ -4059,12 +4066,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_fence_emit_polling(ring, &seq);
>>    	amdgpu_ring_commit(ring);
>> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    
>>    	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>    
>> @@ -4088,10 +4095,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>    	if (cnt > MAX_KIQ_REG_TRY)
>>    		goto failed_kiq_read;
>>    
>> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
>> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
>> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
>> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>>    
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +	return value;
>>    failed_kiq_read:
>> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
>> +failed_kiq_exit:
>>    	pr_err("failed to read gpu clock\n");
>>    	return ~0;
>>    }
>> @@ -5482,10 +5493,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>>    		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>>    }
>>    
>> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring,
>> uint32_t reg)
>> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
>> +				    uint32_t reg_val_offs)
>>    {
>>    	struct amdgpu_device *adev = ring->adev;
>> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>>    	amdgpu_ring_write(ring, 0 |	/* src: register*/
>> @@ -5494,9 +5505,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>>    	amdgpu_ring_write(ring, reg);
>>    	amdgpu_ring_write(ring, 0);
>>    	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
>> -				kiq->reg_val_offs * 4));
>> +				reg_val_offs * 4));
>>    }
>>    
>>    static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring,
>> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 30b75d79efdb..ab960b052c0d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -415,6 +415,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    {
>>    	int vmid, i;
>>    	signed long r;
>> +	unsigned long flags = 0;
>>    	uint32_t seq;
>>    	uint16_t queried_pasid;
>>    	bool ret;
>> @@ -422,20 +423,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>    
>>    	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>> +		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +		if (r) {
>> +			pr_err("failed to lock kiq\n");
>> +			return -ETIME;
>> +		}
>> +
>>    		/* 2 dwords flush + 8 dwords fence */
>>    		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>    		kiq->pmf->kiq_invalidate_tlbs(ring,
>>    					pasid, flush_type, all_hub);
>>    		amdgpu_fence_emit_polling(ring, &seq);
>>    		amdgpu_ring_commit(ring);
>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>> +		amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>> +			amdgpu_gfx_kiq_restore(kiq, false);
>>    			return -ETIME;
>>    		}
>>    
>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>    		return 0;
>>    	}
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index fecdbc471983..d14fd56a959e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -589,6 +589,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    {
>>    	int vmid, i;
>>    	signed long r;
>> +	unsigned long flags = 0;
>>    	uint32_t seq;
>>    	uint16_t queried_pasid;
>>    	bool ret;
>> @@ -613,7 +614,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    		if (vega20_xgmi_wa)
>>    			ndw += kiq->pmf->invalidate_tlbs_size;
>>    
>> -		spin_lock(&adev->gfx.kiq.ring_lock);
>> +		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
>> +		if (r) {
>> +			pr_err("failed to lock kiq\n");
>> +			return -ETIME;
>> +		}
>> +
>> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>>    		/* 2 dwords flush + 8 dwords fence */
>>    		amdgpu_ring_alloc(ring, ndw);
>>    		if (vega20_xgmi_wa)
>> @@ -623,13 +630,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>    					pasid, flush_type, all_hub);
>>    		amdgpu_fence_emit_polling(ring, &seq);
>>    		amdgpu_ring_commit(ring);
>> -		spin_unlock(&adev->gfx.kiq.ring_lock);
>> +		amdgpu_gfx_kiq_unlock(kiq, &flags);
>>    		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>    		if (r < 1) {
>>    			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>>    			return -ETIME;
>>    		}
>>    
>> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>>    		return 0;
>>    	}
>>    

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22 11:02 ` Christian König
@ 2020-04-22 11:49   ` Tao, Yintian
  2020-04-22 11:57     ` Christian König
  0 siblings, 1 reply; 17+ messages in thread
From: Tao, Yintian @ 2020-04-22 11:49 UTC (permalink / raw)
  To: Koenig, Christian, Liu, Monk, Kuehling, Felix; +Cc: amd-gfx

Hi  Christian


Can you help answer the questions below? Thanks in advance.
-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: 2020年4月22日 19:03
To: Tao, Yintian <Yintian.Tao@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: refine kiq access register

Am 22.04.20 um 11:29 schrieb Yintian Tao:
> According to the current kiq access register method, there will be 
> race condition when using KIQ to read register if multiple clients 
> want to read at same time just like the expample below:
> 1. client-A start to read REG-0 throguh KIQ 2. client-A poll the 
> seqno-0 3. client-B start to read REG-1 through KIQ 4. client-B poll 
> the seqno-1 5. the kiq complete these two read operation 6. client-A 
> to read the register at the wb buffer and
>     get REG-1 value
>
> And if there are multiple clients to frequently write registers 
> through KIQ which may raise the KIQ ring buffer overwritten problem.
>
> Therefore, allocate fixed number wb slot for rreg use and limit the 
> submit number which depends on the kiq ring_size in order to prevent 
> the overwritten problem.
>
> v2: directly use amdgpu_device_wb_get() for each read instead
>      of to reserve fixde number slot.
>      if there is no enough kiq ring buffer or rreg slot then
>      directly print error log and return instead of busy waiting

I would split that into three patches. One for each problem we have here:

1. Fix kgd_hiq_mqd_load() and maybe other occasions to use spin_lock_irqsave().
[yttao]: Do you mean that we need to use spin_lock_irqsave for the functions just like kgd_hiq_mqd_load()?

2. Prevent the overrung of the KIQ. Please drop the approach with the atomic here. Instead just add a amdgpu_fence_wait_polling() into
amdgpu_fence_emit_polling() as I discussed with Monk.
[yttao]: Sorry, I can't get your original idea for the amdgpu_fence_wait_polling(). Can you give more details about it? Thanks in advance.

"That is actually only a problem because the KIQ uses polling waits.

See amdgpu_fence_emit() waits for the oldest possible fence to be signaled before emitting a new one.

I suggest that we do the same in amdgpu_fence_emit_polling(). A one liner like the following should be enough:

amdgpu_fence_wait_polling(ring, seq - ring->fence_drv.num_fences_mask, timeout);"
[yttao]: there is no usage of num_fences_mask at kiq fence polling, the num_fences_mask is only effective at dma_fence architecture.
		If I understand correctly, do you want the protype code below? If the protype code is wrong, can you help give one sample? Thanks in advance.

int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s) 
{
        uint32_t seq;

        if (!s)
                return -EINVAL;
+		amdgpu_fence_wait_polling(ring, seq, timeout); 
        seq = ++ring->fence_drv.sync_seq;
        amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
                        ¦      seq, 0); 

        *s = seq;

        return 0;
}




3. Use amdgpu_device_wb_get() each time we need to submit a read.
[yttao]: yes, I will do it.

Regards,
Christian.

>
> Signed-off-by: Yintian Tao <yttao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  8 +-
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 13 ++-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 13 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 83 +++++++++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |  5 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      | 13 ++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |  8 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |  8 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 35 +++++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        | 13 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 13 ++-
>   12 files changed, 167 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4e1d4cfe7a9f..1157c1a0b888 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>   /*
>    * Writeback
>    */
> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>   
>   struct amdgpu_wb {
>   	struct amdgpu_bo	*wb_obj;
> @@ -1028,6 +1028,12 @@ bool amdgpu_device_has_dc_support(struct 
> amdgpu_device *adev);
>   
>   int emu_soc_asic_init(struct amdgpu_device *adev);
>   
> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
> +			unsigned long *flags);
> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long 
> +*flags);
> +
> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs); 
> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>   /*
>    * Registers read & write functions.
>    */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..a65d6a1abc04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -309,9 +309,11 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   			    uint32_t doorbell_off)
>   {
>   	struct amdgpu_device *adev = get_amdgpu_device(kgd);
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>   	struct v10_compute_mqd *m;
>   	uint32_t mec, pipe;
> +	unsigned long flags = 0;
>   	int r;
>   
>   	m = get_mqd(mqd);
> @@ -324,13 +326,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>   		 mec, pipe, queue_id);
>   
> -	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
>   		goto out_unlock;
>   	}
>   
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>   	amdgpu_ring_write(kiq_ring,
>   			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -350,8 
> +358,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>   	amdgpu_ring_commit(kiq_ring);
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   out_unlock:
> -	spin_unlock(&adev->gfx.kiq.ring_lock);
> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
>   	release_queue(kgd);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..4435bd716edd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -307,9 +307,11 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   			    uint32_t doorbell_off)
>   {
>   	struct amdgpu_device *adev = get_amdgpu_device(kgd);
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>   	struct v9_mqd *m;
>   	uint32_t mec, pipe;
> +	unsigned long flags = 0;
>   	int r;
>   
>   	m = get_mqd(mqd);
> @@ -322,13 +324,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>   		 mec, pipe, queue_id);
>   
> -	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
>   		goto out_unlock;
>   	}
>   
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>   	amdgpu_ring_write(kiq_ring,
>   			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ @@ -348,8 
> +356,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>   	amdgpu_ring_commit(kiq_ring);
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   out_unlock:
> -	spin_unlock(&adev->gfx.kiq.ring_lock);
> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
>   	release_queue(kgd);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ea576b4260a4..c0dc7f1849c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -295,6 +295,43 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>   	return -EINVAL;
>   }
>   
> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
> +			unsigned long *flags)
> +{
> +	struct amdgpu_wb *wb = &(kiq->ring.adev)->wb;
> +
> +	spin_lock_irqsave(&kiq->ring_lock, *flags);
> +	if ((atomic64_read(&kiq->submit_avail) > 0) &&
> +	    (read ? find_first_zero_bit(wb->used, wb->num_wb) <
> +	     wb->num_wb : 1)) {
> +		return 0;
> +	} else {
> +		spin_unlock_irqrestore(&kiq->ring_lock, *flags);
> +		pr_err("critical! too more kiq accessers\n");
> +		return -EDEADLK;
> +	}
> +}
> +
> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long 
> +*flags) {
> +	spin_unlock_irqrestore(&kiq->ring_lock, *flags); }
> +
> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs) {
> +	atomic64_dec(&kiq->submit_avail);
> +	if (offs)
> +		amdgpu_device_wb_get(kiq->ring.adev, offs);
> +
> +}
> +
> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs) {
> +	atomic64_inc(&kiq->submit_avail);
> +	if (offs)
> +		amdgpu_device_wb_free(kiq->ring.adev, *offs); }
> +
>   int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     struct amdgpu_ring *ring,
>   			     struct amdgpu_irq_src *irq)
> @@ -304,10 +341,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device 
> *adev,
>   
>   	spin_lock_init(&kiq->ring_lock);
>   
> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
> -	if (r)
> -		return r;
> -
>   	ring->adev = NULL;
>   	ring->ring_obj = NULL;
>   	ring->use_doorbell = true;
> @@ -325,13 +358,15 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     AMDGPU_RING_PRIO_DEFAULT);
>   	if (r)
>   		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
> +	else
> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
> +			     (ring->funcs->align_mask + 1));
>   
>   	return r;
>   }
>   
>   void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>   {
> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>   	amdgpu_ring_fini(ring);
>   }
>   
> @@ -671,19 +706,25 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
>   uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> -	uint32_t seq;
> +	unsigned long flags = 0;
> +	uint32_t seq, reg_val_offs = 0, value = 0;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto kiq_read_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>   	amdgpu_ring_alloc(ring, 32);
> -	amdgpu_ring_emit_rreg(ring, reg);
> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -707,9 +748,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return adev->wb.wb[kiq->reg_val_offs];
> +	mb();
> +	value = adev->wb.wb[reg_val_offs];
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +	return value;
>   
>   failed_kiq_read:
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +kiq_read_exit:
>   	pr_err("failed to read reg:%x\n", reg);
>   	return ~0;
>   }
> @@ -717,19 +763,25 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> +	unsigned long flags = 0;
>   	uint32_t seq;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_wreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto kiq_write_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_wreg(ring, reg, v);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -754,8 +806,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_write;
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   	return;
>   
>   failed_kiq_write:
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
> +kiq_write_exit:
>   	pr_err("failed to write reg:%x\n", reg);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 634746829024..ff7597ca6cad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>   	int invalidate_tlbs_size;
>   };
>   
> +#define MAX_KIQ_RREG_NUM 64
>   struct amdgpu_kiq {
>   	u64			eop_gpu_addr;
>   	struct amdgpu_bo	*eop_obj;
> @@ -103,7 +104,7 @@ struct amdgpu_kiq {
>   	struct amdgpu_ring	ring;
>   	struct amdgpu_irq_src	irq;
>   	const struct kiq_pm4_funcs *pmf;
> -	uint32_t			reg_val_offs;
> +	atomic64_t		submit_avail;
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f61664ee4940..137d3d2b46e8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>   	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
> +			  uint32_t reg_val_offs);
>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>   			      uint32_t val, uint32_t mask); @@ -265,7 +266,7 @@ struct 
> amdgpu_ring {
>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) 
> (r)->funcs->emit_cntxcntl((r), (d)) -#define amdgpu_ring_emit_rreg(r, 
> d) (r)->funcs->emit_rreg((r), (d))
> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), 
> +(d), (o))
>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>   #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>   #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) 
> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) diff 
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 8c10084f44ef..a4c3f284691c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	unsigned long flags;
>   	uint32_t seq;
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto failed_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq;
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   	return;
>   
>   failed_kiq:
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
> +failed_exit:
>   	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 0a03e2ad5d95..7853dbc3c8bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>   	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>   }
>   
> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				     uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fc6c2f2bc76c..bdbd92d4fe45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>   }
>   
> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 84fcf842316d..293219452c0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4042,14 +4042,21 @@ static int gfx_v9_0_soft_reset(void *handle)
>   static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> -	uint32_t seq;
> +	unsigned long flags = 0;
> +	uint32_t seq, reg_val_offs;
> +	uint64_t value = 0;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto failed_kiq_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 9 |	/* src: register*/
> @@ -4059,12 +4066,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -4088,10 +4095,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>   
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +	return value;
>   failed_kiq_read:
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +failed_kiq_exit:
>   	pr_err("failed to read gpu clock\n");
>   	return ~0;
>   }
> @@ -5482,10 +5493,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>   }
>   
> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, 
> uint32_t reg)
> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -5494,9 +5505,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, 
> uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 30b75d79efdb..ab960b052c0d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -415,6 +415,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   {
>   	int vmid, i;
>   	signed long r;
> +	unsigned long flags = 0;
>   	uint32_t seq;
>   	uint16_t queried_pasid;
>   	bool ret;
> @@ -422,20 +423,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
> -		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +		if (r) {
> +			pr_err("failed to lock kiq\n");
> +			return -ETIME;
> +		}
> +
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>   		kiq->pmf->kiq_invalidate_tlbs(ring,
>   					pasid, flush_type, all_hub);
>   		amdgpu_fence_emit_polling(ring, &seq);
>   		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq.ring_lock);
> +		amdgpu_gfx_kiq_unlock(kiq, &flags);
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
> +			amdgpu_gfx_kiq_restore(kiq, false);
>   			return -ETIME;
>   		}
>   
> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>   		return 0;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index fecdbc471983..d14fd56a959e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -589,6 +589,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   {
>   	int vmid, i;
>   	signed long r;
> +	unsigned long flags = 0;
>   	uint32_t seq;
>   	uint16_t queried_pasid;
>   	bool ret;
> @@ -613,7 +614,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		if (vega20_xgmi_wa)
>   			ndw += kiq->pmf->invalidate_tlbs_size;
>   
> -		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +		if (r) {
> +			pr_err("failed to lock kiq\n");
> +			return -ETIME;
> +		}
> +
> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, ndw);
>   		if (vega20_xgmi_wa)
> @@ -623,13 +630,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   					pasid, flush_type, all_hub);
>   		amdgpu_fence_emit_polling(ring, &seq);
>   		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq.ring_lock);
> +		amdgpu_gfx_kiq_unlock(kiq, &flags);
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>   			return -ETIME;
>   		}
>   
> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>   		return 0;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/amdgpu: refine kiq access register
  2020-04-22  9:29 Yintian Tao
@ 2020-04-22 11:02 ` Christian König
  2020-04-22 11:49   ` Tao, Yintian
  0 siblings, 1 reply; 17+ messages in thread
From: Christian König @ 2020-04-22 11:02 UTC (permalink / raw)
  To: Yintian Tao, monk.liu, Felix.Kuehling; +Cc: amd-gfx

Am 22.04.20 um 11:29 schrieb Yintian Tao:
> According to the current kiq access register method,
> there will be race condition when using KIQ to read
> register if multiple clients want to read at same time
> just like the expample below:
> 1. client-A start to read REG-0 throguh KIQ
> 2. client-A poll the seqno-0
> 3. client-B start to read REG-1 through KIQ
> 4. client-B poll the seqno-1
> 5. the kiq complete these two read operation
> 6. client-A to read the register at the wb buffer and
>     get REG-1 value
>
> And if there are multiple clients to frequently write
> registers through KIQ which may raise the KIQ ring buffer
> overwritten problem.
>
> Therefore, allocate fixed number wb slot for rreg use
> and limit the submit number which depends on the kiq
> ring_size in order to prevent the overwritten problem.
>
> v2: directly use amdgpu_device_wb_get() for each read instead
>      of to reserve fixde number slot.
>      if there is no enough kiq ring buffer or rreg slot then
>      directly print error log and return instead of busy waiting

I would split that into three patches. One for each problem we have here:

1. Fix kgd_hiq_mqd_load() and maybe other occasions to use 
spin_lock_irqsave().

2. Prevent the overrung of the KIQ. Please drop the approach with the 
atomic here. Instead just add a amdgpu_fence_wait_polling() into 
amdgpu_fence_emit_polling() as I discussed with Monk.

3. Use amdgpu_device_wb_get() each time we need to submit a read.

Regards,
Christian.

>
> Signed-off-by: Yintian Tao <yttao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  8 +-
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 13 ++-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 13 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 83 +++++++++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |  5 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      | 13 ++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |  8 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |  8 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 35 +++++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        | 13 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 13 ++-
>   12 files changed, 167 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4e1d4cfe7a9f..1157c1a0b888 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
>   /*
>    * Writeback
>    */
> -#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
> +#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
>   
>   struct amdgpu_wb {
>   	struct amdgpu_bo	*wb_obj;
> @@ -1028,6 +1028,12 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
>   
>   int emu_soc_asic_init(struct amdgpu_device *adev);
>   
> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
> +			unsigned long *flags);
> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long *flags);
> +
> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs);
> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
>   /*
>    * Registers read & write functions.
>    */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..a65d6a1abc04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -309,9 +309,11 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   			    uint32_t doorbell_off)
>   {
>   	struct amdgpu_device *adev = get_amdgpu_device(kgd);
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>   	struct v10_compute_mqd *m;
>   	uint32_t mec, pipe;
> +	unsigned long flags = 0;
>   	int r;
>   
>   	m = get_mqd(mqd);
> @@ -324,13 +326,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>   		 mec, pipe, queue_id);
>   
> -	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
>   		goto out_unlock;
>   	}
>   
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>   	amdgpu_ring_write(kiq_ring,
>   			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
> @@ -350,8 +358,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>   	amdgpu_ring_commit(kiq_ring);
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   out_unlock:
> -	spin_unlock(&adev->gfx.kiq.ring_lock);
> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
>   	release_queue(kgd);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..4435bd716edd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -307,9 +307,11 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   			    uint32_t doorbell_off)
>   {
>   	struct amdgpu_device *adev = get_amdgpu_device(kgd);
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
>   	struct v9_mqd *m;
>   	uint32_t mec, pipe;
> +	unsigned long flags = 0;
>   	int r;
>   
>   	m = get_mqd(mqd);
> @@ -322,13 +324,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
>   		 mec, pipe, queue_id);
>   
> -	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
>   		goto out_unlock;
>   	}
>   
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
>   	amdgpu_ring_write(kiq_ring,
>   			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
> @@ -348,8 +356,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
>   	amdgpu_ring_commit(kiq_ring);
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   out_unlock:
> -	spin_unlock(&adev->gfx.kiq.ring_lock);
> +	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
>   	release_queue(kgd);
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ea576b4260a4..c0dc7f1849c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -295,6 +295,43 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
>   	return -EINVAL;
>   }
>   
> +int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
> +			unsigned long *flags)
> +{
> +	struct amdgpu_wb *wb = &(kiq->ring.adev)->wb;
> +
> +	spin_lock_irqsave(&kiq->ring_lock, *flags);
> +	if ((atomic64_read(&kiq->submit_avail) > 0) &&
> +	    (read ? find_first_zero_bit(wb->used, wb->num_wb) <
> +	     wb->num_wb : 1)) {
> +		return 0;
> +	} else {
> +		spin_unlock_irqrestore(&kiq->ring_lock, *flags);
> +		pr_err("critical! too more kiq accessers\n");
> +		return -EDEADLK;
> +	}
> +}
> +
> +void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long *flags)
> +{
> +	spin_unlock_irqrestore(&kiq->ring_lock, *flags);
> +}
> +
> +void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs)
> +{
> +	atomic64_dec(&kiq->submit_avail);
> +	if (offs)
> +		amdgpu_device_wb_get(kiq->ring.adev, offs);
> +
> +}
> +
> +void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs)
> +{
> +	atomic64_inc(&kiq->submit_avail);
> +	if (offs)
> +		amdgpu_device_wb_free(kiq->ring.adev, *offs);
> +}
> +
>   int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     struct amdgpu_ring *ring,
>   			     struct amdgpu_irq_src *irq)
> @@ -304,10 +341,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   
>   	spin_lock_init(&kiq->ring_lock);
>   
> -	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
> -	if (r)
> -		return r;
> -
>   	ring->adev = NULL;
>   	ring->ring_obj = NULL;
>   	ring->use_doorbell = true;
> @@ -325,13 +358,15 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     AMDGPU_RING_PRIO_DEFAULT);
>   	if (r)
>   		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
> +	else
> +		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
> +			     (ring->funcs->align_mask + 1));
>   
>   	return r;
>   }
>   
>   void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
>   {
> -	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
>   	amdgpu_ring_fini(ring);
>   }
>   
> @@ -671,19 +706,25 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
>   uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> -	uint32_t seq;
> +	unsigned long flags = 0;
> +	uint32_t seq, reg_val_offs = 0, value = 0;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto kiq_read_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>   	amdgpu_ring_alloc(ring, 32);
> -	amdgpu_ring_emit_rreg(ring, reg);
> +	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -707,9 +748,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return adev->wb.wb[kiq->reg_val_offs];
> +	mb();
> +	value = adev->wb.wb[reg_val_offs];
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +	return value;
>   
>   failed_kiq_read:
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +kiq_read_exit:
>   	pr_err("failed to read reg:%x\n", reg);
>   	return ~0;
>   }
> @@ -717,19 +763,25 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> +	unsigned long flags = 0;
>   	uint32_t seq;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_wreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto kiq_write_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_wreg(ring, reg, v);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -754,8 +806,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_write;
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   	return;
>   
>   failed_kiq_write:
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
> +kiq_write_exit:
>   	pr_err("failed to write reg:%x\n", reg);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 634746829024..ff7597ca6cad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
>   	int invalidate_tlbs_size;
>   };
>   
> +#define MAX_KIQ_RREG_NUM 64
>   struct amdgpu_kiq {
>   	u64			eop_gpu_addr;
>   	struct amdgpu_bo	*eop_obj;
> @@ -103,7 +104,7 @@ struct amdgpu_kiq {
>   	struct amdgpu_ring	ring;
>   	struct amdgpu_irq_src	irq;
>   	const struct kiq_pm4_funcs *pmf;
> -	uint32_t			reg_val_offs;
> +	atomic64_t		submit_avail;
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index f61664ee4940..137d3d2b46e8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>   	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
> -	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
> +	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
> +			  uint32_t reg_val_offs);
>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>   			      uint32_t val, uint32_t mask);
> @@ -265,7 +266,7 @@ struct amdgpu_ring {
>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
> -#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
> +#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o))
>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>   #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>   #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 8c10084f44ef..a4c3f284691c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	unsigned long flags;
>   	uint32_t seq;
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto failed_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, NULL);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq;
>   
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
>   	return;
>   
>   failed_kiq:
> +	amdgpu_gfx_kiq_restore(kiq, NULL);
> +failed_exit:
>   	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 0a03e2ad5d95..7853dbc3c8bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>   	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>   }
>   
> -static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				     uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index fc6c2f2bc76c..bdbd92d4fe45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
>   }
>   
> -static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 84fcf842316d..293219452c0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4042,14 +4042,21 @@ static int gfx_v9_0_soft_reset(void *handle)
>   static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   {
>   	signed long r, cnt = 0;
> -	unsigned long flags;
> -	uint32_t seq;
> +	unsigned long flags = 0;
> +	uint32_t seq, reg_val_offs;
> +	uint64_t value = 0;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   	struct amdgpu_ring *ring = &kiq->ring;
>   
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
> -	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
> +	if (r) {
> +		pr_err("failed to lock kiq\n");
> +		goto failed_kiq_exit;
> +	}
> +
> +	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 9 |	/* src: register*/
> @@ -4059,12 +4066,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_fence_emit_polling(ring, &seq);
>   	amdgpu_ring_commit(ring);
> -	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +	amdgpu_gfx_kiq_unlock(kiq, &flags);
>   
>   	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>   
> @@ -4088,10 +4095,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	if (cnt > MAX_KIQ_REG_TRY)
>   		goto failed_kiq_read;
>   
> -	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
> -		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
> +	value = (uint64_t)adev->wb.wb[reg_val_offs] |
> +		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
>   
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +	return value;
>   failed_kiq_read:
> +	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
> +failed_kiq_exit:
>   	pr_err("failed to read gpu clock\n");
>   	return ~0;
>   }
> @@ -5482,10 +5493,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
>   		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
>   }
>   
> -static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> +static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
> +				    uint32_t reg_val_offs)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
>   	amdgpu_ring_write(ring, 0 |	/* src: register*/
> @@ -5494,9 +5505,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>   	amdgpu_ring_write(ring, reg);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
> -				kiq->reg_val_offs * 4));
> +				reg_val_offs * 4));
>   }
>   
>   static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 30b75d79efdb..ab960b052c0d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -415,6 +415,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   {
>   	int vmid, i;
>   	signed long r;
> +	unsigned long flags = 0;
>   	uint32_t seq;
>   	uint16_t queried_pasid;
>   	bool ret;
> @@ -422,20 +423,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>   
>   	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
> -		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +		if (r) {
> +			pr_err("failed to lock kiq\n");
> +			return -ETIME;
> +		}
> +
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>   		kiq->pmf->kiq_invalidate_tlbs(ring,
>   					pasid, flush_type, all_hub);
>   		amdgpu_fence_emit_polling(ring, &seq);
>   		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq.ring_lock);
> +		amdgpu_gfx_kiq_unlock(kiq, &flags);
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
> +			amdgpu_gfx_kiq_restore(kiq, false);
>   			return -ETIME;
>   		}
>   
> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>   		return 0;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index fecdbc471983..d14fd56a959e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -589,6 +589,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   {
>   	int vmid, i;
>   	signed long r;
> +	unsigned long flags = 0;
>   	uint32_t seq;
>   	uint16_t queried_pasid;
>   	bool ret;
> @@ -613,7 +614,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		if (vega20_xgmi_wa)
>   			ndw += kiq->pmf->invalidate_tlbs_size;
>   
> -		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
> +		if (r) {
> +			pr_err("failed to lock kiq\n");
> +			return -ETIME;
> +		}
> +
> +		amdgpu_gfx_kiq_consume(kiq, NULL);
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, ndw);
>   		if (vega20_xgmi_wa)
> @@ -623,13 +630,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   					pasid, flush_type, all_hub);
>   		amdgpu_fence_emit_polling(ring, &seq);
>   		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq.ring_lock);
> +		amdgpu_gfx_kiq_unlock(kiq, &flags);
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
> +			amdgpu_gfx_kiq_restore(kiq, NULL);
>   			return -ETIME;
>   		}
>   
> +		amdgpu_gfx_kiq_restore(kiq, NULL);
>   		return 0;
>   	}
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] drm/amdgpu: refine kiq access register
@ 2020-04-22  9:29 Yintian Tao
  2020-04-22 11:02 ` Christian König
  0 siblings, 1 reply; 17+ messages in thread
From: Yintian Tao @ 2020-04-22  9:29 UTC (permalink / raw)
  To: christian.koenig, monk.liu, Felix.Kuehling; +Cc: amd-gfx, Yintian Tao

According to the current kiq access register method,
there will be race condition when using KIQ to read
register if multiple clients want to read at same time
just like the expample below:
1. client-A start to read REG-0 throguh KIQ
2. client-A poll the seqno-0
3. client-B start to read REG-1 through KIQ
4. client-B poll the seqno-1
5. the kiq complete these two read operation
6. client-A to read the register at the wb buffer and
   get REG-1 value

And if there are multiple clients to frequently write
registers through KIQ which may raise the KIQ ring buffer
overwritten problem.

Therefore, allocate fixed number wb slot for rreg use
and limit the submit number which depends on the kiq
ring_size in order to prevent the overwritten problem.

v2: directly use amdgpu_device_wb_get() for each read instead
    of to reserve fixde number slot.
    if there is no enough kiq ring buffer or rreg slot then
    directly print error log and return instead of busy waiting

Signed-off-by: Yintian Tao <yttao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  8 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 13 ++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 13 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 83 +++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      | 13 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |  8 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |  8 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         | 35 +++++---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        | 13 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 13 ++-
 12 files changed, 167 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4e1d4cfe7a9f..1157c1a0b888 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -526,7 +526,7 @@ static inline void amdgpu_set_ib_value(struct amdgpu_cs_parser *p,
 /*
  * Writeback
  */
-#define AMDGPU_MAX_WB 128	/* Reserve at most 128 WB slots for amdgpu-owned rings. */
+#define AMDGPU_MAX_WB 256	/* Reserve at most 256 WB slots for amdgpu-owned rings. */
 
 struct amdgpu_wb {
 	struct amdgpu_bo	*wb_obj;
@@ -1028,6 +1028,12 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
 int emu_soc_asic_init(struct amdgpu_device *adev);
 
+int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
+			unsigned long *flags);
+void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long *flags);
+
+void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs);
+void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs);
 /*
  * Registers read & write functions.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..a65d6a1abc04 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -309,9 +309,11 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 			    uint32_t doorbell_off)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
 	struct v10_compute_mqd *m;
 	uint32_t mec, pipe;
+	unsigned long flags = 0;
 	int r;
 
 	m = get_mqd(mqd);
@@ -324,13 +326,19 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
 		 mec, pipe, queue_id);
 
-	spin_lock(&adev->gfx.kiq.ring_lock);
+	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto out_unlock;
+	}
+
 	r = amdgpu_ring_alloc(kiq_ring, 7);
 	if (r) {
 		pr_err("Failed to alloc KIQ (%d).\n", r);
 		goto out_unlock;
 	}
 
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
 	amdgpu_ring_write(kiq_ring,
 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
@@ -350,8 +358,9 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
 	amdgpu_ring_commit(kiq_ring);
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 out_unlock:
-	spin_unlock(&adev->gfx.kiq.ring_lock);
+	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
 	release_queue(kgd);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index df841c2ac5e7..4435bd716edd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -307,9 +307,11 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 			    uint32_t doorbell_off)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
 	struct v9_mqd *m;
 	uint32_t mec, pipe;
+	unsigned long flags = 0;
 	int r;
 
 	m = get_mqd(mqd);
@@ -322,13 +324,19 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
 		 mec, pipe, queue_id);
 
-	spin_lock(&adev->gfx.kiq.ring_lock);
+	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto out_unlock;
+	}
+
 	r = amdgpu_ring_alloc(kiq_ring, 7);
 	if (r) {
 		pr_err("Failed to alloc KIQ (%d).\n", r);
 		goto out_unlock;
 	}
 
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
 	amdgpu_ring_write(kiq_ring,
 			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
@@ -348,8 +356,9 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
 	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
 	amdgpu_ring_commit(kiq_ring);
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 out_unlock:
-	spin_unlock(&adev->gfx.kiq.ring_lock);
+	amdgpu_gfx_kiq_unlock(&adev->gfx.kiq, &flags);
 	release_queue(kgd);
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ea576b4260a4..c0dc7f1849c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -295,6 +295,43 @@ static int amdgpu_gfx_kiq_acquire(struct amdgpu_device *adev,
 	return -EINVAL;
 }
 
+int amdgpu_gfx_kiq_lock(struct amdgpu_kiq *kiq, bool read,
+			unsigned long *flags)
+{
+	struct amdgpu_wb *wb = &(kiq->ring.adev)->wb;
+
+	spin_lock_irqsave(&kiq->ring_lock, *flags);
+	if ((atomic64_read(&kiq->submit_avail) > 0) &&
+	    (read ? find_first_zero_bit(wb->used, wb->num_wb) <
+	     wb->num_wb : 1)) {
+		return 0;
+	} else {
+		spin_unlock_irqrestore(&kiq->ring_lock, *flags);
+		pr_err("critical! too more kiq accessers\n");
+		return -EDEADLK;
+	}
+}
+
+void amdgpu_gfx_kiq_unlock(struct amdgpu_kiq *kiq, unsigned long *flags)
+{
+	spin_unlock_irqrestore(&kiq->ring_lock, *flags);
+}
+
+void amdgpu_gfx_kiq_consume(struct amdgpu_kiq *kiq, uint32_t *offs)
+{
+	atomic64_dec(&kiq->submit_avail);
+	if (offs)
+		amdgpu_device_wb_get(kiq->ring.adev, offs);
+
+}
+
+void amdgpu_gfx_kiq_restore(struct amdgpu_kiq *kiq, uint32_t *offs)
+{
+	atomic64_inc(&kiq->submit_avail);
+	if (offs)
+		amdgpu_device_wb_free(kiq->ring.adev, *offs);
+}
+
 int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 			     struct amdgpu_ring *ring,
 			     struct amdgpu_irq_src *irq)
@@ -304,10 +341,6 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 
 	spin_lock_init(&kiq->ring_lock);
 
-	r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
-	if (r)
-		return r;
-
 	ring->adev = NULL;
 	ring->ring_obj = NULL;
 	ring->use_doorbell = true;
@@ -325,13 +358,15 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
 			     AMDGPU_RING_PRIO_DEFAULT);
 	if (r)
 		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
+	else
+		atomic64_set(&kiq->submit_avail, ring->ring_size / 4 /
+			     (ring->funcs->align_mask + 1));
 
 	return r;
 }
 
 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
 {
-	amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
 	amdgpu_ring_fini(ring);
 }
 
@@ -671,19 +706,25 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
 uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 {
 	signed long r, cnt = 0;
-	unsigned long flags;
-	uint32_t seq;
+	unsigned long flags = 0;
+	uint32_t seq, reg_val_offs = 0, value = 0;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto kiq_read_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
 	amdgpu_ring_alloc(ring, 32);
-	amdgpu_ring_emit_rreg(ring, reg);
+	amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq, &flags);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -707,9 +748,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return adev->wb.wb[kiq->reg_val_offs];
+	mb();
+	value = adev->wb.wb[reg_val_offs];
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+	return value;
 
 failed_kiq_read:
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+kiq_read_exit:
 	pr_err("failed to read reg:%x\n", reg);
 	return ~0;
 }
@@ -717,19 +763,25 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 {
 	signed long r, cnt = 0;
-	unsigned long flags;
+	unsigned long flags = 0;
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_wreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto kiq_write_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_wreg(ring, reg, v);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq, &flags);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -754,8 +806,11 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_write;
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 	return;
 
 failed_kiq_write:
+	amdgpu_gfx_kiq_restore(kiq, NULL);
+kiq_write_exit:
 	pr_err("failed to write reg:%x\n", reg);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 634746829024..ff7597ca6cad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -96,6 +96,7 @@ struct kiq_pm4_funcs {
 	int invalidate_tlbs_size;
 };
 
+#define MAX_KIQ_RREG_NUM 64
 struct amdgpu_kiq {
 	u64			eop_gpu_addr;
 	struct amdgpu_bo	*eop_obj;
@@ -103,7 +104,7 @@ struct amdgpu_kiq {
 	struct amdgpu_ring	ring;
 	struct amdgpu_irq_src	irq;
 	const struct kiq_pm4_funcs *pmf;
-	uint32_t			reg_val_offs;
+	atomic64_t		submit_avail;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index f61664ee4940..137d3d2b46e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -181,7 +181,8 @@ struct amdgpu_ring_funcs {
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
 	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
-	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
+	void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
+			  uint32_t reg_val_offs);
 	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
 	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
 			      uint32_t val, uint32_t mask);
@@ -265,7 +266,7 @@ struct amdgpu_ring {
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
-#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
+#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
 #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
 #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 8c10084f44ef..a4c3f284691c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -56,13 +56,19 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	unsigned long flags;
 	uint32_t seq;
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto failed_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, NULL);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
 					    ref, mask);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq, &flags);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -80,9 +86,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq;
 
+	amdgpu_gfx_kiq_restore(kiq, NULL);
 	return;
 
 failed_kiq:
+	amdgpu_gfx_kiq_restore(kiq, NULL);
+failed_exit:
 	pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 0a03e2ad5d95..7853dbc3c8bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7594,10 +7594,10 @@ static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
 	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
 }
 
-static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				     uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -7606,9 +7606,9 @@ static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index fc6c2f2bc76c..bdbd92d4fe45 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6383,10 +6383,10 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
 }
 
-static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -6395,9 +6395,9 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 84fcf842316d..293219452c0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4042,14 +4042,21 @@ static int gfx_v9_0_soft_reset(void *handle)
 static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 {
 	signed long r, cnt = 0;
-	unsigned long flags;
-	uint32_t seq;
+	unsigned long flags = 0;
+	uint32_t seq, reg_val_offs;
+	uint64_t value = 0;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
-	spin_lock_irqsave(&kiq->ring_lock, flags);
+	r = amdgpu_gfx_kiq_lock(kiq, true, &flags);
+	if (r) {
+		pr_err("failed to lock kiq\n");
+		goto failed_kiq_exit;
+	}
+
+	amdgpu_gfx_kiq_consume(kiq, &reg_val_offs);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 9 |	/* src: register*/
@@ -4059,12 +4066,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+	amdgpu_gfx_kiq_unlock(kiq, &flags);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 
@@ -4088,10 +4095,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
 	if (cnt > MAX_KIQ_REG_TRY)
 		goto failed_kiq_read;
 
-	return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
-		(uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
+	value = (uint64_t)adev->wb.wb[reg_val_offs] |
+		(uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
 
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+	return value;
 failed_kiq_read:
+	amdgpu_gfx_kiq_restore(kiq, &reg_val_offs);
+failed_kiq_exit:
 	pr_err("failed to read gpu clock\n");
 	return ~0;
 }
@@ -5482,10 +5493,10 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 		ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
 }
 
-static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+				    uint32_t reg_val_offs)
 {
 	struct amdgpu_device *adev = ring->adev;
-	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
 	amdgpu_ring_write(ring, 0 |	/* src: register*/
@@ -5494,9 +5505,9 @@ static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 	amdgpu_ring_write(ring, reg);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 	amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-				kiq->reg_val_offs * 4));
+				reg_val_offs * 4));
 }
 
 static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 30b75d79efdb..ab960b052c0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -415,6 +415,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 {
 	int vmid, i;
 	signed long r;
+	unsigned long flags = 0;
 	uint32_t seq;
 	uint16_t queried_pasid;
 	bool ret;
@@ -422,20 +423,28 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
 	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
-		spin_lock(&adev->gfx.kiq.ring_lock);
+		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
+		if (r) {
+			pr_err("failed to lock kiq\n");
+			return -ETIME;
+		}
+
 		/* 2 dwords flush + 8 dwords fence */
 		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
+		amdgpu_gfx_kiq_consume(kiq, NULL);
 		kiq->pmf->kiq_invalidate_tlbs(ring,
 					pasid, flush_type, all_hub);
 		amdgpu_fence_emit_polling(ring, &seq);
 		amdgpu_ring_commit(ring);
-		spin_unlock(&adev->gfx.kiq.ring_lock);
+		amdgpu_gfx_kiq_unlock(kiq, &flags);
 		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
+			amdgpu_gfx_kiq_restore(kiq, false);
 			return -ETIME;
 		}
 
+		amdgpu_gfx_kiq_restore(kiq, NULL);
 		return 0;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index fecdbc471983..d14fd56a959e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -589,6 +589,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 {
 	int vmid, i;
 	signed long r;
+	unsigned long flags = 0;
 	uint32_t seq;
 	uint16_t queried_pasid;
 	bool ret;
@@ -613,7 +614,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 		if (vega20_xgmi_wa)
 			ndw += kiq->pmf->invalidate_tlbs_size;
 
-		spin_lock(&adev->gfx.kiq.ring_lock);
+		r = amdgpu_gfx_kiq_lock(kiq, false, &flags);
+		if (r) {
+			pr_err("failed to lock kiq\n");
+			return -ETIME;
+		}
+
+		amdgpu_gfx_kiq_consume(kiq, NULL);
 		/* 2 dwords flush + 8 dwords fence */
 		amdgpu_ring_alloc(ring, ndw);
 		if (vega20_xgmi_wa)
@@ -623,13 +630,15 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
 					pasid, flush_type, all_hub);
 		amdgpu_fence_emit_polling(ring, &seq);
 		amdgpu_ring_commit(ring);
-		spin_unlock(&adev->gfx.kiq.ring_lock);
+		amdgpu_gfx_kiq_unlock(kiq, &flags);
 		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
 		if (r < 1) {
 			DRM_ERROR("wait for kiq fence error: %ld.\n", r);
+			amdgpu_gfx_kiq_restore(kiq, NULL);
 			return -ETIME;
 		}
 
+		amdgpu_gfx_kiq_restore(kiq, NULL);
 		return 0;
 	}
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-04-22 12:40 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-22  4:42 [PATCH] drm/amdgpu: refine kiq access register Yintian Tao
2020-04-22  7:11 ` Tao, Yintian
2020-04-22  7:23   ` Christian König
2020-04-22  7:35     ` Tao, Yintian
2020-04-22  7:40       ` Christian König
2020-04-22  7:49         ` Tao, Yintian
2020-04-22  7:53           ` Christian König
2020-04-22  8:06             ` Tao, Yintian
2020-04-22  8:23               ` Christian König
2020-04-22  8:34                 ` Tao, Yintian
2020-04-22  9:29 Yintian Tao
2020-04-22 11:02 ` Christian König
2020-04-22 11:49   ` Tao, Yintian
2020-04-22 11:57     ` Christian König
2020-04-22 12:20       ` Tao, Yintian
2020-04-22 12:33         ` Christian König
2020-04-22 12:40           ` Tao, Yintian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.