All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions
@ 2022-06-13 15:19 Graham Sider
  2022-06-13 15:20 ` [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription Graham Sider
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Graham Sider @ 2022-06-13 15:19 UTC (permalink / raw)
  To: amd-gfx; +Cc: Mukul.Joshi, Felix.Kuehling, Graham Sider, Philip.Yang

Store MES scheduler and MES KIQ version numbers in amdgpu_mes for GFX11.

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  3 +++
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 92ddee5e33db..aa06c8396ee0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -64,6 +64,9 @@ struct amdgpu_mes {
 
 	spinlock_t                      queue_id_lock;
 
+	uint32_t			sched_version;
+	uint32_t			kiq_version;
+
 	uint32_t                        total_max_queue;
 	uint32_t                        doorbell_id_offset;
 	uint32_t                        max_doorbell_slices;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index e4eb87689f7f..2a9ef308e71c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -890,6 +890,18 @@ static int mes_v11_0_queue_init(struct amdgpu_device *adev,
 		mes_v11_0_queue_init_register(ring);
 	}
 
+	/* get MES scheduler/KIQ versions */
+	mutex_lock(&adev->srbm_mutex);
+	soc21_grbm_select(adev, 3, pipe, 0, 0);
+
+	if (pipe == AMDGPU_MES_SCHED_PIPE)
+		adev->mes.sched_version = RREG32_SOC15(GC, 0, regCP_MES_GP3_LO);
+	else if (pipe == AMDGPU_MES_KIQ_PIPE && adev->enable_mes_kiq)
+		adev->mes.kiq_version = RREG32_SOC15(GC, 0, regCP_MES_GP3_LO);
+
+	soc21_grbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+
 	return 0;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-13 15:19 [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions Graham Sider
@ 2022-06-13 15:20 ` Graham Sider
  2022-06-14 18:22   ` philip yang
  2022-06-15  7:28   ` Christian König
  2022-06-13 15:20 ` [PATCH v3 3/3] drm/amdgpu: Update mes_v11_api_def.h Graham Sider
  2022-06-14  8:25 ` [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions Xiao, Jack
  2 siblings, 2 replies; 13+ messages in thread
From: Graham Sider @ 2022-06-13 15:20 UTC (permalink / raw)
  To: amd-gfx; +Cc: Mukul.Joshi, Felix.Kuehling, Graham Sider, Philip.Yang

Starting with GFX11, MES requires wptr BOs to be GTT allocated/mapped to
GART for usermode queues in order to support oversubscription. In the
case that work is submitted to an unmapped queue, MES must have a GART
wptr address to determine whether the queue should be mapped.

This change is accompanied with changes in MES and is applicable for
MES_VERSION >= 3.

v2:
- Update MES_VERSION check from 2 to 3.
v3:
- Use amdgpu_vm_bo_lookup_mapping for wptr_bo mapping lookup
- Move wptr_bo refcount increment to amdgpu_amdkfd_map_gtt_bo_to_gart
- Remove list_del_init from amdgpu_amdkfd_map_gtt_bo_to_gart
- Cleanup/fix create_queue wptr_bo error handling

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 49 +++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 37 +++++++++++++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  9 +++-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  3 ++
 .../amd/amdkfd/kfd_process_queue_manager.c    | 17 +++++--
 7 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 429b16ba10bf..dba26d1e3be9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -301,6 +301,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,
 		struct kgd_mem *mem, void **kptr, uint64_t *size);
 void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct amdgpu_device *adev,
 		struct kgd_mem *mem);
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);
 
 int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
 					    struct dma_fence **ef);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index efab923056f4..888d08128a94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2030,6 +2030,55 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
 	return ret;
 }
 
+/**
+ * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count
+ * @adev: Device to which allocated BO belongs
+ * @bo: Buffer object to be mapped
+ *
+ * Before return, bo reference count is incremented. To release the reference and unpin/
+ * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
+ */
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo)
+{
+	int ret;
+
+	ret = amdgpu_bo_reserve(bo, true);
+	if (ret) {
+		pr_err("Failed to reserve bo. ret %d\n", ret);
+		goto err_reserve_bo_failed;
+	}
+
+	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
+	if (ret) {
+		pr_err("Failed to pin bo. ret %d\n", ret);
+		goto err_pin_bo_failed;
+	}
+
+	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
+	if (ret) {
+		pr_err("Failed to bind bo to GART. ret %d\n", ret);
+		goto err_map_bo_gart_failed;
+	}
+
+	amdgpu_amdkfd_remove_eviction_fence(
+		bo, bo->kfd_bo->process_info->eviction_fence);
+	list_del_init(&bo->kfd_bo->validate_list.head);
+
+	amdgpu_bo_unreserve(bo);
+
+	bo = amdgpu_bo_ref(bo);
+
+	return 0;
+
+err_map_bo_gart_failed:
+	amdgpu_bo_unpin(bo);
+err_pin_bo_failed:
+	amdgpu_bo_unreserve(bo);
+err_reserve_bo_failed:
+
+	return ret;
+}
+
 int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,
 		struct kgd_mem *mem, void **kptr, uint64_t *size)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e9766e165c38..1789ed8b79f5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -289,6 +289,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
 	struct kfd_process_device *pdd;
 	struct queue_properties q_properties;
 	uint32_t doorbell_offset_in_process = 0;
+	struct amdgpu_bo *wptr_bo = NULL;
 
 	memset(&q_properties, 0, sizeof(struct queue_properties));
 
@@ -316,12 +317,41 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
 		goto err_bind_process;
 	}
 
+	/* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work
+	 * on unmapped queues for usermode queue oversubscription (no aggregated doorbell)
+	 */
+	if (dev->shared_resources.enable_mes && (dev->adev->mes.sched_version & 0xff) >= 3) {
+		struct amdgpu_bo_va_mapping *wptr_mapping;
+		struct amdgpu_vm *wptr_vm;
+
+		wptr_vm = drm_priv_to_vm(pdd->drm_priv);
+		err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
+		if (err)
+			goto err_wptr_map_gart;
+
+		wptr_mapping = amdgpu_vm_bo_lookup_mapping(
+				wptr_vm, args->write_pointer_address >> PAGE_SHIFT);
+		amdgpu_bo_unreserve(wptr_vm->root.bo);
+		if (!wptr_mapping) {
+			pr_err("Failed to lookup wptr bo\n");
+			err = -EINVAL;
+			goto err_wptr_map_gart;
+		}
+
+		wptr_bo = wptr_mapping->bo_va->base.bo;
+		err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo);
+		if (err) {
+			pr_err("Failed to map wptr bo to GART\n");
+			goto err_wptr_map_gart;
+		}
+	}
+
 	pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
 			p->pasid,
 			dev->id);
 
-	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, NULL, NULL, NULL,
-			&doorbell_offset_in_process);
+	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo,
+			NULL, NULL, NULL, &doorbell_offset_in_process);
 	if (err != 0)
 		goto err_create_queue;
 
@@ -354,6 +384,9 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
 	return 0;
 
 err_create_queue:
+	if (wptr_bo)
+		amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
+err_wptr_map_gart:
 err_bind_process:
 err_pdd:
 	mutex_unlock(&p->mutex);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index b39d89c52887..d8de2fbdfc7d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -208,6 +208,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
 	struct mes_add_queue_input queue_input;
 	int r, queue_type;
+	uint64_t wptr_addr_off;
 
 	if (dqm->is_hws_hang)
 		return -EIO;
@@ -227,7 +228,13 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 					AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
 	queue_input.doorbell_offset = q->properties.doorbell_off;
 	queue_input.mqd_addr = q->gart_mqd_addr;
-	queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
+
+	if (q->wptr_bo) {
+		wptr_addr_off = (uint64_t)q->properties.write_ptr - (uint64_t)q->wptr_bo->kfd_bo->va;
+		queue_input.wptr_addr = ((uint64_t)q->wptr_bo->tbo.resource->start << PAGE_SHIFT) + wptr_addr_off;
+	} else
+		queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
+
 	queue_input.paging = false;
 	queue_input.tba_addr = qpd->tba_addr;
 	queue_input.tma_addr = qpd->tma_addr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index f1654b4da856..35e74bdd81da 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -377,6 +377,8 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
 	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
 	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+	m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
 	m->sdmax_rlcx_doorbell_offset =
 		q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a5d3963537d7..dcddee0d6f06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -639,6 +639,8 @@ struct queue {
 	void *gang_ctx_bo;
 	uint64_t gang_ctx_gpu_addr;
 	void *gang_ctx_cpu_ptr;
+
+	struct amdgpu_bo *wptr_bo;
 };
 
 enum KFD_MQD_TYPE {
@@ -1404,6 +1406,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 			    struct file *f,
 			    struct queue_properties *properties,
 			    unsigned int *qid,
+			    struct amdgpu_bo *wptr_bo,
 			    const struct kfd_criu_queue_priv_data *q_data,
 			    const void *restore_mqd,
 			    const void *restore_ctl_stack,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index f99e09dc43ea..3a17c1ebc527 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -190,7 +190,8 @@ void pqm_uninit(struct process_queue_manager *pqm)
 static int init_user_queue(struct process_queue_manager *pqm,
 				struct kfd_dev *dev, struct queue **q,
 				struct queue_properties *q_properties,
-				struct file *f, unsigned int qid)
+				struct file *f, struct amdgpu_bo *wptr_bo,
+				unsigned int qid)
 {
 	int retval;
 
@@ -221,6 +222,7 @@ static int init_user_queue(struct process_queue_manager *pqm,
 			goto cleanup;
 		}
 		memset((*q)->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE);
+		(*q)->wptr_bo = wptr_bo;
 	}
 
 	pr_debug("PQM After init queue");
@@ -237,6 +239,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 			    struct file *f,
 			    struct queue_properties *properties,
 			    unsigned int *qid,
+			    struct amdgpu_bo *wptr_bo,
 			    const struct kfd_criu_queue_priv_data *q_data,
 			    const void *restore_mqd,
 			    const void *restore_ctl_stack,
@@ -299,7 +302,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 		 * allocate_sdma_queue() in create_queue() has the
 		 * corresponding check logic.
 		 */
-		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
+		retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid);
 		if (retval != 0)
 			goto err_create_queue;
 		pqn->q = q;
@@ -320,7 +323,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 			goto err_create_queue;
 		}
 
-		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
+		retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid);
 		if (retval != 0)
 			goto err_create_queue;
 		pqn->q = q;
@@ -457,9 +460,13 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 			pdd->qpd.num_gws = 0;
 		}
 
-		if (dev->shared_resources.enable_mes)
+		if (dev->shared_resources.enable_mes) {
 			amdgpu_amdkfd_free_gtt_mem(dev->adev,
 						   pqn->q->gang_ctx_bo);
+			if (pqn->q->wptr_bo)
+				amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo);
+
+		}
 		uninit_queue(pqn->q);
 	}
 
@@ -900,7 +907,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
 
 	print_queue_properties(&qp);
 
-	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, q_data, mqd, ctl_stack,
+	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, NULL, q_data, mqd, ctl_stack,
 				NULL);
 	if (ret) {
 		pr_err("Failed to create new queue err:%d\n", ret);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 3/3] drm/amdgpu: Update mes_v11_api_def.h
  2022-06-13 15:19 [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions Graham Sider
  2022-06-13 15:20 ` [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription Graham Sider
@ 2022-06-13 15:20 ` Graham Sider
  2022-06-14  8:29   ` Xiao, Jack
  2022-06-14  8:25 ` [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions Xiao, Jack
  2 siblings, 1 reply; 13+ messages in thread
From: Graham Sider @ 2022-06-13 15:20 UTC (permalink / raw)
  To: amd-gfx; +Cc: Mukul.Joshi, Felix.Kuehling, Graham Sider, Philip.Yang

Update MES API to support oversubscription without aggregated doorbell
for usermode queues.

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c               | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h               | 1 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c                | 3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
 drivers/gpu/drm/amd/include/mes_v11_api_def.h         | 4 +++-
 5 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 2e86baa32c55..3d9a81a8fa1c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -681,6 +681,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
 	queue_input.wptr_addr = qprops->wptr_gpu_addr;
 	queue_input.queue_type = qprops->queue_type;
 	queue_input.paging = qprops->paging;
+	queue_input.oversubscription_no_aggregated_en = 0;
 
 	r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
 	if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index aa06c8396ee0..26765a9946a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -207,6 +207,7 @@ struct mes_add_queue_input {
 	uint32_t        debug_vmid;
 	uint64_t	tba_addr;
 	uint64_t	tma_addr;
+	uint64_t	oversubscription_no_aggregated_en;
 };
 
 struct mes_remove_queue_input {
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 2a9ef308e71c..95a1394d3943 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -163,6 +163,8 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
 	mes_add_queue_pkt.gws_size = input->gws_size;
 	mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
 	mes_add_queue_pkt.tma_addr = input->tma_addr;
+	mes_add_queue_pkt.oversubscription_no_aggregated_en =
+		input->oversubscription_no_aggregated_en;
 
 	mes_add_queue_pkt.api_status.api_completion_fence_addr =
 		mes->ring.fence_drv.gpu_addr;
@@ -341,6 +343,7 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
 	mes_set_hw_res_pkt.disable_reset = 1;
 	mes_set_hw_res_pkt.disable_mes_log = 1;
 	mes_set_hw_res_pkt.use_different_vmid_compute = 1;
+	mes_set_hw_res_pkt.oversubscription_timer = 50;
 
 	mes_set_hw_res_pkt.api_status.api_completion_fence_addr =
 		mes->ring.fence_drv.gpu_addr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index d8de2fbdfc7d..762bc6059387 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -235,6 +235,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	} else
 		queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
 
+	queue_input.oversubscription_no_aggregated_en = 1;
+
 	queue_input.paging = false;
 	queue_input.tba_addr = qpd->tba_addr;
 	queue_input.tma_addr = qpd->tma_addr;
diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
index f9d02d7bdf77..95f0246eb045 100644
--- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
@@ -226,6 +226,7 @@ union MESAPI_SET_HW_RESOURCES {
 			};
 			uint32_t	uint32_t_all;
 		};
+		uint32_t	oversubscription_timer;
 	};
 
 	uint32_t	max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS];
@@ -265,7 +266,8 @@ union MESAPI__ADD_QUEUE {
 			uint32_t is_gang_suspended	: 1;
 			uint32_t is_tmz_queue		: 1;
 			uint32_t map_kiq_utility_queue  : 1;
-			uint32_t reserved		: 23;
+			uint32_t oversubscription_no_aggregated_en : 1;
+			uint32_t reserved		: 22;
 		};
 		struct MES_API_STATUS		api_status;
 		uint64_t                        tma_addr;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions
  2022-06-13 15:19 [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions Graham Sider
  2022-06-13 15:20 ` [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription Graham Sider
  2022-06-13 15:20 ` [PATCH v3 3/3] drm/amdgpu: Update mes_v11_api_def.h Graham Sider
@ 2022-06-14  8:25 ` Xiao, Jack
  2 siblings, 0 replies; 13+ messages in thread
From: Xiao, Jack @ 2022-06-14  8:25 UTC (permalink / raw)
  To: Sider, Graham, amd-gfx; +Cc: Joshi, Mukul, Kuehling, Felix, Yang, Philip

[-- Attachment #1: Type: text/plain, Size: 2403 bytes --]

[AMD Official Use Only - General]

Reviewed-by: Jack Xiao <Jack.Xiao@amd.com>

Regards,
Jack
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Graham Sider <Graham.Sider@amd.com>
Sent: Monday, 13 June 2022 23:19
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Joshi, Mukul <Mukul.Joshi@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Sider, Graham <Graham.Sider@amd.com>; Yang, Philip <Philip.Yang@amd.com>
Subject: [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions

Store MES scheduler and MES KIQ version numbers in amdgpu_mes for GFX11.

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  3 +++
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 92ddee5e33db..aa06c8396ee0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -64,6 +64,9 @@ struct amdgpu_mes {

         spinlock_t                      queue_id_lock;

+       uint32_t                        sched_version;
+       uint32_t                        kiq_version;
+
         uint32_t                        total_max_queue;
         uint32_t                        doorbell_id_offset;
         uint32_t                        max_doorbell_slices;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index e4eb87689f7f..2a9ef308e71c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -890,6 +890,18 @@ static int mes_v11_0_queue_init(struct amdgpu_device *adev,
                 mes_v11_0_queue_init_register(ring);
         }

+       /* get MES scheduler/KIQ versions */
+       mutex_lock(&adev->srbm_mutex);
+       soc21_grbm_select(adev, 3, pipe, 0, 0);
+
+       if (pipe == AMDGPU_MES_SCHED_PIPE)
+               adev->mes.sched_version = RREG32_SOC15(GC, 0, regCP_MES_GP3_LO);
+       else if (pipe == AMDGPU_MES_KIQ_PIPE && adev->enable_mes_kiq)
+               adev->mes.kiq_version = RREG32_SOC15(GC, 0, regCP_MES_GP3_LO);
+
+       soc21_grbm_select(adev, 0, 0, 0, 0);
+       mutex_unlock(&adev->srbm_mutex);
+
         return 0;
 }

--
2.25.1


[-- Attachment #2: Type: text/html, Size: 5405 bytes --]

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 3/3] drm/amdgpu: Update mes_v11_api_def.h
  2022-06-13 15:20 ` [PATCH v3 3/3] drm/amdgpu: Update mes_v11_api_def.h Graham Sider
@ 2022-06-14  8:29   ` Xiao, Jack
  0 siblings, 0 replies; 13+ messages in thread
From: Xiao, Jack @ 2022-06-14  8:29 UTC (permalink / raw)
  To: Sider, Graham, amd-gfx; +Cc: Joshi, Mukul, Kuehling, Felix, Yang, Philip

[-- Attachment #1: Type: text/plain, Size: 5324 bytes --]

[AMD Official Use Only - General]

>> +       uint64_t        oversubscription_no_aggregated_en;

uint64_t is unnecessary here, 32bit or bool type here better.

With fixed, the patch is Reviewed-by: Jack Xiao <Jack.Xiao@amd.com>

Regards,
Jack
________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Graham Sider <Graham.Sider@amd.com>
Sent: Monday, 13 June 2022 23:20
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Joshi, Mukul <Mukul.Joshi@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Sider, Graham <Graham.Sider@amd.com>; Yang, Philip <Philip.Yang@amd.com>
Subject: [PATCH v3 3/3] drm/amdgpu: Update mes_v11_api_def.h

Update MES API to support oversubscription without aggregated doorbell
for usermode queues.

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c               | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h               | 1 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c                | 3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
 drivers/gpu/drm/amd/include/mes_v11_api_def.h         | 4 +++-
 5 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 2e86baa32c55..3d9a81a8fa1c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -681,6 +681,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
         queue_input.wptr_addr = qprops->wptr_gpu_addr;
         queue_input.queue_type = qprops->queue_type;
         queue_input.paging = qprops->paging;
+       queue_input.oversubscription_no_aggregated_en = 0;

         r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
         if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index aa06c8396ee0..26765a9946a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -207,6 +207,7 @@ struct mes_add_queue_input {
         uint32_t        debug_vmid;
         uint64_t        tba_addr;
         uint64_t        tma_addr;
+       uint64_t        oversubscription_no_aggregated_en;
 };

 struct mes_remove_queue_input {
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 2a9ef308e71c..95a1394d3943 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -163,6 +163,8 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
         mes_add_queue_pkt.gws_size = input->gws_size;
         mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
         mes_add_queue_pkt.tma_addr = input->tma_addr;
+       mes_add_queue_pkt.oversubscription_no_aggregated_en =
+               input->oversubscription_no_aggregated_en;

         mes_add_queue_pkt.api_status.api_completion_fence_addr =
                 mes->ring.fence_drv.gpu_addr;
@@ -341,6 +343,7 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
         mes_set_hw_res_pkt.disable_reset = 1;
         mes_set_hw_res_pkt.disable_mes_log = 1;
         mes_set_hw_res_pkt.use_different_vmid_compute = 1;
+       mes_set_hw_res_pkt.oversubscription_timer = 50;

         mes_set_hw_res_pkt.api_status.api_completion_fence_addr =
                 mes->ring.fence_drv.gpu_addr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index d8de2fbdfc7d..762bc6059387 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -235,6 +235,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
         } else
                 queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;

+       queue_input.oversubscription_no_aggregated_en = 1;
+
         queue_input.paging = false;
         queue_input.tba_addr = qpd->tba_addr;
         queue_input.tma_addr = qpd->tma_addr;
diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
index f9d02d7bdf77..95f0246eb045 100644
--- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
@@ -226,6 +226,7 @@ union MESAPI_SET_HW_RESOURCES {
                         };
                         uint32_t        uint32_t_all;
                 };
+               uint32_t        oversubscription_timer;
         };

         uint32_t        max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS];
@@ -265,7 +266,8 @@ union MESAPI__ADD_QUEUE {
                         uint32_t is_gang_suspended      : 1;
                         uint32_t is_tmz_queue           : 1;
                         uint32_t map_kiq_utility_queue  : 1;
-                       uint32_t reserved               : 23;
+                       uint32_t oversubscription_no_aggregated_en : 1;
+                       uint32_t reserved               : 22;
                 };
                 struct MES_API_STATUS           api_status;
                 uint64_t                        tma_addr;
--
2.25.1


[-- Attachment #2: Type: text/html, Size: 11532 bytes --]

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-13 15:20 ` [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription Graham Sider
@ 2022-06-14 18:22   ` philip yang
  2022-06-14 19:46     ` Sider, Graham
  2022-06-15  7:28   ` Christian König
  1 sibling, 1 reply; 13+ messages in thread
From: philip yang @ 2022-06-14 18:22 UTC (permalink / raw)
  To: Graham Sider, amd-gfx; +Cc: Mukul.Joshi, Felix.Kuehling, Philip.Yang

[-- Attachment #1: Type: text/html, Size: 14848 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-14 18:22   ` philip yang
@ 2022-06-14 19:46     ` Sider, Graham
  0 siblings, 0 replies; 13+ messages in thread
From: Sider, Graham @ 2022-06-14 19:46 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx; +Cc: Joshi, Mukul, Kuehling, Felix

[AMD Official Use Only - General]

>> From: Yang, Philip <Philip.Yang@amd.com> 
>> Sent: Tuesday, June 14, 2022 2:22 PM
>> To: Sider, Graham <Graham.Sider@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Joshi, Mukul <Mukul.Joshi@amd.com>; Yang, Philip <Philip.Yang@amd.com>
>> Subject: Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
>>
>>
>> On 2022-06-13 11:20, Graham Sider wrote:
>> Starting with GFX11, MES requires wptr BOs to be GTT allocated/mapped to
>> GART for usermode queues in order to support oversubscription. In the
>> case that work is submitted to an unmapped queue, MES must have a GART
>> wptr address to determine whether the queue should be mapped.
>>
>> This change is accompanied with changes in MES and is applicable for
>> MES_VERSION >= 3.
>>
>> v2:
>> - Update MES_VERSION check from 2 to 3.
>> v3:
>> - Use amdgpu_vm_bo_lookup_mapping for wptr_bo mapping lookup
>> - Move wptr_bo refcount increment to amdgpu_amdkfd_map_gtt_bo_to_gart
>> - Remove list_del_init from amdgpu_amdkfd_map_gtt_bo_to_gart
>> - Cleanup/fix create_queue wptr_bo error handling
>> Two nit-pick below, with those fixed, this patch is
>> Reviewed-by: Philip Yangmailto:Philip.Yang@amd.com
>>
>> Signed-off-by: Graham Sider mailto:Graham.Sider@amd.com
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 49 +++++++++++++++++++
>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 37 +++++++++++++-
>> .../drm/amd/amdkfd/kfd_device_queue_manager.c |  9 +++-
>> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  2 +
>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  3 ++
>> .../amd/amdkfd/kfd_process_queue_manager.c    | 17 +++++--
>>  7 files changed, 110 insertions(+), 8 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 429b16ba10bf..dba26d1e3be9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -301,6 +301,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,
>>  		struct kgd_mem *mem, void **kptr, uint64_t *size);
>>  void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct amdgpu_device *adev,
>>  		struct kgd_mem *mem);
>> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);
>>  
>>  int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
>>  					    struct dma_fence **ef);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index efab923056f4..888d08128a94 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -2030,6 +2030,55 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
>>  	return ret;
>>  }
>>  
>> +/**
>> + * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count
>> + * @adev: Device to which allocated BO belongs
>> + * @bo: Buffer object to be mapped
>> + *
>> + * Before return, bo reference count is incremented. To release the reference and unpin/
>> + * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
>> + */
>> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo)
>> +{
>> +	int ret;
>> +
>> +	ret = amdgpu_bo_reserve(bo, true);
>> +	if (ret) {
>> +		pr_err("Failed to reserve bo. ret %d\n", ret);
>> +		goto err_reserve_bo_failed;
>> +	}
>> +
>> +	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
>> +	if (ret) {
>> +		pr_err("Failed to pin bo. ret %d\n", ret);
>> +		goto err_pin_bo_failed;
>> +	}
>> +
>> +	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
>> +	if (ret) {
>> +		pr_err("Failed to bind bo to GART. ret %d\n", ret);
>> +		goto err_map_bo_gart_failed;
>> +	}
>> +
>> +	amdgpu_amdkfd_remove_eviction_fence(
>> +		bo, bo->kfd_bo->process_info->eviction_fence);
>> +	list_del_init(&bo->kfd_bo->validate_list.head);
>
> pinned bo should keep in validate_list as PDB/PTB may move and update, please remove list_del_init here.
>

Thought I deleted this line - good catch.

>> +
>> +	amdgpu_bo_unreserve(bo);
>> +
>> +	bo = amdgpu_bo_ref(bo);
>> +
>> +	return 0;
>> +
>> +err_map_bo_gart_failed:
>> +	amdgpu_bo_unpin(bo);
>> +err_pin_bo_failed:
>> +	amdgpu_bo_unreserve(bo);
>> +err_reserve_bo_failed:
>> +
>> +	return ret;
>> +}
>> +
>>  int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,
>>  		struct kgd_mem *mem, void **kptr, uint64_t *size)
>>  {
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index e9766e165c38..1789ed8b79f5 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -289,6 +289,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>>  	struct kfd_process_device *pdd;
>>  	struct queue_properties q_properties;
>>  	uint32_t doorbell_offset_in_process = 0;
>> +	struct amdgpu_bo *wptr_bo = NULL;
>>  
>>  	memset(&q_properties, 0, sizeof(struct queue_properties));
>>  
>> @@ -316,12 +317,41 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>>  		goto err_bind_process;
>>  	}
>>  
>> +	/* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work
>> +	 * on unmapped queues for usermode queue oversubscription (no aggregated doorbell)
>> +	 */
>> +	if (dev->shared_resources.enable_mes && (dev->adev->mes.sched_version & 0xff) >= 3) {
>
> Should we check ip version for GFX11 only? Because GFX10 set enable_mes, and may set adev->mes.sched_version later as well.
> Regards,
> Philip
>

I purposefully omitted setting sched/kiq_version for GFX10 in part for this reason. The MES version numbers for GFX11 don't build on top of GFX10, so it could get very confusing if we go back to set this for GFX10 now. With that in mind I think it's okay to base this check on the MES scheduler version without explicitly checking for GFX11+, but if it's preferred the other way, I'm also fine with that.

Best,
Graham

>> +		struct amdgpu_bo_va_mapping *wptr_mapping;
>> +		struct amdgpu_vm *wptr_vm;
>> +
>> +		wptr_vm = drm_priv_to_vm(pdd->drm_priv);
>> +		err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
>> +		if (err)
>> +			goto err_wptr_map_gart;
>> +
>> +		wptr_mapping = amdgpu_vm_bo_lookup_mapping(
>> +				wptr_vm, args->write_pointer_address >> PAGE_SHIFT);
>> +		amdgpu_bo_unreserve(wptr_vm->root.bo);
>> +		if (!wptr_mapping) {
>> +			pr_err("Failed to lookup wptr bo\n");
>> +			err = -EINVAL;
>> +			goto err_wptr_map_gart;
>> +		}
>> +
>> +		wptr_bo = wptr_mapping->bo_va->base.bo;
>> +		err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo);
>> +		if (err) {
>> +			pr_err("Failed to map wptr bo to GART\n");
>> +			goto err_wptr_map_gart;
>> +		}
>> +	}
>> +
>>  	pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
>>  			p->pasid,
>>  			dev->id);
>>  
>> -	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, NULL, NULL, NULL,
>> -			&doorbell_offset_in_process);
>> +	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo,
>> +			NULL, NULL, NULL, &doorbell_offset_in_process);
>>  	if (err != 0)
>>  		goto err_create_queue;
>>  
>> @@ -354,6 +384,9 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>>  	return 0;
>>  
>>  err_create_queue:
>> +	if (wptr_bo)
>> +		amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
>> +err_wptr_map_gart:
>>  err_bind_process:
>>  err_pdd:
>>  	mutex_unlock(&p->mutex);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index b39d89c52887..d8de2fbdfc7d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -208,6 +208,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>>  	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>>  	struct mes_add_queue_input queue_input;
>>  	int r, queue_type;
>> +	uint64_t wptr_addr_off;
>>  
>>  	if (dqm->is_hws_hang)
>>  		return -EIO;
>> @@ -227,7 +228,13 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>>  					AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
>>  	queue_input.doorbell_offset = q->properties.doorbell_off;
>>  	queue_input.mqd_addr = q->gart_mqd_addr;
>> -	queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
>> +
>> +	if (q->wptr_bo) {
>> +		wptr_addr_off = (uint64_t)q->properties.write_ptr - (uint64_t)q->wptr_bo->kfd_bo->va;
>> +		queue_input.wptr_addr = ((uint64_t)q->wptr_bo->tbo.resource->start << PAGE_SHIFT) + wptr_addr_off;
>> +	} else
>> +		queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
>> +
>>  	queue_input.paging = false;
>>  	queue_input.tba_addr = qpd->tba_addr;
>>  	queue_input.tma_addr = qpd->tma_addr;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>> index f1654b4da856..35e74bdd81da 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>> @@ -377,6 +377,8 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
>>  	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
>>  	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
>>  	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
>> +	m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
>> +	m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
>>  	m->sdmax_rlcx_doorbell_offset =
>>  		q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
>>  
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index a5d3963537d7..dcddee0d6f06 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -639,6 +639,8 @@ struct queue {
>>  	void *gang_ctx_bo;
>>  	uint64_t gang_ctx_gpu_addr;
>>  	void *gang_ctx_cpu_ptr;
>> +
>> +	struct amdgpu_bo *wptr_bo;
>>  };
>>  
>>  enum KFD_MQD_TYPE {
>> @@ -1404,6 +1406,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>>  			    struct file *f,
>>  			    struct queue_properties *properties,
>>  			    unsigned int *qid,
>> +			    struct amdgpu_bo *wptr_bo,
>>  			    const struct kfd_criu_queue_priv_data *q_data,
>>  			    const void *restore_mqd,
>>  			    const void *restore_ctl_stack,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index f99e09dc43ea..3a17c1ebc527 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -190,7 +190,8 @@ void pqm_uninit(struct process_queue_manager *pqm)  static int init_user_queue(struct process_queue_manager *pqm,
>>  				struct kfd_dev *dev, struct queue **q,
>>  				struct queue_properties *q_properties,
>> -				struct file *f, unsigned int qid)
>> +				struct file *f, struct amdgpu_bo *wptr_bo,
>> +				unsigned int qid)
>>  {
>>  	int retval;
>>  
>> @@ -221,6 +222,7 @@ static int init_user_queue(struct process_queue_manager *pqm,
>>  			goto cleanup;
>>  		}
>>  		memset((*q)->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE);
>> +		(*q)->wptr_bo = wptr_bo;
>>  	}
>>  
>>  	pr_debug("PQM After init queue");
>> @@ -237,6 +239,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>>  			    struct file *f,
>>  			    struct queue_properties *properties,
>>  			    unsigned int *qid,
>> +			    struct amdgpu_bo *wptr_bo,
>>  			    const struct kfd_criu_queue_priv_data *q_data,
>>  			    const void *restore_mqd,
>>  			    const void *restore_ctl_stack,
>> @@ -299,7 +302,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>>  		 * allocate_sdma_queue() in create_queue() has the
>>  		 * corresponding check logic.
>>  		 */
>> -		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
>> +		retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid);
>>  		if (retval != 0)
>>  			goto err_create_queue;
>>  		pqn->q = q;
>> @@ -320,7 +323,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>>  			goto err_create_queue;
>>  		}
>>  
>> -		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
>> +		retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid);
>>  		if (retval != 0)
>>  			goto err_create_queue;
>>  		pqn->q = q;
>> @@ -457,9 +460,13 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>>  			pdd->qpd.num_gws = 0;
>>  		}
>>  
>> -		if (dev->shared_resources.enable_mes)
>> +		if (dev->shared_resources.enable_mes) {
>>  			amdgpu_amdkfd_free_gtt_mem(dev->adev,
>>  						   pqn->q->gang_ctx_bo);
>> +			if (pqn->q->wptr_bo)
>> +				amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo);
>> +
>> +		}
>>  		uninit_queue(pqn->q);
>>  	}
>>  
>> @@ -900,7 +907,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
>>  
>>  	print_queue_properties(&qp);
>>  
>> -	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, q_data, mqd, ctl_stack,
>> +	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, NULL, 
>> +q_data, mqd, ctl_stack,
>>  				NULL);
>>  	if (ret) {
>>  		pr_err("Failed to create new queue err:%d\n", ret);
>> --
>> 2.25.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-13 15:20 ` [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription Graham Sider
  2022-06-14 18:22   ` philip yang
@ 2022-06-15  7:28   ` Christian König
  2022-06-15 13:17     ` Sider, Graham
  1 sibling, 1 reply; 13+ messages in thread
From: Christian König @ 2022-06-15  7:28 UTC (permalink / raw)
  To: Graham Sider, amd-gfx; +Cc: Mukul.Joshi, Felix.Kuehling, Philip.Yang



Am 13.06.22 um 17:20 schrieb Graham Sider:
> Starting with GFX11, MES requires wptr BOs to be GTT allocated/mapped to
> GART for usermode queues in order to support oversubscription. In the
> case that work is submitted to an unmapped queue, MES must have a GART
> wptr address to determine whether the queue should be mapped.
>
> This change is accompanied with changes in MES and is applicable for
> MES_VERSION >= 3.
>
> v2:
> - Update MES_VERSION check from 2 to 3.
> v3:
> - Use amdgpu_vm_bo_lookup_mapping for wptr_bo mapping lookup
> - Move wptr_bo refcount increment to amdgpu_amdkfd_map_gtt_bo_to_gart
> - Remove list_del_init from amdgpu_amdkfd_map_gtt_bo_to_gart
> - Cleanup/fix create_queue wptr_bo error handling
>
> Signed-off-by: Graham Sider <Graham.Sider@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 49 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 37 +++++++++++++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  9 +++-
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  3 ++
>   .../amd/amdkfd/kfd_process_queue_manager.c    | 17 +++++--
>   7 files changed, 110 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 429b16ba10bf..dba26d1e3be9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -301,6 +301,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,
>   		struct kgd_mem *mem, void **kptr, uint64_t *size);
>   void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct amdgpu_device *adev,
>   		struct kgd_mem *mem);
> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);
>   
>   int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
>   					    struct dma_fence **ef);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index efab923056f4..888d08128a94 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2030,6 +2030,55 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
>   	return ret;
>   }
>   
> +/**
> + * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count
> + * @adev: Device to which allocated BO belongs
> + * @bo: Buffer object to be mapped
> + *
> + * Before return, bo reference count is incremented. To release the reference and unpin/
> + * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
> + */
> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo)
> +{
> +	int ret;
> +
> +	ret = amdgpu_bo_reserve(bo, true);
> +	if (ret) {
> +		pr_err("Failed to reserve bo. ret %d\n", ret);
> +		goto err_reserve_bo_failed;
> +	}
> +
> +	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
> +	if (ret) {
> +		pr_err("Failed to pin bo. ret %d\n", ret);
> +		goto err_pin_bo_failed;
> +	}


Oh! Is that something we do for every MQD? When yes that here is pretty 
much a NAK.

We can't do this or create a trivial deny of service attack against the 
kernel driver.

Regards,
Christian.

> +
> +	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
> +	if (ret) {
> +		pr_err("Failed to bind bo to GART. ret %d\n", ret);
> +		goto err_map_bo_gart_failed;
> +	}
> +
> +	amdgpu_amdkfd_remove_eviction_fence(
> +		bo, bo->kfd_bo->process_info->eviction_fence);
> +	list_del_init(&bo->kfd_bo->validate_list.head);
> +
> +	amdgpu_bo_unreserve(bo);
> +
> +	bo = amdgpu_bo_ref(bo);
> +
> +	return 0;
> +
> +err_map_bo_gart_failed:
> +	amdgpu_bo_unpin(bo);
> +err_pin_bo_failed:
> +	amdgpu_bo_unreserve(bo);
> +err_reserve_bo_failed:
> +
> +	return ret;
> +}
> +
>   int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,
>   		struct kgd_mem *mem, void **kptr, uint64_t *size)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index e9766e165c38..1789ed8b79f5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -289,6 +289,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   	struct kfd_process_device *pdd;
>   	struct queue_properties q_properties;
>   	uint32_t doorbell_offset_in_process = 0;
> +	struct amdgpu_bo *wptr_bo = NULL;
>   
>   	memset(&q_properties, 0, sizeof(struct queue_properties));
>   
> @@ -316,12 +317,41 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   		goto err_bind_process;
>   	}
>   
> +	/* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work
> +	 * on unmapped queues for usermode queue oversubscription (no aggregated doorbell)
> +	 */
> +	if (dev->shared_resources.enable_mes && (dev->adev->mes.sched_version & 0xff) >= 3) {
> +		struct amdgpu_bo_va_mapping *wptr_mapping;
> +		struct amdgpu_vm *wptr_vm;
> +
> +		wptr_vm = drm_priv_to_vm(pdd->drm_priv);
> +		err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
> +		if (err)
> +			goto err_wptr_map_gart;
> +
> +		wptr_mapping = amdgpu_vm_bo_lookup_mapping(
> +				wptr_vm, args->write_pointer_address >> PAGE_SHIFT);
> +		amdgpu_bo_unreserve(wptr_vm->root.bo);
> +		if (!wptr_mapping) {
> +			pr_err("Failed to lookup wptr bo\n");
> +			err = -EINVAL;
> +			goto err_wptr_map_gart;
> +		}
> +
> +		wptr_bo = wptr_mapping->bo_va->base.bo;
> +		err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo);
> +		if (err) {
> +			pr_err("Failed to map wptr bo to GART\n");
> +			goto err_wptr_map_gart;
> +		}
> +	}
> +
>   	pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
>   			p->pasid,
>   			dev->id);
>   
> -	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, NULL, NULL, NULL,
> -			&doorbell_offset_in_process);
> +	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo,
> +			NULL, NULL, NULL, &doorbell_offset_in_process);
>   	if (err != 0)
>   		goto err_create_queue;
>   
> @@ -354,6 +384,9 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   	return 0;
>   
>   err_create_queue:
> +	if (wptr_bo)
> +		amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
> +err_wptr_map_gart:
>   err_bind_process:
>   err_pdd:
>   	mutex_unlock(&p->mutex);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index b39d89c52887..d8de2fbdfc7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -208,6 +208,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>   	struct mes_add_queue_input queue_input;
>   	int r, queue_type;
> +	uint64_t wptr_addr_off;
>   
>   	if (dqm->is_hws_hang)
>   		return -EIO;
> @@ -227,7 +228,13 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   					AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
>   	queue_input.doorbell_offset = q->properties.doorbell_off;
>   	queue_input.mqd_addr = q->gart_mqd_addr;
> -	queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
> +
> +	if (q->wptr_bo) {
> +		wptr_addr_off = (uint64_t)q->properties.write_ptr - (uint64_t)q->wptr_bo->kfd_bo->va;
> +		queue_input.wptr_addr = ((uint64_t)q->wptr_bo->tbo.resource->start << PAGE_SHIFT) + wptr_addr_off;
> +	} else
> +		queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
> +
>   	queue_input.paging = false;
>   	queue_input.tba_addr = qpd->tba_addr;
>   	queue_input.tma_addr = qpd->tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> index f1654b4da856..35e74bdd81da 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> @@ -377,6 +377,8 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
>   	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
>   	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
>   	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
> +	m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
> +	m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
>   	m->sdmax_rlcx_doorbell_offset =
>   		q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index a5d3963537d7..dcddee0d6f06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -639,6 +639,8 @@ struct queue {
>   	void *gang_ctx_bo;
>   	uint64_t gang_ctx_gpu_addr;
>   	void *gang_ctx_cpu_ptr;
> +
> +	struct amdgpu_bo *wptr_bo;
>   };
>   
>   enum KFD_MQD_TYPE {
> @@ -1404,6 +1406,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>   			    struct file *f,
>   			    struct queue_properties *properties,
>   			    unsigned int *qid,
> +			    struct amdgpu_bo *wptr_bo,
>   			    const struct kfd_criu_queue_priv_data *q_data,
>   			    const void *restore_mqd,
>   			    const void *restore_ctl_stack,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index f99e09dc43ea..3a17c1ebc527 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -190,7 +190,8 @@ void pqm_uninit(struct process_queue_manager *pqm)
>   static int init_user_queue(struct process_queue_manager *pqm,
>   				struct kfd_dev *dev, struct queue **q,
>   				struct queue_properties *q_properties,
> -				struct file *f, unsigned int qid)
> +				struct file *f, struct amdgpu_bo *wptr_bo,
> +				unsigned int qid)
>   {
>   	int retval;
>   
> @@ -221,6 +222,7 @@ static int init_user_queue(struct process_queue_manager *pqm,
>   			goto cleanup;
>   		}
>   		memset((*q)->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE);
> +		(*q)->wptr_bo = wptr_bo;
>   	}
>   
>   	pr_debug("PQM After init queue");
> @@ -237,6 +239,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>   			    struct file *f,
>   			    struct queue_properties *properties,
>   			    unsigned int *qid,
> +			    struct amdgpu_bo *wptr_bo,
>   			    const struct kfd_criu_queue_priv_data *q_data,
>   			    const void *restore_mqd,
>   			    const void *restore_ctl_stack,
> @@ -299,7 +302,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>   		 * allocate_sdma_queue() in create_queue() has the
>   		 * corresponding check logic.
>   		 */
> -		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
> +		retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid);
>   		if (retval != 0)
>   			goto err_create_queue;
>   		pqn->q = q;
> @@ -320,7 +323,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>   			goto err_create_queue;
>   		}
>   
> -		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
> +		retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid);
>   		if (retval != 0)
>   			goto err_create_queue;
>   		pqn->q = q;
> @@ -457,9 +460,13 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   			pdd->qpd.num_gws = 0;
>   		}
>   
> -		if (dev->shared_resources.enable_mes)
> +		if (dev->shared_resources.enable_mes) {
>   			amdgpu_amdkfd_free_gtt_mem(dev->adev,
>   						   pqn->q->gang_ctx_bo);
> +			if (pqn->q->wptr_bo)
> +				amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo);
> +
> +		}
>   		uninit_queue(pqn->q);
>   	}
>   
> @@ -900,7 +907,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
>   
>   	print_queue_properties(&qp);
>   
> -	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, q_data, mqd, ctl_stack,
> +	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, NULL, q_data, mqd, ctl_stack,
>   				NULL);
>   	if (ret) {
>   		pr_err("Failed to create new queue err:%d\n", ret);


^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-15  7:28   ` Christian König
@ 2022-06-15 13:17     ` Sider, Graham
  2022-06-15 14:06       ` Christian König
  0 siblings, 1 reply; 13+ messages in thread
From: Sider, Graham @ 2022-06-15 13:17 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx; +Cc: Joshi, Mukul, Kuehling, Felix, Yang, Philip

[AMD Official Use Only - General]

> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Wednesday, June 15, 2022 3:29 AM
> To: Sider, Graham <Graham.Sider@amd.com>; amd-
> gfx@lists.freedesktop.org
> Cc: Joshi, Mukul <Mukul.Joshi@amd.com>; Kuehling, Felix
> <Felix.Kuehling@amd.com>; Yang, Philip <Philip.Yang@amd.com>
> Subject: Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue
> oversubscription
> 
> 
> 
> Am 13.06.22 um 17:20 schrieb Graham Sider:
> > Starting with GFX11, MES requires wptr BOs to be GTT allocated/mapped
> > to GART for usermode queues in order to support oversubscription. In
> > the case that work is submitted to an unmapped queue, MES must have a
> > GART wptr address to determine whether the queue should be mapped.
> >
> > This change is accompanied with changes in MES and is applicable for
> > MES_VERSION >= 3.
> >
> > v2:
> > - Update MES_VERSION check from 2 to 3.
> > v3:
> > - Use amdgpu_vm_bo_lookup_mapping for wptr_bo mapping lookup
> > - Move wptr_bo refcount increment to
> amdgpu_amdkfd_map_gtt_bo_to_gart
> > - Remove list_del_init from amdgpu_amdkfd_map_gtt_bo_to_gart
> > - Cleanup/fix create_queue wptr_bo error handling
> >
> > Signed-off-by: Graham Sider <Graham.Sider@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
> >   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 49
> +++++++++++++++++++
> >   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 37 +++++++++++++-
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  9 +++-
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  2 +
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  3 ++
> >   .../amd/amdkfd/kfd_process_queue_manager.c    | 17 +++++--
> >   7 files changed, 110 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > index 429b16ba10bf..dba26d1e3be9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > @@ -301,6 +301,7 @@ int
> amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device
> *adev,
> >   		struct kgd_mem *mem, void **kptr, uint64_t *size);
> >   void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct
> amdgpu_device *adev,
> >   		struct kgd_mem *mem);
> > +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev,
> > +struct amdgpu_bo *bo);
> >
> >   int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
> >   					    struct dma_fence **ef);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > index efab923056f4..888d08128a94 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> > @@ -2030,6 +2030,55 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
> >   	return ret;
> >   }
> >
> > +/**
> > + * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and
> increment
> > +reference count
> > + * @adev: Device to which allocated BO belongs
> > + * @bo: Buffer object to be mapped
> > + *
> > + * Before return, bo reference count is incremented. To release the
> > +reference and unpin/
> > + * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
> > + */
> > +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev,
> > +struct amdgpu_bo *bo) {
> > +	int ret;
> > +
> > +	ret = amdgpu_bo_reserve(bo, true);
> > +	if (ret) {
> > +		pr_err("Failed to reserve bo. ret %d\n", ret);
> > +		goto err_reserve_bo_failed;
> > +	}
> > +
> > +	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
> > +	if (ret) {
> > +		pr_err("Failed to pin bo. ret %d\n", ret);
> > +		goto err_pin_bo_failed;
> > +	}
> 
> 
> Oh! Is that something we do for every MQD? When yes that here is pretty
> much a NAK.
> 
> We can't do this or create a trivial deny of service attack against the kernel
> driver.
> 
> Regards,
> Christian.
> 

Hi Christian, could you elaborate on this? Right now this is only being used to pin the queue wptr BO.

Best,
Graham

> > +
> > +	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
> > +	if (ret) {
> > +		pr_err("Failed to bind bo to GART. ret %d\n", ret);
> > +		goto err_map_bo_gart_failed;
> > +	}
> > +
> > +	amdgpu_amdkfd_remove_eviction_fence(
> > +		bo, bo->kfd_bo->process_info->eviction_fence);
> > +	list_del_init(&bo->kfd_bo->validate_list.head);
> > +
> > +	amdgpu_bo_unreserve(bo);
> > +
> > +	bo = amdgpu_bo_ref(bo);
> > +
> > +	return 0;
> > +
> > +err_map_bo_gart_failed:
> > +	amdgpu_bo_unpin(bo);
> > +err_pin_bo_failed:
> > +	amdgpu_bo_unreserve(bo);
> > +err_reserve_bo_failed:
> > +
> > +	return ret;
> > +}
> > +
> >   int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct
> amdgpu_device *adev,
> >   		struct kgd_mem *mem, void **kptr, uint64_t *size)
> >   {
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index e9766e165c38..1789ed8b79f5 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -289,6 +289,7 @@ static int kfd_ioctl_create_queue(struct file *filep,
> struct kfd_process *p,
> >   	struct kfd_process_device *pdd;
> >   	struct queue_properties q_properties;
> >   	uint32_t doorbell_offset_in_process = 0;
> > +	struct amdgpu_bo *wptr_bo = NULL;
> >
> >   	memset(&q_properties, 0, sizeof(struct queue_properties));
> >
> > @@ -316,12 +317,41 @@ static int kfd_ioctl_create_queue(struct file
> *filep, struct kfd_process *p,
> >   		goto err_bind_process;
> >   	}
> >
> > +	/* Starting with GFX11, wptr BOs must be mapped to GART for MES
> to determine work
> > +	 * on unmapped queues for usermode queue oversubscription (no
> aggregated doorbell)
> > +	 */
> > +	if (dev->shared_resources.enable_mes && (dev->adev-
> >mes.sched_version & 0xff) >= 3) {
> > +		struct amdgpu_bo_va_mapping *wptr_mapping;
> > +		struct amdgpu_vm *wptr_vm;
> > +
> > +		wptr_vm = drm_priv_to_vm(pdd->drm_priv);
> > +		err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
> > +		if (err)
> > +			goto err_wptr_map_gart;
> > +
> > +		wptr_mapping = amdgpu_vm_bo_lookup_mapping(
> > +				wptr_vm, args->write_pointer_address >>
> PAGE_SHIFT);
> > +		amdgpu_bo_unreserve(wptr_vm->root.bo);
> > +		if (!wptr_mapping) {
> > +			pr_err("Failed to lookup wptr bo\n");
> > +			err = -EINVAL;
> > +			goto err_wptr_map_gart;
> > +		}
> > +
> > +		wptr_bo = wptr_mapping->bo_va->base.bo;
> > +		err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev,
> wptr_bo);
> > +		if (err) {
> > +			pr_err("Failed to map wptr bo to GART\n");
> > +			goto err_wptr_map_gart;
> > +		}
> > +	}
> > +
> >   	pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
> >   			p->pasid,
> >   			dev->id);
> >
> > -	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
> &queue_id, NULL, NULL, NULL,
> > -			&doorbell_offset_in_process);
> > +	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
> &queue_id, wptr_bo,
> > +			NULL, NULL, NULL, &doorbell_offset_in_process);
> >   	if (err != 0)
> >   		goto err_create_queue;
> >
> > @@ -354,6 +384,9 @@ static int kfd_ioctl_create_queue(struct file *filep,
> struct kfd_process *p,
> >   	return 0;
> >
> >   err_create_queue:
> > +	if (wptr_bo)
> > +		amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
> > +err_wptr_map_gart:
> >   err_bind_process:
> >   err_pdd:
> >   	mutex_unlock(&p->mutex);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index b39d89c52887..d8de2fbdfc7d 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -208,6 +208,7 @@ static int add_queue_mes(struct
> device_queue_manager *dqm, struct queue *q,
> >   	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
> >   	struct mes_add_queue_input queue_input;
> >   	int r, queue_type;
> > +	uint64_t wptr_addr_off;
> >
> >   	if (dqm->is_hws_hang)
> >   		return -EIO;
> > @@ -227,7 +228,13 @@ static int add_queue_mes(struct
> device_queue_manager *dqm, struct queue *q,
> >
> 	AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
> >   	queue_input.doorbell_offset = q->properties.doorbell_off;
> >   	queue_input.mqd_addr = q->gart_mqd_addr;
> > -	queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
> > +
> > +	if (q->wptr_bo) {
> > +		wptr_addr_off = (uint64_t)q->properties.write_ptr -
> (uint64_t)q->wptr_bo->kfd_bo->va;
> > +		queue_input.wptr_addr = ((uint64_t)q->wptr_bo-
> >tbo.resource->start << PAGE_SHIFT) + wptr_addr_off;
> > +	} else
> > +		queue_input.wptr_addr = (uint64_t)q-
> >properties.write_ptr;
> > +
> >   	queue_input.paging = false;
> >   	queue_input.tba_addr = qpd->tba_addr;
> >   	queue_input.tma_addr = qpd->tma_addr; diff --git
> > a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> > index f1654b4da856..35e74bdd81da 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> > @@ -377,6 +377,8 @@ static void update_mqd_sdma(struct mqd_manager
> *mm, void *mqd,
> >   	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >>
> 8);
> >   	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q-
> >read_ptr);
> >   	m->sdmax_rlcx_rb_rptr_addr_hi =
> > upper_32_bits((uint64_t)q->read_ptr);
> > +	m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q-
> >write_ptr);
> > +	m->sdmax_rlcx_rb_wptr_poll_addr_hi =
> > +upper_32_bits((uint64_t)q->write_ptr);
> >   	m->sdmax_rlcx_doorbell_offset =
> >   		q->doorbell_off <<
> SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index a5d3963537d7..dcddee0d6f06 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -639,6 +639,8 @@ struct queue {
> >   	void *gang_ctx_bo;
> >   	uint64_t gang_ctx_gpu_addr;
> >   	void *gang_ctx_cpu_ptr;
> > +
> > +	struct amdgpu_bo *wptr_bo;
> >   };
> >
> >   enum KFD_MQD_TYPE {
> > @@ -1404,6 +1406,7 @@ int pqm_create_queue(struct
> process_queue_manager *pqm,
> >   			    struct file *f,
> >   			    struct queue_properties *properties,
> >   			    unsigned int *qid,
> > +			    struct amdgpu_bo *wptr_bo,
> >   			    const struct kfd_criu_queue_priv_data *q_data,
> >   			    const void *restore_mqd,
> >   			    const void *restore_ctl_stack, diff --git
> > a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > index f99e09dc43ea..3a17c1ebc527 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > @@ -190,7 +190,8 @@ void pqm_uninit(struct process_queue_manager
> *pqm)
> >   static int init_user_queue(struct process_queue_manager *pqm,
> >   				struct kfd_dev *dev, struct queue **q,
> >   				struct queue_properties *q_properties,
> > -				struct file *f, unsigned int qid)
> > +				struct file *f, struct amdgpu_bo *wptr_bo,
> > +				unsigned int qid)
> >   {
> >   	int retval;
> >
> > @@ -221,6 +222,7 @@ static int init_user_queue(struct
> process_queue_manager *pqm,
> >   			goto cleanup;
> >   		}
> >   		memset((*q)->gang_ctx_cpu_ptr, 0,
> AMDGPU_MES_GANG_CTX_SIZE);
> > +		(*q)->wptr_bo = wptr_bo;
> >   	}
> >
> >   	pr_debug("PQM After init queue");
> > @@ -237,6 +239,7 @@ int pqm_create_queue(struct
> process_queue_manager *pqm,
> >   			    struct file *f,
> >   			    struct queue_properties *properties,
> >   			    unsigned int *qid,
> > +			    struct amdgpu_bo *wptr_bo,
> >   			    const struct kfd_criu_queue_priv_data *q_data,
> >   			    const void *restore_mqd,
> >   			    const void *restore_ctl_stack, @@ -299,7 +302,7
> @@ int
> > pqm_create_queue(struct process_queue_manager *pqm,
> >   		 * allocate_sdma_queue() in create_queue() has the
> >   		 * corresponding check logic.
> >   		 */
> > -		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
> > +		retval = init_user_queue(pqm, dev, &q, properties, f,
> wptr_bo,
> > +*qid);
> >   		if (retval != 0)
> >   			goto err_create_queue;
> >   		pqn->q = q;
> > @@ -320,7 +323,7 @@ int pqm_create_queue(struct
> process_queue_manager *pqm,
> >   			goto err_create_queue;
> >   		}
> >
> > -		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
> > +		retval = init_user_queue(pqm, dev, &q, properties, f,
> wptr_bo,
> > +*qid);
> >   		if (retval != 0)
> >   			goto err_create_queue;
> >   		pqn->q = q;
> > @@ -457,9 +460,13 @@ int pqm_destroy_queue(struct
> process_queue_manager *pqm, unsigned int qid)
> >   			pdd->qpd.num_gws = 0;
> >   		}
> >
> > -		if (dev->shared_resources.enable_mes)
> > +		if (dev->shared_resources.enable_mes) {
> >   			amdgpu_amdkfd_free_gtt_mem(dev->adev,
> >   						   pqn->q->gang_ctx_bo);
> > +			if (pqn->q->wptr_bo)
> > +				amdgpu_amdkfd_free_gtt_mem(dev-
> >adev, pqn->q->wptr_bo);
> > +
> > +		}
> >   		uninit_queue(pqn->q);
> >   	}
> >
> > @@ -900,7 +907,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
> >
> >   	print_queue_properties(&qp);
> >
> > -	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp,
> &queue_id, q_data, mqd, ctl_stack,
> > +	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp,
> &queue_id,
> > +NULL, q_data, mqd, ctl_stack,
> >   				NULL);
> >   	if (ret) {
> >   		pr_err("Failed to create new queue err:%d\n", ret);

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-15 13:17     ` Sider, Graham
@ 2022-06-15 14:06       ` Christian König
  2022-06-15 22:41         ` philip yang
  0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2022-06-15 14:06 UTC (permalink / raw)
  To: Sider, Graham, amd-gfx; +Cc: Joshi, Mukul, Kuehling, Felix, Yang, Philip

Am 15.06.22 um 15:17 schrieb Sider, Graham:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Wednesday, June 15, 2022 3:29 AM
>> To: Sider, Graham <Graham.Sider@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Cc: Joshi, Mukul <Mukul.Joshi@amd.com>; Kuehling, Felix
>> <Felix.Kuehling@amd.com>; Yang, Philip <Philip.Yang@amd.com>
>> Subject: Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue
>> oversubscription
>>
>>
>>
>> Am 13.06.22 um 17:20 schrieb Graham Sider:
>>> Starting with GFX11, MES requires wptr BOs to be GTT allocated/mapped
>>> to GART for usermode queues in order to support oversubscription. In
>>> the case that work is submitted to an unmapped queue, MES must have a
>>> GART wptr address to determine whether the queue should be mapped.
>>>
>>> This change is accompanied with changes in MES and is applicable for
>>> MES_VERSION >= 3.
>>>
>>> v2:
>>> - Update MES_VERSION check from 2 to 3.
>>> v3:
>>> - Use amdgpu_vm_bo_lookup_mapping for wptr_bo mapping lookup
>>> - Move wptr_bo refcount increment to
>> amdgpu_amdkfd_map_gtt_bo_to_gart
>>> - Remove list_del_init from amdgpu_amdkfd_map_gtt_bo_to_gart
>>> - Cleanup/fix create_queue wptr_bo error handling
>>>
>>> Signed-off-by: Graham Sider <Graham.Sider@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>>>    .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 49
>> +++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 37 +++++++++++++-
>>>    .../drm/amd/amdkfd/kfd_device_queue_manager.c |  9 +++-
>>>    .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  2 +
>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  3 ++
>>>    .../amd/amdkfd/kfd_process_queue_manager.c    | 17 +++++--
>>>    7 files changed, 110 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> index 429b16ba10bf..dba26d1e3be9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> @@ -301,6 +301,7 @@ int
>> amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device
>> *adev,
>>>    		struct kgd_mem *mem, void **kptr, uint64_t *size);
>>>    void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct
>> amdgpu_device *adev,
>>>    		struct kgd_mem *mem);
>>> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev,
>>> +struct amdgpu_bo *bo);
>>>
>>>    int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
>>>    					    struct dma_fence **ef);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index efab923056f4..888d08128a94 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -2030,6 +2030,55 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
>>>    	return ret;
>>>    }
>>>
>>> +/**
>>> + * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and
>> increment
>>> +reference count
>>> + * @adev: Device to which allocated BO belongs
>>> + * @bo: Buffer object to be mapped
>>> + *
>>> + * Before return, bo reference count is incremented. To release the
>>> +reference and unpin/
>>> + * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
>>> + */
>>> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev,
>>> +struct amdgpu_bo *bo) {
>>> +	int ret;
>>> +
>>> +	ret = amdgpu_bo_reserve(bo, true);
>>> +	if (ret) {
>>> +		pr_err("Failed to reserve bo. ret %d\n", ret);
>>> +		goto err_reserve_bo_failed;
>>> +	}
>>> +
>>> +	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
>>> +	if (ret) {
>>> +		pr_err("Failed to pin bo. ret %d\n", ret);
>>> +		goto err_pin_bo_failed;
>>> +	}
>>
>> Oh! Is that something we do for every MQD? When yes that here is pretty
>> much a NAK.
>>
>> We can't do this or create a trivial deny of service attack against the kernel
>> driver.
>>
>> Regards,
>> Christian.
>>
> Hi Christian, could you elaborate on this? Right now this is only being used to pin the queue wptr BO.

Well is this wptr BO per process, per queue or global?

amdgpu_bo_pin() is only allowed if we pin global resources, otherwise I 
have to reject that.

Regards,
Christian.

>
> Best,
> Graham
>
>>> +
>>> +	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
>>> +	if (ret) {
>>> +		pr_err("Failed to bind bo to GART. ret %d\n", ret);
>>> +		goto err_map_bo_gart_failed;
>>> +	}
>>> +
>>> +	amdgpu_amdkfd_remove_eviction_fence(
>>> +		bo, bo->kfd_bo->process_info->eviction_fence);
>>> +	list_del_init(&bo->kfd_bo->validate_list.head);
>>> +
>>> +	amdgpu_bo_unreserve(bo);
>>> +
>>> +	bo = amdgpu_bo_ref(bo);
>>> +
>>> +	return 0;
>>> +
>>> +err_map_bo_gart_failed:
>>> +	amdgpu_bo_unpin(bo);
>>> +err_pin_bo_failed:
>>> +	amdgpu_bo_unreserve(bo);
>>> +err_reserve_bo_failed:
>>> +
>>> +	return ret;
>>> +}
>>> +
>>>    int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct
>> amdgpu_device *adev,
>>>    		struct kgd_mem *mem, void **kptr, uint64_t *size)
>>>    {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index e9766e165c38..1789ed8b79f5 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -289,6 +289,7 @@ static int kfd_ioctl_create_queue(struct file *filep,
>> struct kfd_process *p,
>>>    	struct kfd_process_device *pdd;
>>>    	struct queue_properties q_properties;
>>>    	uint32_t doorbell_offset_in_process = 0;
>>> +	struct amdgpu_bo *wptr_bo = NULL;
>>>
>>>    	memset(&q_properties, 0, sizeof(struct queue_properties));
>>>
>>> @@ -316,12 +317,41 @@ static int kfd_ioctl_create_queue(struct file
>> *filep, struct kfd_process *p,
>>>    		goto err_bind_process;
>>>    	}
>>>
>>> +	/* Starting with GFX11, wptr BOs must be mapped to GART for MES
>> to determine work
>>> +	 * on unmapped queues for usermode queue oversubscription (no
>> aggregated doorbell)
>>> +	 */
>>> +	if (dev->shared_resources.enable_mes && (dev->adev-
>>> mes.sched_version & 0xff) >= 3) {
>>> +		struct amdgpu_bo_va_mapping *wptr_mapping;
>>> +		struct amdgpu_vm *wptr_vm;
>>> +
>>> +		wptr_vm = drm_priv_to_vm(pdd->drm_priv);
>>> +		err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
>>> +		if (err)
>>> +			goto err_wptr_map_gart;
>>> +
>>> +		wptr_mapping = amdgpu_vm_bo_lookup_mapping(
>>> +				wptr_vm, args->write_pointer_address >>
>> PAGE_SHIFT);
>>> +		amdgpu_bo_unreserve(wptr_vm->root.bo);
>>> +		if (!wptr_mapping) {
>>> +			pr_err("Failed to lookup wptr bo\n");
>>> +			err = -EINVAL;
>>> +			goto err_wptr_map_gart;
>>> +		}
>>> +
>>> +		wptr_bo = wptr_mapping->bo_va->base.bo;
>>> +		err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev,
>> wptr_bo);
>>> +		if (err) {
>>> +			pr_err("Failed to map wptr bo to GART\n");
>>> +			goto err_wptr_map_gart;
>>> +		}
>>> +	}
>>> +
>>>    	pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
>>>    			p->pasid,
>>>    			dev->id);
>>>
>>> -	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
>> &queue_id, NULL, NULL, NULL,
>>> -			&doorbell_offset_in_process);
>>> +	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
>> &queue_id, wptr_bo,
>>> +			NULL, NULL, NULL, &doorbell_offset_in_process);
>>>    	if (err != 0)
>>>    		goto err_create_queue;
>>>
>>> @@ -354,6 +384,9 @@ static int kfd_ioctl_create_queue(struct file *filep,
>> struct kfd_process *p,
>>>    	return 0;
>>>
>>>    err_create_queue:
>>> +	if (wptr_bo)
>>> +		amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
>>> +err_wptr_map_gart:
>>>    err_bind_process:
>>>    err_pdd:
>>>    	mutex_unlock(&p->mutex);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index b39d89c52887..d8de2fbdfc7d 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -208,6 +208,7 @@ static int add_queue_mes(struct
>> device_queue_manager *dqm, struct queue *q,
>>>    	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>>>    	struct mes_add_queue_input queue_input;
>>>    	int r, queue_type;
>>> +	uint64_t wptr_addr_off;
>>>
>>>    	if (dqm->is_hws_hang)
>>>    		return -EIO;
>>> @@ -227,7 +228,13 @@ static int add_queue_mes(struct
>> device_queue_manager *dqm, struct queue *q,
>> 	AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
>>>    	queue_input.doorbell_offset = q->properties.doorbell_off;
>>>    	queue_input.mqd_addr = q->gart_mqd_addr;
>>> -	queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
>>> +
>>> +	if (q->wptr_bo) {
>>> +		wptr_addr_off = (uint64_t)q->properties.write_ptr -
>> (uint64_t)q->wptr_bo->kfd_bo->va;
>>> +		queue_input.wptr_addr = ((uint64_t)q->wptr_bo-
>>> tbo.resource->start << PAGE_SHIFT) + wptr_addr_off;
>>> +	} else
>>> +		queue_input.wptr_addr = (uint64_t)q-
>>> properties.write_ptr;
>>> +
>>>    	queue_input.paging = false;
>>>    	queue_input.tba_addr = qpd->tba_addr;
>>>    	queue_input.tma_addr = qpd->tma_addr; diff --git
>>> a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>> index f1654b4da856..35e74bdd81da 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>> @@ -377,6 +377,8 @@ static void update_mqd_sdma(struct mqd_manager
>> *mm, void *mqd,
>>>    	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >>
>> 8);
>>>    	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q-
>>> read_ptr);
>>>    	m->sdmax_rlcx_rb_rptr_addr_hi =
>>> upper_32_bits((uint64_t)q->read_ptr);
>>> +	m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q-
>>> write_ptr);
>>> +	m->sdmax_rlcx_rb_wptr_poll_addr_hi =
>>> +upper_32_bits((uint64_t)q->write_ptr);
>>>    	m->sdmax_rlcx_doorbell_offset =
>>>    		q->doorbell_off <<
>> SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index a5d3963537d7..dcddee0d6f06 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -639,6 +639,8 @@ struct queue {
>>>    	void *gang_ctx_bo;
>>>    	uint64_t gang_ctx_gpu_addr;
>>>    	void *gang_ctx_cpu_ptr;
>>> +
>>> +	struct amdgpu_bo *wptr_bo;
>>>    };
>>>
>>>    enum KFD_MQD_TYPE {
>>> @@ -1404,6 +1406,7 @@ int pqm_create_queue(struct
>> process_queue_manager *pqm,
>>>    			    struct file *f,
>>>    			    struct queue_properties *properties,
>>>    			    unsigned int *qid,
>>> +			    struct amdgpu_bo *wptr_bo,
>>>    			    const struct kfd_criu_queue_priv_data *q_data,
>>>    			    const void *restore_mqd,
>>>    			    const void *restore_ctl_stack, diff --git
>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index f99e09dc43ea..3a17c1ebc527 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -190,7 +190,8 @@ void pqm_uninit(struct process_queue_manager
>> *pqm)
>>>    static int init_user_queue(struct process_queue_manager *pqm,
>>>    				struct kfd_dev *dev, struct queue **q,
>>>    				struct queue_properties *q_properties,
>>> -				struct file *f, unsigned int qid)
>>> +				struct file *f, struct amdgpu_bo *wptr_bo,
>>> +				unsigned int qid)
>>>    {
>>>    	int retval;
>>>
>>> @@ -221,6 +222,7 @@ static int init_user_queue(struct
>> process_queue_manager *pqm,
>>>    			goto cleanup;
>>>    		}
>>>    		memset((*q)->gang_ctx_cpu_ptr, 0,
>> AMDGPU_MES_GANG_CTX_SIZE);
>>> +		(*q)->wptr_bo = wptr_bo;
>>>    	}
>>>
>>>    	pr_debug("PQM After init queue");
>>> @@ -237,6 +239,7 @@ int pqm_create_queue(struct
>> process_queue_manager *pqm,
>>>    			    struct file *f,
>>>    			    struct queue_properties *properties,
>>>    			    unsigned int *qid,
>>> +			    struct amdgpu_bo *wptr_bo,
>>>    			    const struct kfd_criu_queue_priv_data *q_data,
>>>    			    const void *restore_mqd,
>>>    			    const void *restore_ctl_stack, @@ -299,7 +302,7
>> @@ int
>>> pqm_create_queue(struct process_queue_manager *pqm,
>>>    		 * allocate_sdma_queue() in create_queue() has the
>>>    		 * corresponding check logic.
>>>    		 */
>>> -		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
>>> +		retval = init_user_queue(pqm, dev, &q, properties, f,
>> wptr_bo,
>>> +*qid);
>>>    		if (retval != 0)
>>>    			goto err_create_queue;
>>>    		pqn->q = q;
>>> @@ -320,7 +323,7 @@ int pqm_create_queue(struct
>> process_queue_manager *pqm,
>>>    			goto err_create_queue;
>>>    		}
>>>
>>> -		retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
>>> +		retval = init_user_queue(pqm, dev, &q, properties, f,
>> wptr_bo,
>>> +*qid);
>>>    		if (retval != 0)
>>>    			goto err_create_queue;
>>>    		pqn->q = q;
>>> @@ -457,9 +460,13 @@ int pqm_destroy_queue(struct
>> process_queue_manager *pqm, unsigned int qid)
>>>    			pdd->qpd.num_gws = 0;
>>>    		}
>>>
>>> -		if (dev->shared_resources.enable_mes)
>>> +		if (dev->shared_resources.enable_mes) {
>>>    			amdgpu_amdkfd_free_gtt_mem(dev->adev,
>>>    						   pqn->q->gang_ctx_bo);
>>> +			if (pqn->q->wptr_bo)
>>> +				amdgpu_amdkfd_free_gtt_mem(dev-
>>> adev, pqn->q->wptr_bo);
>>> +
>>> +		}
>>>    		uninit_queue(pqn->q);
>>>    	}
>>>
>>> @@ -900,7 +907,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
>>>
>>>    	print_queue_properties(&qp);
>>>
>>> -	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp,
>> &queue_id, q_data, mqd, ctl_stack,
>>> +	ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp,
>> &queue_id,
>>> +NULL, q_data, mqd, ctl_stack,
>>>    				NULL);
>>>    	if (ret) {
>>>    		pr_err("Failed to create new queue err:%d\n", ret);


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-15 14:06       ` Christian König
@ 2022-06-15 22:41         ` philip yang
  2022-06-16 18:21           ` Alex Deucher
  2022-06-17  6:34           ` Christian König
  0 siblings, 2 replies; 13+ messages in thread
From: philip yang @ 2022-06-15 22:41 UTC (permalink / raw)
  To: Christian König, Sider, Graham, amd-gfx
  Cc: Joshi, Mukul, Kuehling, Felix, Yang, Philip

[-- Attachment #1: Type: text/html, Size: 35647 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-15 22:41         ` philip yang
@ 2022-06-16 18:21           ` Alex Deucher
  2022-06-17  6:34           ` Christian König
  1 sibling, 0 replies; 13+ messages in thread
From: Alex Deucher @ 2022-06-16 18:21 UTC (permalink / raw)
  To: philip yang
  Cc: Joshi, Mukul, Yang, Philip, Kuehling, Felix, amd-gfx,
	Christian König, Sider, Graham

On Wed, Jun 15, 2022 at 6:41 PM philip yang <yangp@amd.com> wrote:
>
>
> On 2022-06-15 10:06, Christian König wrote:
>
> Am 15.06.22 um 15:17 schrieb Sider, Graham:
>
> [AMD Official Use Only - General]
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Wednesday, June 15, 2022 3:29 AM
> To: Sider, Graham <Graham.Sider@amd.com>; amd-
> gfx@lists.freedesktop.org
> Cc: Joshi, Mukul <Mukul.Joshi@amd.com>; Kuehling, Felix
> <Felix.Kuehling@amd.com>; Yang, Philip <Philip.Yang@amd.com>
> Subject: Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue
> oversubscription
>
>
>
> Am 13.06.22 um 17:20 schrieb Graham Sider:
>
> Starting with GFX11, MES requires wptr BOs to be GTT allocated/mapped
> to GART for usermode queues in order to support oversubscription. In
> the case that work is submitted to an unmapped queue, MES must have a
> GART wptr address to determine whether the queue should be mapped.
>
> This change is accompanied with changes in MES and is applicable for
> MES_VERSION >= 3.
>
> v2:
> - Update MES_VERSION check from 2 to 3.
> v3:
> - Use amdgpu_vm_bo_lookup_mapping for wptr_bo mapping lookup
> - Move wptr_bo refcount increment to
>
> amdgpu_amdkfd_map_gtt_bo_to_gart
>
> - Remove list_del_init from amdgpu_amdkfd_map_gtt_bo_to_gart
> - Cleanup/fix create_queue wptr_bo error handling
>
> Signed-off-by: Graham Sider <Graham.Sider@amd.com>
> ---
>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>    .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 49
>
> +++++++++++++++++++
>
>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 37 +++++++++++++-
>    .../drm/amd/amdkfd/kfd_device_queue_manager.c |  9 +++-
>    .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  2 +
>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  3 ++
>    .../amd/amdkfd/kfd_process_queue_manager.c    | 17 +++++--
>    7 files changed, 110 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 429b16ba10bf..dba26d1e3be9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -301,6 +301,7 @@ int
>
> amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device
> *adev,
>
>            struct kgd_mem *mem, void **kptr, uint64_t *size);
>    void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct
>
> amdgpu_device *adev,
>
>            struct kgd_mem *mem);
> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev,
> +struct amdgpu_bo *bo);
>
>    int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
>                            struct dma_fence **ef);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index efab923056f4..888d08128a94 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2030,6 +2030,55 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
>        return ret;
>    }
>
> +/**
> + * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and
>
> increment
>
> +reference count
> + * @adev: Device to which allocated BO belongs
> + * @bo: Buffer object to be mapped
> + *
> + * Before return, bo reference count is incremented. To release the
> +reference and unpin/
> + * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
> + */
> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev,
> +struct amdgpu_bo *bo) {
> +    int ret;
> +
> +    ret = amdgpu_bo_reserve(bo, true);
> +    if (ret) {
> +        pr_err("Failed to reserve bo. ret %d\n", ret);
> +        goto err_reserve_bo_failed;
> +    }
> +
> +    ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
> +    if (ret) {
> +        pr_err("Failed to pin bo. ret %d\n", ret);
> +        goto err_pin_bo_failed;
> +    }
>
>
> Oh! Is that something we do for every MQD? When yes that here is pretty
> much a NAK.
>
> We can't do this or create a trivial deny of service attack against the kernel
> driver.
>
> Regards,
> Christian.
>
> Hi Christian, could you elaborate on this? Right now this is only being used to pin the queue wptr BO.
>
>
> Well is this wptr BO per process, per queue or global?
>
> amdgpu_bo_pin() is only allowed if we pin global resources, otherwise I have to reject that.
>
> wptr BO is per queue, allocated as queue structure, 1 page size on system memory.
>
> KFD limit number of queues globally, max_queues = 127; /* HWS limit */, so this will pin max 508KB and take max 127 GART page mapping.
>
> wptr is updated by app and read by HWS, if we don't pin wptr, we have to evict queue when wptr bo is moved on system memory, then update GART mapping and restore queue.
>

I talked to Graham about this a bit off line.  This is a requirement
due to changes in the hardware and the way KFD doorbells work.  The
fact that we are limited by the max queues supported by the hardware
scheduler prevents this from getting out of hand.  We would need this
space whether the queues were kernel managed or user managed.

Acked-by: Alex Deucher <alexander.deucher@amd.com>

> Regards,
>
> Philip
>
>
> Regards,
> Christian.
>
>
> Best,
> Graham
>
> +
> +    ret = amdgpu_ttm_alloc_gart(&bo->tbo);
> +    if (ret) {
> +        pr_err("Failed to bind bo to GART. ret %d\n", ret);
> +        goto err_map_bo_gart_failed;
> +    }
> +
> +    amdgpu_amdkfd_remove_eviction_fence(
> +        bo, bo->kfd_bo->process_info->eviction_fence);
> +    list_del_init(&bo->kfd_bo->validate_list.head);
> +
> +    amdgpu_bo_unreserve(bo);
> +
> +    bo = amdgpu_bo_ref(bo);
> +
> +    return 0;
> +
> +err_map_bo_gart_failed:
> +    amdgpu_bo_unpin(bo);
> +err_pin_bo_failed:
> +    amdgpu_bo_unreserve(bo);
> +err_reserve_bo_failed:
> +
> +    return ret;
> +}
> +
>    int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct
>
> amdgpu_device *adev,
>
>            struct kgd_mem *mem, void **kptr, uint64_t *size)
>    {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index e9766e165c38..1789ed8b79f5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -289,6 +289,7 @@ static int kfd_ioctl_create_queue(struct file *filep,
>
> struct kfd_process *p,
>
>        struct kfd_process_device *pdd;
>        struct queue_properties q_properties;
>        uint32_t doorbell_offset_in_process = 0;
> +    struct amdgpu_bo *wptr_bo = NULL;
>
>        memset(&q_properties, 0, sizeof(struct queue_properties));
>
> @@ -316,12 +317,41 @@ static int kfd_ioctl_create_queue(struct file
>
> *filep, struct kfd_process *p,
>
>            goto err_bind_process;
>        }
>
> +    /* Starting with GFX11, wptr BOs must be mapped to GART for MES
>
> to determine work
>
> +     * on unmapped queues for usermode queue oversubscription (no
>
> aggregated doorbell)
>
> +     */
> +    if (dev->shared_resources.enable_mes && (dev->adev-
> mes.sched_version & 0xff) >= 3) {
> +        struct amdgpu_bo_va_mapping *wptr_mapping;
> +        struct amdgpu_vm *wptr_vm;
> +
> +        wptr_vm = drm_priv_to_vm(pdd->drm_priv);
> +        err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
> +        if (err)
> +            goto err_wptr_map_gart;
> +
> +        wptr_mapping = amdgpu_vm_bo_lookup_mapping(
> +                wptr_vm, args->write_pointer_address >>
>
> PAGE_SHIFT);
>
> +        amdgpu_bo_unreserve(wptr_vm->root.bo);
> +        if (!wptr_mapping) {
> +            pr_err("Failed to lookup wptr bo\n");
> +            err = -EINVAL;
> +            goto err_wptr_map_gart;
> +        }
> +
> +        wptr_bo = wptr_mapping->bo_va->base.bo;
> +        err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev,
>
> wptr_bo);
>
> +        if (err) {
> +            pr_err("Failed to map wptr bo to GART\n");
> +            goto err_wptr_map_gart;
> +        }
> +    }
> +
>        pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
>                p->pasid,
>                dev->id);
>
> -    err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
>
> &queue_id, NULL, NULL, NULL,
>
> -            &doorbell_offset_in_process);
> +    err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
>
> &queue_id, wptr_bo,
>
> +            NULL, NULL, NULL, &doorbell_offset_in_process);
>        if (err != 0)
>            goto err_create_queue;
>
> @@ -354,6 +384,9 @@ static int kfd_ioctl_create_queue(struct file *filep,
>
> struct kfd_process *p,
>
>        return 0;
>
>    err_create_queue:
> +    if (wptr_bo)
> +        amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
> +err_wptr_map_gart:
>    err_bind_process:
>    err_pdd:
>        mutex_unlock(&p->mutex);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index b39d89c52887..d8de2fbdfc7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -208,6 +208,7 @@ static int add_queue_mes(struct
>
> device_queue_manager *dqm, struct queue *q,
>
>        struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>        struct mes_add_queue_input queue_input;
>        int r, queue_type;
> +    uint64_t wptr_addr_off;
>
>        if (dqm->is_hws_hang)
>            return -EIO;
> @@ -227,7 +228,13 @@ static int add_queue_mes(struct
>
> device_queue_manager *dqm, struct queue *q,
>     AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
>
>        queue_input.doorbell_offset = q->properties.doorbell_off;
>        queue_input.mqd_addr = q->gart_mqd_addr;
> -    queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
> +
> +    if (q->wptr_bo) {
> +        wptr_addr_off = (uint64_t)q->properties.write_ptr -
>
> (uint64_t)q->wptr_bo->kfd_bo->va;
>
> +        queue_input.wptr_addr = ((uint64_t)q->wptr_bo-
> tbo.resource->start << PAGE_SHIFT) + wptr_addr_off;
> +    } else
> +        queue_input.wptr_addr = (uint64_t)q-
> properties.write_ptr;
> +
>        queue_input.paging = false;
>        queue_input.tba_addr = qpd->tba_addr;
>        queue_input.tma_addr = qpd->tma_addr; diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> index f1654b4da856..35e74bdd81da 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
> @@ -377,6 +377,8 @@ static void update_mqd_sdma(struct mqd_manager
>
> *mm, void *mqd,
>
>        m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >>
>
> 8);
>
>        m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q-
> read_ptr);
>        m->sdmax_rlcx_rb_rptr_addr_hi =
> upper_32_bits((uint64_t)q->read_ptr);
> +    m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q-
> write_ptr);
> +    m->sdmax_rlcx_rb_wptr_poll_addr_hi =
> +upper_32_bits((uint64_t)q->write_ptr);
>        m->sdmax_rlcx_doorbell_offset =
>            q->doorbell_off <<
>
> SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index a5d3963537d7..dcddee0d6f06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -639,6 +639,8 @@ struct queue {
>        void *gang_ctx_bo;
>        uint64_t gang_ctx_gpu_addr;
>        void *gang_ctx_cpu_ptr;
> +
> +    struct amdgpu_bo *wptr_bo;
>    };
>
>    enum KFD_MQD_TYPE {
> @@ -1404,6 +1406,7 @@ int pqm_create_queue(struct
>
> process_queue_manager *pqm,
>
>                    struct file *f,
>                    struct queue_properties *properties,
>                    unsigned int *qid,
> +                struct amdgpu_bo *wptr_bo,
>                    const struct kfd_criu_queue_priv_data *q_data,
>                    const void *restore_mqd,
>                    const void *restore_ctl_stack, diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index f99e09dc43ea..3a17c1ebc527 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -190,7 +190,8 @@ void pqm_uninit(struct process_queue_manager
>
> *pqm)
>
>    static int init_user_queue(struct process_queue_manager *pqm,
>                    struct kfd_dev *dev, struct queue **q,
>                    struct queue_properties *q_properties,
> -                struct file *f, unsigned int qid)
> +                struct file *f, struct amdgpu_bo *wptr_bo,
> +                unsigned int qid)
>    {
>        int retval;
>
> @@ -221,6 +222,7 @@ static int init_user_queue(struct
>
> process_queue_manager *pqm,
>
>                goto cleanup;
>            }
>            memset((*q)->gang_ctx_cpu_ptr, 0,
>
> AMDGPU_MES_GANG_CTX_SIZE);
>
> +        (*q)->wptr_bo = wptr_bo;
>        }
>
>        pr_debug("PQM After init queue");
> @@ -237,6 +239,7 @@ int pqm_create_queue(struct
>
> process_queue_manager *pqm,
>
>                    struct file *f,
>                    struct queue_properties *properties,
>                    unsigned int *qid,
> +                struct amdgpu_bo *wptr_bo,
>                    const struct kfd_criu_queue_priv_data *q_data,
>                    const void *restore_mqd,
>                    const void *restore_ctl_stack, @@ -299,7 +302,7
>
> @@ int
>
> pqm_create_queue(struct process_queue_manager *pqm,
>             * allocate_sdma_queue() in create_queue() has the
>             * corresponding check logic.
>             */
> -        retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
> +        retval = init_user_queue(pqm, dev, &q, properties, f,
>
> wptr_bo,
>
> +*qid);
>            if (retval != 0)
>                goto err_create_queue;
>            pqn->q = q;
> @@ -320,7 +323,7 @@ int pqm_create_queue(struct
>
> process_queue_manager *pqm,
>
>                goto err_create_queue;
>            }
>
> -        retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
> +        retval = init_user_queue(pqm, dev, &q, properties, f,
>
> wptr_bo,
>
> +*qid);
>            if (retval != 0)
>                goto err_create_queue;
>            pqn->q = q;
> @@ -457,9 +460,13 @@ int pqm_destroy_queue(struct
>
> process_queue_manager *pqm, unsigned int qid)
>
>                pdd->qpd.num_gws = 0;
>            }
>
> -        if (dev->shared_resources.enable_mes)
> +        if (dev->shared_resources.enable_mes) {
>                amdgpu_amdkfd_free_gtt_mem(dev->adev,
>                               pqn->q->gang_ctx_bo);
> +            if (pqn->q->wptr_bo)
> +                amdgpu_amdkfd_free_gtt_mem(dev-
> adev, pqn->q->wptr_bo);
> +
> +        }
>            uninit_queue(pqn->q);
>        }
>
> @@ -900,7 +907,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
>
>        print_queue_properties(&qp);
>
> -    ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp,
>
> &queue_id, q_data, mqd, ctl_stack,
>
> +    ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp,
>
> &queue_id,
>
> +NULL, q_data, mqd, ctl_stack,
>                    NULL);
>        if (ret) {
>            pr_err("Failed to create new queue err:%d\n", ret);
>
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription
  2022-06-15 22:41         ` philip yang
  2022-06-16 18:21           ` Alex Deucher
@ 2022-06-17  6:34           ` Christian König
  1 sibling, 0 replies; 13+ messages in thread
From: Christian König @ 2022-06-17  6:34 UTC (permalink / raw)
  To: philip yang, Christian König, Sider, Graham, amd-gfx
  Cc: Joshi, Mukul, Kuehling, Felix, Yang, Philip

Hi Philip,

Am 16.06.22 um 00:41 schrieb philip yang:
> [SNIP]
>>>>> +    ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
>>>>> +    if (ret) {
>>>>> +        pr_err("Failed to pin bo. ret %d\n", ret);
>>>>> +        goto err_pin_bo_failed;
>>>>> +    }
>>>>
>>>> Oh! Is that something we do for every MQD? When yes that here is 
>>>> pretty
>>>> much a NAK.
>>>>
>>>> We can't do this or create a trivial deny of service attack against 
>>>> the kernel
>>>> driver.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>> Hi Christian, could you elaborate on this? Right now this is only 
>>> being used to pin the queue wptr BO.
>>
>> Well is this wptr BO per process, per queue or global?
>>
>> amdgpu_bo_pin() is only allowed if we pin global resources, otherwise 
>> I have to reject that.
>
> wptr BO is per queue, allocated as queue structure, 1 page size on 
> system memory.
>

Yeah, I was hoping for this explanation as well. My status was still 
that the WPTR and RPTR are part of the ring buffer.

We should add a check that we really only pin a buffer with 1 page size 
here, then that should be ok.

Regards,
Christian.

> KFD limit number of queues globally, max_queues = 127; /* HWS limit 
> */, so this will pin max 508KB and take max 127 GART page mapping.
>
> wptr is updated by app and read by HWS, if we don't pin wptr, we have 
> to evict queue when wptr bo is moved on system memory, then update 
> GART mapping and restore queue.
>
> Regards,
>
> Philip
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Best,
>>> Graham
>>>
>>>>> +
>>>>> +    ret = amdgpu_ttm_alloc_gart(&bo->tbo);
>>>>> +    if (ret) {
>>>>> +        pr_err("Failed to bind bo to GART. ret %d\n", ret);
>>>>> +        goto err_map_bo_gart_failed;
>>>>> +    }
>>>>> +
>>>>> +    amdgpu_amdkfd_remove_eviction_fence(
>>>>> +        bo, bo->kfd_bo->process_info->eviction_fence);
>>>>> + list_del_init(&bo->kfd_bo->validate_list.head);
>>>>> +
>>>>> +    amdgpu_bo_unreserve(bo);
>>>>> +
>>>>> +    bo = amdgpu_bo_ref(bo);
>>>>> +
>>>>> +    return 0;
>>>>> +
>>>>> +err_map_bo_gart_failed:
>>>>> +    amdgpu_bo_unpin(bo);
>>>>> +err_pin_bo_failed:
>>>>> +    amdgpu_bo_unreserve(bo);
>>>>> +err_reserve_bo_failed:
>>>>> +
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>>    int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct
>>>> amdgpu_device *adev,
>>>>>            struct kgd_mem *mem, void **kptr, uint64_t *size)
>>>>>    {
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> index e9766e165c38..1789ed8b79f5 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> @@ -289,6 +289,7 @@ static int kfd_ioctl_create_queue(struct file 
>>>>> *filep,
>>>> struct kfd_process *p,
>>>>>        struct kfd_process_device *pdd;
>>>>>        struct queue_properties q_properties;
>>>>>        uint32_t doorbell_offset_in_process = 0;
>>>>> +    struct amdgpu_bo *wptr_bo = NULL;
>>>>>
>>>>>        memset(&q_properties, 0, sizeof(struct queue_properties));
>>>>>
>>>>> @@ -316,12 +317,41 @@ static int kfd_ioctl_create_queue(struct file
>>>> *filep, struct kfd_process *p,
>>>>>            goto err_bind_process;
>>>>>        }
>>>>>
>>>>> +    /* Starting with GFX11, wptr BOs must be mapped to GART for MES
>>>> to determine work
>>>>> +     * on unmapped queues for usermode queue oversubscription (no
>>>> aggregated doorbell)
>>>>> +     */
>>>>> +    if (dev->shared_resources.enable_mes && (dev->adev-
>>>>> mes.sched_version & 0xff) >= 3) {
>>>>> +        struct amdgpu_bo_va_mapping *wptr_mapping;
>>>>> +        struct amdgpu_vm *wptr_vm;
>>>>> +
>>>>> +        wptr_vm = drm_priv_to_vm(pdd->drm_priv);
>>>>> +        err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
>>>>> +        if (err)
>>>>> +            goto err_wptr_map_gart;
>>>>> +
>>>>> +        wptr_mapping = amdgpu_vm_bo_lookup_mapping(
>>>>> +                wptr_vm, args->write_pointer_address >>
>>>> PAGE_SHIFT);
>>>>> + amdgpu_bo_unreserve(wptr_vm->root.bo);
>>>>> +        if (!wptr_mapping) {
>>>>> +            pr_err("Failed to lookup wptr bo\n");
>>>>> +            err = -EINVAL;
>>>>> +            goto err_wptr_map_gart;
>>>>> +        }
>>>>> +
>>>>> +        wptr_bo = wptr_mapping->bo_va->base.bo;
>>>>> +        err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev,
>>>> wptr_bo);
>>>>> +        if (err) {
>>>>> +            pr_err("Failed to map wptr bo to GART\n");
>>>>> +            goto err_wptr_map_gart;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>>        pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
>>>>>                p->pasid,
>>>>>                dev->id);
>>>>>
>>>>> -    err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
>>>> &queue_id, NULL, NULL, NULL,
>>>>> - &doorbell_offset_in_process);
>>>>> +    err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
>>>> &queue_id, wptr_bo,
>>>>> +            NULL, NULL, NULL, &doorbell_offset_in_process);
>>>>>        if (err != 0)
>>>>>            goto err_create_queue;
>>>>>
>>>>> @@ -354,6 +384,9 @@ static int kfd_ioctl_create_queue(struct file 
>>>>> *filep,
>>>> struct kfd_process *p,
>>>>>        return 0;
>>>>>
>>>>>    err_create_queue:
>>>>> +    if (wptr_bo)
>>>>> +        amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
>>>>> +err_wptr_map_gart:
>>>>>    err_bind_process:
>>>>>    err_pdd:
>>>>>        mutex_unlock(&p->mutex);
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> index b39d89c52887..d8de2fbdfc7d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> @@ -208,6 +208,7 @@ static int add_queue_mes(struct
>>>> device_queue_manager *dqm, struct queue *q,
>>>>>        struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>>>>>        struct mes_add_queue_input queue_input;
>>>>>        int r, queue_type;
>>>>> +    uint64_t wptr_addr_off;
>>>>>
>>>>>        if (dqm->is_hws_hang)
>>>>>            return -EIO;
>>>>> @@ -227,7 +228,13 @@ static int add_queue_mes(struct
>>>> device_queue_manager *dqm, struct queue *q,
>>>>     AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
>>>>>        queue_input.doorbell_offset = q->properties.doorbell_off;
>>>>>        queue_input.mqd_addr = q->gart_mqd_addr;
>>>>> -    queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
>>>>> +
>>>>> +    if (q->wptr_bo) {
>>>>> +        wptr_addr_off = (uint64_t)q->properties.write_ptr -
>>>> (uint64_t)q->wptr_bo->kfd_bo->va;
>>>>> +        queue_input.wptr_addr = ((uint64_t)q->wptr_bo-
>>>>> tbo.resource->start << PAGE_SHIFT) + wptr_addr_off;
>>>>> +    } else
>>>>> +        queue_input.wptr_addr = (uint64_t)q-
>>>>> properties.write_ptr;
>>>>> +
>>>>>        queue_input.paging = false;
>>>>>        queue_input.tba_addr = qpd->tba_addr;
>>>>>        queue_input.tma_addr = qpd->tma_addr; diff --git
>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>>>> index f1654b4da856..35e74bdd81da 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
>>>>> @@ -377,6 +377,8 @@ static void update_mqd_sdma(struct mqd_manager
>>>> *mm, void *mqd,
>>>>>        m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >>
>>>> 8);
>>>>> m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q-
>>>>> read_ptr);
>>>>>        m->sdmax_rlcx_rb_rptr_addr_hi =
>>>>> upper_32_bits((uint64_t)q->read_ptr);
>>>>> +    m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q-
>>>>> write_ptr);
>>>>> +    m->sdmax_rlcx_rb_wptr_poll_addr_hi =
>>>>> +upper_32_bits((uint64_t)q->write_ptr);
>>>>>        m->sdmax_rlcx_doorbell_offset =
>>>>>            q->doorbell_off <<
>>>> SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> index a5d3963537d7..dcddee0d6f06 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> @@ -639,6 +639,8 @@ struct queue {
>>>>>        void *gang_ctx_bo;
>>>>>        uint64_t gang_ctx_gpu_addr;
>>>>>        void *gang_ctx_cpu_ptr;
>>>>> +
>>>>> +    struct amdgpu_bo *wptr_bo;
>>>>>    };
>>>>>
>>>>>    enum KFD_MQD_TYPE {
>>>>> @@ -1404,6 +1406,7 @@ int pqm_create_queue(struct
>>>> process_queue_manager *pqm,
>>>>>                    struct file *f,
>>>>>                    struct queue_properties *properties,
>>>>>                    unsigned int *qid,
>>>>> +                struct amdgpu_bo *wptr_bo,
>>>>>                    const struct kfd_criu_queue_priv_data *q_data,
>>>>>                    const void *restore_mqd,
>>>>>                    const void *restore_ctl_stack, diff --git
>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> index f99e09dc43ea..3a17c1ebc527 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> @@ -190,7 +190,8 @@ void pqm_uninit(struct process_queue_manager
>>>> *pqm)
>>>>>    static int init_user_queue(struct process_queue_manager *pqm,
>>>>>                    struct kfd_dev *dev, struct queue **q,
>>>>>                    struct queue_properties *q_properties,
>>>>> -                struct file *f, unsigned int qid)
>>>>> +                struct file *f, struct amdgpu_bo *wptr_bo,
>>>>> +                unsigned int qid)
>>>>>    {
>>>>>        int retval;
>>>>>
>>>>> @@ -221,6 +222,7 @@ static int init_user_queue(struct
>>>> process_queue_manager *pqm,
>>>>>                goto cleanup;
>>>>>            }
>>>>>            memset((*q)->gang_ctx_cpu_ptr, 0,
>>>> AMDGPU_MES_GANG_CTX_SIZE);
>>>>> +        (*q)->wptr_bo = wptr_bo;
>>>>>        }
>>>>>
>>>>>        pr_debug("PQM After init queue");
>>>>> @@ -237,6 +239,7 @@ int pqm_create_queue(struct
>>>> process_queue_manager *pqm,
>>>>>                    struct file *f,
>>>>>                    struct queue_properties *properties,
>>>>>                    unsigned int *qid,
>>>>> +                struct amdgpu_bo *wptr_bo,
>>>>>                    const struct kfd_criu_queue_priv_data *q_data,
>>>>>                    const void *restore_mqd,
>>>>>                    const void *restore_ctl_stack, @@ -299,7 +302,7
>>>> @@ int
>>>>> pqm_create_queue(struct process_queue_manager *pqm,
>>>>>             * allocate_sdma_queue() in create_queue() has the
>>>>>             * corresponding check logic.
>>>>>             */
>>>>> -        retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
>>>>> +        retval = init_user_queue(pqm, dev, &q, properties, f,
>>>> wptr_bo,
>>>>> +*qid);
>>>>>            if (retval != 0)
>>>>>                goto err_create_queue;
>>>>>            pqn->q = q;
>>>>> @@ -320,7 +323,7 @@ int pqm_create_queue(struct
>>>> process_queue_manager *pqm,
>>>>>                goto err_create_queue;
>>>>>            }
>>>>>
>>>>> -        retval = init_user_queue(pqm, dev, &q, properties, f, *qid);
>>>>> +        retval = init_user_queue(pqm, dev, &q, properties, f,
>>>> wptr_bo,
>>>>> +*qid);
>>>>>            if (retval != 0)
>>>>>                goto err_create_queue;
>>>>>            pqn->q = q;
>>>>> @@ -457,9 +460,13 @@ int pqm_destroy_queue(struct
>>>> process_queue_manager *pqm, unsigned int qid)
>>>>>                pdd->qpd.num_gws = 0;
>>>>>            }
>>>>>
>>>>> -        if (dev->shared_resources.enable_mes)
>>>>> +        if (dev->shared_resources.enable_mes) {
>>>>>                amdgpu_amdkfd_free_gtt_mem(dev->adev,
>>>>>                               pqn->q->gang_ctx_bo);
>>>>> +            if (pqn->q->wptr_bo)
>>>>> +                amdgpu_amdkfd_free_gtt_mem(dev-
>>>>> adev, pqn->q->wptr_bo);
>>>>> +
>>>>> +        }
>>>>>            uninit_queue(pqn->q);
>>>>>        }
>>>>>
>>>>> @@ -900,7 +907,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
>>>>>
>>>>>        print_queue_properties(&qp);
>>>>>
>>>>> -    ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp,
>>>> &queue_id, q_data, mqd, ctl_stack,
>>>>> +    ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp,
>>>> &queue_id,
>>>>> +NULL, q_data, mqd, ctl_stack,
>>>>>                    NULL);
>>>>>        if (ret) {
>>>>>            pr_err("Failed to create new queue err:%d\n", ret);
>>


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-06-17  6:34 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-13 15:19 [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions Graham Sider
2022-06-13 15:20 ` [PATCH v3 2/3] drm/amdkfd: Enable GFX11 usermode queue oversubscription Graham Sider
2022-06-14 18:22   ` philip yang
2022-06-14 19:46     ` Sider, Graham
2022-06-15  7:28   ` Christian König
2022-06-15 13:17     ` Sider, Graham
2022-06-15 14:06       ` Christian König
2022-06-15 22:41         ` philip yang
2022-06-16 18:21           ` Alex Deucher
2022-06-17  6:34           ` Christian König
2022-06-13 15:20 ` [PATCH v3 3/3] drm/amdgpu: Update mes_v11_api_def.h Graham Sider
2022-06-14  8:29   ` Xiao, Jack
2022-06-14  8:25 ` [PATCH v3 1/3] drm/amdgpu: Fetch MES scheduler/KIQ versions Xiao, Jack

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.