All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
@ 2021-08-20  5:32 Joseph Greathouse
  2021-08-20  5:32 ` [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran Joseph Greathouse
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Joseph Greathouse @ 2021-08-20  5:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: Joseph Greathouse

Give every process at most one queue from each SDMA engine.
Previously, we allocated all SDMA engines and queues on a first-
come-first-serve basis. This meant that it was possible for two
processes racing on their allocation requests to each end up with
two queues on the same SDMA engine. That could serialize the
performance of commands to different queues.

This new mechanism allows each process at most a single queue
on each SDMA engine. Processes will check for queue availability on
SDMA engine 0, then 1, then 2, etc. and only allocate on that
engine if there is an available queue slot. If there are no
queue slots available (or if this process has already allocated
a queue on this engine), it moves on and tries the next engine.

The Aldebaran chip has a small quirk that SDMA0 should only be
used by the very first allocation from each process. It is OK to
use any other SDMA engine at any time, but after the first SDMA
allocation request from a process, the resulting engine must
be from SDMA1 or above. This patch handles this case as well.

Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
 3 files changed, 107 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f8fce9d05f50..86bdb765f350 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 static int map_queues_cpsch(struct device_queue_manager *dqm);
 
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
 				struct queue *q);
 
 static inline void deallocate_hqd(struct device_queue_manager *dqm,
 				struct queue *q);
 static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
 				struct queue *q);
 static void kfd_process_hw_exception(struct work_struct *work);
 
@@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 			q->pipe, q->queue);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
 		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
-		retval = allocate_sdma_queue(dqm, q);
+		retval = allocate_sdma_queue(dqm, qpd, q);
 		if (retval)
 			goto deallocate_vmid;
 		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
@@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 		deallocate_hqd(dqm, q);
 	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
 		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue(dqm, qpd, q);
 deallocate_vmid:
 	if (list_empty(&qpd->queues_list))
 		deallocate_vmid(dqm, qpd, q);
@@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
 		deallocate_hqd(dqm, q);
 	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue(dqm, qpd, q);
 	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue(dqm, qpd, q);
 	else {
 		pr_debug("q->properties.type %d is invalid\n",
 				q->properties.type);
@@ -1039,42 +1041,93 @@ static void pre_reset(struct device_queue_manager *dqm)
 	dqm_unlock(dqm);
 }
 
+static uint64_t sdma_engine_mask(int engine, int num_engines)
+{
+	uint64_t mask = 0;
+
+	engine %= num_engines;
+	while (engine < (sizeof(uint64_t)*8)) {
+		mask |= 1ULL << engine;
+		engine += num_engines;
+	}
+	return mask;
+}
+
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
 				struct queue *q)
 {
-	int bit;
+	uint64_t available_queue_bitmap;
+	unsigned int bit, engine, num_engines;
+	bool found_sdma = false;
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
-		if (dqm->sdma_bitmap == 0) {
+		num_engines = get_num_sdma_engines(dqm);
+		for_each_set_bit(engine, &(qpd->sdma_engine_bitmap), num_engines) {
+			/* Do not reuse SDMA0 for any subsequent SDMA queue
+			 * requests on Aldebaran. If SDMA0's queues are all
+			 * full, then this process should never use SDMA0
+			 * for any further requests
+			 */
+			if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
+					engine == 0)
+				qpd->sdma_engine_bitmap &= ~(1ULL << engine);
+
+			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
+			available_queue_bitmap &= dqm->sdma_bitmap;
+
+			if (!available_queue_bitmap)
+				continue;
+			/* Take the selected engine off the list so we will not
+			 * allocate two queues onto the same engine
+			 */
+			qpd->sdma_engine_bitmap &= ~(1ULL << engine);
+			found_sdma = true;
+
+			bit = __ffs64(available_queue_bitmap);
+			dqm->sdma_bitmap &= ~(1ULL << bit);
+			q->sdma_id = bit;
+			q->properties.sdma_engine_id = q->sdma_id % num_engines;
+			q->properties.sdma_queue_id = q->sdma_id / num_engines;
+			break;
+		}
+		if (!found_sdma) {
 			pr_err("No more SDMA queue to allocate\n");
 			return -ENOMEM;
 		}
-
-		bit = __ffs64(dqm->sdma_bitmap);
-		dqm->sdma_bitmap &= ~(1ULL << bit);
-		q->sdma_id = bit;
-		q->properties.sdma_engine_id = q->sdma_id %
-				get_num_sdma_engines(dqm);
-		q->properties.sdma_queue_id = q->sdma_id /
-				get_num_sdma_engines(dqm);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
-		if (dqm->xgmi_sdma_bitmap == 0) {
+		num_engines = get_num_xgmi_sdma_engines(dqm);
+		for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap), num_engines) {
+			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
+			available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
+
+			if (!available_queue_bitmap)
+				continue;
+			/* Take the selected engine off the list so we will not
+			 * allocate two queues onto the same engine
+			 */
+			qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
+			found_sdma = true;
+
+			bit = __ffs64(available_queue_bitmap);
+			dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
+			q->sdma_id = bit;
+			/* sdma_engine_id is sdma id including
+			 * both PCIe-optimized SDMAs and XGMI-
+			 * optimized SDMAs. The calculation below
+			 * assumes the first N engines are always
+			 * PCIe-optimized ones
+			 */
+			q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
+				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
+			q->properties.sdma_queue_id = q->sdma_id /
+				get_num_xgmi_sdma_engines(dqm);
+			break;
+		}
+		if (!found_sdma) {
 			pr_err("No more XGMI SDMA queue to allocate\n");
 			return -ENOMEM;
 		}
-		bit = __ffs64(dqm->xgmi_sdma_bitmap);
-		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
-		q->sdma_id = bit;
-		/* sdma_engine_id is sdma id including
-		 * both PCIe-optimized SDMAs and XGMI-
-		 * optimized SDMAs. The calculation below
-		 * assumes the first N engines are always
-		 * PCIe-optimized ones
-		 */
-		q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
-				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
-		q->properties.sdma_queue_id = q->sdma_id /
-				get_num_xgmi_sdma_engines(dqm);
 	}
 
 	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
@@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 }
 
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
 				struct queue *q)
 {
+	uint32_t engine = q->properties.sdma_engine_id;
+
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		if (q->sdma_id >= get_num_sdma_queues(dqm))
 			return;
 		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+		/* Don't give SDMA0 back to be reallocated on Aldebaran.
+		 * It is only OK to use this engine from the first allocation
+		 * within a process.
+		 */
+		if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
+					engine == 0))
+			qpd->sdma_engine_bitmap |= (1ULL << engine);
+
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
 			return;
 		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+		/* engine ID in the queue properties is the global engine ID.
+		 * The XGMI engine bitmap ignores the PCIe-optimized engines.
+		 */
+		engine -= get_num_sdma_engines(dqm);
+		qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
 	}
 }
 
@@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
 		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		dqm_lock(dqm);
-		retval = allocate_sdma_queue(dqm, q);
+		retval = allocate_sdma_queue(dqm, qpd, q);
 		dqm_unlock(dqm);
 		if (retval)
 			goto out;
@@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
 		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		dqm_lock(dqm);
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue(dqm, qpd, q);
 		dqm_unlock(dqm);
 	}
 out:
@@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
 	    (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue(dqm, qpd, q);
 		pdd->sdma_past_activity_counter += sdma_val;
 	}
 
@@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	/* Clear all user mode queues */
 	list_for_each_entry(q, &qpd->queues_list, list) {
 		if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
-			deallocate_sdma_queue(dqm, q);
+			deallocate_sdma_queue(dqm, qpd, q);
 		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
-			deallocate_sdma_queue(dqm, q);
+			deallocate_sdma_queue(dqm, qpd, q);
 
 		if (q->properties.is_active) {
 			decrement_queue_count(dqm, q->properties.type);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ab83b0de6b22..c38eebc9db4d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -576,6 +576,8 @@ struct qcm_process_device {
 	struct list_head priv_queue_list;
 
 	unsigned int queue_count;
+	unsigned long sdma_engine_bitmap;
+	unsigned long xgmi_sdma_engine_bitmap;
 	unsigned int vmid;
 	bool is_debug;
 	unsigned int evicted; /* eviction counter, 0=active */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 21ec8a18cad2..13c85624bf7d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1422,6 +1422,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 							struct kfd_process *p)
 {
 	struct kfd_process_device *pdd = NULL;
+	const struct kfd_device_info *dev_info = dev->dqm->dev->device_info;
 
 	if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
 		return NULL;
@@ -1446,6 +1447,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 	pdd->qpd.pqm = &p->pqm;
 	pdd->qpd.evicted = 0;
 	pdd->qpd.mapped_gws_queue = false;
+	pdd->qpd.sdma_engine_bitmap = BIT_ULL(dev_info->num_sdma_engines) - 1;
+	pdd->qpd.xgmi_sdma_engine_bitmap = BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
 	pdd->process = p;
 	pdd->bound = PDD_UNBOUND;
 	pdd->already_dequeued = false;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran
  2021-08-20  5:32 [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly Joseph Greathouse
@ 2021-08-20  5:32 ` Joseph Greathouse
  2021-08-20  6:59   ` Christian König
  2021-08-20  5:32 ` [PATCH 3/3] drm/amdkfd: Spread out XGMI SDMA allocations Joseph Greathouse
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Joseph Greathouse @ 2021-08-20  5:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: Joseph Greathouse

Aldebaran should not use SDMA0 for buffer funcs such as page migration.
Instead, we move over to SDMA1 for these features. Leave SDMA0 in
charge for all other existing chips to avoid any possibility of
regressions.

Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 8931000dcd41..771630d7bb3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -2689,11 +2689,15 @@ static const struct amdgpu_buffer_funcs sdma_v4_0_buffer_funcs = {
 
 static void sdma_v4_0_set_buffer_funcs(struct amdgpu_device *adev)
 {
+	int engine = 0;
+
+	if (adev->asic_type == CHIP_ALDEBARAN)
+		engine = 1;
 	adev->mman.buffer_funcs = &sdma_v4_0_buffer_funcs;
 	if (adev->sdma.has_page_queue)
-		adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].page;
+		adev->mman.buffer_funcs_ring = &adev->sdma.instance[engine].page;
 	else
-		adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].ring;
+		adev->mman.buffer_funcs_ring = &adev->sdma.instance[engine].ring;
 }
 
 static const struct amdgpu_vm_pte_funcs sdma_v4_0_vm_pte_funcs = {
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/3] drm/amdkfd: Spread out XGMI SDMA allocations
  2021-08-20  5:32 [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly Joseph Greathouse
  2021-08-20  5:32 ` [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran Joseph Greathouse
@ 2021-08-20  5:32 ` Joseph Greathouse
  2021-08-20 22:03 ` [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly Felix Kuehling
  2021-08-23  7:08 ` Lazar, Lijo
  3 siblings, 0 replies; 13+ messages in thread
From: Joseph Greathouse @ 2021-08-20  5:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: Joseph Greathouse

Avoid hotspotting of allocations of SDMA engines from the
XGMI pool by making each process attempt to allocate engines
starting from the engine after the last one that was allocated.

Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 8 +++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c              | 1 +
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 86bdb765f350..7f06ad6bdcd2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1096,9 +1096,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 			return -ENOMEM;
 		}
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		if (qpd->initial_xgmi_engine == -1)
+			qpd->initial_xgmi_engine = dqm->next_xgmi_engine;
 		num_engines = get_num_xgmi_sdma_engines(dqm);
 		for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap), num_engines) {
-			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
+			available_queue_bitmap = sdma_engine_mask(
+					qpd->initial_xgmi_engine + engine, num_engines);
 			available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
 
 			if (!available_queue_bitmap)
@@ -1109,6 +1112,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 			qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
 			found_sdma = true;
 
+			dqm->next_xgmi_engine = qpd->initial_xgmi_engine + engine + 1;
+			dqm->next_xgmi_engine %= num_engines;
+
 			bit = __ffs64(available_queue_bitmap);
 			dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
 			q->sdma_id = bit;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index c8719682c4da..b5955e7401e5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -183,6 +183,8 @@ struct device_queue_manager {
 	unsigned int		*allocated_queues;
 	uint64_t		sdma_bitmap;
 	uint64_t		xgmi_sdma_bitmap;
+	/* which XGMI engine the next process should attempt to start on */
+	unsigned int		next_xgmi_engine;
 	/* the pasid mapping for each kfd vmid */
 	uint16_t		vmid_pasid[VMID_NUM];
 	uint64_t		pipelines_addr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c38eebc9db4d..bcf56280c7ed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -578,6 +578,7 @@ struct qcm_process_device {
 	unsigned int queue_count;
 	unsigned long sdma_engine_bitmap;
 	unsigned long xgmi_sdma_engine_bitmap;
+	int initial_xgmi_engine;
 	unsigned int vmid;
 	bool is_debug;
 	unsigned int evicted; /* eviction counter, 0=active */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 13c85624bf7d..24ce1b52a1d5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1449,6 +1449,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 	pdd->qpd.mapped_gws_queue = false;
 	pdd->qpd.sdma_engine_bitmap = BIT_ULL(dev_info->num_sdma_engines) - 1;
 	pdd->qpd.xgmi_sdma_engine_bitmap = BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
+	pdd->qpd.initial_xgmi_engine = -1;
 	pdd->process = p;
 	pdd->bound = PDD_UNBOUND;
 	pdd->already_dequeued = false;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran
  2021-08-20  5:32 ` [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran Joseph Greathouse
@ 2021-08-20  6:59   ` Christian König
  2021-08-20 23:27     ` Greathouse, Joseph
  0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2021-08-20  6:59 UTC (permalink / raw)
  To: Joseph Greathouse, amd-gfx



Am 20.08.21 um 07:32 schrieb Joseph Greathouse:
> Aldebaran should not use SDMA0 for buffer funcs such as page migration.
> Instead, we move over to SDMA1 for these features. Leave SDMA0 in
> charge for all other existing chips to avoid any possibility of
> regressions.

The part why we do this is missing, apart from that looks good to me.

Christian.

>
> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 8 ++++++--
>   1 file changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 8931000dcd41..771630d7bb3f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -2689,11 +2689,15 @@ static const struct amdgpu_buffer_funcs sdma_v4_0_buffer_funcs = {
>   
>   static void sdma_v4_0_set_buffer_funcs(struct amdgpu_device *adev)
>   {
> +	int engine = 0;
> +
> +	if (adev->asic_type == CHIP_ALDEBARAN)
> +		engine = 1;
>   	adev->mman.buffer_funcs = &sdma_v4_0_buffer_funcs;
>   	if (adev->sdma.has_page_queue)
> -		adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].page;
> +		adev->mman.buffer_funcs_ring = &adev->sdma.instance[engine].page;
>   	else
> -		adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].ring;
> +		adev->mman.buffer_funcs_ring = &adev->sdma.instance[engine].ring;
>   }
>   
>   static const struct amdgpu_vm_pte_funcs sdma_v4_0_vm_pte_funcs = {


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
  2021-08-20  5:32 [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly Joseph Greathouse
  2021-08-20  5:32 ` [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran Joseph Greathouse
  2021-08-20  5:32 ` [PATCH 3/3] drm/amdkfd: Spread out XGMI SDMA allocations Joseph Greathouse
@ 2021-08-20 22:03 ` Felix Kuehling
  2021-08-23  7:08 ` Lazar, Lijo
  3 siblings, 0 replies; 13+ messages in thread
From: Felix Kuehling @ 2021-08-20 22:03 UTC (permalink / raw)
  To: amd-gfx, Greathouse, Joseph

Am 2021-08-20 um 1:32 a.m. schrieb Joseph Greathouse:
> Give every process at most one queue from each SDMA engine.
> Previously, we allocated all SDMA engines and queues on a first-
> come-first-serve basis. This meant that it was possible for two
> processes racing on their allocation requests to each end up with
> two queues on the same SDMA engine. That could serialize the
> performance of commands to different queues.
>
> This new mechanism allows each process at most a single queue
> on each SDMA engine. Processes will check for queue availability on
> SDMA engine 0, then 1, then 2, etc. and only allocate on that
> engine if there is an available queue slot. If there are no
> queue slots available (or if this process has already allocated
> a queue on this engine), it moves on and tries the next engine.
>
> The Aldebaran chip has a small quirk that SDMA0 should only be
> used by the very first allocation from each process. It is OK to
> use any other SDMA engine at any time, but after the first SDMA
> allocation request from a process, the resulting engine must
> be from SDMA1 or above. This patch handles this case as well.
>
> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>

One final nit-pick inline. With that fixed, the series is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>  3 files changed, 107 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f8fce9d05f50..86bdb765f350 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>  static int map_queues_cpsch(struct device_queue_manager *dqm);
>  
>  static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>  				struct queue *q);
>  
>  static inline void deallocate_hqd(struct device_queue_manager *dqm,
>  				struct queue *q);
>  static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
>  static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>  				struct queue *q);
>  static void kfd_process_hw_exception(struct work_struct *work);
>  
> @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>  			q->pipe, q->queue);
>  	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>  		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		retval = allocate_sdma_queue(dqm, q);
> +		retval = allocate_sdma_queue(dqm, qpd, q);
>  		if (retval)
>  			goto deallocate_vmid;
>  		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>  		deallocate_hqd(dqm, q);
>  	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>  		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  deallocate_vmid:
>  	if (list_empty(&qpd->queues_list))
>  		deallocate_vmid(dqm, qpd, q);
> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>  	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>  		deallocate_hqd(dqm, q);
>  	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  	else {
>  		pr_debug("q->properties.type %d is invalid\n",
>  				q->properties.type);
> @@ -1039,42 +1041,93 @@ static void pre_reset(struct device_queue_manager *dqm)
>  	dqm_unlock(dqm);
>  }
>  
> +static uint64_t sdma_engine_mask(int engine, int num_engines)
> +{
> +	uint64_t mask = 0;
> +
> +	engine %= num_engines;
> +	while (engine < (sizeof(uint64_t)*8)) {
> +		mask |= 1ULL << engine;
> +		engine += num_engines;
> +	}
> +	return mask;
> +}
> +
>  static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>  				struct queue *q)
>  {
> -	int bit;
> +	uint64_t available_queue_bitmap;
> +	unsigned int bit, engine, num_engines;
> +	bool found_sdma = false;
>  
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> -		if (dqm->sdma_bitmap == 0) {
> +		num_engines = get_num_sdma_engines(dqm);
> +		for_each_set_bit(engine, &(qpd->sdma_engine_bitmap), num_engines) {
> +			/* Do not reuse SDMA0 for any subsequent SDMA queue
> +			 * requests on Aldebaran. If SDMA0's queues are all
> +			 * full, then this process should never use SDMA0
> +			 * for any further requests
> +			 */
> +			if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +					engine == 0)
> +				qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +
> +			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> +			available_queue_bitmap &= dqm->sdma_bitmap;
> +
> +			if (!available_queue_bitmap)
> +				continue;
> +			/* Take the selected engine off the list so we will not
> +			 * allocate two queues onto the same engine
> +			 */
> +			qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +			found_sdma = true;
> +
> +			bit = __ffs64(available_queue_bitmap);
> +			dqm->sdma_bitmap &= ~(1ULL << bit);
> +			q->sdma_id = bit;
> +			q->properties.sdma_engine_id = q->sdma_id % num_engines;
> +			q->properties.sdma_queue_id = q->sdma_id / num_engines;
> +			break;
> +		}
> +		if (!found_sdma) {
>  			pr_err("No more SDMA queue to allocate\n");
>  			return -ENOMEM;
>  		}
> -
> -		bit = __ffs64(dqm->sdma_bitmap);
> -		dqm->sdma_bitmap &= ~(1ULL << bit);
> -		q->sdma_id = bit;
> -		q->properties.sdma_engine_id = q->sdma_id %
> -				get_num_sdma_engines(dqm);
> -		q->properties.sdma_queue_id = q->sdma_id /
> -				get_num_sdma_engines(dqm);
>  	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		if (dqm->xgmi_sdma_bitmap == 0) {
> +		num_engines = get_num_xgmi_sdma_engines(dqm);
> +		for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap), num_engines) {
> +			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> +			available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
> +
> +			if (!available_queue_bitmap)
> +				continue;
> +			/* Take the selected engine off the list so we will not
> +			 * allocate two queues onto the same engine
> +			 */
> +			qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
> +			found_sdma = true;
> +
> +			bit = __ffs64(available_queue_bitmap);
> +			dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> +			q->sdma_id = bit;
> +			/* sdma_engine_id is sdma id including
> +			 * both PCIe-optimized SDMAs and XGMI-
> +			 * optimized SDMAs. The calculation below
> +			 * assumes the first N engines are always
> +			 * PCIe-optimized ones
> +			 */
> +			q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> +				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> +			q->properties.sdma_queue_id = q->sdma_id /
> +				get_num_xgmi_sdma_engines(dqm);

You could use "num_engines" here instead of get_num_xgmi_sdma_engines(dqm).

Regards,
  Felix


> +			break;
> +		}
> +		if (!found_sdma) {
>  			pr_err("No more XGMI SDMA queue to allocate\n");
>  			return -ENOMEM;
>  		}
> -		bit = __ffs64(dqm->xgmi_sdma_bitmap);
> -		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> -		q->sdma_id = bit;
> -		/* sdma_engine_id is sdma id including
> -		 * both PCIe-optimized SDMAs and XGMI-
> -		 * optimized SDMAs. The calculation below
> -		 * assumes the first N engines are always
> -		 * PCIe-optimized ones
> -		 */
> -		q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> -				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> -		q->properties.sdma_queue_id = q->sdma_id /
> -				get_num_xgmi_sdma_engines(dqm);
>  	}
>  
>  	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>  }
>  
>  static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>  				struct queue *q)
>  {
> +	uint32_t engine = q->properties.sdma_engine_id;
> +
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>  		if (q->sdma_id >= get_num_sdma_queues(dqm))
>  			return;
>  		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> +		/* Don't give SDMA0 back to be reallocated on Aldebaran.
> +		 * It is only OK to use this engine from the first allocation
> +		 * within a process.
> +		 */
> +		if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +					engine == 0))
> +			qpd->sdma_engine_bitmap |= (1ULL << engine);
> +
>  	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>  		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>  			return;
>  		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> +		/* engine ID in the queue properties is the global engine ID.
> +		 * The XGMI engine bitmap ignores the PCIe-optimized engines.
> +		 */
> +		engine -= get_num_sdma_engines(dqm);
> +		qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>  	}
>  }
>  
> @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>  		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>  		dqm_lock(dqm);
> -		retval = allocate_sdma_queue(dqm, q);
> +		retval = allocate_sdma_queue(dqm, qpd, q);
>  		dqm_unlock(dqm);
>  		if (retval)
>  			goto out;
> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>  		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>  		dqm_lock(dqm);
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  		dqm_unlock(dqm);
>  	}
>  out:
> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>  
>  	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>  	    (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  		pdd->sdma_past_activity_counter += sdma_val;
>  	}
>  
> @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>  	/* Clear all user mode queues */
>  	list_for_each_entry(q, &qpd->queues_list, list) {
>  		if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue(dqm, qpd, q);
>  		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue(dqm, qpd, q);
>  
>  		if (q->properties.is_active) {
>  			decrement_queue_count(dqm, q->properties.type);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ab83b0de6b22..c38eebc9db4d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -576,6 +576,8 @@ struct qcm_process_device {
>  	struct list_head priv_queue_list;
>  
>  	unsigned int queue_count;
> +	unsigned long sdma_engine_bitmap;
> +	unsigned long xgmi_sdma_engine_bitmap;
>  	unsigned int vmid;
>  	bool is_debug;
>  	unsigned int evicted; /* eviction counter, 0=active */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 21ec8a18cad2..13c85624bf7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1422,6 +1422,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>  							struct kfd_process *p)
>  {
>  	struct kfd_process_device *pdd = NULL;
> +	const struct kfd_device_info *dev_info = dev->dqm->dev->device_info;
>  
>  	if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>  		return NULL;
> @@ -1446,6 +1447,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>  	pdd->qpd.pqm = &p->pqm;
>  	pdd->qpd.evicted = 0;
>  	pdd->qpd.mapped_gws_queue = false;
> +	pdd->qpd.sdma_engine_bitmap = BIT_ULL(dev_info->num_sdma_engines) - 1;
> +	pdd->qpd.xgmi_sdma_engine_bitmap = BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>  	pdd->process = p;
>  	pdd->bound = PDD_UNBOUND;
>  	pdd->already_dequeued = false;

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran
  2021-08-20  6:59   ` Christian König
@ 2021-08-20 23:27     ` Greathouse, Joseph
  0 siblings, 0 replies; 13+ messages in thread
From: Greathouse, Joseph @ 2021-08-20 23:27 UTC (permalink / raw)
  To: Christian König, amd-gfx

> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Friday, August 20, 2021 2:00 AM
> To: Greathouse, Joseph <Joseph.Greathouse@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran
> 
> Am 20.08.21 um 07:32 schrieb Joseph Greathouse:
> > Aldebaran should not use SDMA0 for buffer funcs such as page migration.
> > Instead, we move over to SDMA1 for these features. Leave SDMA0 in
> > charge for all other existing chips to avoid any possibility of
> > regressions.
> 
> The part why we do this is missing, apart from that looks good to me.
> 
> Christian.

How about this for a replacement description?

Because of sharing an MMHUB port with other engines, the hardware
design team has advised that Aldebaran should not use SDMA0 for
buffer funcs such as page migration. Instead, we move over to
SDMA1 for these features. Leave SDMA0 in charge for all other
existing chips to avoid any possibility of regressions.
 
Thanks,
-Joe

> >
> > Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 8 ++++++--
> >   1 file changed, 6 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> > index 8931000dcd41..771630d7bb3f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> > @@ -2689,11 +2689,15 @@ static const struct amdgpu_buffer_funcs sdma_v4_0_buffer_funcs = {
> >
> >   static void sdma_v4_0_set_buffer_funcs(struct amdgpu_device *adev)
> >   {
> > +     int engine = 0;
> > +
> > +     if (adev->asic_type == CHIP_ALDEBARAN)
> > +             engine = 1;
> >       adev->mman.buffer_funcs = &sdma_v4_0_buffer_funcs;
> >       if (adev->sdma.has_page_queue)
> > -             adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].page;
> > +             adev->mman.buffer_funcs_ring = &adev->sdma.instance[engine].page;
> >       else
> > -             adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].ring;
> > +             adev->mman.buffer_funcs_ring = &adev->sdma.instance[engine].ring;
> >   }
> >
> >   static const struct amdgpu_vm_pte_funcs sdma_v4_0_vm_pte_funcs = {


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
  2021-08-20  5:32 [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly Joseph Greathouse
                   ` (2 preceding siblings ...)
  2021-08-20 22:03 ` [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly Felix Kuehling
@ 2021-08-23  7:08 ` Lazar, Lijo
  2021-08-23 17:04   ` Felix Kuehling
  3 siblings, 1 reply; 13+ messages in thread
From: Lazar, Lijo @ 2021-08-23  7:08 UTC (permalink / raw)
  To: Joseph Greathouse, amd-gfx



On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
> Give every process at most one queue from each SDMA engine.
> Previously, we allocated all SDMA engines and queues on a first-
> come-first-serve basis. This meant that it was possible for two
> processes racing on their allocation requests to each end up with
> two queues on the same SDMA engine. That could serialize the
> performance of commands to different queues.
> 
> This new mechanism allows each process at most a single queue
> on each SDMA engine. Processes will check for queue availability on
> SDMA engine 0, then 1, then 2, etc. and only allocate on that
> engine if there is an available queue slot. If there are no
> queue slots available (or if this process has already allocated
> a queue on this engine), it moves on and tries the next engine.
> 
> The Aldebaran chip has a small quirk that SDMA0 should only be
> used by the very first allocation from each process. It is OK to
> use any other SDMA engine at any time, but after the first SDMA
> allocation request from a process, the resulting engine must
> be from SDMA1 or above. This patch handles this case as well.
> 
> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
> ---
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>   3 files changed, 107 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f8fce9d05f50..86bdb765f350 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   static int map_queues_cpsch(struct device_queue_manager *dqm);
>   
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>   				struct queue *q);
>   
>   static inline void deallocate_hqd(struct device_queue_manager *dqm,
>   				struct queue *q);
>   static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
>   static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>   				struct queue *q);
>   static void kfd_process_hw_exception(struct work_struct *work);
>   
> @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   			q->pipe, q->queue);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		retval = allocate_sdma_queue(dqm, q);
> +		retval = allocate_sdma_queue(dqm, qpd, q);
>   		if (retval)
>   			goto deallocate_vmid;
>   		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   		deallocate_hqd(dqm, q);
>   	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   deallocate_vmid:
>   	if (list_empty(&qpd->queues_list))
>   		deallocate_vmid(dqm, qpd, q);
> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>   		deallocate_hqd(dqm, q);
>   	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   	else {
>   		pr_debug("q->properties.type %d is invalid\n",
>   				q->properties.type);
> @@ -1039,42 +1041,93 @@ static void pre_reset(struct device_queue_manager *dqm)
>   	dqm_unlock(dqm);
>   }
>   
> +static uint64_t sdma_engine_mask(int engine, int num_engines)

Looks more like the queue mask for an engine, the name doesn't make it 
clear.

> +{
> +	uint64_t mask = 0;
> +
> +	engine %= num_engines;
> +	while (engine < (sizeof(uint64_t)*8)) {
> +		mask |= 1ULL << engine;
> +		engine += num_engines;
> +	}
> +	return mask;
> +}
> +
>   static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>   				struct queue *q)
>   {
> -	int bit;
> +	uint64_t available_queue_bitmap;
> +	unsigned int bit, engine, num_engines;
> +	bool found_sdma = false;
>   
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> -		if (dqm->sdma_bitmap == 0) {

This is still a valid optimization and no need to loop through if 
nothing is available anyway. Valid for XGMI loop also.

> +		num_engines = get_num_sdma_engines(dqm);
> +		for_each_set_bit(engine, &(qpd->sdma_engine_bitmap), num_engines) {

Probably a naive question -

Theoretically there are 8 queues per engine as per the mask code. In the 
below code, there is an assumption that a process will use at best 
number of queues=max num of sdma engines or xgmi engines simultaneously. 
Is that true always? For ex: there are 2 sdma engines and 4 queues per 
engine. Can't multiple threads of a process initiate multiple sdma 
transactions > number of sdma engines available? This code limits that, 
but I don't know if that is a possible case.

Thanks,
Lijo

> +			/* Do not reuse SDMA0 for any subsequent SDMA queue
> +			 * requests on Aldebaran. If SDMA0's queues are all
> +			 * full, then this process should never use SDMA0
> +			 * for any further requests
> +			 */
> +			if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +					engine == 0)
> +				qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +
> +			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> +			available_queue_bitmap &= dqm->sdma_bitmap;
> +
> +			if (!available_queue_bitmap)
> +				continue;
> +			/* Take the selected engine off the list so we will not
> +			 * allocate two queues onto the same engine
> +			 */
> +			qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +			found_sdma = true;
> +
> +			bit = __ffs64(available_queue_bitmap);
> +			dqm->sdma_bitmap &= ~(1ULL << bit);
> +			q->sdma_id = bit;
> +			q->properties.sdma_engine_id = q->sdma_id % num_engines;
> +			q->properties.sdma_queue_id = q->sdma_id / num_engines;
> +			break;
> +		}
> +		if (!found_sdma) {
>   			pr_err("No more SDMA queue to allocate\n");
>   			return -ENOMEM;
>   		}
> -
> -		bit = __ffs64(dqm->sdma_bitmap);
> -		dqm->sdma_bitmap &= ~(1ULL << bit);
> -		q->sdma_id = bit;
> -		q->properties.sdma_engine_id = q->sdma_id %
> -				get_num_sdma_engines(dqm);
> -		q->properties.sdma_queue_id = q->sdma_id /
> -				get_num_sdma_engines(dqm);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		if (dqm->xgmi_sdma_bitmap == 0) {
> +		num_engines = get_num_xgmi_sdma_engines(dqm);
> +		for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap), num_engines) {
> +			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> +			available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
> +
> +			if (!available_queue_bitmap)
> +				continue;
> +			/* Take the selected engine off the list so we will not
> +			 * allocate two queues onto the same engine
> +			 */
> +			qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
> +			found_sdma = true;
> +
> +			bit = __ffs64(available_queue_bitmap);
> +			dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> +			q->sdma_id = bit;
> +			/* sdma_engine_id is sdma id including
> +			 * both PCIe-optimized SDMAs and XGMI-
> +			 * optimized SDMAs. The calculation below
> +			 * assumes the first N engines are always
> +			 * PCIe-optimized ones
> +			 */
> +			q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> +				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> +			q->properties.sdma_queue_id = q->sdma_id /
> +				get_num_xgmi_sdma_engines(dqm);
> +			break;
> +		}
> +		if (!found_sdma) {
>   			pr_err("No more XGMI SDMA queue to allocate\n");
>   			return -ENOMEM;
>   		}
> -		bit = __ffs64(dqm->xgmi_sdma_bitmap);
> -		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> -		q->sdma_id = bit;
> -		/* sdma_engine_id is sdma id including
> -		 * both PCIe-optimized SDMAs and XGMI-
> -		 * optimized SDMAs. The calculation below
> -		 * assumes the first N engines are always
> -		 * PCIe-optimized ones
> -		 */
> -		q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> -				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> -		q->properties.sdma_queue_id = q->sdma_id /
> -				get_num_xgmi_sdma_engines(dqm);
>   	}
>   
>   	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   }
>   
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>   				struct queue *q)
>   {
> +	uint32_t engine = q->properties.sdma_engine_id;
> +
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		if (q->sdma_id >= get_num_sdma_queues(dqm))
>   			return;
>   		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> +		/* Don't give SDMA0 back to be reallocated on Aldebaran.
> +		 * It is only OK to use this engine from the first allocation
> +		 * within a process.
> +		 */
> +		if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +					engine == 0))
> +			qpd->sdma_engine_bitmap |= (1ULL << engine);
> +
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>   			return;
>   		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> +		/* engine ID in the queue properties is the global engine ID.
> +		 * The XGMI engine bitmap ignores the PCIe-optimized engines.
> +		 */
> +		engine -= get_num_sdma_engines(dqm);
> +		qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>   	}
>   }
>   
> @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm_lock(dqm);
> -		retval = allocate_sdma_queue(dqm, q);
> +		retval = allocate_sdma_queue(dqm, qpd, q);
>   		dqm_unlock(dqm);
>   		if (retval)
>   			goto out;
> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm_lock(dqm);
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   		dqm_unlock(dqm);
>   	}
>   out:
> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   
>   	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>   	    (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   		pdd->sdma_past_activity_counter += sdma_val;
>   	}
>   
> @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	/* Clear all user mode queues */
>   	list_for_each_entry(q, &qpd->queues_list, list) {
>   		if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue(dqm, qpd, q);
>   		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue(dqm, qpd, q);
>   
>   		if (q->properties.is_active) {
>   			decrement_queue_count(dqm, q->properties.type);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ab83b0de6b22..c38eebc9db4d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -576,6 +576,8 @@ struct qcm_process_device {
>   	struct list_head priv_queue_list;
>   
>   	unsigned int queue_count;
> +	unsigned long sdma_engine_bitmap;
> +	unsigned long xgmi_sdma_engine_bitmap;
>   	unsigned int vmid;
>   	bool is_debug;
>   	unsigned int evicted; /* eviction counter, 0=active */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 21ec8a18cad2..13c85624bf7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1422,6 +1422,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>   							struct kfd_process *p)
>   {
>   	struct kfd_process_device *pdd = NULL;
> +	const struct kfd_device_info *dev_info = dev->dqm->dev->device_info;
>   
>   	if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>   		return NULL;
> @@ -1446,6 +1447,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>   	pdd->qpd.pqm = &p->pqm;
>   	pdd->qpd.evicted = 0;
>   	pdd->qpd.mapped_gws_queue = false;
> +	pdd->qpd.sdma_engine_bitmap = BIT_ULL(dev_info->num_sdma_engines) - 1;
> +	pdd->qpd.xgmi_sdma_engine_bitmap = BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>   	pdd->process = p;
>   	pdd->bound = PDD_UNBOUND;
>   	pdd->already_dequeued = false;
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
  2021-08-23  7:08 ` Lazar, Lijo
@ 2021-08-23 17:04   ` Felix Kuehling
  2021-08-24  4:37     ` Lazar, Lijo
  0 siblings, 1 reply; 13+ messages in thread
From: Felix Kuehling @ 2021-08-23 17:04 UTC (permalink / raw)
  To: Lazar, Lijo, Joseph Greathouse, amd-gfx

Am 2021-08-23 um 3:08 a.m. schrieb Lazar, Lijo:
>
>
> On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
>> Give every process at most one queue from each SDMA engine.
>> Previously, we allocated all SDMA engines and queues on a first-
>> come-first-serve basis. This meant that it was possible for two
>> processes racing on their allocation requests to each end up with
>> two queues on the same SDMA engine. That could serialize the
>> performance of commands to different queues.
>>
>> This new mechanism allows each process at most a single queue
>> on each SDMA engine. Processes will check for queue availability on
>> SDMA engine 0, then 1, then 2, etc. and only allocate on that
>> engine if there is an available queue slot. If there are no
>> queue slots available (or if this process has already allocated
>> a queue on this engine), it moves on and tries the next engine.
>>
>> The Aldebaran chip has a small quirk that SDMA0 should only be
>> used by the very first allocation from each process. It is OK to
>> use any other SDMA engine at any time, but after the first SDMA
>> allocation request from a process, the resulting engine must
>> be from SDMA1 or above. This patch handles this case as well.
>>
>> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
>> ---
>>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>>   3 files changed, 107 insertions(+), 33 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index f8fce9d05f50..86bdb765f350 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct
>> device_queue_manager *dqm,
>>   static int map_queues_cpsch(struct device_queue_manager *dqm);
>>     static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>> +                struct qcm_process_device *qpd,
>>                   struct queue *q);
>>     static inline void deallocate_hqd(struct device_queue_manager *dqm,
>>                   struct queue *q);
>>   static int allocate_hqd(struct device_queue_manager *dqm, struct
>> queue *q);
>>   static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> +                struct qcm_process_device *qpd,
>>                   struct queue *q);
>>   static void kfd_process_hw_exception(struct work_struct *work);
>>   @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct
>> device_queue_manager *dqm,
>>               q->pipe, q->queue);
>>       } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>           q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> -        retval = allocate_sdma_queue(dqm, q);
>> +        retval = allocate_sdma_queue(dqm, qpd, q);
>>           if (retval)
>>               goto deallocate_vmid;
>>           dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct
>> device_queue_manager *dqm,
>>           deallocate_hqd(dqm, q);
>>       else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>           q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>> -        deallocate_sdma_queue(dqm, q);
>> +        deallocate_sdma_queue(dqm, qpd, q);
>>   deallocate_vmid:
>>       if (list_empty(&qpd->queues_list))
>>           deallocate_vmid(dqm, qpd, q);
>> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct
>> device_queue_manager *dqm,
>>       if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>>           deallocate_hqd(dqm, q);
>>       else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>> -        deallocate_sdma_queue(dqm, q);
>> +        deallocate_sdma_queue(dqm, qpd, q);
>>       else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>> -        deallocate_sdma_queue(dqm, q);
>> +        deallocate_sdma_queue(dqm, qpd, q);
>>       else {
>>           pr_debug("q->properties.type %d is invalid\n",
>>                   q->properties.type);
>> @@ -1039,42 +1041,93 @@ static void pre_reset(struct
>> device_queue_manager *dqm)
>>       dqm_unlock(dqm);
>>   }
>>   +static uint64_t sdma_engine_mask(int engine, int num_engines)
>
> Looks more like the queue mask for an engine, the name doesn't make it
> clear.
>
>> +{
>> +    uint64_t mask = 0;
>> +
>> +    engine %= num_engines;
>> +    while (engine < (sizeof(uint64_t)*8)) {
>> +        mask |= 1ULL << engine;
>> +        engine += num_engines;
>> +    }
>> +    return mask;
>> +}
>> +
>>   static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> +                struct qcm_process_device *qpd,
>>                   struct queue *q)
>>   {
>> -    int bit;
>> +    uint64_t available_queue_bitmap;
>> +    unsigned int bit, engine, num_engines;
>> +    bool found_sdma = false;
>>         if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> -        if (dqm->sdma_bitmap == 0) {
>
> This is still a valid optimization and no need to loop through if
> nothing is available anyway. Valid for XGMI loop also.
>
>> +        num_engines = get_num_sdma_engines(dqm);
>> +        for_each_set_bit(engine, &(qpd->sdma_engine_bitmap),
>> num_engines) {
>
> Probably a naive question -
>
> Theoretically there are 8 queues per engine as per the mask code. In
> the below code, there is an assumption that a process will use at best
> number of queues=max num of sdma engines or xgmi engines
> simultaneously. Is that true always? For ex: there are 2 sdma engines
> and 4 queues per engine. Can't multiple threads of a process initiate
> multiple sdma transactions > number of sdma engines available? This
> code limits that, but I don't know if that is a possible case.

When you use multiple SDMA queues on the same engine, the work from the
queues gets serialized. You can either let the SDMA engine serialize
work from multiple queues, or let ROCr serialize work from multiple
threads on the same SDMA queue. There is no obvious benefit to let the
SDMA engine do it. But there is a drawback: Fewer processes get to use SDMA.

Regards,
  Felix


>
> Thanks,
> Lijo
>
>> +            /* Do not reuse SDMA0 for any subsequent SDMA queue
>> +             * requests on Aldebaran. If SDMA0's queues are all
>> +             * full, then this process should never use SDMA0
>> +             * for any further requests
>> +             */
>> +            if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
>> +                    engine == 0)
>> +                qpd->sdma_engine_bitmap &= ~(1ULL << engine);
>> +
>> +            available_queue_bitmap = sdma_engine_mask(engine,
>> num_engines);
>> +            available_queue_bitmap &= dqm->sdma_bitmap;
>> +
>> +            if (!available_queue_bitmap)
>> +                continue;
>> +            /* Take the selected engine off the list so we will not
>> +             * allocate two queues onto the same engine
>> +             */
>> +            qpd->sdma_engine_bitmap &= ~(1ULL << engine);
>> +            found_sdma = true;
>> +
>> +            bit = __ffs64(available_queue_bitmap);
>> +            dqm->sdma_bitmap &= ~(1ULL << bit);
>> +            q->sdma_id = bit;
>> +            q->properties.sdma_engine_id = q->sdma_id % num_engines;
>> +            q->properties.sdma_queue_id = q->sdma_id / num_engines;
>> +            break;
>> +        }
>> +        if (!found_sdma) {
>>               pr_err("No more SDMA queue to allocate\n");
>>               return -ENOMEM;
>>           }
>> -
>> -        bit = __ffs64(dqm->sdma_bitmap);
>> -        dqm->sdma_bitmap &= ~(1ULL << bit);
>> -        q->sdma_id = bit;
>> -        q->properties.sdma_engine_id = q->sdma_id %
>> -                get_num_sdma_engines(dqm);
>> -        q->properties.sdma_queue_id = q->sdma_id /
>> -                get_num_sdma_engines(dqm);
>>       } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> -        if (dqm->xgmi_sdma_bitmap == 0) {
>> +        num_engines = get_num_xgmi_sdma_engines(dqm);
>> +        for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap),
>> num_engines) {
>> +            available_queue_bitmap = sdma_engine_mask(engine,
>> num_engines);
>> +            available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
>> +
>> +            if (!available_queue_bitmap)
>> +                continue;
>> +            /* Take the selected engine off the list so we will not
>> +             * allocate two queues onto the same engine
>> +             */
>> +            qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
>> +            found_sdma = true;
>> +
>> +            bit = __ffs64(available_queue_bitmap);
>> +            dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>> +            q->sdma_id = bit;
>> +            /* sdma_engine_id is sdma id including
>> +             * both PCIe-optimized SDMAs and XGMI-
>> +             * optimized SDMAs. The calculation below
>> +             * assumes the first N engines are always
>> +             * PCIe-optimized ones
>> +             */
>> +            q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
>> +                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
>> +            q->properties.sdma_queue_id = q->sdma_id /
>> +                get_num_xgmi_sdma_engines(dqm);
>> +            break;
>> +        }
>> +        if (!found_sdma) {
>>               pr_err("No more XGMI SDMA queue to allocate\n");
>>               return -ENOMEM;
>>           }
>> -        bit = __ffs64(dqm->xgmi_sdma_bitmap);
>> -        dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>> -        q->sdma_id = bit;
>> -        /* sdma_engine_id is sdma id including
>> -         * both PCIe-optimized SDMAs and XGMI-
>> -         * optimized SDMAs. The calculation below
>> -         * assumes the first N engines are always
>> -         * PCIe-optimized ones
>> -         */
>> -        q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
>> -                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
>> -        q->properties.sdma_queue_id = q->sdma_id /
>> -                get_num_xgmi_sdma_engines(dqm);
>>       }
>>         pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
>> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct
>> device_queue_manager *dqm,
>>   }
>>     static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>> +                struct qcm_process_device *qpd,
>>                   struct queue *q)
>>   {
>> +    uint32_t engine = q->properties.sdma_engine_id;
>> +
>>       if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>           if (q->sdma_id >= get_num_sdma_queues(dqm))
>>               return;
>>           dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> +        /* Don't give SDMA0 back to be reallocated on Aldebaran.
>> +         * It is only OK to use this engine from the first allocation
>> +         * within a process.
>> +         */
>> +        if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
>> +                    engine == 0))
>> +            qpd->sdma_engine_bitmap |= (1ULL << engine);
>> +
>>       } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>           if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>>               return;
>>           dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> +        /* engine ID in the queue properties is the global engine ID.
>> +         * The XGMI engine bitmap ignores the PCIe-optimized engines.
>> +         */
>> +        engine -= get_num_sdma_engines(dqm);
>> +        qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>>       }
>>   }
>>   @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct
>> device_queue_manager *dqm, struct queue *q,
>>       if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>           q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>           dqm_lock(dqm);
>> -        retval = allocate_sdma_queue(dqm, q);
>> +        retval = allocate_sdma_queue(dqm, qpd, q);
>>           dqm_unlock(dqm);
>>           if (retval)
>>               goto out;
>> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct
>> device_queue_manager *dqm, struct queue *q,
>>       if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>           q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>           dqm_lock(dqm);
>> -        deallocate_sdma_queue(dqm, q);
>> +        deallocate_sdma_queue(dqm, qpd, q);
>>           dqm_unlock(dqm);
>>       }
>>   out:
>> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct
>> device_queue_manager *dqm,
>>         if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>           (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>> -        deallocate_sdma_queue(dqm, q);
>> +        deallocate_sdma_queue(dqm, qpd, q);
>>           pdd->sdma_past_activity_counter += sdma_val;
>>       }
>>   @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct
>> device_queue_manager *dqm,
>>       /* Clear all user mode queues */
>>       list_for_each_entry(q, &qpd->queues_list, list) {
>>           if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>> -            deallocate_sdma_queue(dqm, q);
>> +            deallocate_sdma_queue(dqm, qpd, q);
>>           else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>> -            deallocate_sdma_queue(dqm, q);
>> +            deallocate_sdma_queue(dqm, qpd, q);
>>             if (q->properties.is_active) {
>>               decrement_queue_count(dqm, q->properties.type);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index ab83b0de6b22..c38eebc9db4d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -576,6 +576,8 @@ struct qcm_process_device {
>>       struct list_head priv_queue_list;
>>         unsigned int queue_count;
>> +    unsigned long sdma_engine_bitmap;
>> +    unsigned long xgmi_sdma_engine_bitmap;
>>       unsigned int vmid;
>>       bool is_debug;
>>       unsigned int evicted; /* eviction counter, 0=active */
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 21ec8a18cad2..13c85624bf7d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -1422,6 +1422,7 @@ struct kfd_process_device
>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>                               struct kfd_process *p)
>>   {
>>       struct kfd_process_device *pdd = NULL;
>> +    const struct kfd_device_info *dev_info =
>> dev->dqm->dev->device_info;
>>         if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>>           return NULL;
>> @@ -1446,6 +1447,8 @@ struct kfd_process_device
>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>       pdd->qpd.pqm = &p->pqm;
>>       pdd->qpd.evicted = 0;
>>       pdd->qpd.mapped_gws_queue = false;
>> +    pdd->qpd.sdma_engine_bitmap =
>> BIT_ULL(dev_info->num_sdma_engines) - 1;
>> +    pdd->qpd.xgmi_sdma_engine_bitmap =
>> BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>>       pdd->process = p;
>>       pdd->bound = PDD_UNBOUND;
>>       pdd->already_dequeued = false;
>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
  2021-08-23 17:04   ` Felix Kuehling
@ 2021-08-24  4:37     ` Lazar, Lijo
  2021-08-24  6:49       ` Greathouse, Joseph
  0 siblings, 1 reply; 13+ messages in thread
From: Lazar, Lijo @ 2021-08-24  4:37 UTC (permalink / raw)
  To: Felix Kuehling, Joseph Greathouse, amd-gfx



On 8/23/2021 10:34 PM, Felix Kuehling wrote:
> Am 2021-08-23 um 3:08 a.m. schrieb Lazar, Lijo:
>>
>>
>> On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
>>> Give every process at most one queue from each SDMA engine.
>>> Previously, we allocated all SDMA engines and queues on a first-
>>> come-first-serve basis. This meant that it was possible for two
>>> processes racing on their allocation requests to each end up with
>>> two queues on the same SDMA engine. That could serialize the
>>> performance of commands to different queues.
>>>
>>> This new mechanism allows each process at most a single queue
>>> on each SDMA engine. Processes will check for queue availability on
>>> SDMA engine 0, then 1, then 2, etc. and only allocate on that
>>> engine if there is an available queue slot. If there are no
>>> queue slots available (or if this process has already allocated
>>> a queue on this engine), it moves on and tries the next engine.
>>>
>>> The Aldebaran chip has a small quirk that SDMA0 should only be
>>> used by the very first allocation from each process. It is OK to
>>> use any other SDMA engine at any time, but after the first SDMA
>>> allocation request from a process, the resulting engine must
>>> be from SDMA1 or above. This patch handles this case as well.
>>>
>>> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
>>> ---
>>>    .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>>>    3 files changed, 107 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index f8fce9d05f50..86bdb765f350 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct
>>> device_queue_manager *dqm,
>>>    static int map_queues_cpsch(struct device_queue_manager *dqm);
>>>      static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>> +                struct qcm_process_device *qpd,
>>>                    struct queue *q);
>>>      static inline void deallocate_hqd(struct device_queue_manager *dqm,
>>>                    struct queue *q);
>>>    static int allocate_hqd(struct device_queue_manager *dqm, struct
>>> queue *q);
>>>    static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>> +                struct qcm_process_device *qpd,
>>>                    struct queue *q);
>>>    static void kfd_process_hw_exception(struct work_struct *work);
>>>    @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct
>>> device_queue_manager *dqm,
>>>                q->pipe, q->queue);
>>>        } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>> -        retval = allocate_sdma_queue(dqm, q);
>>> +        retval = allocate_sdma_queue(dqm, qpd, q);
>>>            if (retval)
>>>                goto deallocate_vmid;
>>>            dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>>> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct
>>> device_queue_manager *dqm,
>>>            deallocate_hqd(dqm, q);
>>>        else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>> -        deallocate_sdma_queue(dqm, q);
>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>    deallocate_vmid:
>>>        if (list_empty(&qpd->queues_list))
>>>            deallocate_vmid(dqm, qpd, q);
>>> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct
>>> device_queue_manager *dqm,
>>>        if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>>>            deallocate_hqd(dqm, q);
>>>        else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>>> -        deallocate_sdma_queue(dqm, q);
>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>        else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>> -        deallocate_sdma_queue(dqm, q);
>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>        else {
>>>            pr_debug("q->properties.type %d is invalid\n",
>>>                    q->properties.type);
>>> @@ -1039,42 +1041,93 @@ static void pre_reset(struct
>>> device_queue_manager *dqm)
>>>        dqm_unlock(dqm);
>>>    }
>>>    +static uint64_t sdma_engine_mask(int engine, int num_engines)
>>
>> Looks more like the queue mask for an engine, the name doesn't make it
>> clear.
>>
>>> +{
>>> +    uint64_t mask = 0;
>>> +
>>> +    engine %= num_engines;
>>> +    while (engine < (sizeof(uint64_t)*8)) {
>>> +        mask |= 1ULL << engine;
>>> +        engine += num_engines;
>>> +    }
>>> +    return mask;
>>> +}
>>> +
>>>    static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>> +                struct qcm_process_device *qpd,
>>>                    struct queue *q)
>>>    {
>>> -    int bit;
>>> +    uint64_t available_queue_bitmap;
>>> +    unsigned int bit, engine, num_engines;
>>> +    bool found_sdma = false;
>>>          if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>> -        if (dqm->sdma_bitmap == 0) {
>>
>> This is still a valid optimization and no need to loop through if
>> nothing is available anyway. Valid for XGMI loop also.
>>
>>> +        num_engines = get_num_sdma_engines(dqm);
>>> +        for_each_set_bit(engine, &(qpd->sdma_engine_bitmap),
>>> num_engines) {
>>
>> Probably a naive question -
>>
>> Theoretically there are 8 queues per engine as per the mask code. In
>> the below code, there is an assumption that a process will use at best
>> number of queues=max num of sdma engines or xgmi engines
>> simultaneously. Is that true always? For ex: there are 2 sdma engines
>> and 4 queues per engine. Can't multiple threads of a process initiate
>> multiple sdma transactions > number of sdma engines available? This
>> code limits that, but I don't know if that is a possible case.
> 
> When you use multiple SDMA queues on the same engine, the work from the
> queues gets serialized. You can either let the SDMA engine serialize
> work from multiple queues, or let ROCr serialize work from multiple
> threads on the same SDMA queue. There is no obvious benefit to let the
> SDMA engine do it. But there is a drawback: Fewer processes get to use SDMA.
> 

The fact that there exists multiple queues and there is no such 
assumption made in existing logic (prior to the patch) tells that there 
is indeed some advantage of making use of hardware queues. For ex: 
switching to different ring buffer may not need context save/resubmission.

Just like different processes, different threads of same process may 
take advantage of multiple queues. More interested to see the impact in 
cases where the focus is on single process ex: impact on running some 
benchmarks or in HPC where entire hardware is dedicated for a specific 
compute purpose.

Thanks,
Lijo

> Regards,
>    Felix
> 
> 
>>
>> Thanks,
>> Lijo
>>
>>> +            /* Do not reuse SDMA0 for any subsequent SDMA queue
>>> +             * requests on Aldebaran. If SDMA0's queues are all
>>> +             * full, then this process should never use SDMA0
>>> +             * for any further requests
>>> +             */
>>> +            if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
>>> +                    engine == 0)
>>> +                qpd->sdma_engine_bitmap &= ~(1ULL << engine);
>>> +
>>> +            available_queue_bitmap = sdma_engine_mask(engine,
>>> num_engines);
>>> +            available_queue_bitmap &= dqm->sdma_bitmap;
>>> +
>>> +            if (!available_queue_bitmap)
>>> +                continue;
>>> +            /* Take the selected engine off the list so we will not
>>> +             * allocate two queues onto the same engine
>>> +             */
>>> +            qpd->sdma_engine_bitmap &= ~(1ULL << engine);
>>> +            found_sdma = true;
>>> +
>>> +            bit = __ffs64(available_queue_bitmap);
>>> +            dqm->sdma_bitmap &= ~(1ULL << bit);
>>> +            q->sdma_id = bit;
>>> +            q->properties.sdma_engine_id = q->sdma_id % num_engines;
>>> +            q->properties.sdma_queue_id = q->sdma_id / num_engines;
>>> +            break;
>>> +        }
>>> +        if (!found_sdma) {
>>>                pr_err("No more SDMA queue to allocate\n");
>>>                return -ENOMEM;
>>>            }
>>> -
>>> -        bit = __ffs64(dqm->sdma_bitmap);
>>> -        dqm->sdma_bitmap &= ~(1ULL << bit);
>>> -        q->sdma_id = bit;
>>> -        q->properties.sdma_engine_id = q->sdma_id %
>>> -                get_num_sdma_engines(dqm);
>>> -        q->properties.sdma_queue_id = q->sdma_id /
>>> -                get_num_sdma_engines(dqm);
>>>        } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>> -        if (dqm->xgmi_sdma_bitmap == 0) {
>>> +        num_engines = get_num_xgmi_sdma_engines(dqm);
>>> +        for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap),
>>> num_engines) {
>>> +            available_queue_bitmap = sdma_engine_mask(engine,
>>> num_engines);
>>> +            available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
>>> +
>>> +            if (!available_queue_bitmap)
>>> +                continue;
>>> +            /* Take the selected engine off the list so we will not
>>> +             * allocate two queues onto the same engine
>>> +             */
>>> +            qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
>>> +            found_sdma = true;
>>> +
>>> +            bit = __ffs64(available_queue_bitmap);
>>> +            dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>>> +            q->sdma_id = bit;
>>> +            /* sdma_engine_id is sdma id including
>>> +             * both PCIe-optimized SDMAs and XGMI-
>>> +             * optimized SDMAs. The calculation below
>>> +             * assumes the first N engines are always
>>> +             * PCIe-optimized ones
>>> +             */
>>> +            q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
>>> +                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
>>> +            q->properties.sdma_queue_id = q->sdma_id /
>>> +                get_num_xgmi_sdma_engines(dqm);
>>> +            break;
>>> +        }
>>> +        if (!found_sdma) {
>>>                pr_err("No more XGMI SDMA queue to allocate\n");
>>>                return -ENOMEM;
>>>            }
>>> -        bit = __ffs64(dqm->xgmi_sdma_bitmap);
>>> -        dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>>> -        q->sdma_id = bit;
>>> -        /* sdma_engine_id is sdma id including
>>> -         * both PCIe-optimized SDMAs and XGMI-
>>> -         * optimized SDMAs. The calculation below
>>> -         * assumes the first N engines are always
>>> -         * PCIe-optimized ones
>>> -         */
>>> -        q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
>>> -                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
>>> -        q->properties.sdma_queue_id = q->sdma_id /
>>> -                get_num_xgmi_sdma_engines(dqm);
>>>        }
>>>          pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
>>> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct
>>> device_queue_manager *dqm,
>>>    }
>>>      static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>> +                struct qcm_process_device *qpd,
>>>                    struct queue *q)
>>>    {
>>> +    uint32_t engine = q->properties.sdma_engine_id;
>>> +
>>>        if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>>            if (q->sdma_id >= get_num_sdma_queues(dqm))
>>>                return;
>>>            dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>>> +        /* Don't give SDMA0 back to be reallocated on Aldebaran.
>>> +         * It is only OK to use this engine from the first allocation
>>> +         * within a process.
>>> +         */
>>> +        if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
>>> +                    engine == 0))
>>> +            qpd->sdma_engine_bitmap |= (1ULL << engine);
>>> +
>>>        } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>            if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>>>                return;
>>>            dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>>> +        /* engine ID in the queue properties is the global engine ID.
>>> +         * The XGMI engine bitmap ignores the PCIe-optimized engines.
>>> +         */
>>> +        engine -= get_num_sdma_engines(dqm);
>>> +        qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>>>        }
>>>    }
>>>    @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct
>>> device_queue_manager *dqm, struct queue *q,
>>>        if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>            dqm_lock(dqm);
>>> -        retval = allocate_sdma_queue(dqm, q);
>>> +        retval = allocate_sdma_queue(dqm, qpd, q);
>>>            dqm_unlock(dqm);
>>>            if (retval)
>>>                goto out;
>>> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct
>>> device_queue_manager *dqm, struct queue *q,
>>>        if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>            dqm_lock(dqm);
>>> -        deallocate_sdma_queue(dqm, q);
>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>            dqm_unlock(dqm);
>>>        }
>>>    out:
>>> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct
>>> device_queue_manager *dqm,
>>>          if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>>            (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>> -        deallocate_sdma_queue(dqm, q);
>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>            pdd->sdma_past_activity_counter += sdma_val;
>>>        }
>>>    @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct
>>> device_queue_manager *dqm,
>>>        /* Clear all user mode queues */
>>>        list_for_each_entry(q, &qpd->queues_list, list) {
>>>            if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>>> -            deallocate_sdma_queue(dqm, q);
>>> +            deallocate_sdma_queue(dqm, qpd, q);
>>>            else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>> -            deallocate_sdma_queue(dqm, q);
>>> +            deallocate_sdma_queue(dqm, qpd, q);
>>>              if (q->properties.is_active) {
>>>                decrement_queue_count(dqm, q->properties.type);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index ab83b0de6b22..c38eebc9db4d 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -576,6 +576,8 @@ struct qcm_process_device {
>>>        struct list_head priv_queue_list;
>>>          unsigned int queue_count;
>>> +    unsigned long sdma_engine_bitmap;
>>> +    unsigned long xgmi_sdma_engine_bitmap;
>>>        unsigned int vmid;
>>>        bool is_debug;
>>>        unsigned int evicted; /* eviction counter, 0=active */
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 21ec8a18cad2..13c85624bf7d 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -1422,6 +1422,7 @@ struct kfd_process_device
>>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>>                                struct kfd_process *p)
>>>    {
>>>        struct kfd_process_device *pdd = NULL;
>>> +    const struct kfd_device_info *dev_info =
>>> dev->dqm->dev->device_info;
>>>          if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>>>            return NULL;
>>> @@ -1446,6 +1447,8 @@ struct kfd_process_device
>>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>>        pdd->qpd.pqm = &p->pqm;
>>>        pdd->qpd.evicted = 0;
>>>        pdd->qpd.mapped_gws_queue = false;
>>> +    pdd->qpd.sdma_engine_bitmap =
>>> BIT_ULL(dev_info->num_sdma_engines) - 1;
>>> +    pdd->qpd.xgmi_sdma_engine_bitmap =
>>> BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>>>        pdd->process = p;
>>>        pdd->bound = PDD_UNBOUND;
>>>        pdd->already_dequeued = false;
>>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
  2021-08-24  4:37     ` Lazar, Lijo
@ 2021-08-24  6:49       ` Greathouse, Joseph
  2021-08-24  7:23         ` Lazar, Lijo
  0 siblings, 1 reply; 13+ messages in thread
From: Greathouse, Joseph @ 2021-08-24  6:49 UTC (permalink / raw)
  To: Lazar, Lijo, Kuehling, Felix, amd-gfx

> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> Sent: Monday, August 23, 2021 11:37 PM
> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph <Joseph.Greathouse@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
> 
> On 8/23/2021 10:34 PM, Felix Kuehling wrote:
> > Am 2021-08-23 um 3:08 a.m. schrieb Lazar, Lijo:
> >>
> >>
> >> On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
> >>> Give every process at most one queue from each SDMA engine.
> >>> Previously, we allocated all SDMA engines and queues on a first-
> >>> come-first-serve basis. This meant that it was possible for two
> >>> processes racing on their allocation requests to each end up with
> >>> two queues on the same SDMA engine. That could serialize the
> >>> performance of commands to different queues.
> >>>
> >>> This new mechanism allows each process at most a single queue
> >>> on each SDMA engine. Processes will check for queue availability on
> >>> SDMA engine 0, then 1, then 2, etc. and only allocate on that
> >>> engine if there is an available queue slot. If there are no
> >>> queue slots available (or if this process has already allocated
> >>> a queue on this engine), it moves on and tries the next engine.
> >>>
> >>> The Aldebaran chip has a small quirk that SDMA0 should only be
> >>> used by the very first allocation from each process. It is OK to
> >>> use any other SDMA engine at any time, but after the first SDMA
> >>> allocation request from a process, the resulting engine must
> >>> be from SDMA1 or above. This patch handles this case as well.
> >>>
> >>> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
> >>> ---
> >>>    .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
> >>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
> >>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
> >>>    3 files changed, 107 insertions(+), 33 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>> index f8fce9d05f50..86bdb765f350 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct
> >>> device_queue_manager *dqm,
> >>>    static int map_queues_cpsch(struct device_queue_manager *dqm);
> >>>      static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> >>> +                struct qcm_process_device *qpd,
> >>>                    struct queue *q);
> >>>      static inline void deallocate_hqd(struct device_queue_manager *dqm,
> >>>                    struct queue *q);
> >>>    static int allocate_hqd(struct device_queue_manager *dqm, struct
> >>> queue *q);
> >>>    static int allocate_sdma_queue(struct device_queue_manager *dqm,
> >>> +                struct qcm_process_device *qpd,
> >>>                    struct queue *q);
> >>>    static void kfd_process_hw_exception(struct work_struct *work);
> >>>    @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct
> >>> device_queue_manager *dqm,
> >>>                q->pipe, q->queue);
> >>>        } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> >>>            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>> -        retval = allocate_sdma_queue(dqm, q);
> >>> +        retval = allocate_sdma_queue(dqm, qpd, q);
> >>>            if (retval)
> >>>                goto deallocate_vmid;
> >>>            dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> >>> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct
> >>> device_queue_manager *dqm,
> >>>            deallocate_hqd(dqm, q);
> >>>        else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> >>>            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> >>> -        deallocate_sdma_queue(dqm, q);
> >>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>    deallocate_vmid:
> >>>        if (list_empty(&qpd->queues_list))
> >>>            deallocate_vmid(dqm, qpd, q);
> >>> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct
> >>> device_queue_manager *dqm,
> >>>        if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> >>>            deallocate_hqd(dqm, q);
> >>>        else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> >>> -        deallocate_sdma_queue(dqm, q);
> >>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>        else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> >>> -        deallocate_sdma_queue(dqm, q);
> >>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>        else {
> >>>            pr_debug("q->properties.type %d is invalid\n",
> >>>                    q->properties.type);
> >>> @@ -1039,42 +1041,93 @@ static void pre_reset(struct
> >>> device_queue_manager *dqm)
> >>>        dqm_unlock(dqm);
> >>>    }
> >>>    +static uint64_t sdma_engine_mask(int engine, int num_engines)
> >>
> >> Looks more like the queue mask for an engine, the name doesn't make it
> >> clear.
> >>
> >>> +{
> >>> +    uint64_t mask = 0;
> >>> +
> >>> +    engine %= num_engines;
> >>> +    while (engine < (sizeof(uint64_t)*8)) {
> >>> +        mask |= 1ULL << engine;
> >>> +        engine += num_engines;
> >>> +    }
> >>> +    return mask;
> >>> +}
> >>> +
> >>>    static int allocate_sdma_queue(struct device_queue_manager *dqm,
> >>> +                struct qcm_process_device *qpd,
> >>>                    struct queue *q)
> >>>    {
> >>> -    int bit;
> >>> +    uint64_t available_queue_bitmap;
> >>> +    unsigned int bit, engine, num_engines;
> >>> +    bool found_sdma = false;
> >>>          if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> >>> -        if (dqm->sdma_bitmap == 0) {
> >>
> >> This is still a valid optimization and no need to loop through if
> >> nothing is available anyway. Valid for XGMI loop also.
> >>
> >>> +        num_engines = get_num_sdma_engines(dqm);
> >>> +        for_each_set_bit(engine, &(qpd->sdma_engine_bitmap),
> >>> num_engines) {
> >>
> >> Probably a naive question -
> >>
> >> Theoretically there are 8 queues per engine as per the mask code. In
> >> the below code, there is an assumption that a process will use at best
> >> number of queues=max num of sdma engines or xgmi engines
> >> simultaneously. Is that true always? For ex: there are 2 sdma engines
> >> and 4 queues per engine. Can't multiple threads of a process initiate
> >> multiple sdma transactions > number of sdma engines available? This
> >> code limits that, but I don't know if that is a possible case.
> >
> > When you use multiple SDMA queues on the same engine, the work from the
> > queues gets serialized. You can either let the SDMA engine serialize
> > work from multiple queues, or let ROCr serialize work from multiple
> > threads on the same SDMA queue. There is no obvious benefit to let the
> > SDMA engine do it. But there is a drawback: Fewer processes get to use SDMA.
> >
> 
> The fact that there exists multiple queues and there is no such
> assumption made in existing logic (prior to the patch) tells that there
> is indeed some advantage of making use of hardware queues. For ex:
> switching to different ring buffer may not need context save/resubmission.
> 
> Just like different processes, different threads of same process may
> take advantage of multiple queues. More interested to see the impact in
> cases where the focus is on single process ex: impact on running some
> benchmarks or in HPC where entire hardware is dedicated for a specific
> compute purpose.
 
Each engine has multiple user-mode queues because we expect to see multiple processes sharing the device at the same time. The internal datasheets from the hardware team strongly suggest using one user-mode queue per engine per process.

With that in mind, the ROCr user-mode runtime attempts to create, at most, one queue per SDMA engine within each process. For instance, ROCr creates one queue for H->D transfers and one queue for D->H transfers. We want these queues to be on different engines so that we can get full-duplex PCIe transfers. If both queues are on the same engine, the engine will serialize the transfers (because it can only work on one command at a time), and we will get half-duplex.

The queue allocation algorithm that this patch replaces could accidentally yield two queues on the same engine. If Process A allocates its first queue, it would receive sdma0_queue0. Process B could then allocate its first queue, and get sdma1_queue0. Then Process A could allocate its second queue and get sdma0_queue1. This would yield undesirable and unexpected serialization between H->D and D->H transfers in Process A. In addition, I believe the old algorithm would allow an uncooperative process to allocate all SDMA queues to itself. Looking at the git history, I believe the old allocator is basically the same as the initial KFD SDMA allocator built for Kaveri. I'm confident we've learned more about how multiple processes should share our GPUs since then. :)

To your last point: even if we have different threads in the same process submitting transfer requests, the ROCr runtime within the process will submit all like transfers (e.g. all H->D transfers) into the same queue. Submitting those same commands into different queues on the same engine will not yield any clear performance benefit since the engine will serialize the commands.  And it will likely yield some performance loss due to queue-switching overheads in the engine. I think this patch yields a more fair allocation mechanism (e.g., no process can starve out other processes by taking all the queues on all engines), and it also more closely match what user-space expects (e.g., different SDMA queues can run commands in parallel). 

And thanks for the code suggestions up above, I will make those changes as requested.

Thanks!
-Joe

> Thanks,
> Lijo
> 
> > Regards,
> >    Felix
> >
> >
> >>
> >> Thanks,
> >> Lijo
> >>
> >>> +            /* Do not reuse SDMA0 for any subsequent SDMA queue
> >>> +             * requests on Aldebaran. If SDMA0's queues are all
> >>> +             * full, then this process should never use SDMA0
> >>> +             * for any further requests
> >>> +             */
> >>> +            if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> >>> +                    engine == 0)
> >>> +                qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> >>> +
> >>> +            available_queue_bitmap = sdma_engine_mask(engine,
> >>> num_engines);
> >>> +            available_queue_bitmap &= dqm->sdma_bitmap;
> >>> +
> >>> +            if (!available_queue_bitmap)
> >>> +                continue;
> >>> +            /* Take the selected engine off the list so we will not
> >>> +             * allocate two queues onto the same engine
> >>> +             */
> >>> +            qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> >>> +            found_sdma = true;
> >>> +
> >>> +            bit = __ffs64(available_queue_bitmap);
> >>> +            dqm->sdma_bitmap &= ~(1ULL << bit);
> >>> +            q->sdma_id = bit;
> >>> +            q->properties.sdma_engine_id = q->sdma_id % num_engines;
> >>> +            q->properties.sdma_queue_id = q->sdma_id / num_engines;
> >>> +            break;
> >>> +        }
> >>> +        if (!found_sdma) {
> >>>                pr_err("No more SDMA queue to allocate\n");
> >>>                return -ENOMEM;
> >>>            }
> >>> -
> >>> -        bit = __ffs64(dqm->sdma_bitmap);
> >>> -        dqm->sdma_bitmap &= ~(1ULL << bit);
> >>> -        q->sdma_id = bit;
> >>> -        q->properties.sdma_engine_id = q->sdma_id %
> >>> -                get_num_sdma_engines(dqm);
> >>> -        q->properties.sdma_queue_id = q->sdma_id /
> >>> -                get_num_sdma_engines(dqm);
> >>>        } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>> -        if (dqm->xgmi_sdma_bitmap == 0) {
> >>> +        num_engines = get_num_xgmi_sdma_engines(dqm);
> >>> +        for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap),
> >>> num_engines) {
> >>> +            available_queue_bitmap = sdma_engine_mask(engine,
> >>> num_engines);
> >>> +            available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
> >>> +
> >>> +            if (!available_queue_bitmap)
> >>> +                continue;
> >>> +            /* Take the selected engine off the list so we will not
> >>> +             * allocate two queues onto the same engine
> >>> +             */
> >>> +            qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
> >>> +            found_sdma = true;
> >>> +
> >>> +            bit = __ffs64(available_queue_bitmap);
> >>> +            dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> >>> +            q->sdma_id = bit;
> >>> +            /* sdma_engine_id is sdma id including
> >>> +             * both PCIe-optimized SDMAs and XGMI-
> >>> +             * optimized SDMAs. The calculation below
> >>> +             * assumes the first N engines are always
> >>> +             * PCIe-optimized ones
> >>> +             */
> >>> +            q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> >>> +                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> >>> +            q->properties.sdma_queue_id = q->sdma_id /
> >>> +                get_num_xgmi_sdma_engines(dqm);
> >>> +            break;
> >>> +        }
> >>> +        if (!found_sdma) {
> >>>                pr_err("No more XGMI SDMA queue to allocate\n");
> >>>                return -ENOMEM;
> >>>            }
> >>> -        bit = __ffs64(dqm->xgmi_sdma_bitmap);
> >>> -        dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> >>> -        q->sdma_id = bit;
> >>> -        /* sdma_engine_id is sdma id including
> >>> -         * both PCIe-optimized SDMAs and XGMI-
> >>> -         * optimized SDMAs. The calculation below
> >>> -         * assumes the first N engines are always
> >>> -         * PCIe-optimized ones
> >>> -         */
> >>> -        q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> >>> -                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> >>> -        q->properties.sdma_queue_id = q->sdma_id /
> >>> -                get_num_xgmi_sdma_engines(dqm);
> >>>        }
> >>>          pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> >>> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct
> >>> device_queue_manager *dqm,
> >>>    }
> >>>      static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> >>> +                struct qcm_process_device *qpd,
> >>>                    struct queue *q)
> >>>    {
> >>> +    uint32_t engine = q->properties.sdma_engine_id;
> >>> +
> >>>        if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> >>>            if (q->sdma_id >= get_num_sdma_queues(dqm))
> >>>                return;
> >>>            dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> >>> +        /* Don't give SDMA0 back to be reallocated on Aldebaran.
> >>> +         * It is only OK to use this engine from the first allocation
> >>> +         * within a process.
> >>> +         */
> >>> +        if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> >>> +                    engine == 0))
> >>> +            qpd->sdma_engine_bitmap |= (1ULL << engine);
> >>> +
> >>>        } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>>            if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
> >>>                return;
> >>>            dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> >>> +        /* engine ID in the queue properties is the global engine ID.
> >>> +         * The XGMI engine bitmap ignores the PCIe-optimized engines.
> >>> +         */
> >>> +        engine -= get_num_sdma_engines(dqm);
> >>> +        qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
> >>>        }
> >>>    }
> >>>    @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct
> >>> device_queue_manager *dqm, struct queue *q,
> >>>        if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> >>>            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>>            dqm_lock(dqm);
> >>> -        retval = allocate_sdma_queue(dqm, q);
> >>> +        retval = allocate_sdma_queue(dqm, qpd, q);
> >>>            dqm_unlock(dqm);
> >>>            if (retval)
> >>>                goto out;
> >>> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct
> >>> device_queue_manager *dqm, struct queue *q,
> >>>        if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> >>>            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>>            dqm_lock(dqm);
> >>> -        deallocate_sdma_queue(dqm, q);
> >>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>            dqm_unlock(dqm);
> >>>        }
> >>>    out:
> >>> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct
> >>> device_queue_manager *dqm,
> >>>          if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
> >>>            (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> >>> -        deallocate_sdma_queue(dqm, q);
> >>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>            pdd->sdma_past_activity_counter += sdma_val;
> >>>        }
> >>>    @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct
> >>> device_queue_manager *dqm,
> >>>        /* Clear all user mode queues */
> >>>        list_for_each_entry(q, &qpd->queues_list, list) {
> >>>            if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> >>> -            deallocate_sdma_queue(dqm, q);
> >>> +            deallocate_sdma_queue(dqm, qpd, q);
> >>>            else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> >>> -            deallocate_sdma_queue(dqm, q);
> >>> +            deallocate_sdma_queue(dqm, qpd, q);
> >>>              if (q->properties.is_active) {
> >>>                decrement_queue_count(dqm, q->properties.type);
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> index ab83b0de6b22..c38eebc9db4d 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>> @@ -576,6 +576,8 @@ struct qcm_process_device {
> >>>        struct list_head priv_queue_list;
> >>>          unsigned int queue_count;
> >>> +    unsigned long sdma_engine_bitmap;
> >>> +    unsigned long xgmi_sdma_engine_bitmap;
> >>>        unsigned int vmid;
> >>>        bool is_debug;
> >>>        unsigned int evicted; /* eviction counter, 0=active */
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>> index 21ec8a18cad2..13c85624bf7d 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>> @@ -1422,6 +1422,7 @@ struct kfd_process_device
> >>> *kfd_create_process_device_data(struct kfd_dev *dev,
> >>>                                struct kfd_process *p)
> >>>    {
> >>>        struct kfd_process_device *pdd = NULL;
> >>> +    const struct kfd_device_info *dev_info =
> >>> dev->dqm->dev->device_info;
> >>>          if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
> >>>            return NULL;
> >>> @@ -1446,6 +1447,8 @@ struct kfd_process_device
> >>> *kfd_create_process_device_data(struct kfd_dev *dev,
> >>>        pdd->qpd.pqm = &p->pqm;
> >>>        pdd->qpd.evicted = 0;
> >>>        pdd->qpd.mapped_gws_queue = false;
> >>> +    pdd->qpd.sdma_engine_bitmap =
> >>> BIT_ULL(dev_info->num_sdma_engines) - 1;
> >>> +    pdd->qpd.xgmi_sdma_engine_bitmap =
> >>> BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
> >>>        pdd->process = p;
> >>>        pdd->bound = PDD_UNBOUND;
> >>>        pdd->already_dequeued = false;
> >>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
  2021-08-24  6:49       ` Greathouse, Joseph
@ 2021-08-24  7:23         ` Lazar, Lijo
  2021-08-24  7:56           ` Greathouse, Joseph
  0 siblings, 1 reply; 13+ messages in thread
From: Lazar, Lijo @ 2021-08-24  7:23 UTC (permalink / raw)
  To: Greathouse, Joseph, Kuehling, Felix, amd-gfx



On 8/24/2021 12:19 PM, Greathouse, Joseph wrote:
>> -----Original Message-----
>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>> Sent: Monday, August 23, 2021 11:37 PM
>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph <Joseph.Greathouse@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
>>
>> On 8/23/2021 10:34 PM, Felix Kuehling wrote:
>>> Am 2021-08-23 um 3:08 a.m. schrieb Lazar, Lijo:
>>>>
>>>>
>>>> On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
>>>>> Give every process at most one queue from each SDMA engine.
>>>>> Previously, we allocated all SDMA engines and queues on a first-
>>>>> come-first-serve basis. This meant that it was possible for two
>>>>> processes racing on their allocation requests to each end up with
>>>>> two queues on the same SDMA engine. That could serialize the
>>>>> performance of commands to different queues.
>>>>>
>>>>> This new mechanism allows each process at most a single queue
>>>>> on each SDMA engine. Processes will check for queue availability on
>>>>> SDMA engine 0, then 1, then 2, etc. and only allocate on that
>>>>> engine if there is an available queue slot. If there are no
>>>>> queue slots available (or if this process has already allocated
>>>>> a queue on this engine), it moves on and tries the next engine.
>>>>>
>>>>> The Aldebaran chip has a small quirk that SDMA0 should only be
>>>>> used by the very first allocation from each process. It is OK to
>>>>> use any other SDMA engine at any time, but after the first SDMA
>>>>> allocation request from a process, the resulting engine must
>>>>> be from SDMA1 or above. This patch handles this case as well.
>>>>>
>>>>> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
>>>>> ---
>>>>>     .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>>>>>     3 files changed, 107 insertions(+), 33 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> index f8fce9d05f50..86bdb765f350 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct
>>>>> device_queue_manager *dqm,
>>>>>     static int map_queues_cpsch(struct device_queue_manager *dqm);
>>>>>       static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>>>> +                struct qcm_process_device *qpd,
>>>>>                     struct queue *q);
>>>>>       static inline void deallocate_hqd(struct device_queue_manager *dqm,
>>>>>                     struct queue *q);
>>>>>     static int allocate_hqd(struct device_queue_manager *dqm, struct
>>>>> queue *q);
>>>>>     static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>>>> +                struct qcm_process_device *qpd,
>>>>>                     struct queue *q);
>>>>>     static void kfd_process_hw_exception(struct work_struct *work);
>>>>>     @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct
>>>>> device_queue_manager *dqm,
>>>>>                 q->pipe, q->queue);
>>>>>         } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>             q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>> -        retval = allocate_sdma_queue(dqm, q);
>>>>> +        retval = allocate_sdma_queue(dqm, qpd, q);
>>>>>             if (retval)
>>>>>                 goto deallocate_vmid;
>>>>>             dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>>>>> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct
>>>>> device_queue_manager *dqm,
>>>>>             deallocate_hqd(dqm, q);
>>>>>         else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>             q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>     deallocate_vmid:
>>>>>         if (list_empty(&qpd->queues_list))
>>>>>             deallocate_vmid(dqm, qpd, q);
>>>>> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct
>>>>> device_queue_manager *dqm,
>>>>>         if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>>>>>             deallocate_hqd(dqm, q);
>>>>>         else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>         else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>         else {
>>>>>             pr_debug("q->properties.type %d is invalid\n",
>>>>>                     q->properties.type);
>>>>> @@ -1039,42 +1041,93 @@ static void pre_reset(struct
>>>>> device_queue_manager *dqm)
>>>>>         dqm_unlock(dqm);
>>>>>     }
>>>>>     +static uint64_t sdma_engine_mask(int engine, int num_engines)
>>>>
>>>> Looks more like the queue mask for an engine, the name doesn't make it
>>>> clear.
>>>>
>>>>> +{
>>>>> +    uint64_t mask = 0;
>>>>> +
>>>>> +    engine %= num_engines;
>>>>> +    while (engine < (sizeof(uint64_t)*8)) {
>>>>> +        mask |= 1ULL << engine;
>>>>> +        engine += num_engines;
>>>>> +    }
>>>>> +    return mask;
>>>>> +}
>>>>> +
>>>>>     static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>>>> +                struct qcm_process_device *qpd,
>>>>>                     struct queue *q)
>>>>>     {
>>>>> -    int bit;
>>>>> +    uint64_t available_queue_bitmap;
>>>>> +    unsigned int bit, engine, num_engines;
>>>>> +    bool found_sdma = false;
>>>>>           if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>>>> -        if (dqm->sdma_bitmap == 0) {
>>>>
>>>> This is still a valid optimization and no need to loop through if
>>>> nothing is available anyway. Valid for XGMI loop also.
>>>>
>>>>> +        num_engines = get_num_sdma_engines(dqm);
>>>>> +        for_each_set_bit(engine, &(qpd->sdma_engine_bitmap),
>>>>> num_engines) {
>>>>
>>>> Probably a naive question -
>>>>
>>>> Theoretically there are 8 queues per engine as per the mask code. In
>>>> the below code, there is an assumption that a process will use at best
>>>> number of queues=max num of sdma engines or xgmi engines
>>>> simultaneously. Is that true always? For ex: there are 2 sdma engines
>>>> and 4 queues per engine. Can't multiple threads of a process initiate
>>>> multiple sdma transactions > number of sdma engines available? This
>>>> code limits that, but I don't know if that is a possible case.
>>>
>>> When you use multiple SDMA queues on the same engine, the work from the
>>> queues gets serialized. You can either let the SDMA engine serialize
>>> work from multiple queues, or let ROCr serialize work from multiple
>>> threads on the same SDMA queue. There is no obvious benefit to let the
>>> SDMA engine do it. But there is a drawback: Fewer processes get to use SDMA.
>>>
>>
>> The fact that there exists multiple queues and there is no such
>> assumption made in existing logic (prior to the patch) tells that there
>> is indeed some advantage of making use of hardware queues. For ex:
>> switching to different ring buffer may not need context save/resubmission.
>>
>> Just like different processes, different threads of same process may
>> take advantage of multiple queues. More interested to see the impact in
>> cases where the focus is on single process ex: impact on running some
>> benchmarks or in HPC where entire hardware is dedicated for a specific
>> compute purpose.
>   
> Each engine has multiple user-mode queues because we expect to see multiple processes sharing the device at the same time. The internal datasheets from the hardware team strongly suggest using one user-mode queue per engine per process.
> 
> With that in mind, the ROCr user-mode runtime attempts to create, at most, one queue per SDMA engine within each process. For instance, ROCr creates one queue for H->D transfers and one queue for D->H transfers. We want these queues to be on different engines so that we can get full-duplex PCIe transfers. If both queues are on the same engine, the engine will serialize the transfers (because it can only work on one command at a time), and we will get half-duplex.
> 
> The queue allocation algorithm that this patch replaces could accidentally yield two queues on the same engine. If Process A allocates its first queue, it would receive sdma0_queue0. Process B could then allocate its first queue, and get sdma1_queue0. Then Process A could allocate its second queue and get sdma0_queue1. This would yield undesirable and unexpected serialization between H->D and D->H transfers in Process A. In addition, I believe the old algorithm would allow an uncooperative process to allocate all SDMA queues to itself. Looking at the git history, I believe the old allocator is basically the same as the initial KFD SDMA allocator built for Kaveri. I'm confident we've learned more about how multiple processes should share our GPUs since then. :)
> 
> To your last point: even if we have different threads in the same process submitting transfer requests, the ROCr runtime within the process will submit all like transfers (e.g. all H->D transfers) into the same queue. Submitting those same commands into different queues on the same engine will not yield any clear performance benefit since the engine will serialize the commands.  And it will likely yield some performance loss due to queue-switching overheads in the engine. I think this patch yields a more fair allocation mechanism (e.g., no process can starve out other processes by taking all the queues on all engines), and it also more closely match what user-space expects (e.g., different SDMA queues can run commands in parallel).
> 

Thanks Joe for providing more info related to ROCr.

One other thing is - seems the process will keep waiting for the second 
queue (assuming first is still in use) to be alloted always on a 
separate engine. In the example cited above, do you expect a case where 
sdma1 queues are all full and Process A has to wait till a sdma1 queue 
is free even though there is a free queue available in sdma0 (by virtue 
of long/short jobs submitted earlier by other processes). Is that 
situation managed by ROCr? Or, as worst case allocate on sdma0 itself if 
nothing is available on other engines? Rather than going in any order, 
maybe another way is to order engines based on max free queues and give 
preference to the first engine not actively used by the process.

Thanks,
Lijo

> And thanks for the code suggestions up above, I will make those changes as requested.
> 
> Thanks!
> -Joe
> 
>> Thanks,
>> Lijo
>>
>>> Regards,
>>>     Felix
>>>
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> +            /* Do not reuse SDMA0 for any subsequent SDMA queue
>>>>> +             * requests on Aldebaran. If SDMA0's queues are all
>>>>> +             * full, then this process should never use SDMA0
>>>>> +             * for any further requests
>>>>> +             */
>>>>> +            if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
>>>>> +                    engine == 0)
>>>>> +                qpd->sdma_engine_bitmap &= ~(1ULL << engine);
>>>>> +
>>>>> +            available_queue_bitmap = sdma_engine_mask(engine,
>>>>> num_engines);
>>>>> +            available_queue_bitmap &= dqm->sdma_bitmap;
>>>>> +
>>>>> +            if (!available_queue_bitmap)
>>>>> +                continue;
>>>>> +            /* Take the selected engine off the list so we will not
>>>>> +             * allocate two queues onto the same engine
>>>>> +             */
>>>>> +            qpd->sdma_engine_bitmap &= ~(1ULL << engine);
>>>>> +            found_sdma = true;
>>>>> +
>>>>> +            bit = __ffs64(available_queue_bitmap);
>>>>> +            dqm->sdma_bitmap &= ~(1ULL << bit);
>>>>> +            q->sdma_id = bit;
>>>>> +            q->properties.sdma_engine_id = q->sdma_id % num_engines;
>>>>> +            q->properties.sdma_queue_id = q->sdma_id / num_engines;
>>>>> +            break;
>>>>> +        }
>>>>> +        if (!found_sdma) {
>>>>>                 pr_err("No more SDMA queue to allocate\n");
>>>>>                 return -ENOMEM;
>>>>>             }
>>>>> -
>>>>> -        bit = __ffs64(dqm->sdma_bitmap);
>>>>> -        dqm->sdma_bitmap &= ~(1ULL << bit);
>>>>> -        q->sdma_id = bit;
>>>>> -        q->properties.sdma_engine_id = q->sdma_id %
>>>>> -                get_num_sdma_engines(dqm);
>>>>> -        q->properties.sdma_queue_id = q->sdma_id /
>>>>> -                get_num_sdma_engines(dqm);
>>>>>         } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>> -        if (dqm->xgmi_sdma_bitmap == 0) {
>>>>> +        num_engines = get_num_xgmi_sdma_engines(dqm);
>>>>> +        for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap),
>>>>> num_engines) {
>>>>> +            available_queue_bitmap = sdma_engine_mask(engine,
>>>>> num_engines);
>>>>> +            available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
>>>>> +
>>>>> +            if (!available_queue_bitmap)
>>>>> +                continue;
>>>>> +            /* Take the selected engine off the list so we will not
>>>>> +             * allocate two queues onto the same engine
>>>>> +             */
>>>>> +            qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
>>>>> +            found_sdma = true;
>>>>> +
>>>>> +            bit = __ffs64(available_queue_bitmap);
>>>>> +            dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>>>>> +            q->sdma_id = bit;
>>>>> +            /* sdma_engine_id is sdma id including
>>>>> +             * both PCIe-optimized SDMAs and XGMI-
>>>>> +             * optimized SDMAs. The calculation below
>>>>> +             * assumes the first N engines are always
>>>>> +             * PCIe-optimized ones
>>>>> +             */
>>>>> +            q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
>>>>> +                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
>>>>> +            q->properties.sdma_queue_id = q->sdma_id /
>>>>> +                get_num_xgmi_sdma_engines(dqm);
>>>>> +            break;
>>>>> +        }
>>>>> +        if (!found_sdma) {
>>>>>                 pr_err("No more XGMI SDMA queue to allocate\n");
>>>>>                 return -ENOMEM;
>>>>>             }
>>>>> -        bit = __ffs64(dqm->xgmi_sdma_bitmap);
>>>>> -        dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>>>>> -        q->sdma_id = bit;
>>>>> -        /* sdma_engine_id is sdma id including
>>>>> -         * both PCIe-optimized SDMAs and XGMI-
>>>>> -         * optimized SDMAs. The calculation below
>>>>> -         * assumes the first N engines are always
>>>>> -         * PCIe-optimized ones
>>>>> -         */
>>>>> -        q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
>>>>> -                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
>>>>> -        q->properties.sdma_queue_id = q->sdma_id /
>>>>> -                get_num_xgmi_sdma_engines(dqm);
>>>>>         }
>>>>>           pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
>>>>> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct
>>>>> device_queue_manager *dqm,
>>>>>     }
>>>>>       static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>>>> +                struct qcm_process_device *qpd,
>>>>>                     struct queue *q)
>>>>>     {
>>>>> +    uint32_t engine = q->properties.sdma_engine_id;
>>>>> +
>>>>>         if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>>>>             if (q->sdma_id >= get_num_sdma_queues(dqm))
>>>>>                 return;
>>>>>             dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>>>>> +        /* Don't give SDMA0 back to be reallocated on Aldebaran.
>>>>> +         * It is only OK to use this engine from the first allocation
>>>>> +         * within a process.
>>>>> +         */
>>>>> +        if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
>>>>> +                    engine == 0))
>>>>> +            qpd->sdma_engine_bitmap |= (1ULL << engine);
>>>>> +
>>>>>         } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>>             if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>>>>>                 return;
>>>>>             dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>>>>> +        /* engine ID in the queue properties is the global engine ID.
>>>>> +         * The XGMI engine bitmap ignores the PCIe-optimized engines.
>>>>> +         */
>>>>> +        engine -= get_num_sdma_engines(dqm);
>>>>> +        qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>>>>>         }
>>>>>     }
>>>>>     @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct
>>>>> device_queue_manager *dqm, struct queue *q,
>>>>>         if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>             q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>>             dqm_lock(dqm);
>>>>> -        retval = allocate_sdma_queue(dqm, q);
>>>>> +        retval = allocate_sdma_queue(dqm, qpd, q);
>>>>>             dqm_unlock(dqm);
>>>>>             if (retval)
>>>>>                 goto out;
>>>>> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct
>>>>> device_queue_manager *dqm, struct queue *q,
>>>>>         if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>             q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>>             dqm_lock(dqm);
>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>             dqm_unlock(dqm);
>>>>>         }
>>>>>     out:
>>>>> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct
>>>>> device_queue_manager *dqm,
>>>>>           if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>>>>             (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>             pdd->sdma_past_activity_counter += sdma_val;
>>>>>         }
>>>>>     @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct
>>>>> device_queue_manager *dqm,
>>>>>         /* Clear all user mode queues */
>>>>>         list_for_each_entry(q, &qpd->queues_list, list) {
>>>>>             if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>>>>> -            deallocate_sdma_queue(dqm, q);
>>>>> +            deallocate_sdma_queue(dqm, qpd, q);
>>>>>             else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>>>> -            deallocate_sdma_queue(dqm, q);
>>>>> +            deallocate_sdma_queue(dqm, qpd, q);
>>>>>               if (q->properties.is_active) {
>>>>>                 decrement_queue_count(dqm, q->properties.type);
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> index ab83b0de6b22..c38eebc9db4d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> @@ -576,6 +576,8 @@ struct qcm_process_device {
>>>>>         struct list_head priv_queue_list;
>>>>>           unsigned int queue_count;
>>>>> +    unsigned long sdma_engine_bitmap;
>>>>> +    unsigned long xgmi_sdma_engine_bitmap;
>>>>>         unsigned int vmid;
>>>>>         bool is_debug;
>>>>>         unsigned int evicted; /* eviction counter, 0=active */
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> index 21ec8a18cad2..13c85624bf7d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> @@ -1422,6 +1422,7 @@ struct kfd_process_device
>>>>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>>>>                                 struct kfd_process *p)
>>>>>     {
>>>>>         struct kfd_process_device *pdd = NULL;
>>>>> +    const struct kfd_device_info *dev_info =
>>>>> dev->dqm->dev->device_info;
>>>>>           if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>>>>>             return NULL;
>>>>> @@ -1446,6 +1447,8 @@ struct kfd_process_device
>>>>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>>>>         pdd->qpd.pqm = &p->pqm;
>>>>>         pdd->qpd.evicted = 0;
>>>>>         pdd->qpd.mapped_gws_queue = false;
>>>>> +    pdd->qpd.sdma_engine_bitmap =
>>>>> BIT_ULL(dev_info->num_sdma_engines) - 1;
>>>>> +    pdd->qpd.xgmi_sdma_engine_bitmap =
>>>>> BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>>>>>         pdd->process = p;
>>>>>         pdd->bound = PDD_UNBOUND;
>>>>>         pdd->already_dequeued = false;
>>>>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
  2021-08-24  7:23         ` Lazar, Lijo
@ 2021-08-24  7:56           ` Greathouse, Joseph
  2021-08-24 12:33             ` Lazar, Lijo
  0 siblings, 1 reply; 13+ messages in thread
From: Greathouse, Joseph @ 2021-08-24  7:56 UTC (permalink / raw)
  To: Lazar, Lijo, Kuehling, Felix, amd-gfx

> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> Sent: Tuesday, August 24, 2021 2:24 AM
> To: Greathouse, Joseph <Joseph.Greathouse@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; amd-
> gfx@lists.freedesktop.org
> Subject: Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
> 
> 
> 
> On 8/24/2021 12:19 PM, Greathouse, Joseph wrote:
> >> -----Original Message-----
> >> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> >> Sent: Monday, August 23, 2021 11:37 PM
> >> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph <Joseph.Greathouse@amd.com>; amd-
> >> gfx@lists.freedesktop.org
> >> Subject: Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
> >>
> >> On 8/23/2021 10:34 PM, Felix Kuehling wrote:
> >>> Am 2021-08-23 um 3:08 a.m. schrieb Lazar, Lijo:
> >>>>
> >>>>
> >>>> On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
> >>>>> Give every process at most one queue from each SDMA engine.
> >>>>> Previously, we allocated all SDMA engines and queues on a first-
> >>>>> come-first-serve basis. This meant that it was possible for two
> >>>>> processes racing on their allocation requests to each end up with
> >>>>> two queues on the same SDMA engine. That could serialize the
> >>>>> performance of commands to different queues.
> >>>>>
> >>>>> This new mechanism allows each process at most a single queue
> >>>>> on each SDMA engine. Processes will check for queue availability on
> >>>>> SDMA engine 0, then 1, then 2, etc. and only allocate on that
> >>>>> engine if there is an available queue slot. If there are no
> >>>>> queue slots available (or if this process has already allocated
> >>>>> a queue on this engine), it moves on and tries the next engine.
> >>>>>
> >>>>> The Aldebaran chip has a small quirk that SDMA0 should only be
> >>>>> used by the very first allocation from each process. It is OK to
> >>>>> use any other SDMA engine at any time, but after the first SDMA
> >>>>> allocation request from a process, the resulting engine must
> >>>>> be from SDMA1 or above. This patch handles this case as well.
> >>>>>
> >>>>> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
> >>>>> ---
> >>>>>     .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
> >>>>>     drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
> >>>>>     drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
> >>>>>     3 files changed, 107 insertions(+), 33 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>>>> index f8fce9d05f50..86bdb765f350 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> >>>>> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct
> >>>>> device_queue_manager *dqm,
> >>>>>     static int map_queues_cpsch(struct device_queue_manager *dqm);
> >>>>>       static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> >>>>> +                struct qcm_process_device *qpd,
> >>>>>                     struct queue *q);
> >>>>>       static inline void deallocate_hqd(struct device_queue_manager *dqm,
> >>>>>                     struct queue *q);
> >>>>>     static int allocate_hqd(struct device_queue_manager *dqm, struct
> >>>>> queue *q);
> >>>>>     static int allocate_sdma_queue(struct device_queue_manager *dqm,
> >>>>> +                struct qcm_process_device *qpd,
> >>>>>                     struct queue *q);
> >>>>>     static void kfd_process_hw_exception(struct work_struct *work);
> >>>>>     @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct
> >>>>> device_queue_manager *dqm,
> >>>>>                 q->pipe, q->queue);
> >>>>>         } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> >>>>>             q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>>>> -        retval = allocate_sdma_queue(dqm, q);
> >>>>> +        retval = allocate_sdma_queue(dqm, qpd, q);
> >>>>>             if (retval)
> >>>>>                 goto deallocate_vmid;
> >>>>>             dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> >>>>> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct
> >>>>> device_queue_manager *dqm,
> >>>>>             deallocate_hqd(dqm, q);
> >>>>>         else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> >>>>>             q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> >>>>> -        deallocate_sdma_queue(dqm, q);
> >>>>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>>>     deallocate_vmid:
> >>>>>         if (list_empty(&qpd->queues_list))
> >>>>>             deallocate_vmid(dqm, qpd, q);
> >>>>> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct
> >>>>> device_queue_manager *dqm,
> >>>>>         if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> >>>>>             deallocate_hqd(dqm, q);
> >>>>>         else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> >>>>> -        deallocate_sdma_queue(dqm, q);
> >>>>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>>>         else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> >>>>> -        deallocate_sdma_queue(dqm, q);
> >>>>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>>>         else {
> >>>>>             pr_debug("q->properties.type %d is invalid\n",
> >>>>>                     q->properties.type);
> >>>>> @@ -1039,42 +1041,93 @@ static void pre_reset(struct
> >>>>> device_queue_manager *dqm)
> >>>>>         dqm_unlock(dqm);
> >>>>>     }
> >>>>>     +static uint64_t sdma_engine_mask(int engine, int num_engines)
> >>>>
> >>>> Looks more like the queue mask for an engine, the name doesn't make it
> >>>> clear.
> >>>>
> >>>>> +{
> >>>>> +    uint64_t mask = 0;
> >>>>> +
> >>>>> +    engine %= num_engines;
> >>>>> +    while (engine < (sizeof(uint64_t)*8)) {
> >>>>> +        mask |= 1ULL << engine;
> >>>>> +        engine += num_engines;
> >>>>> +    }
> >>>>> +    return mask;
> >>>>> +}
> >>>>> +
> >>>>>     static int allocate_sdma_queue(struct device_queue_manager *dqm,
> >>>>> +                struct qcm_process_device *qpd,
> >>>>>                     struct queue *q)
> >>>>>     {
> >>>>> -    int bit;
> >>>>> +    uint64_t available_queue_bitmap;
> >>>>> +    unsigned int bit, engine, num_engines;
> >>>>> +    bool found_sdma = false;
> >>>>>           if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> >>>>> -        if (dqm->sdma_bitmap == 0) {
> >>>>
> >>>> This is still a valid optimization and no need to loop through if
> >>>> nothing is available anyway. Valid for XGMI loop also.
> >>>>
> >>>>> +        num_engines = get_num_sdma_engines(dqm);
> >>>>> +        for_each_set_bit(engine, &(qpd->sdma_engine_bitmap),
> >>>>> num_engines) {
> >>>>
> >>>> Probably a naive question -
> >>>>
> >>>> Theoretically there are 8 queues per engine as per the mask code. In
> >>>> the below code, there is an assumption that a process will use at best
> >>>> number of queues=max num of sdma engines or xgmi engines
> >>>> simultaneously. Is that true always? For ex: there are 2 sdma engines
> >>>> and 4 queues per engine. Can't multiple threads of a process initiate
> >>>> multiple sdma transactions > number of sdma engines available? This
> >>>> code limits that, but I don't know if that is a possible case.
> >>>
> >>> When you use multiple SDMA queues on the same engine, the work from the
> >>> queues gets serialized. You can either let the SDMA engine serialize
> >>> work from multiple queues, or let ROCr serialize work from multiple
> >>> threads on the same SDMA queue. There is no obvious benefit to let the
> >>> SDMA engine do it. But there is a drawback: Fewer processes get to use SDMA.
> >>>
> >>
> >> The fact that there exists multiple queues and there is no such
> >> assumption made in existing logic (prior to the patch) tells that there
> >> is indeed some advantage of making use of hardware queues. For ex:
> >> switching to different ring buffer may not need context save/resubmission.
> >>
> >> Just like different processes, different threads of same process may
> >> take advantage of multiple queues. More interested to see the impact in
> >> cases where the focus is on single process ex: impact on running some
> >> benchmarks or in HPC where entire hardware is dedicated for a specific
> >> compute purpose.
> >
> > Each engine has multiple user-mode queues because we expect to see multiple processes sharing the device at the same time. The
> internal datasheets from the hardware team strongly suggest using one user-mode queue per engine per process.
> >
> > With that in mind, the ROCr user-mode runtime attempts to create, at most, one queue per SDMA engine within each process. For
> instance, ROCr creates one queue for H->D transfers and one queue for D->H transfers. We want these queues to be on different
> engines so that we can get full-duplex PCIe transfers. If both queues are on the same engine, the engine will serialize the transfers
> (because it can only work on one command at a time), and we will get half-duplex.
> >
> > The queue allocation algorithm that this patch replaces could accidentally yield two queues on the same engine. If Process A
> allocates its first queue, it would receive sdma0_queue0. Process B could then allocate its first queue, and get sdma1_queue0. Then
> Process A could allocate its second queue and get sdma0_queue1. This would yield undesirable and unexpected serialization between
> H->D and D->H transfers in Process A. In addition, I believe the old algorithm would allow an uncooperative process to allocate all
> SDMA queues to itself. Looking at the git history, I believe the old allocator is basically the same as the initial KFD SDMA allocator built
> for Kaveri. I'm confident we've learned more about how multiple processes should share our GPUs since then. :)
> >
> > To your last point: even if we have different threads in the same process submitting transfer requests, the ROCr runtime within the
> process will submit all like transfers (e.g. all H->D transfers) into the same queue. Submitting those same commands into different
> queues on the same engine will not yield any clear performance benefit since the engine will serialize the commands.  And it will likely
> yield some performance loss due to queue-switching overheads in the engine. I think this patch yields a more fair allocation
> mechanism (e.g., no process can starve out other processes by taking all the queues on all engines), and it also more closely match
> what user-space expects (e.g., different SDMA queues can run commands in parallel).
> >
> 
> Thanks Joe for providing more info related to ROCr.
> 
> One other thing is - seems the process will keep waiting for the second
> queue (assuming first is still in use) to be alloted always on a
> separate engine. In the example cited above, do you expect a case where
> sdma1 queues are all full and Process A has to wait till a sdma1 queue
> is free even though there is a free queue available in sdma0 (by virtue
> of long/short jobs submitted earlier by other processes). Is that
> situation managed by ROCr? Or, as worst case allocate on sdma0 itself if
> nothing is available on other engines? Rather than going in any order,
> maybe another way is to order engines based on max free queues and give
> preference to the first engine not actively used by the process.

I think there are a few parts to this answer:

User-space has always been responsible for handling an allocation failure and falling back in some capacity, even before this patch. If the process receives a NAK and has no SDMA queues, it may need to fall back to a blit copy-kernel submitted to a compute queue. ROCr does this today.

Another example of when this allocator could NAK is (for example) if this process tries to allocate a 2nd PCIe queue, it already has a queue on SDMA0, and SDMA1 is full. User-space could also use a blit kernel to handle this, if it wants transfers to run in parallel (one direction in SDMA, one with a blit). Or it could decide to serialize all transfers commands into the single queue it already has.

A third situation would be if Process A initially requests a queue while SDMA0 is full, so its first queue is allocated on SDMA1. Later, another process ends and one of the SDMA0 queues is freed. If the Process A then requests a second SDMA queue, it will receive it on SDMA0. (Except on Aldebaran, due to the hardware quirk mentioned in the commit message. On Aldebaran, you would end up in the situation from the paragraph above, with one SDMA1 queue).

In none of these cases would we expect user-space to wait until a queue is free and keep trying to allocate. For all one process knows, such an allocation may never succeed. Imagine, for instance, 8 other processes that run perpetually and never relinquish their queue allocations. This SDMA allocations are not time-sliced on today's GPUs.

As for the "rather than going in any order" suggestion, the 3rd patch in this series attempts to address this concern without adding a lot of extra code complexity. I worry that ordering engines based on max free queues: a) would requiring more complexity than the current bitmap-based free-queue list, and b) doesn't allow us to easily handle the Aldebaran quirk mentioned above, which specifically deals with SDMA0.

Thanks again,
-Joe

> 
> Thanks,
> Lijo
> 
> > And thanks for the code suggestions up above, I will make those changes as requested.
> >
> > Thanks!
> > -Joe
> >
> >> Thanks,
> >> Lijo
> >>
> >>> Regards,
> >>>     Felix
> >>>
> >>>
> >>>>
> >>>> Thanks,
> >>>> Lijo
> >>>>
> >>>>> +            /* Do not reuse SDMA0 for any subsequent SDMA queue
> >>>>> +             * requests on Aldebaran. If SDMA0's queues are all
> >>>>> +             * full, then this process should never use SDMA0
> >>>>> +             * for any further requests
> >>>>> +             */
> >>>>> +            if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> >>>>> +                    engine == 0)
> >>>>> +                qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> >>>>> +
> >>>>> +            available_queue_bitmap = sdma_engine_mask(engine,
> >>>>> num_engines);
> >>>>> +            available_queue_bitmap &= dqm->sdma_bitmap;
> >>>>> +
> >>>>> +            if (!available_queue_bitmap)
> >>>>> +                continue;
> >>>>> +            /* Take the selected engine off the list so we will not
> >>>>> +             * allocate two queues onto the same engine
> >>>>> +             */
> >>>>> +            qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> >>>>> +            found_sdma = true;
> >>>>> +
> >>>>> +            bit = __ffs64(available_queue_bitmap);
> >>>>> +            dqm->sdma_bitmap &= ~(1ULL << bit);
> >>>>> +            q->sdma_id = bit;
> >>>>> +            q->properties.sdma_engine_id = q->sdma_id % num_engines;
> >>>>> +            q->properties.sdma_queue_id = q->sdma_id / num_engines;
> >>>>> +            break;
> >>>>> +        }
> >>>>> +        if (!found_sdma) {
> >>>>>                 pr_err("No more SDMA queue to allocate\n");
> >>>>>                 return -ENOMEM;
> >>>>>             }
> >>>>> -
> >>>>> -        bit = __ffs64(dqm->sdma_bitmap);
> >>>>> -        dqm->sdma_bitmap &= ~(1ULL << bit);
> >>>>> -        q->sdma_id = bit;
> >>>>> -        q->properties.sdma_engine_id = q->sdma_id %
> >>>>> -                get_num_sdma_engines(dqm);
> >>>>> -        q->properties.sdma_queue_id = q->sdma_id /
> >>>>> -                get_num_sdma_engines(dqm);
> >>>>>         } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>>>> -        if (dqm->xgmi_sdma_bitmap == 0) {
> >>>>> +        num_engines = get_num_xgmi_sdma_engines(dqm);
> >>>>> +        for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap),
> >>>>> num_engines) {
> >>>>> +            available_queue_bitmap = sdma_engine_mask(engine,
> >>>>> num_engines);
> >>>>> +            available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
> >>>>> +
> >>>>> +            if (!available_queue_bitmap)
> >>>>> +                continue;
> >>>>> +            /* Take the selected engine off the list so we will not
> >>>>> +             * allocate two queues onto the same engine
> >>>>> +             */
> >>>>> +            qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
> >>>>> +            found_sdma = true;
> >>>>> +
> >>>>> +            bit = __ffs64(available_queue_bitmap);
> >>>>> +            dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> >>>>> +            q->sdma_id = bit;
> >>>>> +            /* sdma_engine_id is sdma id including
> >>>>> +             * both PCIe-optimized SDMAs and XGMI-
> >>>>> +             * optimized SDMAs. The calculation below
> >>>>> +             * assumes the first N engines are always
> >>>>> +             * PCIe-optimized ones
> >>>>> +             */
> >>>>> +            q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> >>>>> +                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> >>>>> +            q->properties.sdma_queue_id = q->sdma_id /
> >>>>> +                get_num_xgmi_sdma_engines(dqm);
> >>>>> +            break;
> >>>>> +        }
> >>>>> +        if (!found_sdma) {
> >>>>>                 pr_err("No more XGMI SDMA queue to allocate\n");
> >>>>>                 return -ENOMEM;
> >>>>>             }
> >>>>> -        bit = __ffs64(dqm->xgmi_sdma_bitmap);
> >>>>> -        dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> >>>>> -        q->sdma_id = bit;
> >>>>> -        /* sdma_engine_id is sdma id including
> >>>>> -         * both PCIe-optimized SDMAs and XGMI-
> >>>>> -         * optimized SDMAs. The calculation below
> >>>>> -         * assumes the first N engines are always
> >>>>> -         * PCIe-optimized ones
> >>>>> -         */
> >>>>> -        q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> >>>>> -                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> >>>>> -        q->properties.sdma_queue_id = q->sdma_id /
> >>>>> -                get_num_xgmi_sdma_engines(dqm);
> >>>>>         }
> >>>>>           pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> >>>>> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct
> >>>>> device_queue_manager *dqm,
> >>>>>     }
> >>>>>       static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> >>>>> +                struct qcm_process_device *qpd,
> >>>>>                     struct queue *q)
> >>>>>     {
> >>>>> +    uint32_t engine = q->properties.sdma_engine_id;
> >>>>> +
> >>>>>         if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> >>>>>             if (q->sdma_id >= get_num_sdma_queues(dqm))
> >>>>>                 return;
> >>>>>             dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> >>>>> +        /* Don't give SDMA0 back to be reallocated on Aldebaran.
> >>>>> +         * It is only OK to use this engine from the first allocation
> >>>>> +         * within a process.
> >>>>> +         */
> >>>>> +        if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> >>>>> +                    engine == 0))
> >>>>> +            qpd->sdma_engine_bitmap |= (1ULL << engine);
> >>>>> +
> >>>>>         } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>>>>             if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
> >>>>>                 return;
> >>>>>             dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> >>>>> +        /* engine ID in the queue properties is the global engine ID.
> >>>>> +         * The XGMI engine bitmap ignores the PCIe-optimized engines.
> >>>>> +         */
> >>>>> +        engine -= get_num_sdma_engines(dqm);
> >>>>> +        qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
> >>>>>         }
> >>>>>     }
> >>>>>     @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct
> >>>>> device_queue_manager *dqm, struct queue *q,
> >>>>>         if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> >>>>>             q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>>>>             dqm_lock(dqm);
> >>>>> -        retval = allocate_sdma_queue(dqm, q);
> >>>>> +        retval = allocate_sdma_queue(dqm, qpd, q);
> >>>>>             dqm_unlock(dqm);
> >>>>>             if (retval)
> >>>>>                 goto out;
> >>>>> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct
> >>>>> device_queue_manager *dqm, struct queue *q,
> >>>>>         if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> >>>>>             q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> >>>>>             dqm_lock(dqm);
> >>>>> -        deallocate_sdma_queue(dqm, q);
> >>>>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>>>             dqm_unlock(dqm);
> >>>>>         }
> >>>>>     out:
> >>>>> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct
> >>>>> device_queue_manager *dqm,
> >>>>>           if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
> >>>>>             (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> >>>>> -        deallocate_sdma_queue(dqm, q);
> >>>>> +        deallocate_sdma_queue(dqm, qpd, q);
> >>>>>             pdd->sdma_past_activity_counter += sdma_val;
> >>>>>         }
> >>>>>     @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct
> >>>>> device_queue_manager *dqm,
> >>>>>         /* Clear all user mode queues */
> >>>>>         list_for_each_entry(q, &qpd->queues_list, list) {
> >>>>>             if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> >>>>> -            deallocate_sdma_queue(dqm, q);
> >>>>> +            deallocate_sdma_queue(dqm, qpd, q);
> >>>>>             else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> >>>>> -            deallocate_sdma_queue(dqm, q);
> >>>>> +            deallocate_sdma_queue(dqm, qpd, q);
> >>>>>               if (q->properties.is_active) {
> >>>>>                 decrement_queue_count(dqm, q->properties.type);
> >>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>>>> index ab83b0de6b22..c38eebc9db4d 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> >>>>> @@ -576,6 +576,8 @@ struct qcm_process_device {
> >>>>>         struct list_head priv_queue_list;
> >>>>>           unsigned int queue_count;
> >>>>> +    unsigned long sdma_engine_bitmap;
> >>>>> +    unsigned long xgmi_sdma_engine_bitmap;
> >>>>>         unsigned int vmid;
> >>>>>         bool is_debug;
> >>>>>         unsigned int evicted; /* eviction counter, 0=active */
> >>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>>>> index 21ec8a18cad2..13c85624bf7d 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> >>>>> @@ -1422,6 +1422,7 @@ struct kfd_process_device
> >>>>> *kfd_create_process_device_data(struct kfd_dev *dev,
> >>>>>                                 struct kfd_process *p)
> >>>>>     {
> >>>>>         struct kfd_process_device *pdd = NULL;
> >>>>> +    const struct kfd_device_info *dev_info =
> >>>>> dev->dqm->dev->device_info;
> >>>>>           if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
> >>>>>             return NULL;
> >>>>> @@ -1446,6 +1447,8 @@ struct kfd_process_device
> >>>>> *kfd_create_process_device_data(struct kfd_dev *dev,
> >>>>>         pdd->qpd.pqm = &p->pqm;
> >>>>>         pdd->qpd.evicted = 0;
> >>>>>         pdd->qpd.mapped_gws_queue = false;
> >>>>> +    pdd->qpd.sdma_engine_bitmap =
> >>>>> BIT_ULL(dev_info->num_sdma_engines) - 1;
> >>>>> +    pdd->qpd.xgmi_sdma_engine_bitmap =
> >>>>> BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
> >>>>>         pdd->process = p;
> >>>>>         pdd->bound = PDD_UNBOUND;
> >>>>>         pdd->already_dequeued = false;
> >>>>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
  2021-08-24  7:56           ` Greathouse, Joseph
@ 2021-08-24 12:33             ` Lazar, Lijo
  0 siblings, 0 replies; 13+ messages in thread
From: Lazar, Lijo @ 2021-08-24 12:33 UTC (permalink / raw)
  To: Greathouse, Joseph, Kuehling, Felix, amd-gfx



On 8/24/2021 1:26 PM, Greathouse, Joseph wrote:
>> -----Original Message-----
>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>> Sent: Tuesday, August 24, 2021 2:24 AM
>> To: Greathouse, Joseph <Joseph.Greathouse@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
>>
>>
>>
>> On 8/24/2021 12:19 PM, Greathouse, Joseph wrote:
>>>> -----Original Message-----
>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>> Sent: Monday, August 23, 2021 11:37 PM
>>>> To: Kuehling, Felix <Felix.Kuehling@amd.com>; Greathouse, Joseph <Joseph.Greathouse@amd.com>; amd-
>>>> gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
>>>>
>>>> On 8/23/2021 10:34 PM, Felix Kuehling wrote:
>>>>> Am 2021-08-23 um 3:08 a.m. schrieb Lazar, Lijo:
>>>>>>
>>>>>>
>>>>>> On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
>>>>>>> Give every process at most one queue from each SDMA engine.
>>>>>>> Previously, we allocated all SDMA engines and queues on a first-
>>>>>>> come-first-serve basis. This meant that it was possible for two
>>>>>>> processes racing on their allocation requests to each end up with
>>>>>>> two queues on the same SDMA engine. That could serialize the
>>>>>>> performance of commands to different queues.
>>>>>>>
>>>>>>> This new mechanism allows each process at most a single queue
>>>>>>> on each SDMA engine. Processes will check for queue availability on
>>>>>>> SDMA engine 0, then 1, then 2, etc. and only allocate on that
>>>>>>> engine if there is an available queue slot. If there are no
>>>>>>> queue slots available (or if this process has already allocated
>>>>>>> a queue on this engine), it moves on and tries the next engine.
>>>>>>>
>>>>>>> The Aldebaran chip has a small quirk that SDMA0 should only be
>>>>>>> used by the very first allocation from each process. It is OK to
>>>>>>> use any other SDMA engine at any time, but after the first SDMA
>>>>>>> allocation request from a process, the resulting engine must
>>>>>>> be from SDMA1 or above. This patch handles this case as well.
>>>>>>>
>>>>>>> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
>>>>>>> ---
>>>>>>>      .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>>>>>>>      drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>>>>>>>      drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>>>>>>>      3 files changed, 107 insertions(+), 33 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>>> index f8fce9d05f50..86bdb765f350 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>>> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct
>>>>>>> device_queue_manager *dqm,
>>>>>>>      static int map_queues_cpsch(struct device_queue_manager *dqm);
>>>>>>>        static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>>>>>> +                struct qcm_process_device *qpd,
>>>>>>>                      struct queue *q);
>>>>>>>        static inline void deallocate_hqd(struct device_queue_manager *dqm,
>>>>>>>                      struct queue *q);
>>>>>>>      static int allocate_hqd(struct device_queue_manager *dqm, struct
>>>>>>> queue *q);
>>>>>>>      static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>>>>>> +                struct qcm_process_device *qpd,
>>>>>>>                      struct queue *q);
>>>>>>>      static void kfd_process_hw_exception(struct work_struct *work);
>>>>>>>      @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct
>>>>>>> device_queue_manager *dqm,
>>>>>>>                  q->pipe, q->queue);
>>>>>>>          } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>>>              q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>>>> -        retval = allocate_sdma_queue(dqm, q);
>>>>>>> +        retval = allocate_sdma_queue(dqm, qpd, q);
>>>>>>>              if (retval)
>>>>>>>                  goto deallocate_vmid;
>>>>>>>              dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>>>>>>> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct
>>>>>>> device_queue_manager *dqm,
>>>>>>>              deallocate_hqd(dqm, q);
>>>>>>>          else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>>>              q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>>>      deallocate_vmid:
>>>>>>>          if (list_empty(&qpd->queues_list))
>>>>>>>              deallocate_vmid(dqm, qpd, q);
>>>>>>> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct
>>>>>>> device_queue_manager *dqm,
>>>>>>>          if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>>>>>>>              deallocate_hqd(dqm, q);
>>>>>>>          else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>>>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>>>          else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>>>          else {
>>>>>>>              pr_debug("q->properties.type %d is invalid\n",
>>>>>>>                      q->properties.type);
>>>>>>> @@ -1039,42 +1041,93 @@ static void pre_reset(struct
>>>>>>> device_queue_manager *dqm)
>>>>>>>          dqm_unlock(dqm);
>>>>>>>      }
>>>>>>>      +static uint64_t sdma_engine_mask(int engine, int num_engines)
>>>>>>
>>>>>> Looks more like the queue mask for an engine, the name doesn't make it
>>>>>> clear.
>>>>>>
>>>>>>> +{
>>>>>>> +    uint64_t mask = 0;
>>>>>>> +
>>>>>>> +    engine %= num_engines;
>>>>>>> +    while (engine < (sizeof(uint64_t)*8)) {
>>>>>>> +        mask |= 1ULL << engine;
>>>>>>> +        engine += num_engines;
>>>>>>> +    }
>>>>>>> +    return mask;
>>>>>>> +}
>>>>>>> +
>>>>>>>      static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>>>>>> +                struct qcm_process_device *qpd,
>>>>>>>                      struct queue *q)
>>>>>>>      {
>>>>>>> -    int bit;
>>>>>>> +    uint64_t available_queue_bitmap;
>>>>>>> +    unsigned int bit, engine, num_engines;
>>>>>>> +    bool found_sdma = false;
>>>>>>>            if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>>>>>> -        if (dqm->sdma_bitmap == 0) {
>>>>>>
>>>>>> This is still a valid optimization and no need to loop through if
>>>>>> nothing is available anyway. Valid for XGMI loop also.
>>>>>>
>>>>>>> +        num_engines = get_num_sdma_engines(dqm);
>>>>>>> +        for_each_set_bit(engine, &(qpd->sdma_engine_bitmap),
>>>>>>> num_engines) {
>>>>>>
>>>>>> Probably a naive question -
>>>>>>
>>>>>> Theoretically there are 8 queues per engine as per the mask code. In
>>>>>> the below code, there is an assumption that a process will use at best
>>>>>> number of queues=max num of sdma engines or xgmi engines
>>>>>> simultaneously. Is that true always? For ex: there are 2 sdma engines
>>>>>> and 4 queues per engine. Can't multiple threads of a process initiate
>>>>>> multiple sdma transactions > number of sdma engines available? This
>>>>>> code limits that, but I don't know if that is a possible case.
>>>>>
>>>>> When you use multiple SDMA queues on the same engine, the work from the
>>>>> queues gets serialized. You can either let the SDMA engine serialize
>>>>> work from multiple queues, or let ROCr serialize work from multiple
>>>>> threads on the same SDMA queue. There is no obvious benefit to let the
>>>>> SDMA engine do it. But there is a drawback: Fewer processes get to use SDMA.
>>>>>
>>>>
>>>> The fact that there exists multiple queues and there is no such
>>>> assumption made in existing logic (prior to the patch) tells that there
>>>> is indeed some advantage of making use of hardware queues. For ex:
>>>> switching to different ring buffer may not need context save/resubmission.
>>>>
>>>> Just like different processes, different threads of same process may
>>>> take advantage of multiple queues. More interested to see the impact in
>>>> cases where the focus is on single process ex: impact on running some
>>>> benchmarks or in HPC where entire hardware is dedicated for a specific
>>>> compute purpose.
>>>
>>> Each engine has multiple user-mode queues because we expect to see multiple processes sharing the device at the same time. The
>> internal datasheets from the hardware team strongly suggest using one user-mode queue per engine per process.
>>>
>>> With that in mind, the ROCr user-mode runtime attempts to create, at most, one queue per SDMA engine within each process. For
>> instance, ROCr creates one queue for H->D transfers and one queue for D->H transfers. We want these queues to be on different
>> engines so that we can get full-duplex PCIe transfers. If both queues are on the same engine, the engine will serialize the transfers
>> (because it can only work on one command at a time), and we will get half-duplex.
>>>
>>> The queue allocation algorithm that this patch replaces could accidentally yield two queues on the same engine. If Process A
>> allocates its first queue, it would receive sdma0_queue0. Process B could then allocate its first queue, and get sdma1_queue0. Then
>> Process A could allocate its second queue and get sdma0_queue1. This would yield undesirable and unexpected serialization between
>> H->D and D->H transfers in Process A. In addition, I believe the old algorithm would allow an uncooperative process to allocate all
>> SDMA queues to itself. Looking at the git history, I believe the old allocator is basically the same as the initial KFD SDMA allocator built
>> for Kaveri. I'm confident we've learned more about how multiple processes should share our GPUs since then. :)
>>>
>>> To your last point: even if we have different threads in the same process submitting transfer requests, the ROCr runtime within the
>> process will submit all like transfers (e.g. all H->D transfers) into the same queue. Submitting those same commands into different
>> queues on the same engine will not yield any clear performance benefit since the engine will serialize the commands.  And it will likely
>> yield some performance loss due to queue-switching overheads in the engine. I think this patch yields a more fair allocation
>> mechanism (e.g., no process can starve out other processes by taking all the queues on all engines), and it also more closely match
>> what user-space expects (e.g., different SDMA queues can run commands in parallel).
>>>
>>
>> Thanks Joe for providing more info related to ROCr.
>>
>> One other thing is - seems the process will keep waiting for the second
>> queue (assuming first is still in use) to be alloted always on a
>> separate engine. In the example cited above, do you expect a case where
>> sdma1 queues are all full and Process A has to wait till a sdma1 queue
>> is free even though there is a free queue available in sdma0 (by virtue
>> of long/short jobs submitted earlier by other processes). Is that
>> situation managed by ROCr? Or, as worst case allocate on sdma0 itself if
>> nothing is available on other engines? Rather than going in any order,
>> maybe another way is to order engines based on max free queues and give
>> preference to the first engine not actively used by the process.
> 
> I think there are a few parts to this answer:
> 
> User-space has always been responsible for handling an allocation failure and falling back in some capacity, even before this patch. If the process receives a NAK and has no SDMA queues, it may need to fall back to a blit copy-kernel submitted to a compute queue. ROCr does this today.
> 
> Another example of when this allocator could NAK is (for example) if this process tries to allocate a 2nd PCIe queue, it already has a queue on SDMA0, and SDMA1 is full. User-space could also use a blit kernel to handle this, if it wants transfers to run in parallel (one direction in SDMA, one with a blit). Or it could decide to serialize all transfers commands into the single queue it already has.
> 
> A third situation would be if Process A initially requests a queue while SDMA0 is full, so its first queue is allocated on SDMA1. Later, another process ends and one of the SDMA0 queues is freed. If the Process A then requests a second SDMA queue, it will receive it on SDMA0. (Except on Aldebaran, due to the hardware quirk mentioned in the commit message. On Aldebaran, you would end up in the situation from the paragraph above, with one SDMA1 queue).
> 
> In none of these cases would we expect user-space to wait until a queue is free and keep trying to allocate. For all one process knows, such an allocation may never succeed. Imagine, for instance, 8 other processes that run perpetually and never relinquish their queue allocations. This SDMA allocations are not time-sliced on today's GPUs.
> 

Thanks again for the details!

> As for the "rather than going in any order" suggestion, the 3rd patch in this series attempts to address this concern without adding a lot of extra code complexity. I worry that ordering engines based on max free queues: a) would requiring more complexity than the current bitmap-based free-queue list, and b) doesn't allow us to easily handle the Aldebaran quirk mentioned above, which specifically deals with SDMA0.
> 

It may not be so complicated once this bitmap is viewed differently. 
Sent a sample logic separately, please check.

Thanks,
Lijo

> Thanks again,
> -Joe
> 
>>
>> Thanks,
>> Lijo
>>
>>> And thanks for the code suggestions up above, I will make those changes as requested.
>>>
>>> Thanks!
>>> -Joe
>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> Regards,
>>>>>      Felix
>>>>>
>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>>> +            /* Do not reuse SDMA0 for any subsequent SDMA queue
>>>>>>> +             * requests on Aldebaran. If SDMA0's queues are all
>>>>>>> +             * full, then this process should never use SDMA0
>>>>>>> +             * for any further requests
>>>>>>> +             */
>>>>>>> +            if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
>>>>>>> +                    engine == 0)
>>>>>>> +                qpd->sdma_engine_bitmap &= ~(1ULL << engine);
>>>>>>> +
>>>>>>> +            available_queue_bitmap = sdma_engine_mask(engine,
>>>>>>> num_engines);
>>>>>>> +            available_queue_bitmap &= dqm->sdma_bitmap;
>>>>>>> +
>>>>>>> +            if (!available_queue_bitmap)
>>>>>>> +                continue;
>>>>>>> +            /* Take the selected engine off the list so we will not
>>>>>>> +             * allocate two queues onto the same engine
>>>>>>> +             */
>>>>>>> +            qpd->sdma_engine_bitmap &= ~(1ULL << engine);
>>>>>>> +            found_sdma = true;
>>>>>>> +
>>>>>>> +            bit = __ffs64(available_queue_bitmap);
>>>>>>> +            dqm->sdma_bitmap &= ~(1ULL << bit);
>>>>>>> +            q->sdma_id = bit;
>>>>>>> +            q->properties.sdma_engine_id = q->sdma_id % num_engines;
>>>>>>> +            q->properties.sdma_queue_id = q->sdma_id / num_engines;
>>>>>>> +            break;
>>>>>>> +        }
>>>>>>> +        if (!found_sdma) {
>>>>>>>                  pr_err("No more SDMA queue to allocate\n");
>>>>>>>                  return -ENOMEM;
>>>>>>>              }
>>>>>>> -
>>>>>>> -        bit = __ffs64(dqm->sdma_bitmap);
>>>>>>> -        dqm->sdma_bitmap &= ~(1ULL << bit);
>>>>>>> -        q->sdma_id = bit;
>>>>>>> -        q->properties.sdma_engine_id = q->sdma_id %
>>>>>>> -                get_num_sdma_engines(dqm);
>>>>>>> -        q->properties.sdma_queue_id = q->sdma_id /
>>>>>>> -                get_num_sdma_engines(dqm);
>>>>>>>          } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>>>> -        if (dqm->xgmi_sdma_bitmap == 0) {
>>>>>>> +        num_engines = get_num_xgmi_sdma_engines(dqm);
>>>>>>> +        for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap),
>>>>>>> num_engines) {
>>>>>>> +            available_queue_bitmap = sdma_engine_mask(engine,
>>>>>>> num_engines);
>>>>>>> +            available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
>>>>>>> +
>>>>>>> +            if (!available_queue_bitmap)
>>>>>>> +                continue;
>>>>>>> +            /* Take the selected engine off the list so we will not
>>>>>>> +             * allocate two queues onto the same engine
>>>>>>> +             */
>>>>>>> +            qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
>>>>>>> +            found_sdma = true;
>>>>>>> +
>>>>>>> +            bit = __ffs64(available_queue_bitmap);
>>>>>>> +            dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>>>>>>> +            q->sdma_id = bit;
>>>>>>> +            /* sdma_engine_id is sdma id including
>>>>>>> +             * both PCIe-optimized SDMAs and XGMI-
>>>>>>> +             * optimized SDMAs. The calculation below
>>>>>>> +             * assumes the first N engines are always
>>>>>>> +             * PCIe-optimized ones
>>>>>>> +             */
>>>>>>> +            q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
>>>>>>> +                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
>>>>>>> +            q->properties.sdma_queue_id = q->sdma_id /
>>>>>>> +                get_num_xgmi_sdma_engines(dqm);
>>>>>>> +            break;
>>>>>>> +        }
>>>>>>> +        if (!found_sdma) {
>>>>>>>                  pr_err("No more XGMI SDMA queue to allocate\n");
>>>>>>>                  return -ENOMEM;
>>>>>>>              }
>>>>>>> -        bit = __ffs64(dqm->xgmi_sdma_bitmap);
>>>>>>> -        dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>>>>>>> -        q->sdma_id = bit;
>>>>>>> -        /* sdma_engine_id is sdma id including
>>>>>>> -         * both PCIe-optimized SDMAs and XGMI-
>>>>>>> -         * optimized SDMAs. The calculation below
>>>>>>> -         * assumes the first N engines are always
>>>>>>> -         * PCIe-optimized ones
>>>>>>> -         */
>>>>>>> -        q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
>>>>>>> -                q->sdma_id % get_num_xgmi_sdma_engines(dqm);
>>>>>>> -        q->properties.sdma_queue_id = q->sdma_id /
>>>>>>> -                get_num_xgmi_sdma_engines(dqm);
>>>>>>>          }
>>>>>>>            pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
>>>>>>> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct
>>>>>>> device_queue_manager *dqm,
>>>>>>>      }
>>>>>>>        static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>>>>>> +                struct qcm_process_device *qpd,
>>>>>>>                      struct queue *q)
>>>>>>>      {
>>>>>>> +    uint32_t engine = q->properties.sdma_engine_id;
>>>>>>> +
>>>>>>>          if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>>>>>>              if (q->sdma_id >= get_num_sdma_queues(dqm))
>>>>>>>                  return;
>>>>>>>              dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>>>>>>> +        /* Don't give SDMA0 back to be reallocated on Aldebaran.
>>>>>>> +         * It is only OK to use this engine from the first allocation
>>>>>>> +         * within a process.
>>>>>>> +         */
>>>>>>> +        if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
>>>>>>> +                    engine == 0))
>>>>>>> +            qpd->sdma_engine_bitmap |= (1ULL << engine);
>>>>>>> +
>>>>>>>          } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>>>>              if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>>>>>>>                  return;
>>>>>>>              dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>>>>>>> +        /* engine ID in the queue properties is the global engine ID.
>>>>>>> +         * The XGMI engine bitmap ignores the PCIe-optimized engines.
>>>>>>> +         */
>>>>>>> +        engine -= get_num_sdma_engines(dqm);
>>>>>>> +        qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>>>>>>>          }
>>>>>>>      }
>>>>>>>      @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct
>>>>>>> device_queue_manager *dqm, struct queue *q,
>>>>>>>          if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>>>              q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>>>>              dqm_lock(dqm);
>>>>>>> -        retval = allocate_sdma_queue(dqm, q);
>>>>>>> +        retval = allocate_sdma_queue(dqm, qpd, q);
>>>>>>>              dqm_unlock(dqm);
>>>>>>>              if (retval)
>>>>>>>                  goto out;
>>>>>>> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct
>>>>>>> device_queue_manager *dqm, struct queue *q,
>>>>>>>          if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>>>              q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>>>>>>              dqm_lock(dqm);
>>>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>>>              dqm_unlock(dqm);
>>>>>>>          }
>>>>>>>      out:
>>>>>>> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct
>>>>>>> device_queue_manager *dqm,
>>>>>>>            if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>>>>>>              (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>>>>>> -        deallocate_sdma_queue(dqm, q);
>>>>>>> +        deallocate_sdma_queue(dqm, qpd, q);
>>>>>>>              pdd->sdma_past_activity_counter += sdma_val;
>>>>>>>          }
>>>>>>>      @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct
>>>>>>> device_queue_manager *dqm,
>>>>>>>          /* Clear all user mode queues */
>>>>>>>          list_for_each_entry(q, &qpd->queues_list, list) {
>>>>>>>              if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>>>>>>> -            deallocate_sdma_queue(dqm, q);
>>>>>>> +            deallocate_sdma_queue(dqm, qpd, q);
>>>>>>>              else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>>>>>> -            deallocate_sdma_queue(dqm, q);
>>>>>>> +            deallocate_sdma_queue(dqm, qpd, q);
>>>>>>>                if (q->properties.is_active) {
>>>>>>>                  decrement_queue_count(dqm, q->properties.type);
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>>> index ab83b0de6b22..c38eebc9db4d 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>>> @@ -576,6 +576,8 @@ struct qcm_process_device {
>>>>>>>          struct list_head priv_queue_list;
>>>>>>>            unsigned int queue_count;
>>>>>>> +    unsigned long sdma_engine_bitmap;
>>>>>>> +    unsigned long xgmi_sdma_engine_bitmap;
>>>>>>>          unsigned int vmid;
>>>>>>>          bool is_debug;
>>>>>>>          unsigned int evicted; /* eviction counter, 0=active */
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>>> index 21ec8a18cad2..13c85624bf7d 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>>> @@ -1422,6 +1422,7 @@ struct kfd_process_device
>>>>>>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>>>>>>                                  struct kfd_process *p)
>>>>>>>      {
>>>>>>>          struct kfd_process_device *pdd = NULL;
>>>>>>> +    const struct kfd_device_info *dev_info =
>>>>>>> dev->dqm->dev->device_info;
>>>>>>>            if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>>>>>>>              return NULL;
>>>>>>> @@ -1446,6 +1447,8 @@ struct kfd_process_device
>>>>>>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>>>>>>          pdd->qpd.pqm = &p->pqm;
>>>>>>>          pdd->qpd.evicted = 0;
>>>>>>>          pdd->qpd.mapped_gws_queue = false;
>>>>>>> +    pdd->qpd.sdma_engine_bitmap =
>>>>>>> BIT_ULL(dev_info->num_sdma_engines) - 1;
>>>>>>> +    pdd->qpd.xgmi_sdma_engine_bitmap =
>>>>>>> BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>>>>>>>          pdd->process = p;
>>>>>>>          pdd->bound = PDD_UNBOUND;
>>>>>>>          pdd->already_dequeued = false;
>>>>>>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-08-24 12:34 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-20  5:32 [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly Joseph Greathouse
2021-08-20  5:32 ` [PATCH 2/3] drm/amdgpu: Use SDMA1 for buffer movement on Aldebaran Joseph Greathouse
2021-08-20  6:59   ` Christian König
2021-08-20 23:27     ` Greathouse, Joseph
2021-08-20  5:32 ` [PATCH 3/3] drm/amdkfd: Spread out XGMI SDMA allocations Joseph Greathouse
2021-08-20 22:03 ` [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly Felix Kuehling
2021-08-23  7:08 ` Lazar, Lijo
2021-08-23 17:04   ` Felix Kuehling
2021-08-24  4:37     ` Lazar, Lijo
2021-08-24  6:49       ` Greathouse, Joseph
2021-08-24  7:23         ` Lazar, Lijo
2021-08-24  7:56           ` Greathouse, Joseph
2021-08-24 12:33             ` Lazar, Lijo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.