All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/11] KFD fixes
@ 2017-09-15 23:42 Felix Kuehling
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Felix Kuehling

Small self-contained KFD fixes that don't introduce any new functionality or
features. These may be suitable to include in drm-fixes for 4.14. Patch 11
may be controversial and I may just drop it if people think this is better
dealt with in udevd configuration files.

Andres Rodriguez (1):
  drm/amdkfd: Set /dev/kfd permissions to 0666 by default

Felix Kuehling (3):
  drm/amdkfd: Adjust dequeue latencies and timeouts
  drm/amdkfd: Fix incorrect destroy_mqd parameter
  drm/amdkfd: Print event limit messages only once per process

Yong Zhao (7):
  drm/amdkfd: Reorganize kfd resume code
  drm/amdkfd: Fix suspend/resume issue on Carrizo
  drm/amdkfd: Rectify the jiffies calculation error with milliseconds
  drm/amdkfd: Use VMID bitmap from KGD
  drm/amdkfd: Reuse CHIP_* from amdgpu
  drm/amdkfd: Drop _nocpsch suffix from shared functions
  drm/amdkfd: Fix kernel-queue wrapping bugs

 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           |  10 ++
 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            |   9 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 107 ++++++++++++---------
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  62 +++++-------
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_events.c            |   5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      |  24 ++++-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c       |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |   2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  37 ++++---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  89 ++++++++++++++---
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |   2 +-
 12 files changed, 228 insertions(+), 129 deletions(-)

-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 01/11] drm/amdkfd: Reorganize kfd resume code
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-15 23:42   ` Felix Kuehling
       [not found]     ` <1505518982-29631-2-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 02/11] drm/amdkfd: Fix suspend/resume issue on Carrizo Felix Kuehling
                     ` (9 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

The idea is to let kfd init and resume function share the same code path
as much as possible, rather than to have two copies of almost identical
code. That way improves the code readability and maintainability.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 78 +++++++++++++++++----------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 61fff25..cc8af11 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -92,6 +92,8 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
 				unsigned int chunk_size);
 static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
 
+static int kfd_resume(struct kfd_dev *kfd);
+
 static const struct kfd_device_info *lookup_device_info(unsigned short did)
 {
 	size_t i;
@@ -176,15 +178,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd)
 				pasid_limit,
 				kfd->doorbell_process_limit - 1);
 
-	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
-	if (err < 0) {
-		dev_err(kfd_device, "error initializing iommu device\n");
-		return false;
-	}
-
 	if (!kfd_set_pasid_limit(pasid_limit)) {
 		dev_err(kfd_device, "error setting pasid limit\n");
-		amd_iommu_free_device(kfd->pdev);
 		return false;
 	}
 
@@ -280,29 +275,22 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 		goto kfd_interrupt_error;
 	}
 
-	if (!device_iommu_pasid_init(kfd)) {
-		dev_err(kfd_device,
-			"Error initializing iommuv2 for device %x:%x\n",
-			kfd->pdev->vendor, kfd->pdev->device);
-		goto device_iommu_pasid_error;
-	}
-	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
-						iommu_pasid_shutdown_callback);
-	amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
-
 	kfd->dqm = device_queue_manager_init(kfd);
 	if (!kfd->dqm) {
 		dev_err(kfd_device, "Error initializing queue manager\n");
 		goto device_queue_manager_error;
 	}
 
-	if (kfd->dqm->ops.start(kfd->dqm)) {
+	if (!device_iommu_pasid_init(kfd)) {
 		dev_err(kfd_device,
-			"Error starting queue manager for device %x:%x\n",
+			"Error initializing iommuv2 for device %x:%x\n",
 			kfd->pdev->vendor, kfd->pdev->device);
-		goto dqm_start_error;
+		goto device_iommu_pasid_error;
 	}
 
+	if (kfd_resume(kfd))
+		goto kfd_resume_error;
+
 	kfd->dbgmgr = NULL;
 
 	kfd->init_complete = true;
@@ -314,11 +302,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
 	goto out;
 
-dqm_start_error:
+kfd_resume_error:
+device_iommu_pasid_error:
 	device_queue_manager_uninit(kfd->dqm);
 device_queue_manager_error:
-	amd_iommu_free_device(kfd->pdev);
-device_iommu_pasid_error:
 	kfd_interrupt_exit(kfd);
 kfd_interrupt_error:
 	kfd_topology_remove_device(kfd);
@@ -338,8 +325,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 void kgd2kfd_device_exit(struct kfd_dev *kfd)
 {
 	if (kfd->init_complete) {
+		kgd2kfd_suspend(kfd);
 		device_queue_manager_uninit(kfd->dqm);
-		amd_iommu_free_device(kfd->pdev);
 		kfd_interrupt_exit(kfd);
 		kfd_topology_remove_device(kfd);
 		kfd_doorbell_fini(kfd);
@@ -362,25 +349,40 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
 
 int kgd2kfd_resume(struct kfd_dev *kfd)
 {
-	unsigned int pasid_limit;
-	int err;
+	if (!kfd->init_complete)
+		return 0;
 
-	pasid_limit = kfd_get_pasid_limit();
+	return kfd_resume(kfd);
 
-	if (kfd->init_complete) {
-		err = amd_iommu_init_device(kfd->pdev, pasid_limit);
-		if (err < 0) {
-			dev_err(kfd_device, "failed to initialize iommu\n");
-			return -ENXIO;
-		}
+}
 
-		amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
-						iommu_pasid_shutdown_callback);
-		amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
-		kfd->dqm->ops.start(kfd->dqm);
+static int kfd_resume(struct kfd_dev *kfd)
+{
+	int err = 0;
+	unsigned int pasid_limit = kfd_get_pasid_limit();
+
+	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
+	if (err)
+		return -ENXIO;
+	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
+					iommu_pasid_shutdown_callback);
+	amd_iommu_set_invalid_ppr_cb(kfd->pdev,
+				     iommu_invalid_ppr_cb);
+
+	err = kfd->dqm->ops.start(kfd->dqm);
+	if (err) {
+		dev_err(kfd_device,
+			"Error starting queue manager for device %x:%x\n",
+			kfd->pdev->vendor, kfd->pdev->device);
+		goto dqm_start_error;
 	}
 
-	return 0;
+	return err;
+
+dqm_start_error:
+	amd_iommu_free_device(kfd->pdev);
+
+	return err;
 }
 
 /* This is called directly from KGD at ISR. */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 02/11] drm/amdkfd: Fix suspend/resume issue on Carrizo
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 01/11] drm/amdkfd: Reorganize kfd resume code Felix Kuehling
@ 2017-09-15 23:42   ` Felix Kuehling
       [not found]     ` <1505518982-29631-3-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 03/11] drm/amdkfd: Rectify the jiffies calculation error with milliseconds Felix Kuehling
                     ` (8 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

When we do suspend/resume through "sudo pm-suspend" while there is
HSA activity running, upon resume we will encounter HWS hanging, which
is caused by memory read/write failures. The root cause is that when
suspend, we neglected to unbind pasid from kfd device.

Another major change is that the bind/unbinding is changed to be
performed on a per process basis, instead of whether there are queues
in dqm.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 22 ++++--
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 ----
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              | 15 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 89 ++++++++++++++++++----
 4 files changed, 101 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index cc8af11..ff3f97c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -191,7 +191,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
 
 	if (dev)
-		kfd_unbind_process_from_device(dev, pasid);
+		kfd_process_iommu_unbind_callback(dev, pasid);
 }
 
 /*
@@ -339,12 +339,16 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
 
 void kgd2kfd_suspend(struct kfd_dev *kfd)
 {
-	if (kfd->init_complete) {
-		kfd->dqm->ops.stop(kfd->dqm);
-		amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
-		amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
-		amd_iommu_free_device(kfd->pdev);
-	}
+	if (!kfd->init_complete)
+		return;
+
+	kfd->dqm->ops.stop(kfd->dqm);
+
+	kfd_unbind_processes_from_device(kfd);
+
+	amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+	amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+	amd_iommu_free_device(kfd->pdev);
 }
 
 int kgd2kfd_resume(struct kfd_dev *kfd)
@@ -369,6 +373,10 @@ static int kfd_resume(struct kfd_dev *kfd)
 	amd_iommu_set_invalid_ppr_cb(kfd->pdev,
 				     iommu_invalid_ppr_cb);
 
+	err = kfd_bind_processes_to_device(kfd);
+	if (err)
+		return -ENXIO;
+
 	err = kfd->dqm->ops.start(kfd->dqm);
 	if (err) {
 		dev_err(kfd_device,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 53a66e8..5db82b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 
 static int start_cpsch(struct device_queue_manager *dqm)
 {
-	struct device_process_node *node;
 	int retval;
 
 	retval = 0;
@@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
 
 	init_interrupts(dqm);
 
-	list_for_each_entry(node, &dqm->queues, list)
-		if (node->qpd->pqm->process && dqm->dev)
-			kfd_bind_process_to_device(dqm->dev,
-						node->qpd->pqm->process);
-
 	execute_queues_cpsch(dqm, true);
 
 	return 0;
@@ -714,15 +708,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
 
 static int stop_cpsch(struct device_queue_manager *dqm)
 {
-	struct device_process_node *node;
-	struct kfd_process_device *pdd;
-
 	destroy_queues_cpsch(dqm, true, true);
 
-	list_for_each_entry(node, &dqm->queues, list) {
-		pdd = qpd_to_pdd(node->qpd);
-		pdd->bound = false;
-	}
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
 	pm_uninit(&dqm->packets);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index b397ec7..ef582cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -435,6 +435,13 @@ struct qcm_process_device {
 	uint32_t sh_hidden_private_base;
 };
 
+
+enum kfd_pdd_bound {
+	PDD_UNBOUND = 0,
+	PDD_BOUND,
+	PDD_BOUND_SUSPENDED,
+};
+
 /* Data that is per-process-per device. */
 struct kfd_process_device {
 	/*
@@ -459,7 +466,7 @@ struct kfd_process_device {
 	uint64_t scratch_limit;
 
 	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
-	bool bound;
+	enum kfd_pdd_bound bound;
 
 	/* This flag tells if we should reset all
 	 * wavefronts on process termination
@@ -548,8 +555,10 @@ struct kfd_process *kfd_get_process(const struct task_struct *);
 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
 
 struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
-							struct kfd_process *p);
-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
+						struct kfd_process *p);
+int kfd_bind_processes_to_device(struct kfd_dev *dev);
+void kfd_unbind_processes_from_device(struct kfd_dev *dev);
+void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid);
 struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 							struct kfd_process *p);
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index c74cf22..3ffaac3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -174,7 +174,10 @@ static void kfd_process_wq_release(struct work_struct *work)
 		if (pdd->reset_wavefronts)
 			dbgdev_wave_reset_wavefronts(pdd->dev, p);
 
-		amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
+		if (pdd->bound == PDD_BOUND) {
+			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
+			pdd->bound = PDD_UNBOUND;
+		}
 		list_del(&pdd->per_device_list);
 
 		kfree(pdd);
@@ -345,9 +348,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 
 	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
 		if (pdd->dev == dev)
-			break;
+			return pdd;
 
-	return pdd;
+	return NULL;
 }
 
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
@@ -362,6 +365,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 		INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
 		pdd->qpd.dqm = dev->dqm;
 		pdd->reset_wavefronts = false;
+		pdd->bound = PDD_UNBOUND;
 		list_add(&pdd->per_device_list, &p->per_device_data);
 	}
 
@@ -387,19 +391,83 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (pdd->bound)
+	if (pdd->bound == PDD_BOUND)
 		return pdd;
 
+	if (pdd->bound == PDD_BOUND_SUSPENDED) {
+		pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
+		return ERR_PTR(-EINVAL);
+	}
+
 	err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
 	if (err < 0)
 		return ERR_PTR(err);
 
-	pdd->bound = true;
+	pdd->bound = PDD_BOUND;
 
 	return pdd;
 }
 
-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
+int kfd_bind_processes_to_device(struct kfd_dev *dev)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p;
+	unsigned int temp;
+	int err = 0;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		mutex_lock(&p->mutex);
+		pdd = kfd_get_process_device_data(dev, p);
+		if (pdd->bound != PDD_BOUND_SUSPENDED) {
+			mutex_unlock(&p->mutex);
+			continue;
+		}
+
+		err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
+				p->lead_thread);
+		if (err < 0) {
+			pr_err("unexpected pasid %d binding failure\n",
+					p->pasid);
+			mutex_unlock(&p->mutex);
+			break;
+		}
+
+		pdd->bound = PDD_BOUND;
+		mutex_unlock(&p->mutex);
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return err;
+}
+
+void kfd_unbind_processes_from_device(struct kfd_dev *dev)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p;
+	unsigned int temp, temp_bound, temp_pasid;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		mutex_lock(&p->mutex);
+		pdd = kfd_get_process_device_data(dev, p);
+		temp_bound = pdd->bound;
+		temp_pasid = p->pasid;
+		if (pdd->bound == PDD_BOUND)
+			pdd->bound = PDD_BOUND_SUSPENDED;
+		mutex_unlock(&p->mutex);
+
+		if (temp_bound == PDD_BOUND)
+			amd_iommu_unbind_pasid(dev->pdev, temp_pasid);
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+}
+
+void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
 {
 	struct kfd_process *p;
 	struct kfd_process_device *pdd;
@@ -432,15 +500,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
 		pdd->reset_wavefronts = false;
 	}
 
-	/*
-	 * Just mark pdd as unbound, because we still need it
-	 * to call amd_iommu_unbind_pasid() in when the
-	 * process exits.
-	 * We don't call amd_iommu_unbind_pasid() here
-	 * because the IOMMU called us.
-	 */
-	pdd->bound = false;
-
 	mutex_unlock(&p->mutex);
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 03/11] drm/amdkfd: Rectify the jiffies calculation error with milliseconds
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 01/11] drm/amdkfd: Reorganize kfd resume code Felix Kuehling
  2017-09-15 23:42   ` [PATCH 02/11] drm/amdkfd: Fix suspend/resume issue on Carrizo Felix Kuehling
@ 2017-09-15 23:42   ` Felix Kuehling
       [not found]     ` <1505518982-29631-4-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 04/11] drm/amdkfd: Adjust dequeue latencies and timeouts Felix Kuehling
                     ` (7 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

The timeout in milliseconds should not be regarded as jiffies. This
commit fixed that.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 8 +++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 5db82b8..3db6a31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -835,12 +835,14 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 				unsigned int fence_value,
-				unsigned long timeout)
+				unsigned long timeout_ms)
 {
-	timeout += jiffies;
+	unsigned long end_jiffies;
+
+	end_jiffies = (timeout_ms * HZ / 1000) + jiffies;
 
 	while (*fence_addr != fence_value) {
-		if (time_after(jiffies, timeout)) {
+		if (time_after(jiffies, end_jiffies)) {
 			pr_err("qcm fence wait loop timeout expired\n");
 			return -ETIME;
 		}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ef582cc..f8d6a8e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -669,7 +669,7 @@ struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
 
 int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 				unsigned int fence_value,
-				unsigned long timeout);
+				unsigned long timeout_ms);
 
 /* Packet Manager */
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 04/11] drm/amdkfd: Adjust dequeue latencies and timeouts
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2017-09-15 23:42   ` [PATCH 03/11] drm/amdkfd: Rectify the jiffies calculation error with milliseconds Felix Kuehling
@ 2017-09-15 23:42   ` Felix Kuehling
       [not found]     ` <1505518982-29631-5-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 05/11] drm/amdkfd: Fix incorrect destroy_mqd parameter Felix Kuehling
                     ` (6 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

Adjust latencies and timeouts for dequeueing with HWS and consolidate
them in one place. Make them longer to allow long running waves to
complete without causing a timeout. The timeout is twice as long as the
latency plus some buffer to make sure we don't detect a timeout
prematurely.

Change timeouts for dequeueing HQDs without HWS. KFD_UNMAP_LATENCY is
more consistent with what the HWS does for user queues.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c         | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c       | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 3 ---
 5 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 3db6a31..5da7ef4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -323,7 +323,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
 
 	retval = mqd->destroy_mqd(mqd, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
-				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
+				KFD_UNMAP_LATENCY_MS,
 				q->pipe, q->queue);
 
 	if (retval)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index faf820a..99e2305 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -29,7 +29,9 @@
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 
-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS	(500)
+#define KFD_UNMAP_LATENCY_MS			(4000)
+#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
+
 #define CIK_VMID_NUM				(8)
 #define KFD_VMID_START_OFFSET			(8)
 #define VMID_PER_DEVICE				CIK_VMID_NUM
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 681b639..0c82446 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -185,7 +185,7 @@ static void uninitialize(struct kernel_queue *kq)
 		kq->mqd->destroy_mqd(kq->mqd,
 					NULL,
 					false,
-					QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
+					KFD_UNMAP_LATENCY_MS,
 					kq->queue->pipe,
 					kq->queue->queue);
 	else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 1d31260..9eda884 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -376,7 +376,7 @@ int pm_send_set_resources(struct packet_manager *pm,
 	packet->bitfields2.queue_type =
 			queue_type__mes_set_resources__hsa_interface_queue_hiq;
 	packet->bitfields2.vmid_mask = res->vmid_mask;
-	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY;
+	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
 	packet->bitfields7.oac_mask = res->oac_mask;
 	packet->bitfields8.gds_heap_base = res->gds_heap_base;
 	packet->bitfields8.gds_heap_size = res->gds_heap_size;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f8d6a8e..099dc33 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -673,11 +673,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 
 /* Packet Manager */
 
-#define KFD_HIQ_TIMEOUT (500)
-
 #define KFD_FENCE_COMPLETED (100)
 #define KFD_FENCE_INIT   (10)
-#define KFD_UNMAP_LATENCY (150)
 
 struct packet_manager {
 	struct device_queue_manager *dqm;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 05/11] drm/amdkfd: Fix incorrect destroy_mqd parameter
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2017-09-15 23:42   ` [PATCH 04/11] drm/amdkfd: Adjust dequeue latencies and timeouts Felix Kuehling
@ 2017-09-15 23:42   ` Felix Kuehling
       [not found]     ` <1505518982-29631-6-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 06/11] drm/amdkfd: Use VMID bitmap from KGD Felix Kuehling
                     ` (5 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

When uninitializing a kernel queue.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 0c82446..09356d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -184,7 +184,7 @@ static void uninitialize(struct kernel_queue *kq)
 	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
 		kq->mqd->destroy_mqd(kq->mqd,
 					NULL,
-					false,
+					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
 					KFD_UNMAP_LATENCY_MS,
 					kq->queue->pipe,
 					kq->queue->queue);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 06/11] drm/amdkfd: Use VMID bitmap from KGD
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (4 preceding siblings ...)
  2017-09-15 23:42   ` [PATCH 05/11] drm/amdkfd: Fix incorrect destroy_mqd parameter Felix Kuehling
@ 2017-09-15 23:42   ` Felix Kuehling
       [not found]     ` <1505518982-29631-7-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 07/11] drm/amdkfd: Reuse CHIP_* from amdgpu Felix Kuehling
                     ` (4 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

The hard-coded values related to VMID were removed in KFD, as those
values can be calculated in the KFD initialization function.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c                |  9 ++-------
 drivers/gpu/drm/amd/amdkfd/kfd_device.c                |  7 +++++++
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 ++++++-------
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |  4 ----
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                  |  7 +++++++
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  2 +-
 6 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index 0aa021a..7d5635f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -769,13 +769,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
 	struct kfd_process_device *pdd;
 	struct dbg_wave_control_info wac_info;
-	int temp;
-	int first_vmid_to_scan = 8;
-	int last_vmid_to_scan = 15;
-
-	first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1;
-	temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan;
-	last_vmid_to_scan = first_vmid_to_scan + ffz(temp);
+	int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
+	int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
 
 	reg_sq_cmd.u32All = 0;
 	status = 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index ff3f97c..abf91b0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -223,9 +223,16 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 			 const struct kgd2kfd_shared_resources *gpu_resources)
 {
 	unsigned int size;
+	unsigned int vmid_bitmap_kfd;
 
 	kfd->shared_resources = *gpu_resources;
 
+	vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap;
+	kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1;
+	kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1;
+	kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
+			- kfd->vm_info.first_vmid_kfd + 1;
+
 	/* calculate max size of mqds needed for queues */
 	size = max_num_of_queues_per_device *
 			kfd->device_info->mqd_size_aligned;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 5da7ef4..897ff083 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -113,11 +113,11 @@ static int allocate_vmid(struct device_queue_manager *dqm,
 	if (dqm->vmid_bitmap == 0)
 		return -ENOMEM;
 
-	bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM);
+	bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap,
+				dqm->dev->vm_info.vmid_num_kfd);
 	clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap);
 
-	/* Kaveri kfd vmid's starts from vmid 8 */
-	allocated_vmid = bit + KFD_VMID_START_OFFSET;
+	allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
 	pr_debug("vmid allocation %d\n", allocated_vmid);
 	qpd->vmid = allocated_vmid;
 	q->properties.vmid = allocated_vmid;
@@ -132,7 +132,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q)
 {
-	int bit = qpd->vmid - KFD_VMID_START_OFFSET;
+	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
 
 	/* Release the vmid mapping */
 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
@@ -507,7 +507,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 				dqm->allocated_queues[pipe] |= 1 << queue;
 	}
 
-	dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1;
+	dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
 	dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
 
 	return 0;
@@ -613,8 +613,7 @@ static int set_sched_resources(struct device_queue_manager *dqm)
 	int i, mec;
 	struct scheduling_resources res;
 
-	res.vmid_mask = (1 << VMID_PER_DEVICE) - 1;
-	res.vmid_mask <<= KFD_VMID_START_OFFSET;
+	res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
 
 	res.queue_mask = 0;
 	for (i = 0; i < KGD_MAX_QUEUES; ++i) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 99e2305..60d46ce 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -32,10 +32,6 @@
 #define KFD_UNMAP_LATENCY_MS			(4000)
 #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
 
-#define CIK_VMID_NUM				(8)
-#define KFD_VMID_START_OFFSET			(8)
-#define VMID_PER_DEVICE				CIK_VMID_NUM
-#define KFD_DQM_FIRST_PIPE			(0)
 #define CIK_SDMA_QUEUES				(4)
 #define CIK_SDMA_QUEUES_PER_ENGINE		(2)
 #define CIK_SDMA_ENGINE_NUM			(2)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 099dc33..7bed4ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -141,6 +141,12 @@ struct kfd_mem_obj {
 	uint32_t *cpu_ptr;
 };
 
+struct kfd_vmid_info {
+	uint32_t first_vmid_kfd;
+	uint32_t last_vmid_kfd;
+	uint32_t vmid_num_kfd;
+};
+
 struct kfd_dev {
 	struct kgd_dev *kgd;
 
@@ -165,6 +171,7 @@ struct kfd_dev {
 					   */
 
 	struct kgd2kfd_shared_resources shared_resources;
+	struct kfd_vmid_info vm_info;
 
 	const struct kfd2kgd_calls *kfd2kgd;
 	struct mutex doorbell_mutex;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 1cae95e..c047534 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -188,7 +188,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 	case KFD_QUEUE_TYPE_COMPUTE:
 		/* check if there is over subscription */
 		if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
-		((dev->dqm->processes_count >= VMID_PER_DEVICE) ||
+		((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) ||
 		(dev->dqm->queue_count >= get_queues_num(dev->dqm)))) {
 			pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
 			retval = -EPERM;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 07/11] drm/amdkfd: Reuse CHIP_* from amdgpu
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (5 preceding siblings ...)
  2017-09-15 23:42   ` [PATCH 06/11] drm/amdkfd: Use VMID bitmap from KGD Felix Kuehling
@ 2017-09-15 23:42   ` Felix Kuehling
       [not found]     ` <1505518982-29631-8-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:42   ` [PATCH 08/11] drm/amdkfd: Drop _nocpsch suffix from shared functions Felix Kuehling
                     ` (3 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

From: Yong Zhao <Yong.Zhao@amd.com>

There are already CHIP_* definitions under amd_shared.h file on amdgpu
side, so KFD should reuse them rather than defining new ones.

Using enum for asic type requires default cases on switch statements
to prevent compiler warnings. BUG on unsupported ASICs. It should never
get there because KFD should not be initialized on unsupported devices.

Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c         | 2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c          | 2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 9 +++------
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 897ff083..0ecea67 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1132,6 +1132,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 	case CHIP_KAVERI:
 		device_queue_manager_init_cik(&dqm->ops_asic_specific);
 		break;
+	default:
+		BUG();
 	}
 
 	if (!dqm->ops.initialize(dqm))
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 09356d0..9ebb4c1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -291,6 +291,8 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 	case CHIP_KAVERI:
 		kernel_queue_init_cik(&kq->ops_asic_specific);
 		break;
+	default:
+		BUG();
 	}
 
 	if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index b1ef136..b5a87ba 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -31,6 +31,8 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
 		return mqd_manager_init_cik(type, dev);
 	case CHIP_CARRIZO:
 		return mqd_manager_init_vi(type, dev);
+	default:
+		BUG();
 	}
 
 	return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 7bed4ef..bb71697 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -33,6 +33,8 @@
 #include <linux/kfd_ioctl.h>
 #include <kgd_kfd_interface.h>
 
+#include "amd_shared.h"
+
 #define KFD_SYSFS_FILE_MODE 0444
 
 #define KFD_MMAP_DOORBELL_MASK 0x8000000000000
@@ -112,11 +114,6 @@ enum cache_policy {
 	cache_policy_noncoherent
 };
 
-enum asic_family_type {
-	CHIP_KAVERI = 0,
-	CHIP_CARRIZO
-};
-
 struct kfd_event_interrupt_class {
 	bool (*interrupt_isr)(struct kfd_dev *dev,
 				const uint32_t *ih_ring_entry);
@@ -125,7 +122,7 @@ struct kfd_event_interrupt_class {
 };
 
 struct kfd_device_info {
-	unsigned int asic_family;
+	enum amd_asic_type asic_family;
 	const struct kfd_event_interrupt_class *event_interrupt_class;
 	unsigned int max_pasid_bits;
 	unsigned int max_no_of_hqd;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 08/11] drm/amdkfd: Drop _nocpsch suffix from shared functions
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (6 preceding siblings ...)
  2017-09-15 23:42   ` [PATCH 07/11] drm/amdkfd: Reuse CHIP_* from amdgpu Felix Kuehling
@ 2017-09-15 23:42   ` Felix Kuehling
       [not found]     ` <1505518982-29631-9-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:43   ` [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs Felix Kuehling
                     ` (2 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

Several functions in DQM are shared between cpsch and nocpsch code.
Remove the misleading _nocpsch suffix from their names.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 24 +++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 0ecea67..169e061 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -386,7 +386,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 	return retval;
 }
 
-static struct mqd_manager *get_mqd_manager_nocpsch(
+static struct mqd_manager *get_mqd_manager(
 		struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
 {
 	struct mqd_manager *mqd;
@@ -407,7 +407,7 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
 	return mqd;
 }
 
-static int register_process_nocpsch(struct device_queue_manager *dqm,
+static int register_process(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
 	struct device_process_node *n;
@@ -431,7 +431,7 @@ static int register_process_nocpsch(struct device_queue_manager *dqm,
 	return retval;
 }
 
-static int unregister_process_nocpsch(struct device_queue_manager *dqm,
+static int unregister_process(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
 	int retval;
@@ -513,7 +513,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 	return 0;
 }
 
-static void uninitialize_nocpsch(struct device_queue_manager *dqm)
+static void uninitialize(struct device_queue_manager *dqm)
 {
 	int i;
 
@@ -1097,10 +1097,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.stop = stop_cpsch;
 		dqm->ops.destroy_queue = destroy_queue_cpsch;
 		dqm->ops.update_queue = update_queue;
-		dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
-		dqm->ops.register_process = register_process_nocpsch;
-		dqm->ops.unregister_process = unregister_process_nocpsch;
-		dqm->ops.uninitialize = uninitialize_nocpsch;
+		dqm->ops.get_mqd_manager = get_mqd_manager;
+		dqm->ops.register_process = register_process;
+		dqm->ops.unregister_process = unregister_process;
+		dqm->ops.uninitialize = uninitialize;
 		dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
 		dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
@@ -1112,11 +1112,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.create_queue = create_queue_nocpsch;
 		dqm->ops.destroy_queue = destroy_queue_nocpsch;
 		dqm->ops.update_queue = update_queue;
-		dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
-		dqm->ops.register_process = register_process_nocpsch;
-		dqm->ops.unregister_process = unregister_process_nocpsch;
+		dqm->ops.get_mqd_manager = get_mqd_manager;
+		dqm->ops.register_process = register_process;
+		dqm->ops.unregister_process = unregister_process;
 		dqm->ops.initialize = initialize_nocpsch;
-		dqm->ops.uninitialize = uninitialize_nocpsch;
+		dqm->ops.uninitialize = uninitialize;
 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
 		break;
 	default:
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (7 preceding siblings ...)
  2017-09-15 23:42   ` [PATCH 08/11] drm/amdkfd: Drop _nocpsch suffix from shared functions Felix Kuehling
@ 2017-09-15 23:43   ` Felix Kuehling
       [not found]     ` <1505518982-29631-10-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:43   ` [PATCH 10/11] drm/amdkfd: Print event limit messages only once per process Felix Kuehling
  2017-09-15 23:43   ` [PATCH 11/11] drm/amdkfd: Set /dev/kfd permissions to 0666 by default Felix Kuehling
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:43 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

Avoid intermediate negative numbers when doing calculations with a mix
of signed and unsigned variables where implicit conversions can lead
to unexpected results.

When kernel queue buffer wraps around to 0, we need to check that rptr
won't be overwritten by the new packet.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 9ebb4c1..1c66334 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
 	uint32_t wptr, rptr;
 	unsigned int *queue_address;
 
+	/* When rptr == wptr, the buffer is empty.
+	 * When rptr == wptr + 1, the buffer is full.
+	 * It is always rptr that advances to the position of wptr, rather than
+	 * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
+	 */
 	rptr = *kq->rptr_kernel;
 	wptr = *kq->wptr_kernel;
 	queue_address = (unsigned int *)kq->pq_kernel_addr;
@@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
 	pr_debug("wptr: %d\n", wptr);
 	pr_debug("queue_address 0x%p\n", queue_address);
 
-	available_size = (rptr - 1 - wptr + queue_size_dwords) %
+	available_size = (rptr + queue_size_dwords - 1 - wptr) %
 							queue_size_dwords;
 
-	if (packet_size_in_dwords >= queue_size_dwords ||
-			packet_size_in_dwords >= available_size) {
+	if (packet_size_in_dwords > available_size) {
 		/*
 		 * make sure calling functions know
 		 * acquire_packet_buffer() failed
@@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
 	}
 
 	if (wptr + packet_size_in_dwords >= queue_size_dwords) {
+		/* make sure after rolling back to position 0, there is
+		 * still enough space.
+		 */
+		if (packet_size_in_dwords >= rptr) {
+			*buffer_ptr = NULL;
+			return -ENOMEM;
+		}
+		/* fill nops, roll back and start at position 0 */
 		while (wptr > 0) {
 			queue_address[wptr] = kq->nop_packet;
 			wptr = (wptr + 1) % queue_size_dwords;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 10/11] drm/amdkfd: Print event limit messages only once per process
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (8 preceding siblings ...)
  2017-09-15 23:43   ` [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs Felix Kuehling
@ 2017-09-15 23:43   ` Felix Kuehling
       [not found]     ` <1505518982-29631-11-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-15 23:43   ` [PATCH 11/11] drm/amdkfd: Set /dev/kfd permissions to 0666 by default Felix Kuehling
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:43 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Felix Kuehling

To avoid spamming the log.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 ++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 5979158..944abfa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -292,7 +292,10 @@ static int create_signal_event(struct file *devkfd,
 				struct kfd_event *ev)
 {
 	if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
-		pr_warn("Signal event wasn't created because limit was reached\n");
+		if (!p->signal_event_limit_reached) {
+			pr_warn("Signal event wasn't created because limit was reached\n");
+			p->signal_event_limit_reached = true;
+		}
 		return -ENOMEM;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index bb71697..a546d01 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -532,6 +532,7 @@ struct kfd_process {
 	struct list_head signal_event_pages;
 	u32 next_nonsignal_event_id;
 	size_t signal_event_count;
+	bool signal_event_limit_reached;
 };
 
 /**
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 11/11] drm/amdkfd: Set /dev/kfd permissions to 0666 by default
       [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (9 preceding siblings ...)
  2017-09-15 23:43   ` [PATCH 10/11] drm/amdkfd: Print event limit messages only once per process Felix Kuehling
@ 2017-09-15 23:43   ` Felix Kuehling
       [not found]     ` <1505518982-29631-12-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  10 siblings, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-15 23:43 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Andres Rodriguez

From: Andres Rodriguez <andres.rodriguez@amd.com>

Set the default permissions of /dev/kfd to be more than just root
accessible 600.

Signed-off-by: Andres Rodriguez <andres.rodriguez@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e4a8c2e..1ad9901 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -55,6 +55,14 @@ static int kfd_char_dev_major = -1;
 static struct class *kfd_class;
 struct device *kfd_device;
 
+static char *kfd_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode && dev->devt == MKDEV(kfd_char_dev_major, 0))
+		*mode = 0666;
+
+	return NULL;
+}
+
 int kfd_chardev_init(void)
 {
 	int err = 0;
@@ -69,6 +77,8 @@ int kfd_chardev_init(void)
 	if (IS_ERR(kfd_class))
 		goto err_class_create;
 
+	kfd_class->devnode = kfd_devnode;
+
 	kfd_device = device_create(kfd_class, NULL,
 					MKDEV(kfd_char_dev_major, 0),
 					NULL, kfd_dev_name);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 03/11] drm/amdkfd: Rectify the jiffies calculation error with milliseconds
       [not found]     ` <1505518982-29631-4-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-16  0:09       ` Jan Vesely
  0 siblings, 0 replies; 27+ messages in thread
From: Jan Vesely @ 2017-09-16  0:09 UTC (permalink / raw)
  To: Felix Kuehling, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yong Zhao


[-- Attachment #1.1: Type: text/plain, Size: 2030 bytes --]

On Fri, 2017-09-15 at 19:42 -0400, Felix Kuehling wrote:
> From: Yong Zhao <yong.zhao-5C7GfCeVMHo@public.gmane.org>
> 
> The timeout in milliseconds should not be regarded as jiffies. This
> commit fixed that.
> 
> Signed-off-by: Yong Zhao <yong.zhao-5C7GfCeVMHo@public.gmane.org>
> Signed-off-by: Felix Kuehling <Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 8 +++++---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 2 +-
>  2 files changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 5db82b8..3db6a31 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -835,12 +835,14 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>  
>  int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
>  				unsigned int fence_value,
> -				unsigned long timeout)
> +				unsigned long timeout_ms)
>  {
> -	timeout += jiffies;
> +	unsigned long end_jiffies;
> +
> +	end_jiffies = (timeout_ms * HZ / 1000) + jiffies;

msecs_to_jiffies? (jiffies.h)

Jan

>  
>  	while (*fence_addr != fence_value) {
> -		if (time_after(jiffies, timeout)) {
> +		if (time_after(jiffies, end_jiffies)) {
>  			pr_err("qcm fence wait loop timeout expired\n");
>  			return -ETIME;
>  		}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ef582cc..f8d6a8e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -669,7 +669,7 @@ struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
>  
>  int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
>  				unsigned int fence_value,
> -				unsigned long timeout);
> +				unsigned long timeout_ms);
>  
>  /* Packet Manager */
>  

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 11/11] drm/amdkfd: Set /dev/kfd permissions to 0666 by default
       [not found]     ` <1505518982-29631-12-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17  7:38       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17  7:38 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Andres Rodriguez, amd-gfx list

On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Andres Rodriguez <andres.rodriguez@amd.com>
>
> Set the default permissions of /dev/kfd to be more than just root
> accessible 600.
>
I don't think that's acceptable.
You need to use udev rules file for that.

Oded

> Signed-off-by: Andres Rodriguez <andres.rodriguez@amd.com>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index e4a8c2e..1ad9901 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -55,6 +55,14 @@ static int kfd_char_dev_major = -1;
>  static struct class *kfd_class;
>  struct device *kfd_device;
>
> +static char *kfd_devnode(struct device *dev, umode_t *mode)
> +{
> +       if (mode && dev->devt == MKDEV(kfd_char_dev_major, 0))
> +               *mode = 0666;
> +
> +       return NULL;
> +}
> +
>  int kfd_chardev_init(void)
>  {
>         int err = 0;
> @@ -69,6 +77,8 @@ int kfd_chardev_init(void)
>         if (IS_ERR(kfd_class))
>                 goto err_class_create;
>
> +       kfd_class->devnode = kfd_devnode;
> +
>         kfd_device = device_create(kfd_class, NULL,
>                                         MKDEV(kfd_char_dev_major, 0),
>                                         NULL, kfd_dev_name);
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 07/11] drm/amdkfd: Reuse CHIP_* from amdgpu
       [not found]     ` <1505518982-29631-8-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17  7:54       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17  7:54 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <Yong.Zhao@amd.com>
>
> There are already CHIP_* definitions under amd_shared.h file on amdgpu
> side, so KFD should reuse them rather than defining new ones.
>
> Using enum for asic type requires default cases on switch statements
> to prevent compiler warnings. BUG on unsupported ASICs. It should never
> get there because KFD should not be initialized on unsupported devices.

We did an effort to remove all BUG statements from the driver so
please don't introduce new ones.
Even if the code should never reach there, it is not a reason to crash
the entire kernel as it doesn't effect the rest of the system's
functionality.
Oded.

>
> Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c         | 2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c          | 2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 9 +++------
>  4 files changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 897ff083..0ecea67 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1132,6 +1132,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>         case CHIP_KAVERI:
>                 device_queue_manager_init_cik(&dqm->ops_asic_specific);
>                 break;
> +       default:
> +               BUG();
Replace this with some error printing.

>         }
>
>         if (!dqm->ops.initialize(dqm))
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 09356d0..9ebb4c1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -291,6 +291,8 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>         case CHIP_KAVERI:
>                 kernel_queue_init_cik(&kq->ops_asic_specific);
>                 break;
> +       default:
> +               BUG();
Replace this with some error printing.

>         }
>
>         if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index b1ef136..b5a87ba 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -31,6 +31,8 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
>                 return mqd_manager_init_cik(type, dev);
>         case CHIP_CARRIZO:
>                 return mqd_manager_init_vi(type, dev);
> +       default:
> +               BUG();
Replace this with some error printing.

>         }
>
>         return NULL;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 7bed4ef..bb71697 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -33,6 +33,8 @@
>  #include <linux/kfd_ioctl.h>
>  #include <kgd_kfd_interface.h>
>
> +#include "amd_shared.h"
> +
>  #define KFD_SYSFS_FILE_MODE 0444
>
>  #define KFD_MMAP_DOORBELL_MASK 0x8000000000000
> @@ -112,11 +114,6 @@ enum cache_policy {
>         cache_policy_noncoherent
>  };
>
> -enum asic_family_type {
> -       CHIP_KAVERI = 0,
> -       CHIP_CARRIZO
> -};
> -
>  struct kfd_event_interrupt_class {
>         bool (*interrupt_isr)(struct kfd_dev *dev,
>                                 const uint32_t *ih_ring_entry);
> @@ -125,7 +122,7 @@ struct kfd_event_interrupt_class {
>  };
>
>  struct kfd_device_info {
> -       unsigned int asic_family;
> +       enum amd_asic_type asic_family;
>         const struct kfd_event_interrupt_class *event_interrupt_class;
>         unsigned int max_pasid_bits;
>         unsigned int max_no_of_hqd;
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 02/11] drm/amdkfd: Fix suspend/resume issue on Carrizo
       [not found]     ` <1505518982-29631-3-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17 11:17       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17 11:17 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <yong.zhao@amd.com>
>
> When we do suspend/resume through "sudo pm-suspend" while there is
> HSA activity running, upon resume we will encounter HWS hanging, which
> is caused by memory read/write failures. The root cause is that when
> suspend, we neglected to unbind pasid from kfd device.
>
> Another major change is that the bind/unbinding is changed to be
> performed on a per process basis, instead of whether there are queues
> in dqm.
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 22 ++++--
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 ----
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h              | 15 +++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 89 ++++++++++++++++++----
>  4 files changed, 101 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index cc8af11..ff3f97c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -191,7 +191,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
>         struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
>
>         if (dev)
> -               kfd_unbind_process_from_device(dev, pasid);
> +               kfd_process_iommu_unbind_callback(dev, pasid);
>  }
>
>  /*
> @@ -339,12 +339,16 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>
>  void kgd2kfd_suspend(struct kfd_dev *kfd)
>  {
> -       if (kfd->init_complete) {
> -               kfd->dqm->ops.stop(kfd->dqm);
> -               amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
> -               amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
> -               amd_iommu_free_device(kfd->pdev);
> -       }
> +       if (!kfd->init_complete)
> +               return;
> +
> +       kfd->dqm->ops.stop(kfd->dqm);
> +
> +       kfd_unbind_processes_from_device(kfd);
> +
> +       amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
> +       amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
> +       amd_iommu_free_device(kfd->pdev);
>  }
>
>  int kgd2kfd_resume(struct kfd_dev *kfd)
> @@ -369,6 +373,10 @@ static int kfd_resume(struct kfd_dev *kfd)
>         amd_iommu_set_invalid_ppr_cb(kfd->pdev,
>                                      iommu_invalid_ppr_cb);
>
> +       err = kfd_bind_processes_to_device(kfd);
> +       if (err)
> +               return -ENXIO;

You need to undo previous initialization in case
kfd_bind_processes_to_device fails, i.e. call amd_iommu_free_device()

> +
>         err = kfd->dqm->ops.start(kfd->dqm);
>         if (err) {
>                 dev_err(kfd_device,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 53a66e8..5db82b8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
>
>  static int start_cpsch(struct device_queue_manager *dqm)
>  {
> -       struct device_process_node *node;
>         int retval;
>
>         retval = 0;
> @@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
>
>         init_interrupts(dqm);
>
> -       list_for_each_entry(node, &dqm->queues, list)
> -               if (node->qpd->pqm->process && dqm->dev)
> -                       kfd_bind_process_to_device(dqm->dev,
> -                                               node->qpd->pqm->process);
> -
>         execute_queues_cpsch(dqm, true);
>
>         return 0;
> @@ -714,15 +708,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
>
>  static int stop_cpsch(struct device_queue_manager *dqm)
>  {
> -       struct device_process_node *node;
> -       struct kfd_process_device *pdd;
> -
>         destroy_queues_cpsch(dqm, true, true);
>
> -       list_for_each_entry(node, &dqm->queues, list) {
> -               pdd = qpd_to_pdd(node->qpd);
> -               pdd->bound = false;
> -       }
>         kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>         pm_uninit(&dqm->packets);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index b397ec7..ef582cc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -435,6 +435,13 @@ struct qcm_process_device {
>         uint32_t sh_hidden_private_base;
>  };
>
> +
> +enum kfd_pdd_bound {
> +       PDD_UNBOUND = 0,
> +       PDD_BOUND,
> +       PDD_BOUND_SUSPENDED,
> +};
> +
>  /* Data that is per-process-per device. */
>  struct kfd_process_device {
>         /*
> @@ -459,7 +466,7 @@ struct kfd_process_device {
>         uint64_t scratch_limit;
>
>         /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
> -       bool bound;
> +       enum kfd_pdd_bound bound;
>
>         /* This flag tells if we should reset all
>          * wavefronts on process termination
> @@ -548,8 +555,10 @@ struct kfd_process *kfd_get_process(const struct task_struct *);
>  struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
>
>  struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
> -                                                       struct kfd_process *p);
> -void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
> +                                               struct kfd_process *p);
> +int kfd_bind_processes_to_device(struct kfd_dev *dev);
> +void kfd_unbind_processes_from_device(struct kfd_dev *dev);
> +void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid);
>  struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
>                                                         struct kfd_process *p);
>  struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index c74cf22..3ffaac3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -174,7 +174,10 @@ static void kfd_process_wq_release(struct work_struct *work)
>                 if (pdd->reset_wavefronts)
>                         dbgdev_wave_reset_wavefronts(pdd->dev, p);
>
> -               amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
> +               if (pdd->bound == PDD_BOUND) {
> +                       amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
> +                       pdd->bound = PDD_UNBOUND;
I think setting the bound property here is redundant because we do
kfree(pdd) in two more statements.

> +               }
>                 list_del(&pdd->per_device_list);
>
>                 kfree(pdd);
> @@ -345,9 +348,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
>
>         list_for_each_entry(pdd, &p->per_device_data, per_device_list)
>                 if (pdd->dev == dev)
> -                       break;
> +                       return pdd;
>
> -       return pdd;
> +       return NULL;
>  }
>
>  struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
> @@ -362,6 +365,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>                 INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
>                 pdd->qpd.dqm = dev->dqm;
>                 pdd->reset_wavefronts = false;
> +               pdd->bound = PDD_UNBOUND;
>                 list_add(&pdd->per_device_list, &p->per_device_data);
>         }
>
> @@ -387,19 +391,83 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
>                 return ERR_PTR(-ENOMEM);
>         }
>
> -       if (pdd->bound)
> +       if (pdd->bound == PDD_BOUND)
>                 return pdd;
>
> +       if (pdd->bound == PDD_BOUND_SUSPENDED) {
because we check the same variable, we should do it as "else if"

> +               pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
> +               return ERR_PTR(-EINVAL);
> +       }
> +
>         err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
>         if (err < 0)
>                 return ERR_PTR(err);
>
> -       pdd->bound = true;
> +       pdd->bound = PDD_BOUND;
>
>         return pdd;
>  }
>
> -void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
> +int kfd_bind_processes_to_device(struct kfd_dev *dev)
> +{
> +       struct kfd_process_device *pdd;
> +       struct kfd_process *p;
> +       unsigned int temp;
> +       int err = 0;
> +
> +       int idx = srcu_read_lock(&kfd_processes_srcu);
> +
> +       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> +               mutex_lock(&p->mutex);
> +               pdd = kfd_get_process_device_data(dev, p);
> +               if (pdd->bound != PDD_BOUND_SUSPENDED) {
> +                       mutex_unlock(&p->mutex);
> +                       continue;
> +               }
> +
> +               err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
> +                               p->lead_thread);
> +               if (err < 0) {
> +                       pr_err("unexpected pasid %d binding failure\n",
> +                                       p->pasid);
> +                       mutex_unlock(&p->mutex);
> +                       break;
> +               }
> +
> +               pdd->bound = PDD_BOUND;
> +               mutex_unlock(&p->mutex);
> +       }
> +
> +       srcu_read_unlock(&kfd_processes_srcu, idx);
> +
> +       return err;
> +}
> +
> +void kfd_unbind_processes_from_device(struct kfd_dev *dev)
> +{
> +       struct kfd_process_device *pdd;
> +       struct kfd_process *p;
> +       unsigned int temp, temp_bound, temp_pasid;
> +
> +       int idx = srcu_read_lock(&kfd_processes_srcu);
> +
> +       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> +               mutex_lock(&p->mutex);
> +               pdd = kfd_get_process_device_data(dev, p);
> +               temp_bound = pdd->bound;
> +               temp_pasid = p->pasid;
> +               if (pdd->bound == PDD_BOUND)
> +                       pdd->bound = PDD_BOUND_SUSPENDED;
I would add a comment here explaining why we set pdd->bound to
PDD_BOUND_SUSPENDED and not to PDD_UNBOUND. It took me a couple of
minutes to traces all the paths in the code in order to understand it.


> +               mutex_unlock(&p->mutex);
> +
> +               if (temp_bound == PDD_BOUND)
> +                       amd_iommu_unbind_pasid(dev->pdev, temp_pasid);
> +       }
> +
> +       srcu_read_unlock(&kfd_processes_srcu, idx);
> +}
> +
> +void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
>  {
>         struct kfd_process *p;
>         struct kfd_process_device *pdd;
> @@ -432,15 +500,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
>                 pdd->reset_wavefronts = false;
>         }
>
> -       /*
> -        * Just mark pdd as unbound, because we still need it
> -        * to call amd_iommu_unbind_pasid() in when the
> -        * process exits.
> -        * We don't call amd_iommu_unbind_pasid() here
> -        * because the IOMMU called us.
> -        */
> -       pdd->bound = false;
> -
>         mutex_unlock(&p->mutex);
>  }
>
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

With the above minor comments fixed, this patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 01/11] drm/amdkfd: Reorganize kfd resume code
       [not found]     ` <1505518982-29631-2-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17 11:18       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17 11:18 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <yong.zhao@amd.com>
>
> The idea is to let kfd init and resume function share the same code path
> as much as possible, rather than to have two copies of almost identical
> code. That way improves the code readability and maintainability.
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 78 +++++++++++++++++----------------
>  1 file changed, 40 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 61fff25..cc8af11 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -92,6 +92,8 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
>                                 unsigned int chunk_size);
>  static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
>
> +static int kfd_resume(struct kfd_dev *kfd);
> +
>  static const struct kfd_device_info *lookup_device_info(unsigned short did)
>  {
>         size_t i;
> @@ -176,15 +178,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd)
>                                 pasid_limit,
>                                 kfd->doorbell_process_limit - 1);
>
> -       err = amd_iommu_init_device(kfd->pdev, pasid_limit);
> -       if (err < 0) {
> -               dev_err(kfd_device, "error initializing iommu device\n");
> -               return false;
> -       }
> -
>         if (!kfd_set_pasid_limit(pasid_limit)) {
>                 dev_err(kfd_device, "error setting pasid limit\n");
> -               amd_iommu_free_device(kfd->pdev);
>                 return false;
>         }
>
> @@ -280,29 +275,22 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>                 goto kfd_interrupt_error;
>         }
>
> -       if (!device_iommu_pasid_init(kfd)) {
> -               dev_err(kfd_device,
> -                       "Error initializing iommuv2 for device %x:%x\n",
> -                       kfd->pdev->vendor, kfd->pdev->device);
> -               goto device_iommu_pasid_error;
> -       }
> -       amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
> -                                               iommu_pasid_shutdown_callback);
> -       amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
> -
>         kfd->dqm = device_queue_manager_init(kfd);
>         if (!kfd->dqm) {
>                 dev_err(kfd_device, "Error initializing queue manager\n");
>                 goto device_queue_manager_error;
>         }
>
> -       if (kfd->dqm->ops.start(kfd->dqm)) {
> +       if (!device_iommu_pasid_init(kfd)) {
>                 dev_err(kfd_device,
> -                       "Error starting queue manager for device %x:%x\n",
> +                       "Error initializing iommuv2 for device %x:%x\n",
>                         kfd->pdev->vendor, kfd->pdev->device);
> -               goto dqm_start_error;
> +               goto device_iommu_pasid_error;
>         }
>
> +       if (kfd_resume(kfd))
> +               goto kfd_resume_error;
> +
>         kfd->dbgmgr = NULL;
>
>         kfd->init_complete = true;
> @@ -314,11 +302,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>
>         goto out;
>
> -dqm_start_error:
> +kfd_resume_error:
> +device_iommu_pasid_error:
>         device_queue_manager_uninit(kfd->dqm);
>  device_queue_manager_error:
> -       amd_iommu_free_device(kfd->pdev);
> -device_iommu_pasid_error:
>         kfd_interrupt_exit(kfd);
>  kfd_interrupt_error:
>         kfd_topology_remove_device(kfd);
> @@ -338,8 +325,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  void kgd2kfd_device_exit(struct kfd_dev *kfd)
>  {
>         if (kfd->init_complete) {
> +               kgd2kfd_suspend(kfd);
>                 device_queue_manager_uninit(kfd->dqm);
> -               amd_iommu_free_device(kfd->pdev);
>                 kfd_interrupt_exit(kfd);
>                 kfd_topology_remove_device(kfd);
>                 kfd_doorbell_fini(kfd);
> @@ -362,25 +349,40 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
>
>  int kgd2kfd_resume(struct kfd_dev *kfd)
>  {
> -       unsigned int pasid_limit;
> -       int err;
> +       if (!kfd->init_complete)
> +               return 0;
>
> -       pasid_limit = kfd_get_pasid_limit();
> +       return kfd_resume(kfd);
>
> -       if (kfd->init_complete) {
> -               err = amd_iommu_init_device(kfd->pdev, pasid_limit);
> -               if (err < 0) {
> -                       dev_err(kfd_device, "failed to initialize iommu\n");
> -                       return -ENXIO;
> -               }
> +}
>
> -               amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
> -                                               iommu_pasid_shutdown_callback);
> -               amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
> -               kfd->dqm->ops.start(kfd->dqm);
> +static int kfd_resume(struct kfd_dev *kfd)
> +{
> +       int err = 0;
> +       unsigned int pasid_limit = kfd_get_pasid_limit();
> +
> +       err = amd_iommu_init_device(kfd->pdev, pasid_limit);
> +       if (err)
> +               return -ENXIO;
> +       amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
> +                                       iommu_pasid_shutdown_callback);
> +       amd_iommu_set_invalid_ppr_cb(kfd->pdev,
> +                                    iommu_invalid_ppr_cb);
> +
> +       err = kfd->dqm->ops.start(kfd->dqm);
> +       if (err) {
> +               dev_err(kfd_device,
> +                       "Error starting queue manager for device %x:%x\n",
> +                       kfd->pdev->vendor, kfd->pdev->device);
> +               goto dqm_start_error;
>         }
>
> -       return 0;
> +       return err;
> +
> +dqm_start_error:
> +       amd_iommu_free_device(kfd->pdev);
> +
> +       return err;
>  }
>
>  /* This is called directly from KGD at ISR. */
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 04/11] drm/amdkfd: Adjust dequeue latencies and timeouts
       [not found]     ` <1505518982-29631-5-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17 11:23       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17 11:23 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> Adjust latencies and timeouts for dequeueing with HWS and consolidate
> them in one place. Make them longer to allow long running waves to
> complete without causing a timeout. The timeout is twice as long as the
> latency plus some buffer to make sure we don't detect a timeout
> prematurely.
>
> Change timeouts for dequeueing HQDs without HWS. KFD_UNMAP_LATENCY is
> more consistent with what the HWS does for user queues.
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +++-
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c         | 2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c       | 2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 3 ---
>  5 files changed, 6 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 3db6a31..5da7ef4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -323,7 +323,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>
>         retval = mqd->destroy_mqd(mqd, q->mqd,
>                                 KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> -                               QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
> +                               KFD_UNMAP_LATENCY_MS,
>                                 q->pipe, q->queue);
>
>         if (retval)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index faf820a..99e2305 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -29,7 +29,9 @@
>  #include "kfd_priv.h"
>  #include "kfd_mqd_manager.h"
>
> -#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS       (500)
> +#define KFD_UNMAP_LATENCY_MS                   (4000)
> +#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
> +
>  #define CIK_VMID_NUM                           (8)
>  #define KFD_VMID_START_OFFSET                  (8)
>  #define VMID_PER_DEVICE                                CIK_VMID_NUM
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 681b639..0c82446 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -185,7 +185,7 @@ static void uninitialize(struct kernel_queue *kq)
>                 kq->mqd->destroy_mqd(kq->mqd,
>                                         NULL,
>                                         false,
> -                                       QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
> +                                       KFD_UNMAP_LATENCY_MS,
>                                         kq->queue->pipe,
>                                         kq->queue->queue);
>         else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 1d31260..9eda884 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -376,7 +376,7 @@ int pm_send_set_resources(struct packet_manager *pm,
>         packet->bitfields2.queue_type =
>                         queue_type__mes_set_resources__hsa_interface_queue_hiq;
>         packet->bitfields2.vmid_mask = res->vmid_mask;
> -       packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY;
> +       packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
>         packet->bitfields7.oac_mask = res->oac_mask;
>         packet->bitfields8.gds_heap_base = res->gds_heap_base;
>         packet->bitfields8.gds_heap_size = res->gds_heap_size;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f8d6a8e..099dc33 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -673,11 +673,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
>
>  /* Packet Manager */
>
> -#define KFD_HIQ_TIMEOUT (500)
> -
>  #define KFD_FENCE_COMPLETED (100)
>  #define KFD_FENCE_INIT   (10)
> -#define KFD_UNMAP_LATENCY (150)
>
>  struct packet_manager {
>         struct device_queue_manager *dqm;
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 05/11] drm/amdkfd: Fix incorrect destroy_mqd parameter
       [not found]     ` <1505518982-29631-6-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17 11:35       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17 11:35 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> When uninitializing a kernel queue.
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 0c82446..09356d0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -184,7 +184,7 @@ static void uninitialize(struct kernel_queue *kq)
>         if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>                 kq->mqd->destroy_mqd(kq->mqd,
>                                         NULL,
> -                                       false,
> +                                       KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>                                         KFD_UNMAP_LATENCY_MS,
>                                         kq->queue->pipe,
>                                         kq->queue->queue);
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 08/11] drm/amdkfd: Drop _nocpsch suffix from shared functions
       [not found]     ` <1505518982-29631-9-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17 11:36       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17 11:36 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <yong.zhao@amd.com>
>
> Several functions in DQM are shared between cpsch and nocpsch code.
> Remove the misleading _nocpsch suffix from their names.
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 24 +++++++++++-----------
>  1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 0ecea67..169e061 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -386,7 +386,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>         return retval;
>  }
>
> -static struct mqd_manager *get_mqd_manager_nocpsch(
> +static struct mqd_manager *get_mqd_manager(
>                 struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
>  {
>         struct mqd_manager *mqd;
> @@ -407,7 +407,7 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
>         return mqd;
>  }
>
> -static int register_process_nocpsch(struct device_queue_manager *dqm,
> +static int register_process(struct device_queue_manager *dqm,
>                                         struct qcm_process_device *qpd)
>  {
>         struct device_process_node *n;
> @@ -431,7 +431,7 @@ static int register_process_nocpsch(struct device_queue_manager *dqm,
>         return retval;
>  }
>
> -static int unregister_process_nocpsch(struct device_queue_manager *dqm,
> +static int unregister_process(struct device_queue_manager *dqm,
>                                         struct qcm_process_device *qpd)
>  {
>         int retval;
> @@ -513,7 +513,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
>         return 0;
>  }
>
> -static void uninitialize_nocpsch(struct device_queue_manager *dqm)
> +static void uninitialize(struct device_queue_manager *dqm)
>  {
>         int i;
>
> @@ -1097,10 +1097,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>                 dqm->ops.stop = stop_cpsch;
>                 dqm->ops.destroy_queue = destroy_queue_cpsch;
>                 dqm->ops.update_queue = update_queue;
> -               dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
> -               dqm->ops.register_process = register_process_nocpsch;
> -               dqm->ops.unregister_process = unregister_process_nocpsch;
> -               dqm->ops.uninitialize = uninitialize_nocpsch;
> +               dqm->ops.get_mqd_manager = get_mqd_manager;
> +               dqm->ops.register_process = register_process;
> +               dqm->ops.unregister_process = unregister_process;
> +               dqm->ops.uninitialize = uninitialize;
>                 dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
>                 dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
>                 dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
> @@ -1112,11 +1112,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>                 dqm->ops.create_queue = create_queue_nocpsch;
>                 dqm->ops.destroy_queue = destroy_queue_nocpsch;
>                 dqm->ops.update_queue = update_queue;
> -               dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
> -               dqm->ops.register_process = register_process_nocpsch;
> -               dqm->ops.unregister_process = unregister_process_nocpsch;
> +               dqm->ops.get_mqd_manager = get_mqd_manager;
> +               dqm->ops.register_process = register_process;
> +               dqm->ops.unregister_process = unregister_process;
>                 dqm->ops.initialize = initialize_nocpsch;
> -               dqm->ops.uninitialize = uninitialize_nocpsch;
> +               dqm->ops.uninitialize = uninitialize;
>                 dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
>                 break;
>         default:
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 06/11] drm/amdkfd: Use VMID bitmap from KGD
       [not found]     ` <1505518982-29631-7-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17 11:47       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17 11:47 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <yong.zhao@amd.com>
>
> The hard-coded values related to VMID were removed in KFD, as those
> values can be calculated in the KFD initialization function.
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c                |  9 ++-------
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c                |  7 +++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 ++++++-------
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |  4 ----
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h                  |  7 +++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  2 +-
>  6 files changed, 23 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> index 0aa021a..7d5635f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> @@ -769,13 +769,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
>         union GRBM_GFX_INDEX_BITS reg_gfx_index;
>         struct kfd_process_device *pdd;
>         struct dbg_wave_control_info wac_info;
> -       int temp;
> -       int first_vmid_to_scan = 8;
> -       int last_vmid_to_scan = 15;
> -
> -       first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1;
> -       temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan;
> -       last_vmid_to_scan = first_vmid_to_scan + ffz(temp);
> +       int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
> +       int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
>
>         reg_sq_cmd.u32All = 0;
>         status = 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index ff3f97c..abf91b0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -223,9 +223,16 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>                          const struct kgd2kfd_shared_resources *gpu_resources)
>  {
>         unsigned int size;
> +       unsigned int vmid_bitmap_kfd;
>
>         kfd->shared_resources = *gpu_resources;
>
> +       vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap;
Unnecessary copy, just use kfd->shared_resources.compute_vmid_bitmap
in the below lines. If you want a shorter name, use a pointer.

> +       kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1;
> +       kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1;
> +       kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
> +                       - kfd->vm_info.first_vmid_kfd + 1;
> +
>         /* calculate max size of mqds needed for queues */
>         size = max_num_of_queues_per_device *
>                         kfd->device_info->mqd_size_aligned;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 5da7ef4..897ff083 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -113,11 +113,11 @@ static int allocate_vmid(struct device_queue_manager *dqm,
>         if (dqm->vmid_bitmap == 0)
>                 return -ENOMEM;
>
> -       bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM);
> +       bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap,
> +                               dqm->dev->vm_info.vmid_num_kfd);
>         clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap);
>
> -       /* Kaveri kfd vmid's starts from vmid 8 */
> -       allocated_vmid = bit + KFD_VMID_START_OFFSET;
> +       allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
>         pr_debug("vmid allocation %d\n", allocated_vmid);
>         qpd->vmid = allocated_vmid;
>         q->properties.vmid = allocated_vmid;
> @@ -132,7 +132,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
>                                 struct qcm_process_device *qpd,
>                                 struct queue *q)
>  {
> -       int bit = qpd->vmid - KFD_VMID_START_OFFSET;
> +       int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
>
>         /* Release the vmid mapping */
>         set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
> @@ -507,7 +507,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
>                                 dqm->allocated_queues[pipe] |= 1 << queue;
>         }
>
> -       dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1;
> +       dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
>         dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
>
>         return 0;
> @@ -613,8 +613,7 @@ static int set_sched_resources(struct device_queue_manager *dqm)
>         int i, mec;
>         struct scheduling_resources res;
>
> -       res.vmid_mask = (1 << VMID_PER_DEVICE) - 1;
> -       res.vmid_mask <<= KFD_VMID_START_OFFSET;
> +       res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
>
>         res.queue_mask = 0;
>         for (i = 0; i < KGD_MAX_QUEUES; ++i) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 99e2305..60d46ce 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -32,10 +32,6 @@
>  #define KFD_UNMAP_LATENCY_MS                   (4000)
>  #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
>
> -#define CIK_VMID_NUM                           (8)
> -#define KFD_VMID_START_OFFSET                  (8)
> -#define VMID_PER_DEVICE                                CIK_VMID_NUM
> -#define KFD_DQM_FIRST_PIPE                     (0)
>  #define CIK_SDMA_QUEUES                                (4)
>  #define CIK_SDMA_QUEUES_PER_ENGINE             (2)
>  #define CIK_SDMA_ENGINE_NUM                    (2)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 099dc33..7bed4ef 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -141,6 +141,12 @@ struct kfd_mem_obj {
>         uint32_t *cpu_ptr;
>  };
>
> +struct kfd_vmid_info {
> +       uint32_t first_vmid_kfd;
> +       uint32_t last_vmid_kfd;
> +       uint32_t vmid_num_kfd;
> +};
> +
>  struct kfd_dev {
>         struct kgd_dev *kgd;
>
> @@ -165,6 +171,7 @@ struct kfd_dev {
>                                            */
>
>         struct kgd2kfd_shared_resources shared_resources;
> +       struct kfd_vmid_info vm_info;
>
>         const struct kfd2kgd_calls *kfd2kgd;
>         struct mutex doorbell_mutex;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 1cae95e..c047534 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -188,7 +188,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>         case KFD_QUEUE_TYPE_COMPUTE:
>                 /* check if there is over subscription */
>                 if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
> -               ((dev->dqm->processes_count >= VMID_PER_DEVICE) ||
> +               ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) ||
>                 (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) {
>                         pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
>                         retval = -EPERM;
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

With the above comment fixed, this patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs
       [not found]     ` <1505518982-29631-10-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17 12:03       ` Oded Gabbay
       [not found]         ` <CAFCwf127-0M8oBFG4mefuRmjFFGY+NLrW-SVCaU=Dqwi0SKpBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17 12:03 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <yong.zhao@amd.com>
>
> Avoid intermediate negative numbers when doing calculations with a mix
> of signed and unsigned variables where implicit conversions can lead
> to unexpected results.
>
> When kernel queue buffer wraps around to 0, we need to check that rptr
> won't be overwritten by the new packet.
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 9ebb4c1..1c66334 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>         uint32_t wptr, rptr;
>         unsigned int *queue_address;
>
> +       /* When rptr == wptr, the buffer is empty.
Start comment text in a new line. First line should be just /*

> +        * When rptr == wptr + 1, the buffer is full.
> +        * It is always rptr that advances to the position of wptr, rather than
> +        * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
> +        */
>         rptr = *kq->rptr_kernel;
>         wptr = *kq->wptr_kernel;
>         queue_address = (unsigned int *)kq->pq_kernel_addr;
> @@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>         pr_debug("wptr: %d\n", wptr);
>         pr_debug("queue_address 0x%p\n", queue_address);
>
> -       available_size = (rptr - 1 - wptr + queue_size_dwords) %
> +       available_size = (rptr + queue_size_dwords - 1 - wptr) %
>                                                         queue_size_dwords;
>
> -       if (packet_size_in_dwords >= queue_size_dwords ||
> -                       packet_size_in_dwords >= available_size) {
> +       if (packet_size_in_dwords > available_size) {
>                 /*
>                  * make sure calling functions know
>                  * acquire_packet_buffer() failed
> @@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>         }
>
>         if (wptr + packet_size_in_dwords >= queue_size_dwords) {
> +               /* make sure after rolling back to position 0, there is
> +                * still enough space.
> +                */
> +               if (packet_size_in_dwords >= rptr) {
> +                       *buffer_ptr = NULL;
> +                       return -ENOMEM;
> +               }

I don't think the condition is correct.
Suppose, queue_size_dwords == 100, wptr == rptr == 50 (queue is empty)
and we have a new packet with size of 70.
Now, wptr + size is 120, which is >= 100
However, 70 >= rptr (50) which will give us -ENOMEM, but this is not
correct condition, because the packet *does* have enough room in the
queue.

I think the condition should be:
if (packet_size_in_dwords - (queue_size_dwords - wptr) >= rptr)
but please check this.

> +               /* fill nops, roll back and start at position 0 */
>                 while (wptr > 0) {
>                         queue_address[wptr] = kq->nop_packet;
>                         wptr = (wptr + 1) % queue_size_dwords;
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 10/11] drm/amdkfd: Print event limit messages only once per process
       [not found]     ` <1505518982-29631-11-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-17 12:04       ` Oded Gabbay
  0 siblings, 0 replies; 27+ messages in thread
From: Oded Gabbay @ 2017-09-17 12:04 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: amd-gfx list

On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> To avoid spamming the log.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 ++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 1 +
>  2 files changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index 5979158..944abfa 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -292,7 +292,10 @@ static int create_signal_event(struct file *devkfd,
>                                 struct kfd_event *ev)
>  {
>         if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
> -               pr_warn("Signal event wasn't created because limit was reached\n");
> +               if (!p->signal_event_limit_reached) {
> +                       pr_warn("Signal event wasn't created because limit was reached\n");
> +                       p->signal_event_limit_reached = true;
> +               }
>                 return -ENOMEM;
>         }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index bb71697..a546d01 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -532,6 +532,7 @@ struct kfd_process {
>         struct list_head signal_event_pages;
>         u32 next_nonsignal_event_id;
>         size_t signal_event_count;
> +       bool signal_event_limit_reached;
>  };
>
>  /**
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs
       [not found]         ` <CAFCwf127-0M8oBFG4mefuRmjFFGY+NLrW-SVCaU=Dqwi0SKpBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-09-18 15:08           ` Felix Kuehling
  2017-09-18 15:21           ` Felix Kuehling
  1 sibling, 0 replies; 27+ messages in thread
From: Felix Kuehling @ 2017-09-18 15:08 UTC (permalink / raw)
  To: Oded Gabbay; +Cc: Yong Zhao, amd-gfx list

On 2017-09-17 08:03 AM, Oded Gabbay wrote:
> On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
>> From: Yong Zhao <yong.zhao@amd.com>
>>
>> Avoid intermediate negative numbers when doing calculations with a mix
>> of signed and unsigned variables where implicit conversions can lead
>> to unexpected results.
>>
>> When kernel queue buffer wraps around to 0, we need to check that rptr
>> won't be overwritten by the new packet.
>>
>> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++---
>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> index 9ebb4c1..1c66334 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         uint32_t wptr, rptr;
>>         unsigned int *queue_address;
>>
>> +       /* When rptr == wptr, the buffer is empty.
> Start comment text in a new line. First line should be just /*
>
>> +        * When rptr == wptr + 1, the buffer is full.
>> +        * It is always rptr that advances to the position of wptr, rather than
>> +        * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
>> +        */
>>         rptr = *kq->rptr_kernel;
>>         wptr = *kq->wptr_kernel;
>>         queue_address = (unsigned int *)kq->pq_kernel_addr;
>> @@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         pr_debug("wptr: %d\n", wptr);
>>         pr_debug("queue_address 0x%p\n", queue_address);
>>
>> -       available_size = (rptr - 1 - wptr + queue_size_dwords) %
>> +       available_size = (rptr + queue_size_dwords - 1 - wptr) %
>>                                                         queue_size_dwords;
>>
>> -       if (packet_size_in_dwords >= queue_size_dwords ||
>> -                       packet_size_in_dwords >= available_size) {
>> +       if (packet_size_in_dwords > available_size) {
>>                 /*
>>                  * make sure calling functions know
>>                  * acquire_packet_buffer() failed
>> @@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         }
>>
>>         if (wptr + packet_size_in_dwords >= queue_size_dwords) {
>> +               /* make sure after rolling back to position 0, there is
>> +                * still enough space.
>> +                */
>> +               if (packet_size_in_dwords >= rptr) {
>> +                       *buffer_ptr = NULL;
>> +                       return -ENOMEM;
>> +               }
> I don't think the condition is correct.
> Suppose, queue_size_dwords == 100, wptr == rptr == 50 (queue is empty)
> and we have a new packet with size of 70.
> Now, wptr + size is 120, which is >= 100
> However, 70 >= rptr (50) which will give us -ENOMEM, but this is not
> correct condition, because the packet *does* have enough room in the
> queue.

Not really. We need 70 consecutive dwords. So the last 50 dwords of the
queue are not enough. Therefore we decide to wrap around to the
beginning of the queue and fill the last 50 dwords with NOPs. That means
we really need 70+50 dwords, including those NOPs. Now we only have 50
dwords left in the queue for the actual packet, which is not enough. So
we have no choice but to fail with ENOMEM.

I think the consequence is, that we can't guarantee any successful
allocations from the queue that need more than half of the queue.

Regards,
  Felix

>
> I think the condition should be:
> if (packet_size_in_dwords - (queue_size_dwords - wptr) >= rptr)
> but please check this.
>
>> +               /* fill nops, roll back and start at position 0 */
>>                 while (wptr > 0) {
>>                         queue_address[wptr] = kq->nop_packet;
>>                         wptr = (wptr + 1) % queue_size_dwords;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs
       [not found]         ` <CAFCwf127-0M8oBFG4mefuRmjFFGY+NLrW-SVCaU=Dqwi0SKpBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-09-18 15:08           ` Felix Kuehling
@ 2017-09-18 15:21           ` Felix Kuehling
       [not found]             ` <a3e6c81d-4b3f-a58b-284c-dc466e09ca2c-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 27+ messages in thread
From: Felix Kuehling @ 2017-09-18 15:21 UTC (permalink / raw)
  To: Oded Gabbay; +Cc: Yong Zhao, amd-gfx list

On 2017-09-17 08:03 AM, Oded Gabbay wrote:
> On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
>> From: Yong Zhao <yong.zhao@amd.com>
>>
>> Avoid intermediate negative numbers when doing calculations with a mix
>> of signed and unsigned variables where implicit conversions can lead
>> to unexpected results.
>>
>> When kernel queue buffer wraps around to 0, we need to check that rptr
>> won't be overwritten by the new packet.
>>
>> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++---
>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> index 9ebb4c1..1c66334 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         uint32_t wptr, rptr;
>>         unsigned int *queue_address;
>>
>> +       /* When rptr == wptr, the buffer is empty.
> Start comment text in a new line. First line should be just /*

That's a new rule to me, and it seems a bit arbitrary. checkpatch.pl
doesn't complain about this. checkpatch.pl does complain about the
closing */ not being on it's own line, but checkpatch has always been OK
with multiline comments starting on the same line as the opening /*.

There are also plenty of examples of multiline comments starting on the
same line as /*. For example run this grep on the include/linux:

    grep -A3 '/\*[^*]\+[^/]$' *.h

Regards,
  Felix

>
>> +        * When rptr == wptr + 1, the buffer is full.
>> +        * It is always rptr that advances to the position of wptr, rather than
>> +        * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
>> +        */
>>         rptr = *kq->rptr_kernel;
>>         wptr = *kq->wptr_kernel;
>>         queue_address = (unsigned int *)kq->pq_kernel_addr;
>> @@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         pr_debug("wptr: %d\n", wptr);
>>         pr_debug("queue_address 0x%p\n", queue_address);
>>
>> -       available_size = (rptr - 1 - wptr + queue_size_dwords) %
>> +       available_size = (rptr + queue_size_dwords - 1 - wptr) %
>>                                                         queue_size_dwords;
>>
>> -       if (packet_size_in_dwords >= queue_size_dwords ||
>> -                       packet_size_in_dwords >= available_size) {
>> +       if (packet_size_in_dwords > available_size) {
>>                 /*
>>                  * make sure calling functions know
>>                  * acquire_packet_buffer() failed
>> @@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         }
>>
>>         if (wptr + packet_size_in_dwords >= queue_size_dwords) {
>> +               /* make sure after rolling back to position 0, there is
>> +                * still enough space.
>> +                */
>> +               if (packet_size_in_dwords >= rptr) {
>> +                       *buffer_ptr = NULL;
>> +                       return -ENOMEM;
>> +               }
> I don't think the condition is correct.
> Suppose, queue_size_dwords == 100, wptr == rptr == 50 (queue is empty)
> and we have a new packet with size of 70.
> Now, wptr + size is 120, which is >= 100
> However, 70 >= rptr (50) which will give us -ENOMEM, but this is not
> correct condition, because the packet *does* have enough room in the
> queue.
>
> I think the condition should be:
> if (packet_size_in_dwords - (queue_size_dwords - wptr) >= rptr)
> but please check this.
>
>> +               /* fill nops, roll back and start at position 0 */
>>                 while (wptr > 0) {
>>                         queue_address[wptr] = kq->nop_packet;
>>                         wptr = (wptr + 1) % queue_size_dwords;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs
       [not found]             ` <a3e6c81d-4b3f-a58b-284c-dc466e09ca2c-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-18 15:28               ` Russell, Kent
       [not found]                 ` <BN6PR1201MB01801FB1444BC499708CC11185630-6iU6OBHu2P/H0AMcJMwsYmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 27+ messages in thread
From: Russell, Kent @ 2017-09-18 15:28 UTC (permalink / raw)
  To: Kuehling, Felix, Oded Gabbay; +Cc: Zhao, Yong, amd-gfx list

Correct (coming from the guy who did all of the checkpatch cleanup for KFD). For multi-line comments, /* Can be on its own, or on the same line as the comment. */ has to be on its own. https://github.com/torvalds/linux/blob/master/scripts/checkpatch.pl#L3042

 Kent

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Felix Kuehling
Sent: Monday, September 18, 2017 11:22 AM
To: Oded Gabbay
Cc: Zhao, Yong; amd-gfx list
Subject: Re: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs

On 2017-09-17 08:03 AM, Oded Gabbay wrote:
> On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
>> From: Yong Zhao <yong.zhao@amd.com>
>>
>> Avoid intermediate negative numbers when doing calculations with a 
>> mix of signed and unsigned variables where implicit conversions can 
>> lead to unexpected results.
>>
>> When kernel queue buffer wraps around to 0, we need to check that 
>> rptr won't be overwritten by the new packet.
>>
>> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 
>> +++++++++++++++---
>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> index 9ebb4c1..1c66334 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         uint32_t wptr, rptr;
>>         unsigned int *queue_address;
>>
>> +       /* When rptr == wptr, the buffer is empty.
> Start comment text in a new line. First line should be just /*

That's a new rule to me, and it seems a bit arbitrary. checkpatch.pl doesn't complain about this. checkpatch.pl does complain about the closing */ not being on it's own line, but checkpatch has always been OK with multiline comments starting on the same line as the opening /*.

There are also plenty of examples of multiline comments starting on the same line as /*. For example run this grep on the include/linux:

    grep -A3 '/\*[^*]\+[^/]$' *.h

Regards,
  Felix

>
>> +        * When rptr == wptr + 1, the buffer is full.
>> +        * It is always rptr that advances to the position of wptr, rather than
>> +        * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
>> +        */
>>         rptr = *kq->rptr_kernel;
>>         wptr = *kq->wptr_kernel;
>>         queue_address = (unsigned int *)kq->pq_kernel_addr; @@ 
>> -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         pr_debug("wptr: %d\n", wptr);
>>         pr_debug("queue_address 0x%p\n", queue_address);
>>
>> -       available_size = (rptr - 1 - wptr + queue_size_dwords) %
>> +       available_size = (rptr + queue_size_dwords - 1 - wptr) %
>>                                                         
>> queue_size_dwords;
>>
>> -       if (packet_size_in_dwords >= queue_size_dwords ||
>> -                       packet_size_in_dwords >= available_size) {
>> +       if (packet_size_in_dwords > available_size) {
>>                 /*
>>                  * make sure calling functions know
>>                  * acquire_packet_buffer() failed @@ -233,6 +237,14 
>> @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         }
>>
>>         if (wptr + packet_size_in_dwords >= queue_size_dwords) {
>> +               /* make sure after rolling back to position 0, there is
>> +                * still enough space.
>> +                */
>> +               if (packet_size_in_dwords >= rptr) {
>> +                       *buffer_ptr = NULL;
>> +                       return -ENOMEM;
>> +               }
> I don't think the condition is correct.
> Suppose, queue_size_dwords == 100, wptr == rptr == 50 (queue is empty) 
> and we have a new packet with size of 70.
> Now, wptr + size is 120, which is >= 100 However, 70 >= rptr (50) 
> which will give us -ENOMEM, but this is not correct condition, because 
> the packet *does* have enough room in the queue.
>
> I think the condition should be:
> if (packet_size_in_dwords - (queue_size_dwords - wptr) >= rptr) but 
> please check this.
>
>> +               /* fill nops, roll back and start at position 0 */
>>                 while (wptr > 0) {
>>                         queue_address[wptr] = kq->nop_packet;
>>                         wptr = (wptr + 1) % queue_size_dwords;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs
       [not found]                 ` <BN6PR1201MB01801FB1444BC499708CC11185630-6iU6OBHu2P/H0AMcJMwsYmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-09-18 15:33                   ` Russell, Kent
  0 siblings, 0 replies; 27+ messages in thread
From: Russell, Kent @ 2017-09-18 15:33 UTC (permalink / raw)
  To: Russell, Kent, Kuehling, Felix, Oded Gabbay; +Cc: Zhao, Yong, amd-gfx list

Though to be fair, we should probably consolidate the comment style so that it's actually consistent through the KFD.

 Kent

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Russell, Kent
Sent: Monday, September 18, 2017 11:28 AM
To: Kuehling, Felix; Oded Gabbay
Cc: Zhao, Yong; amd-gfx list
Subject: RE: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs

Correct (coming from the guy who did all of the checkpatch cleanup for KFD). For multi-line comments, /* Can be on its own, or on the same line as the comment. */ has to be on its own. https://github.com/torvalds/linux/blob/master/scripts/checkpatch.pl#L3042

 Kent

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Felix Kuehling
Sent: Monday, September 18, 2017 11:22 AM
To: Oded Gabbay
Cc: Zhao, Yong; amd-gfx list
Subject: Re: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs

On 2017-09-17 08:03 AM, Oded Gabbay wrote:
> On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
>> From: Yong Zhao <yong.zhao@amd.com>
>>
>> Avoid intermediate negative numbers when doing calculations with a 
>> mix of signed and unsigned variables where implicit conversions can 
>> lead to unexpected results.
>>
>> When kernel queue buffer wraps around to 0, we need to check that 
>> rptr won't be overwritten by the new packet.
>>
>> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18
>> +++++++++++++++---
>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> index 9ebb4c1..1c66334 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         uint32_t wptr, rptr;
>>         unsigned int *queue_address;
>>
>> +       /* When rptr == wptr, the buffer is empty.
> Start comment text in a new line. First line should be just /*

That's a new rule to me, and it seems a bit arbitrary. checkpatch.pl doesn't complain about this. checkpatch.pl does complain about the closing */ not being on it's own line, but checkpatch has always been OK with multiline comments starting on the same line as the opening /*.

There are also plenty of examples of multiline comments starting on the same line as /*. For example run this grep on the include/linux:

    grep -A3 '/\*[^*]\+[^/]$' *.h

Regards,
  Felix

>
>> +        * When rptr == wptr + 1, the buffer is full.
>> +        * It is always rptr that advances to the position of wptr, rather than
>> +        * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
>> +        */
>>         rptr = *kq->rptr_kernel;
>>         wptr = *kq->wptr_kernel;
>>         queue_address = (unsigned int *)kq->pq_kernel_addr; @@
>> -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         pr_debug("wptr: %d\n", wptr);
>>         pr_debug("queue_address 0x%p\n", queue_address);
>>
>> -       available_size = (rptr - 1 - wptr + queue_size_dwords) %
>> +       available_size = (rptr + queue_size_dwords - 1 - wptr) %
>>                                                         
>> queue_size_dwords;
>>
>> -       if (packet_size_in_dwords >= queue_size_dwords ||
>> -                       packet_size_in_dwords >= available_size) {
>> +       if (packet_size_in_dwords > available_size) {
>>                 /*
>>                  * make sure calling functions know
>>                  * acquire_packet_buffer() failed @@ -233,6 +237,14 
>> @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>>         }
>>
>>         if (wptr + packet_size_in_dwords >= queue_size_dwords) {
>> +               /* make sure after rolling back to position 0, there is
>> +                * still enough space.
>> +                */
>> +               if (packet_size_in_dwords >= rptr) {
>> +                       *buffer_ptr = NULL;
>> +                       return -ENOMEM;
>> +               }
> I don't think the condition is correct.
> Suppose, queue_size_dwords == 100, wptr == rptr == 50 (queue is empty) 
> and we have a new packet with size of 70.
> Now, wptr + size is 120, which is >= 100 However, 70 >= rptr (50) 
> which will give us -ENOMEM, but this is not correct condition, because 
> the packet *does* have enough room in the queue.
>
> I think the condition should be:
> if (packet_size_in_dwords - (queue_size_dwords - wptr) >= rptr) but 
> please check this.
>
>> +               /* fill nops, roll back and start at position 0 */
>>                 while (wptr > 0) {
>>                         queue_address[wptr] = kq->nop_packet;
>>                         wptr = (wptr + 1) % queue_size_dwords;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2017-09-18 15:33 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-15 23:42 [PATCH 00/11] KFD fixes Felix Kuehling
     [not found] ` <1505518982-29631-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-15 23:42   ` [PATCH 01/11] drm/amdkfd: Reorganize kfd resume code Felix Kuehling
     [not found]     ` <1505518982-29631-2-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17 11:18       ` Oded Gabbay
2017-09-15 23:42   ` [PATCH 02/11] drm/amdkfd: Fix suspend/resume issue on Carrizo Felix Kuehling
     [not found]     ` <1505518982-29631-3-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17 11:17       ` Oded Gabbay
2017-09-15 23:42   ` [PATCH 03/11] drm/amdkfd: Rectify the jiffies calculation error with milliseconds Felix Kuehling
     [not found]     ` <1505518982-29631-4-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-16  0:09       ` Jan Vesely
2017-09-15 23:42   ` [PATCH 04/11] drm/amdkfd: Adjust dequeue latencies and timeouts Felix Kuehling
     [not found]     ` <1505518982-29631-5-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17 11:23       ` Oded Gabbay
2017-09-15 23:42   ` [PATCH 05/11] drm/amdkfd: Fix incorrect destroy_mqd parameter Felix Kuehling
     [not found]     ` <1505518982-29631-6-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17 11:35       ` Oded Gabbay
2017-09-15 23:42   ` [PATCH 06/11] drm/amdkfd: Use VMID bitmap from KGD Felix Kuehling
     [not found]     ` <1505518982-29631-7-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17 11:47       ` Oded Gabbay
2017-09-15 23:42   ` [PATCH 07/11] drm/amdkfd: Reuse CHIP_* from amdgpu Felix Kuehling
     [not found]     ` <1505518982-29631-8-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17  7:54       ` Oded Gabbay
2017-09-15 23:42   ` [PATCH 08/11] drm/amdkfd: Drop _nocpsch suffix from shared functions Felix Kuehling
     [not found]     ` <1505518982-29631-9-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17 11:36       ` Oded Gabbay
2017-09-15 23:43   ` [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs Felix Kuehling
     [not found]     ` <1505518982-29631-10-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17 12:03       ` Oded Gabbay
     [not found]         ` <CAFCwf127-0M8oBFG4mefuRmjFFGY+NLrW-SVCaU=Dqwi0SKpBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-09-18 15:08           ` Felix Kuehling
2017-09-18 15:21           ` Felix Kuehling
     [not found]             ` <a3e6c81d-4b3f-a58b-284c-dc466e09ca2c-5C7GfCeVMHo@public.gmane.org>
2017-09-18 15:28               ` Russell, Kent
     [not found]                 ` <BN6PR1201MB01801FB1444BC499708CC11185630-6iU6OBHu2P/H0AMcJMwsYmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-09-18 15:33                   ` Russell, Kent
2017-09-15 23:43   ` [PATCH 10/11] drm/amdkfd: Print event limit messages only once per process Felix Kuehling
     [not found]     ` <1505518982-29631-11-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17 12:04       ` Oded Gabbay
2017-09-15 23:43   ` [PATCH 11/11] drm/amdkfd: Set /dev/kfd permissions to 0666 by default Felix Kuehling
     [not found]     ` <1505518982-29631-12-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-17  7:38       ` Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.