All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/10] KFD fixes v2
@ 2017-09-20 22:10 Felix Kuehling
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Felix Kuehling

Small self-contained KFD fixes that don't introduce any new
functionality or features. These may be suitable to include in
drm-fixes for 4.14.

v2:
- Rebased on gabbayo/amdkfd-next
- Addressed code review feedback
- Dropped "Set /dev/kfd permissions to 0666 by default"

Felix Kuehling (3):
  drm/amdkfd: Adjust dequeue latencies and timeouts
  drm/amdkfd: Fix incorrect destroy_mqd parameter
  drm/amdkfd: Print event limit messages only once per process

Yong Zhao (7):
  drm/amdkfd: Reorganize kfd resume code
  drm/amdkfd: Fix suspend/resume issue on Carrizo v2
  drm/amdkfd: Rectify the jiffies calculation error with milliseconds v2
  drm/amdkfd: Use VMID bitmap from KGD v2
  drm/amdkfd: Reuse CHIP_* from amdgpu v2
  drm/amdkfd: Drop _nocpsch suffix from shared functions
  drm/amdkfd: Fix kernel-queue wrapping bugs

 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            |   9 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 106 ++++++++++++---------
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  62 +++++-------
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_events.c            |   5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      |  40 +++++---
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c       |   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |   2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  37 ++++---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  97 +++++++++++++++----
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |   2 +-
 11 files changed, 235 insertions(+), 136 deletions(-)

-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 01/10] drm/amdkfd: Reorganize kfd resume code
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-20 22:10   ` Felix Kuehling
  2017-09-20 22:10   ` [PATCH 02/10] drm/amdkfd: Fix suspend/resume issue on Carrizo v2 Felix Kuehling
                     ` (9 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

The idea is to let kfd init and resume function share the same code path
as much as possible, rather than to have two copies of almost identical
code. That way improves the code readability and maintainability.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 78 +++++++++++++++++----------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 61fff25..cc8af11 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -92,6 +92,8 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
 				unsigned int chunk_size);
 static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
 
+static int kfd_resume(struct kfd_dev *kfd);
+
 static const struct kfd_device_info *lookup_device_info(unsigned short did)
 {
 	size_t i;
@@ -176,15 +178,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd)
 				pasid_limit,
 				kfd->doorbell_process_limit - 1);
 
-	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
-	if (err < 0) {
-		dev_err(kfd_device, "error initializing iommu device\n");
-		return false;
-	}
-
 	if (!kfd_set_pasid_limit(pasid_limit)) {
 		dev_err(kfd_device, "error setting pasid limit\n");
-		amd_iommu_free_device(kfd->pdev);
 		return false;
 	}
 
@@ -280,29 +275,22 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 		goto kfd_interrupt_error;
 	}
 
-	if (!device_iommu_pasid_init(kfd)) {
-		dev_err(kfd_device,
-			"Error initializing iommuv2 for device %x:%x\n",
-			kfd->pdev->vendor, kfd->pdev->device);
-		goto device_iommu_pasid_error;
-	}
-	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
-						iommu_pasid_shutdown_callback);
-	amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
-
 	kfd->dqm = device_queue_manager_init(kfd);
 	if (!kfd->dqm) {
 		dev_err(kfd_device, "Error initializing queue manager\n");
 		goto device_queue_manager_error;
 	}
 
-	if (kfd->dqm->ops.start(kfd->dqm)) {
+	if (!device_iommu_pasid_init(kfd)) {
 		dev_err(kfd_device,
-			"Error starting queue manager for device %x:%x\n",
+			"Error initializing iommuv2 for device %x:%x\n",
 			kfd->pdev->vendor, kfd->pdev->device);
-		goto dqm_start_error;
+		goto device_iommu_pasid_error;
 	}
 
+	if (kfd_resume(kfd))
+		goto kfd_resume_error;
+
 	kfd->dbgmgr = NULL;
 
 	kfd->init_complete = true;
@@ -314,11 +302,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
 	goto out;
 
-dqm_start_error:
+kfd_resume_error:
+device_iommu_pasid_error:
 	device_queue_manager_uninit(kfd->dqm);
 device_queue_manager_error:
-	amd_iommu_free_device(kfd->pdev);
-device_iommu_pasid_error:
 	kfd_interrupt_exit(kfd);
 kfd_interrupt_error:
 	kfd_topology_remove_device(kfd);
@@ -338,8 +325,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 void kgd2kfd_device_exit(struct kfd_dev *kfd)
 {
 	if (kfd->init_complete) {
+		kgd2kfd_suspend(kfd);
 		device_queue_manager_uninit(kfd->dqm);
-		amd_iommu_free_device(kfd->pdev);
 		kfd_interrupt_exit(kfd);
 		kfd_topology_remove_device(kfd);
 		kfd_doorbell_fini(kfd);
@@ -362,25 +349,40 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
 
 int kgd2kfd_resume(struct kfd_dev *kfd)
 {
-	unsigned int pasid_limit;
-	int err;
+	if (!kfd->init_complete)
+		return 0;
 
-	pasid_limit = kfd_get_pasid_limit();
+	return kfd_resume(kfd);
 
-	if (kfd->init_complete) {
-		err = amd_iommu_init_device(kfd->pdev, pasid_limit);
-		if (err < 0) {
-			dev_err(kfd_device, "failed to initialize iommu\n");
-			return -ENXIO;
-		}
+}
 
-		amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
-						iommu_pasid_shutdown_callback);
-		amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
-		kfd->dqm->ops.start(kfd->dqm);
+static int kfd_resume(struct kfd_dev *kfd)
+{
+	int err = 0;
+	unsigned int pasid_limit = kfd_get_pasid_limit();
+
+	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
+	if (err)
+		return -ENXIO;
+	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
+					iommu_pasid_shutdown_callback);
+	amd_iommu_set_invalid_ppr_cb(kfd->pdev,
+				     iommu_invalid_ppr_cb);
+
+	err = kfd->dqm->ops.start(kfd->dqm);
+	if (err) {
+		dev_err(kfd_device,
+			"Error starting queue manager for device %x:%x\n",
+			kfd->pdev->vendor, kfd->pdev->device);
+		goto dqm_start_error;
 	}
 
-	return 0;
+	return err;
+
+dqm_start_error:
+	amd_iommu_free_device(kfd->pdev);
+
+	return err;
 }
 
 /* This is called directly from KGD at ISR. */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 02/10] drm/amdkfd: Fix suspend/resume issue on Carrizo v2
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-20 22:10   ` [PATCH 01/10] drm/amdkfd: Reorganize kfd resume code Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
  2017-09-20 22:10   ` [PATCH 03/10] drm/amdkfd: Rectify the jiffies calculation error with milliseconds v2 Felix Kuehling
                     ` (8 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

When we do suspend/resume through "sudo pm-suspend" while there is
HSA activity running, upon resume we will encounter HWS hanging, which
is caused by memory read/write failures. The root cause is that when
suspend, we neglected to unbind pasid from kfd device.

Another major change is that the bind/unbinding is changed to be
performed on a per process basis, instead of whether there are queues
in dqm.

v2:
- free IOMMU device if kfd_bind_processes_to_device fails in kfd_resume
- add comments to kfd_bind/unbind_processes_to/from_device
- minor cleanups

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 23 +++--
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 ---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              | 15 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 97 ++++++++++++++++++----
 4 files changed, 109 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index cc8af11..092db3a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -191,7 +191,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
 
 	if (dev)
-		kfd_unbind_process_from_device(dev, pasid);
+		kfd_process_iommu_unbind_callback(dev, pasid);
 }
 
 /*
@@ -339,12 +339,16 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
 
 void kgd2kfd_suspend(struct kfd_dev *kfd)
 {
-	if (kfd->init_complete) {
-		kfd->dqm->ops.stop(kfd->dqm);
-		amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
-		amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
-		amd_iommu_free_device(kfd->pdev);
-	}
+	if (!kfd->init_complete)
+		return;
+
+	kfd->dqm->ops.stop(kfd->dqm);
+
+	kfd_unbind_processes_from_device(kfd);
+
+	amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+	amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+	amd_iommu_free_device(kfd->pdev);
 }
 
 int kgd2kfd_resume(struct kfd_dev *kfd)
@@ -369,6 +373,10 @@ static int kfd_resume(struct kfd_dev *kfd)
 	amd_iommu_set_invalid_ppr_cb(kfd->pdev,
 				     iommu_invalid_ppr_cb);
 
+	err = kfd_bind_processes_to_device(kfd);
+	if (err)
+		goto processes_bind_error;
+
 	err = kfd->dqm->ops.start(kfd->dqm);
 	if (err) {
 		dev_err(kfd_device,
@@ -380,6 +388,7 @@ static int kfd_resume(struct kfd_dev *kfd)
 	return err;
 
 dqm_start_error:
+processes_bind_error:
 	amd_iommu_free_device(kfd->pdev);
 
 	return err;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 53a66e8..5db82b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 
 static int start_cpsch(struct device_queue_manager *dqm)
 {
-	struct device_process_node *node;
 	int retval;
 
 	retval = 0;
@@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
 
 	init_interrupts(dqm);
 
-	list_for_each_entry(node, &dqm->queues, list)
-		if (node->qpd->pqm->process && dqm->dev)
-			kfd_bind_process_to_device(dqm->dev,
-						node->qpd->pqm->process);
-
 	execute_queues_cpsch(dqm, true);
 
 	return 0;
@@ -714,15 +708,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
 
 static int stop_cpsch(struct device_queue_manager *dqm)
 {
-	struct device_process_node *node;
-	struct kfd_process_device *pdd;
-
 	destroy_queues_cpsch(dqm, true, true);
 
-	list_for_each_entry(node, &dqm->queues, list) {
-		pdd = qpd_to_pdd(node->qpd);
-		pdd->bound = false;
-	}
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
 	pm_uninit(&dqm->packets);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index b397ec7..ef582cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -435,6 +435,13 @@ struct qcm_process_device {
 	uint32_t sh_hidden_private_base;
 };
 
+
+enum kfd_pdd_bound {
+	PDD_UNBOUND = 0,
+	PDD_BOUND,
+	PDD_BOUND_SUSPENDED,
+};
+
 /* Data that is per-process-per device. */
 struct kfd_process_device {
 	/*
@@ -459,7 +466,7 @@ struct kfd_process_device {
 	uint64_t scratch_limit;
 
 	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
-	bool bound;
+	enum kfd_pdd_bound bound;
 
 	/* This flag tells if we should reset all
 	 * wavefronts on process termination
@@ -548,8 +555,10 @@ struct kfd_process *kfd_get_process(const struct task_struct *);
 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
 
 struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
-							struct kfd_process *p);
-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
+						struct kfd_process *p);
+int kfd_bind_processes_to_device(struct kfd_dev *dev);
+void kfd_unbind_processes_from_device(struct kfd_dev *dev);
+void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid);
 struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 							struct kfd_process *p);
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index c74cf22..0464c31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -174,9 +174,10 @@ static void kfd_process_wq_release(struct work_struct *work)
 		if (pdd->reset_wavefronts)
 			dbgdev_wave_reset_wavefronts(pdd->dev, p);
 
-		amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
-		list_del(&pdd->per_device_list);
+		if (pdd->bound == PDD_BOUND)
+			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
 
+		list_del(&pdd->per_device_list);
 		kfree(pdd);
 	}
 
@@ -345,9 +346,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 
 	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
 		if (pdd->dev == dev)
-			break;
+			return pdd;
 
-	return pdd;
+	return NULL;
 }
 
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
@@ -362,6 +363,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 		INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
 		pdd->qpd.dqm = dev->dqm;
 		pdd->reset_wavefronts = false;
+		pdd->bound = PDD_UNBOUND;
 		list_add(&pdd->per_device_list, &p->per_device_data);
 	}
 
@@ -387,19 +389,91 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (pdd->bound)
+	if (pdd->bound == PDD_BOUND) {
 		return pdd;
+	} else if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) {
+		pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
+		return ERR_PTR(-EINVAL);
+	}
 
 	err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
 	if (err < 0)
 		return ERR_PTR(err);
 
-	pdd->bound = true;
+	pdd->bound = PDD_BOUND;
 
 	return pdd;
 }
 
-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
+/*
+ * Bind processes do the device that have been temporarily unbound
+ * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device.
+ */
+int kfd_bind_processes_to_device(struct kfd_dev *dev)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p;
+	unsigned int temp;
+	int err = 0;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		mutex_lock(&p->mutex);
+		pdd = kfd_get_process_device_data(dev, p);
+		if (pdd->bound != PDD_BOUND_SUSPENDED) {
+			mutex_unlock(&p->mutex);
+			continue;
+		}
+
+		err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
+				p->lead_thread);
+		if (err < 0) {
+			pr_err("unexpected pasid %d binding failure\n",
+					p->pasid);
+			mutex_unlock(&p->mutex);
+			break;
+		}
+
+		pdd->bound = PDD_BOUND;
+		mutex_unlock(&p->mutex);
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return err;
+}
+
+/*
+ * Temporarily unbind currently bound processes from the device and
+ * mark them as PDD_BOUND_SUSPENDED. These processes will be restored
+ * to PDD_BOUND state in kfd_bind_processes_to_device.
+ */
+void kfd_unbind_processes_from_device(struct kfd_dev *dev)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p;
+	unsigned int temp, temp_bound, temp_pasid;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		mutex_lock(&p->mutex);
+		pdd = kfd_get_process_device_data(dev, p);
+		temp_bound = pdd->bound;
+		temp_pasid = p->pasid;
+		if (pdd->bound == PDD_BOUND)
+			pdd->bound = PDD_BOUND_SUSPENDED;
+		mutex_unlock(&p->mutex);
+
+		if (temp_bound == PDD_BOUND)
+			amd_iommu_unbind_pasid(dev->pdev, temp_pasid);
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+}
+
+void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
 {
 	struct kfd_process *p;
 	struct kfd_process_device *pdd;
@@ -432,15 +506,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
 		pdd->reset_wavefronts = false;
 	}
 
-	/*
-	 * Just mark pdd as unbound, because we still need it
-	 * to call amd_iommu_unbind_pasid() in when the
-	 * process exits.
-	 * We don't call amd_iommu_unbind_pasid() here
-	 * because the IOMMU called us.
-	 */
-	pdd->bound = false;
-
 	mutex_unlock(&p->mutex);
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 03/10] drm/amdkfd: Rectify the jiffies calculation error with milliseconds v2
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-20 22:10   ` [PATCH 01/10] drm/amdkfd: Reorganize kfd resume code Felix Kuehling
  2017-09-20 22:10   ` [PATCH 02/10] drm/amdkfd: Fix suspend/resume issue on Carrizo v2 Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
       [not found]     ` <1505945422-8851-4-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-20 22:10   ` [PATCH 04/10] drm/amdkfd: Adjust dequeue latencies and timeouts Felix Kuehling
                     ` (7 subsequent siblings)
  10 siblings, 1 reply; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

The timeout in milliseconds should not be regarded as jiffies. This
commit fixed that.

v2:
- use msecs_to_jiffies
- change timeout_ms parameter to unsigned int to match msecs_to_jiffies

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 +++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 5db82b8..87961fe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -835,12 +835,12 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 				unsigned int fence_value,
-				unsigned long timeout)
+				unsigned int timeout_ms)
 {
-	timeout += jiffies;
+	unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies;
 
 	while (*fence_addr != fence_value) {
-		if (time_after(jiffies, timeout)) {
+		if (time_after(jiffies, end_jiffies)) {
 			pr_err("qcm fence wait loop timeout expired\n");
 			return -ETIME;
 		}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ef582cc..1c341b3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -669,7 +669,7 @@ struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
 
 int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 				unsigned int fence_value,
-				unsigned long timeout);
+				unsigned int timeout_ms);
 
 /* Packet Manager */
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 04/10] drm/amdkfd: Adjust dequeue latencies and timeouts
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2017-09-20 22:10   ` [PATCH 03/10] drm/amdkfd: Rectify the jiffies calculation error with milliseconds v2 Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
  2017-09-20 22:10   ` [PATCH 05/10] drm/amdkfd: Fix incorrect destroy_mqd parameter Felix Kuehling
                     ` (6 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

Adjust latencies and timeouts for dequeueing with HWS and consolidate
them in one place. Make them longer to allow long running waves to
complete without causing a timeout. The timeout is twice as long as the
latency plus some buffer to make sure we don't detect a timeout
prematurely.

Change timeouts for dequeueing HQDs without HWS. KFD_UNMAP_LATENCY is
more consistent with what the HWS does for user queues.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c         | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c       | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 3 ---
 5 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 87961fe..dd60c6e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -323,7 +323,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
 
 	retval = mqd->destroy_mqd(mqd, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
-				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
+				KFD_UNMAP_LATENCY_MS,
 				q->pipe, q->queue);
 
 	if (retval)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index faf820a..99e2305 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -29,7 +29,9 @@
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 
-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS	(500)
+#define KFD_UNMAP_LATENCY_MS			(4000)
+#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
+
 #define CIK_VMID_NUM				(8)
 #define KFD_VMID_START_OFFSET			(8)
 #define VMID_PER_DEVICE				CIK_VMID_NUM
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 0649dd4..04379a6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -185,7 +185,7 @@ static void uninitialize(struct kernel_queue *kq)
 		kq->mqd->destroy_mqd(kq->mqd,
 					kq->queue->mqd,
 					false,
-					QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
+					KFD_UNMAP_LATENCY_MS,
 					kq->queue->pipe,
 					kq->queue->queue);
 	else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 1d31260..9eda884 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -376,7 +376,7 @@ int pm_send_set_resources(struct packet_manager *pm,
 	packet->bitfields2.queue_type =
 			queue_type__mes_set_resources__hsa_interface_queue_hiq;
 	packet->bitfields2.vmid_mask = res->vmid_mask;
-	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY;
+	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
 	packet->bitfields7.oac_mask = res->oac_mask;
 	packet->bitfields8.gds_heap_base = res->gds_heap_base;
 	packet->bitfields8.gds_heap_size = res->gds_heap_size;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 1c341b3..7adb890 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -673,11 +673,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 
 /* Packet Manager */
 
-#define KFD_HIQ_TIMEOUT (500)
-
 #define KFD_FENCE_COMPLETED (100)
 #define KFD_FENCE_INIT   (10)
-#define KFD_UNMAP_LATENCY (150)
 
 struct packet_manager {
 	struct device_queue_manager *dqm;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 05/10] drm/amdkfd: Fix incorrect destroy_mqd parameter
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2017-09-20 22:10   ` [PATCH 04/10] drm/amdkfd: Adjust dequeue latencies and timeouts Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
  2017-09-20 22:10   ` [PATCH 06/10] drm/amdkfd: Use VMID bitmap from KGD v2 Felix Kuehling
                     ` (5 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

When uninitializing a kernel queue.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 04379a6..a47ca3c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -184,7 +184,7 @@ static void uninitialize(struct kernel_queue *kq)
 	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
 		kq->mqd->destroy_mqd(kq->mqd,
 					kq->queue->mqd,
-					false,
+					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
 					KFD_UNMAP_LATENCY_MS,
 					kq->queue->pipe,
 					kq->queue->queue);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 06/10] drm/amdkfd: Use VMID bitmap from KGD v2
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (4 preceding siblings ...)
  2017-09-20 22:10   ` [PATCH 05/10] drm/amdkfd: Fix incorrect destroy_mqd parameter Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
  2017-09-20 22:10   ` [PATCH 07/10] drm/amdkfd: Reuse CHIP_* from amdgpu v2 Felix Kuehling
                     ` (4 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

The hard-coded values related to VMID were removed in KFD, as those
values can be calculated in the KFD initialization function.

v2: remove unnecessary local variable

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c                |  9 ++-------
 drivers/gpu/drm/amd/amdkfd/kfd_device.c                |  5 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 ++++++-------
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |  4 ----
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                  |  7 +++++++
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  2 +-
 6 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index 0aa021a..7d5635f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -769,13 +769,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
 	struct kfd_process_device *pdd;
 	struct dbg_wave_control_info wac_info;
-	int temp;
-	int first_vmid_to_scan = 8;
-	int last_vmid_to_scan = 15;
-
-	first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1;
-	temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan;
-	last_vmid_to_scan = first_vmid_to_scan + ffz(temp);
+	int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
+	int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
 
 	reg_sq_cmd.u32All = 0;
 	status = 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 092db3a..9d23362 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -226,6 +226,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
 	kfd->shared_resources = *gpu_resources;
 
+	kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1;
+	kfd->vm_info.last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1;
+	kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
+			- kfd->vm_info.first_vmid_kfd + 1;
+
 	/* calculate max size of mqds needed for queues */
 	size = max_num_of_queues_per_device *
 			kfd->device_info->mqd_size_aligned;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index dd60c6e..87f8742 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -113,11 +113,11 @@ static int allocate_vmid(struct device_queue_manager *dqm,
 	if (dqm->vmid_bitmap == 0)
 		return -ENOMEM;
 
-	bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM);
+	bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap,
+				dqm->dev->vm_info.vmid_num_kfd);
 	clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap);
 
-	/* Kaveri kfd vmid's starts from vmid 8 */
-	allocated_vmid = bit + KFD_VMID_START_OFFSET;
+	allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
 	pr_debug("vmid allocation %d\n", allocated_vmid);
 	qpd->vmid = allocated_vmid;
 	q->properties.vmid = allocated_vmid;
@@ -132,7 +132,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q)
 {
-	int bit = qpd->vmid - KFD_VMID_START_OFFSET;
+	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
 
 	/* Release the vmid mapping */
 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
@@ -507,7 +507,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 				dqm->allocated_queues[pipe] |= 1 << queue;
 	}
 
-	dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1;
+	dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
 	dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
 
 	return 0;
@@ -613,8 +613,7 @@ static int set_sched_resources(struct device_queue_manager *dqm)
 	int i, mec;
 	struct scheduling_resources res;
 
-	res.vmid_mask = (1 << VMID_PER_DEVICE) - 1;
-	res.vmid_mask <<= KFD_VMID_START_OFFSET;
+	res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
 
 	res.queue_mask = 0;
 	for (i = 0; i < KGD_MAX_QUEUES; ++i) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 99e2305..60d46ce 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -32,10 +32,6 @@
 #define KFD_UNMAP_LATENCY_MS			(4000)
 #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
 
-#define CIK_VMID_NUM				(8)
-#define KFD_VMID_START_OFFSET			(8)
-#define VMID_PER_DEVICE				CIK_VMID_NUM
-#define KFD_DQM_FIRST_PIPE			(0)
 #define CIK_SDMA_QUEUES				(4)
 #define CIK_SDMA_QUEUES_PER_ENGINE		(2)
 #define CIK_SDMA_ENGINE_NUM			(2)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 7adb890..4d989b9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -141,6 +141,12 @@ struct kfd_mem_obj {
 	uint32_t *cpu_ptr;
 };
 
+struct kfd_vmid_info {
+	uint32_t first_vmid_kfd;
+	uint32_t last_vmid_kfd;
+	uint32_t vmid_num_kfd;
+};
+
 struct kfd_dev {
 	struct kgd_dev *kgd;
 
@@ -165,6 +171,7 @@ struct kfd_dev {
 					   */
 
 	struct kgd2kfd_shared_resources shared_resources;
+	struct kfd_vmid_info vm_info;
 
 	const struct kfd2kgd_calls *kfd2kgd;
 	struct mutex doorbell_mutex;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 03bec76..68fe0d9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -187,7 +187,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 	case KFD_QUEUE_TYPE_COMPUTE:
 		/* check if there is over subscription */
 		if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
-		((dev->dqm->processes_count >= VMID_PER_DEVICE) ||
+		((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) ||
 		(dev->dqm->queue_count >= get_queues_num(dev->dqm)))) {
 			pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
 			retval = -EPERM;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 07/10] drm/amdkfd: Reuse CHIP_* from amdgpu v2
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (5 preceding siblings ...)
  2017-09-20 22:10   ` [PATCH 06/10] drm/amdkfd: Use VMID bitmap from KGD v2 Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
       [not found]     ` <1505945422-8851-8-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-20 22:10   ` [PATCH 08/10] drm/amdkfd: Drop _nocpsch suffix from shared functions Felix Kuehling
                     ` (3 subsequent siblings)
  10 siblings, 1 reply; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

From: Yong Zhao <Yong.Zhao@amd.com>

There are already CHIP_* definitions under amd_shared.h file on amdgpu
side, so KFD should reuse them rather than defining new ones.

Using enum for asic type requires default cases on switch statements
to prevent compiler warnings. WARN on unsupported ASICs. It should never
get there because KFD should not be initialized on unsupported devices.

v2: Replace BUG() with WARN and error return

Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c |  4 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c         | 18 ++++++++++++------
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c          |  3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 |  9 +++------
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 87f8742..fe0f0de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1130,6 +1130,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 	case CHIP_KAVERI:
 		device_queue_manager_init_cik(&dqm->ops_asic_specific);
 		break;
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dev->device_info->asic_family);
+		goto out_free;
 	}
 
 	if (!dqm->ops.initialize(dqm))
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index a47ca3c..d7ed10e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -291,14 +291,20 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 	case CHIP_KAVERI:
 		kernel_queue_init_cik(&kq->ops_asic_specific);
 		break;
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dev->device_info->asic_family);
+		goto out_free;
 	}
 
-	if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) {
-		pr_err("Failed to init kernel queue\n");
-		kfree(kq);
-		return NULL;
-	}
-	return kq;
+	if (kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE))
+		return kq;
+
+	pr_err("Failed to init kernel queue\n");
+
+out_free:
+	kfree(kq);
+	return NULL;
 }
 
 void kernel_queue_uninit(struct kernel_queue *kq)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index b1ef136..dfd260e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -31,6 +31,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
 		return mqd_manager_init_cik(type, dev);
 	case CHIP_CARRIZO:
 		return mqd_manager_init_vi(type, dev);
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dev->device_info->asic_family);
 	}
 
 	return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 4d989b9..47eee77 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -33,6 +33,8 @@
 #include <linux/kfd_ioctl.h>
 #include <kgd_kfd_interface.h>
 
+#include "amd_shared.h"
+
 #define KFD_SYSFS_FILE_MODE 0444
 
 #define KFD_MMAP_DOORBELL_MASK 0x8000000000000
@@ -112,11 +114,6 @@ enum cache_policy {
 	cache_policy_noncoherent
 };
 
-enum asic_family_type {
-	CHIP_KAVERI = 0,
-	CHIP_CARRIZO
-};
-
 struct kfd_event_interrupt_class {
 	bool (*interrupt_isr)(struct kfd_dev *dev,
 				const uint32_t *ih_ring_entry);
@@ -125,7 +122,7 @@ struct kfd_event_interrupt_class {
 };
 
 struct kfd_device_info {
-	unsigned int asic_family;
+	enum amd_asic_type asic_family;
 	const struct kfd_event_interrupt_class *event_interrupt_class;
 	unsigned int max_pasid_bits;
 	unsigned int max_no_of_hqd;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 08/10] drm/amdkfd: Drop _nocpsch suffix from shared functions
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (6 preceding siblings ...)
  2017-09-20 22:10   ` [PATCH 07/10] drm/amdkfd: Reuse CHIP_* from amdgpu v2 Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
  2017-09-20 22:10   ` [PATCH 09/10] drm/amdkfd: Fix kernel-queue wrapping bugs Felix Kuehling
                     ` (2 subsequent siblings)
  10 siblings, 0 replies; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

Several functions in DQM are shared between cpsch and nocpsch code.
Remove the misleading _nocpsch suffix from their names.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 24 +++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index fe0f0de..471b34e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -386,7 +386,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 	return retval;
 }
 
-static struct mqd_manager *get_mqd_manager_nocpsch(
+static struct mqd_manager *get_mqd_manager(
 		struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
 {
 	struct mqd_manager *mqd;
@@ -407,7 +407,7 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
 	return mqd;
 }
 
-static int register_process_nocpsch(struct device_queue_manager *dqm,
+static int register_process(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
 	struct device_process_node *n;
@@ -431,7 +431,7 @@ static int register_process_nocpsch(struct device_queue_manager *dqm,
 	return retval;
 }
 
-static int unregister_process_nocpsch(struct device_queue_manager *dqm,
+static int unregister_process(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
 	int retval;
@@ -513,7 +513,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 	return 0;
 }
 
-static void uninitialize_nocpsch(struct device_queue_manager *dqm)
+static void uninitialize(struct device_queue_manager *dqm)
 {
 	int i;
 
@@ -1095,10 +1095,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.stop = stop_cpsch;
 		dqm->ops.destroy_queue = destroy_queue_cpsch;
 		dqm->ops.update_queue = update_queue;
-		dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
-		dqm->ops.register_process = register_process_nocpsch;
-		dqm->ops.unregister_process = unregister_process_nocpsch;
-		dqm->ops.uninitialize = uninitialize_nocpsch;
+		dqm->ops.get_mqd_manager = get_mqd_manager;
+		dqm->ops.register_process = register_process;
+		dqm->ops.unregister_process = unregister_process;
+		dqm->ops.uninitialize = uninitialize;
 		dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
 		dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
@@ -1110,11 +1110,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.create_queue = create_queue_nocpsch;
 		dqm->ops.destroy_queue = destroy_queue_nocpsch;
 		dqm->ops.update_queue = update_queue;
-		dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
-		dqm->ops.register_process = register_process_nocpsch;
-		dqm->ops.unregister_process = unregister_process_nocpsch;
+		dqm->ops.get_mqd_manager = get_mqd_manager;
+		dqm->ops.register_process = register_process;
+		dqm->ops.unregister_process = unregister_process;
 		dqm->ops.initialize = initialize_nocpsch;
-		dqm->ops.uninitialize = uninitialize_nocpsch;
+		dqm->ops.uninitialize = uninitialize;
 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
 		break;
 	default:
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 09/10] drm/amdkfd: Fix kernel-queue wrapping bugs
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (7 preceding siblings ...)
  2017-09-20 22:10   ` [PATCH 08/10] drm/amdkfd: Drop _nocpsch suffix from shared functions Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
       [not found]     ` <1505945422-8851-10-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2017-09-20 22:10   ` [PATCH 10/10] drm/amdkfd: Print event limit messages only once per process Felix Kuehling
  2017-09-24 11:13   ` [PATCH 00/10] KFD fixes v2 Oded Gabbay
  10 siblings, 1 reply; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Yong Zhao, Felix Kuehling

From: Yong Zhao <yong.zhao@amd.com>

Avoid intermediate negative numbers when doing calculations with a mix
of signed and unsigned variables where implicit conversions can lead
to unexpected results.

When kernel queue buffer wraps around to 0, we need to check that rptr
won't be overwritten by the new packet.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index d7ed10e..8b0c064 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
 	uint32_t wptr, rptr;
 	unsigned int *queue_address;
 
+	/* When rptr == wptr, the buffer is empty.
+	 * When rptr == wptr + 1, the buffer is full.
+	 * It is always rptr that advances to the position of wptr, rather than
+	 * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
+	 */
 	rptr = *kq->rptr_kernel;
 	wptr = *kq->wptr_kernel;
 	queue_address = (unsigned int *)kq->pq_kernel_addr;
@@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
 	pr_debug("wptr: %d\n", wptr);
 	pr_debug("queue_address 0x%p\n", queue_address);
 
-	available_size = (rptr - 1 - wptr + queue_size_dwords) %
+	available_size = (rptr + queue_size_dwords - 1 - wptr) %
 							queue_size_dwords;
 
-	if (packet_size_in_dwords >= queue_size_dwords ||
-			packet_size_in_dwords >= available_size) {
+	if (packet_size_in_dwords > available_size) {
 		/*
 		 * make sure calling functions know
 		 * acquire_packet_buffer() failed
@@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
 	}
 
 	if (wptr + packet_size_in_dwords >= queue_size_dwords) {
+		/* make sure after rolling back to position 0, there is
+		 * still enough space.
+		 */
+		if (packet_size_in_dwords >= rptr) {
+			*buffer_ptr = NULL;
+			return -ENOMEM;
+		}
+		/* fill nops, roll back and start at position 0 */
 		while (wptr > 0) {
 			queue_address[wptr] = kq->nop_packet;
 			wptr = (wptr + 1) % queue_size_dwords;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 10/10] drm/amdkfd: Print event limit messages only once per process
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (8 preceding siblings ...)
  2017-09-20 22:10   ` [PATCH 09/10] drm/amdkfd: Fix kernel-queue wrapping bugs Felix Kuehling
@ 2017-09-20 22:10   ` Felix Kuehling
  2017-09-24 11:13   ` [PATCH 00/10] KFD fixes v2 Oded Gabbay
  10 siblings, 0 replies; 15+ messages in thread
From: Felix Kuehling @ 2017-09-20 22:10 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: oded.gabbay-Re5JQEeQqe8AvxtiuMwx3w, Felix Kuehling

To avoid spamming the log.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 ++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 5979158..944abfa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -292,7 +292,10 @@ static int create_signal_event(struct file *devkfd,
 				struct kfd_event *ev)
 {
 	if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
-		pr_warn("Signal event wasn't created because limit was reached\n");
+		if (!p->signal_event_limit_reached) {
+			pr_warn("Signal event wasn't created because limit was reached\n");
+			p->signal_event_limit_reached = true;
+		}
 		return -ENOMEM;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 47eee77..10c0626 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -532,6 +532,7 @@ struct kfd_process {
 	struct list_head signal_event_pages;
 	u32 next_nonsignal_event_id;
 	size_t signal_event_count;
+	bool signal_event_limit_reached;
 };
 
 /**
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 03/10] drm/amdkfd: Rectify the jiffies calculation error with milliseconds v2
       [not found]     ` <1505945422-8851-4-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-24 10:32       ` Oded Gabbay
  0 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2017-09-24 10:32 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Thu, Sep 21, 2017 at 1:10 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <yong.zhao@amd.com>
>
> The timeout in milliseconds should not be regarded as jiffies. This
> commit fixed that.
>
> v2:
> - use msecs_to_jiffies
> - change timeout_ms parameter to unsigned int to match msecs_to_jiffies
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 +++---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 2 +-
>  2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 5db82b8..87961fe 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -835,12 +835,12 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>
>  int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
>                                 unsigned int fence_value,
> -                               unsigned long timeout)
> +                               unsigned int timeout_ms)
>  {
> -       timeout += jiffies;
> +       unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies;
>
>         while (*fence_addr != fence_value) {
> -               if (time_after(jiffies, timeout)) {
> +               if (time_after(jiffies, end_jiffies)) {
>                         pr_err("qcm fence wait loop timeout expired\n");
>                         return -ETIME;
>                 }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ef582cc..1c341b3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -669,7 +669,7 @@ struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
>
>  int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
>                                 unsigned int fence_value,
> -                               unsigned long timeout);
> +                               unsigned int timeout_ms);
>
>  /* Packet Manager */
>
> --
> 2.7.4
>

This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 07/10] drm/amdkfd: Reuse CHIP_* from amdgpu v2
       [not found]     ` <1505945422-8851-8-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-24 11:06       ` Oded Gabbay
  0 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2017-09-24 11:06 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Thu, Sep 21, 2017 at 1:10 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <Yong.Zhao@amd.com>
>
> There are already CHIP_* definitions under amd_shared.h file on amdgpu
> side, so KFD should reuse them rather than defining new ones.
>
> Using enum for asic type requires default cases on switch statements
> to prevent compiler warnings. WARN on unsupported ASICs. It should never
> get there because KFD should not be initialized on unsupported devices.
>
> v2: Replace BUG() with WARN and error return
>
> Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c |  4 ++++
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c         | 18 ++++++++++++------
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c          |  3 +++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 |  9 +++------
>  4 files changed, 22 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 87f8742..fe0f0de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1130,6 +1130,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>         case CHIP_KAVERI:
>                 device_queue_manager_init_cik(&dqm->ops_asic_specific);
>                 break;
> +       default:
> +               WARN(1, "Unexpected ASIC family %u",
> +                    dev->device_info->asic_family);
> +               goto out_free;
>         }
>
>         if (!dqm->ops.initialize(dqm))
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index a47ca3c..d7ed10e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -291,14 +291,20 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>         case CHIP_KAVERI:
>                 kernel_queue_init_cik(&kq->ops_asic_specific);
>                 break;
> +       default:
> +               WARN(1, "Unexpected ASIC family %u",
> +                    dev->device_info->asic_family);
> +               goto out_free;
>         }
>
> -       if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) {
> -               pr_err("Failed to init kernel queue\n");
> -               kfree(kq);
> -               return NULL;
> -       }
> -       return kq;
> +       if (kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE))
> +               return kq;
> +
> +       pr_err("Failed to init kernel queue\n");
> +
> +out_free:
> +       kfree(kq);
> +       return NULL;
>  }
>
>  void kernel_queue_uninit(struct kernel_queue *kq)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index b1ef136..dfd260e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -31,6 +31,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
>                 return mqd_manager_init_cik(type, dev);
>         case CHIP_CARRIZO:
>                 return mqd_manager_init_vi(type, dev);
> +       default:
> +               WARN(1, "Unexpected ASIC family %u",
> +                    dev->device_info->asic_family);
>         }
>
>         return NULL;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 4d989b9..47eee77 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -33,6 +33,8 @@
>  #include <linux/kfd_ioctl.h>
>  #include <kgd_kfd_interface.h>
>
> +#include "amd_shared.h"
> +
>  #define KFD_SYSFS_FILE_MODE 0444
>
>  #define KFD_MMAP_DOORBELL_MASK 0x8000000000000
> @@ -112,11 +114,6 @@ enum cache_policy {
>         cache_policy_noncoherent
>  };
>
> -enum asic_family_type {
> -       CHIP_KAVERI = 0,
> -       CHIP_CARRIZO
> -};
> -
>  struct kfd_event_interrupt_class {
>         bool (*interrupt_isr)(struct kfd_dev *dev,
>                                 const uint32_t *ih_ring_entry);
> @@ -125,7 +122,7 @@ struct kfd_event_interrupt_class {
>  };
>
>  struct kfd_device_info {
> -       unsigned int asic_family;
> +       enum amd_asic_type asic_family;
>         const struct kfd_event_interrupt_class *event_interrupt_class;
>         unsigned int max_pasid_bits;
>         unsigned int max_no_of_hqd;
> --
> 2.7.4
>

This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 09/10] drm/amdkfd: Fix kernel-queue wrapping bugs
       [not found]     ` <1505945422-8851-10-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-24 11:09       ` Oded Gabbay
  0 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2017-09-24 11:09 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: Yong Zhao, amd-gfx list

On Thu, Sep 21, 2017 at 1:10 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> From: Yong Zhao <yong.zhao@amd.com>
>
> Avoid intermediate negative numbers when doing calculations with a mix
> of signed and unsigned variables where implicit conversions can lead
> to unexpected results.
>
> When kernel queue buffer wraps around to 0, we need to check that rptr
> won't be overwritten by the new packet.
>
> Signed-off-by: Yong Zhao <yong.zhao@amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index d7ed10e..8b0c064 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>         uint32_t wptr, rptr;
>         unsigned int *queue_address;
>
> +       /* When rptr == wptr, the buffer is empty.
> +        * When rptr == wptr + 1, the buffer is full.
> +        * It is always rptr that advances to the position of wptr, rather than
> +        * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
> +        */
>         rptr = *kq->rptr_kernel;
>         wptr = *kq->wptr_kernel;
>         queue_address = (unsigned int *)kq->pq_kernel_addr;
> @@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>         pr_debug("wptr: %d\n", wptr);
>         pr_debug("queue_address 0x%p\n", queue_address);
>
> -       available_size = (rptr - 1 - wptr + queue_size_dwords) %
> +       available_size = (rptr + queue_size_dwords - 1 - wptr) %
>                                                         queue_size_dwords;
>
> -       if (packet_size_in_dwords >= queue_size_dwords ||
> -                       packet_size_in_dwords >= available_size) {
> +       if (packet_size_in_dwords > available_size) {
>                 /*
>                  * make sure calling functions know
>                  * acquire_packet_buffer() failed
> @@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
>         }
>
>         if (wptr + packet_size_in_dwords >= queue_size_dwords) {
> +               /* make sure after rolling back to position 0, there is
> +                * still enough space.
> +                */
> +               if (packet_size_in_dwords >= rptr) {
> +                       *buffer_ptr = NULL;
> +                       return -ENOMEM;
> +               }
> +               /* fill nops, roll back and start at position 0 */
>                 while (wptr > 0) {
>                         queue_address[wptr] = kq->nop_packet;
>                         wptr = (wptr + 1) % queue_size_dwords;
> --
> 2.7.4
>
This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 00/10] KFD fixes v2
       [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
                     ` (9 preceding siblings ...)
  2017-09-20 22:10   ` [PATCH 10/10] drm/amdkfd: Print event limit messages only once per process Felix Kuehling
@ 2017-09-24 11:13   ` Oded Gabbay
  10 siblings, 0 replies; 15+ messages in thread
From: Oded Gabbay @ 2017-09-24 11:13 UTC (permalink / raw)
  To: Felix Kuehling; +Cc: amd-gfx list

On Thu, Sep 21, 2017 at 1:10 AM, Felix Kuehling <Felix.Kuehling@amd.com> wrote:
> Small self-contained KFD fixes that don't introduce any new
> functionality or features. These may be suitable to include in
> drm-fixes for 4.14.
>
Thanks!

I added patches 5, 9 and 10 to my -fixes tree and I will send them
shortly to Dave (together with another fix).
I don't feel comfortable adding the other patches in rc stages. I will
send them to 4.15 merge window.

Thanks,
Oded


> v2:
> - Rebased on gabbayo/amdkfd-next
> - Addressed code review feedback
> - Dropped "Set /dev/kfd permissions to 0666 by default"
>
> Felix Kuehling (3):
>   drm/amdkfd: Adjust dequeue latencies and timeouts
>   drm/amdkfd: Fix incorrect destroy_mqd parameter
>   drm/amdkfd: Print event limit messages only once per process
>
> Yong Zhao (7):
>   drm/amdkfd: Reorganize kfd resume code
>   drm/amdkfd: Fix suspend/resume issue on Carrizo v2
>   drm/amdkfd: Rectify the jiffies calculation error with milliseconds v2
>   drm/amdkfd: Use VMID bitmap from KGD v2
>   drm/amdkfd: Reuse CHIP_* from amdgpu v2
>   drm/amdkfd: Drop _nocpsch suffix from shared functions
>   drm/amdkfd: Fix kernel-queue wrapping bugs
>
>  drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            |   9 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 106 ++++++++++++---------
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  62 +++++-------
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |   8 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c            |   5 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      |  40 +++++---
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c       |   3 +
>  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |   2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  37 ++++---
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  97 +++++++++++++++----
>  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |   2 +-
>  11 files changed, 235 insertions(+), 136 deletions(-)
>
> --
> 2.7.4
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2017-09-24 11:13 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-20 22:10 [PATCH 00/10] KFD fixes v2 Felix Kuehling
     [not found] ` <1505945422-8851-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-20 22:10   ` [PATCH 01/10] drm/amdkfd: Reorganize kfd resume code Felix Kuehling
2017-09-20 22:10   ` [PATCH 02/10] drm/amdkfd: Fix suspend/resume issue on Carrizo v2 Felix Kuehling
2017-09-20 22:10   ` [PATCH 03/10] drm/amdkfd: Rectify the jiffies calculation error with milliseconds v2 Felix Kuehling
     [not found]     ` <1505945422-8851-4-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-24 10:32       ` Oded Gabbay
2017-09-20 22:10   ` [PATCH 04/10] drm/amdkfd: Adjust dequeue latencies and timeouts Felix Kuehling
2017-09-20 22:10   ` [PATCH 05/10] drm/amdkfd: Fix incorrect destroy_mqd parameter Felix Kuehling
2017-09-20 22:10   ` [PATCH 06/10] drm/amdkfd: Use VMID bitmap from KGD v2 Felix Kuehling
2017-09-20 22:10   ` [PATCH 07/10] drm/amdkfd: Reuse CHIP_* from amdgpu v2 Felix Kuehling
     [not found]     ` <1505945422-8851-8-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-24 11:06       ` Oded Gabbay
2017-09-20 22:10   ` [PATCH 08/10] drm/amdkfd: Drop _nocpsch suffix from shared functions Felix Kuehling
2017-09-20 22:10   ` [PATCH 09/10] drm/amdkfd: Fix kernel-queue wrapping bugs Felix Kuehling
     [not found]     ` <1505945422-8851-10-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-09-24 11:09       ` Oded Gabbay
2017-09-20 22:10   ` [PATCH 10/10] drm/amdkfd: Print event limit messages only once per process Felix Kuehling
2017-09-24 11:13   ` [PATCH 00/10] KFD fixes v2 Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.