All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdkfd: add helper function for kfd sysfs create
@ 2021-06-22 13:32 Philip Yang
  2021-06-22 13:32 ` [PATCH 2/4] drm/amdkfd: fix sysfs kobj leak Philip Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Philip Yang @ 2021-06-22 13:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

No functionality change. Modify kfd_sysfs_create_file to use kobject as
parameter, so it becomes common helper function to remove duplicate code
and will simplify new kfd sysfs file create in future.

Move pr_warn to helper function if sysfs file create failed. Set helper
function as void return because caller doesn't use the helper function
return value.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 119 ++++++++---------------
 1 file changed, 39 insertions(+), 80 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 09b98a83f670..3147dc8bb051 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -484,34 +484,31 @@ int kfd_procfs_add_queue(struct queue *q)
 	return 0;
 }
 
-static int kfd_sysfs_create_file(struct kfd_process *p, struct attribute *attr,
+static void kfd_sysfs_create_file(struct kobject *kobj, struct attribute *attr,
 				 char *name)
 {
-	int ret = 0;
+	int ret;
 
-	if (!p || !attr || !name)
-		return -EINVAL;
+	if (!kobj || !attr || !name)
+		return;
 
 	attr->name = name;
 	attr->mode = KFD_SYSFS_FILE_MODE;
 	sysfs_attr_init(attr);
 
-	ret = sysfs_create_file(p->kobj, attr);
-
-	return ret;
+	ret = sysfs_create_file(kobj, attr);
+	if (ret)
+		pr_warn("Create sysfs %s/%s failed %d", kobj->name, name, ret);
 }
 
-static int kfd_procfs_add_sysfs_stats(struct kfd_process *p)
+static void kfd_procfs_add_sysfs_stats(struct kfd_process *p)
 {
-	int ret = 0;
+	int ret;
 	int i;
 	char stats_dir_filename[MAX_SYSFS_FILENAME_LEN];
 
-	if (!p)
-		return -EINVAL;
-
-	if (!p->kobj)
-		return -EFAULT;
+	if (!p || !p->kobj)
+		return;
 
 	/*
 	 * Create sysfs files for each GPU:
@@ -521,63 +518,43 @@ static int kfd_procfs_add_sysfs_stats(struct kfd_process *p)
 	 */
 	for (i = 0; i < p->n_pdds; i++) {
 		struct kfd_process_device *pdd = p->pdds[i];
-		struct kobject *kobj_stats;
 
 		snprintf(stats_dir_filename, MAX_SYSFS_FILENAME_LEN,
 				"stats_%u", pdd->dev->id);
-		kobj_stats = kfd_alloc_struct(kobj_stats);
-		if (!kobj_stats)
-			return -ENOMEM;
+		pdd->kobj_stats = kfd_alloc_struct(pdd->kobj_stats);
+		if (!pdd->kobj_stats)
+			return;
 
-		ret = kobject_init_and_add(kobj_stats,
-						&procfs_stats_type,
-						p->kobj,
-						stats_dir_filename);
+		ret = kobject_init_and_add(pdd->kobj_stats,
+					   &procfs_stats_type,
+					   p->kobj,
+					   stats_dir_filename);
 
 		if (ret) {
 			pr_warn("Creating KFD proc/stats_%s folder failed",
-					stats_dir_filename);
-			kobject_put(kobj_stats);
-			goto err;
+				stats_dir_filename);
+			kobject_put(pdd->kobj_stats);
+			pdd->kobj_stats = NULL;
+			return;
 		}
 
-		pdd->kobj_stats = kobj_stats;
-		pdd->attr_evict.name = "evicted_ms";
-		pdd->attr_evict.mode = KFD_SYSFS_FILE_MODE;
-		sysfs_attr_init(&pdd->attr_evict);
-		ret = sysfs_create_file(kobj_stats, &pdd->attr_evict);
-		if (ret)
-			pr_warn("Creating eviction stats for gpuid %d failed",
-					(int)pdd->dev->id);
-
+		kfd_sysfs_create_file(pdd->kobj_stats, &pdd->attr_evict,
+				      "evicted_ms");
 		/* Add sysfs file to report compute unit occupancy */
-		if (pdd->dev->kfd2kgd->get_cu_occupancy != NULL) {
-			pdd->attr_cu_occupancy.name = "cu_occupancy";
-			pdd->attr_cu_occupancy.mode = KFD_SYSFS_FILE_MODE;
-			sysfs_attr_init(&pdd->attr_cu_occupancy);
-			ret = sysfs_create_file(kobj_stats,
-						&pdd->attr_cu_occupancy);
-			if (ret)
-				pr_warn("Creating %s failed for gpuid: %d",
-					pdd->attr_cu_occupancy.name,
-					(int)pdd->dev->id);
-		}
+		if (pdd->dev->kfd2kgd->get_cu_occupancy)
+			kfd_sysfs_create_file(pdd->kobj_stats,
+					      &pdd->attr_cu_occupancy,
+					      "cu_occupancy");
 	}
-err:
-	return ret;
 }
 
 
-static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
+static void kfd_procfs_add_sysfs_files(struct kfd_process *p)
 {
-	int ret = 0;
 	int i;
 
-	if (!p)
-		return -EINVAL;
-
-	if (!p->kobj)
-		return -EFAULT;
+	if (!p || !p->kobj)
+		return;
 
 	/*
 	 * Create sysfs files for each GPU:
@@ -589,20 +566,14 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
 
 		snprintf(pdd->vram_filename, MAX_SYSFS_FILENAME_LEN, "vram_%u",
 			 pdd->dev->id);
-		ret = kfd_sysfs_create_file(p, &pdd->attr_vram, pdd->vram_filename);
-		if (ret)
-			pr_warn("Creating vram usage for gpu id %d failed",
-				(int)pdd->dev->id);
+		kfd_sysfs_create_file(p->kobj, &pdd->attr_vram,
+				      pdd->vram_filename);
 
 		snprintf(pdd->sdma_filename, MAX_SYSFS_FILENAME_LEN, "sdma_%u",
 			 pdd->dev->id);
-		ret = kfd_sysfs_create_file(p, &pdd->attr_sdma, pdd->sdma_filename);
-		if (ret)
-			pr_warn("Creating sdma usage for gpu id %d failed",
-				(int)pdd->dev->id);
+		kfd_sysfs_create_file(p->kobj, &pdd->attr_sdma,
+					    pdd->sdma_filename);
 	}
-
-	return ret;
 }
 
 void kfd_procfs_del_queue(struct queue *q)
@@ -800,28 +771,16 @@ struct kfd_process *kfd_create_process(struct file *filep)
 			goto out;
 		}
 
-		process->attr_pasid.name = "pasid";
-		process->attr_pasid.mode = KFD_SYSFS_FILE_MODE;
-		sysfs_attr_init(&process->attr_pasid);
-		ret = sysfs_create_file(process->kobj, &process->attr_pasid);
-		if (ret)
-			pr_warn("Creating pasid for pid %d failed",
-					(int)process->lead_thread->pid);
+		kfd_sysfs_create_file(process->kobj, &process->attr_pasid,
+				      "pasid");
 
 		process->kobj_queues = kobject_create_and_add("queues",
 							process->kobj);
 		if (!process->kobj_queues)
 			pr_warn("Creating KFD proc/queues folder failed");
 
-		ret = kfd_procfs_add_sysfs_stats(process);
-		if (ret)
-			pr_warn("Creating sysfs stats dir for pid %d failed",
-				(int)process->lead_thread->pid);
-
-		ret = kfd_procfs_add_sysfs_files(process);
-		if (ret)
-			pr_warn("Creating sysfs usage file for pid %d failed",
-				(int)process->lead_thread->pid);
+		kfd_procfs_add_sysfs_stats(process);
+		kfd_procfs_add_sysfs_files(process);
 	}
 out:
 	if (!IS_ERR(process))
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/4] drm/amdkfd: fix sysfs kobj leak
  2021-06-22 13:32 [PATCH 1/4] drm/amdkfd: add helper function for kfd sysfs create Philip Yang
@ 2021-06-22 13:32 ` Philip Yang
  2021-06-22 13:32 ` [PATCH 3/4] drm/amdkfd: add sysfs counters for vm fault and migration Philip Yang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: Philip Yang @ 2021-06-22 13:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

3 cases of kobj leak, which causes memory leak:

kobj_type must have release() method to free memory from release
callback. Don't need NULL default_attrs to init kobj.

sysfs files created under kobj_status should be removed with kobj_status
as parent kobject.

Remove queue sysfs files when releasing queue from process MMU notifier
release callback.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 14 ++++++--------
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  1 +
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 3147dc8bb051..cfc36fceac8a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -451,13 +451,9 @@ static const struct sysfs_ops procfs_stats_ops = {
 	.show = kfd_procfs_stats_show,
 };
 
-static struct attribute *procfs_stats_attrs[] = {
-	NULL
-};
-
 static struct kobj_type procfs_stats_type = {
 	.sysfs_ops = &procfs_stats_ops,
-	.default_attrs = procfs_stats_attrs,
+	.release = kfd_procfs_kobj_release,
 };
 
 int kfd_procfs_add_queue(struct queue *q)
@@ -946,9 +942,11 @@ static void kfd_process_wq_release(struct work_struct *work)
 
 			sysfs_remove_file(p->kobj, &pdd->attr_vram);
 			sysfs_remove_file(p->kobj, &pdd->attr_sdma);
-			sysfs_remove_file(p->kobj, &pdd->attr_evict);
-			if (pdd->dev->kfd2kgd->get_cu_occupancy != NULL)
-				sysfs_remove_file(p->kobj, &pdd->attr_cu_occupancy);
+
+			sysfs_remove_file(pdd->kobj_stats, &pdd->attr_evict);
+			if (pdd->dev->kfd2kgd->get_cu_occupancy)
+				sysfs_remove_file(pdd->kobj_stats,
+						  &pdd->attr_cu_occupancy);
 			kobject_del(pdd->kobj_stats);
 			kobject_put(pdd->kobj_stats);
 			pdd->kobj_stats = NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 95a6c36cea4c..243dd1efcdbf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -153,6 +153,7 @@ void pqm_uninit(struct process_queue_manager *pqm)
 		if (pqn->q && pqn->q->gws)
 			amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
 				pqn->q->gws);
+		kfd_procfs_del_queue(pqn->q);
 		uninit_queue(pqn->q);
 		list_del(&pqn->process_queue_list);
 		kfree(pqn);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/4] drm/amdkfd: add sysfs counters for vm fault and migration
  2021-06-22 13:32 [PATCH 1/4] drm/amdkfd: add helper function for kfd sysfs create Philip Yang
  2021-06-22 13:32 ` [PATCH 2/4] drm/amdkfd: fix sysfs kobj leak Philip Yang
@ 2021-06-22 13:32 ` Philip Yang
  2021-06-22 13:32 ` [PATCH 4/4] drm/amdkfd: implement " Philip Yang
  2021-06-23 15:02 ` [PATCH v2 " Philip Yang
  3 siblings, 0 replies; 8+ messages in thread
From: Philip Yang @ 2021-06-22 13:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

This is part of SVM profiling API, export sysfs counters for
per-process, per-GPU vm retry fault, pages migrated in and out of GPU vram.

counters will not be updated in parallel in GPU retry fault handler and
migration to vram/ram path, use READ_ONCE to avoid compiler
optimization.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   9 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 151 ++++++++++++++++++-----
 2 files changed, 131 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6dc22fa1e555..3426743ed228 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -730,6 +730,15 @@ struct kfd_process_device {
 	 *  number of CU's a device has along with number of other competing processes
 	 */
 	struct attribute attr_cu_occupancy;
+
+	/* sysfs counters for GPU retry fault and page migration tracking */
+	struct kobject *kobj_counters;
+	struct attribute attr_faults;
+	struct attribute attr_page_in;
+	struct attribute attr_page_out;
+	uint64_t faults;
+	uint64_t page_in;
+	uint64_t page_out;
 };
 
 #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index cfc36fceac8a..21ec8a18cad2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -416,6 +416,29 @@ static ssize_t kfd_procfs_stats_show(struct kobject *kobj,
 	return 0;
 }
 
+static ssize_t kfd_sysfs_counters_show(struct kobject *kobj,
+				       struct attribute *attr, char *buf)
+{
+	struct kfd_process_device *pdd;
+
+	if (!strcmp(attr->name, "faults")) {
+		pdd = container_of(attr, struct kfd_process_device,
+				   attr_faults);
+		return sysfs_emit(buf, "%llu\n", READ_ONCE(pdd->faults));
+	}
+	if (!strcmp(attr->name, "page_in")) {
+		pdd = container_of(attr, struct kfd_process_device,
+				   attr_page_in);
+		return sysfs_emit(buf, "%llu\n", READ_ONCE(pdd->page_in));
+	}
+	if (!strcmp(attr->name, "page_out")) {
+		pdd = container_of(attr, struct kfd_process_device,
+				   attr_page_out);
+		return sysfs_emit(buf, "%llu\n", READ_ONCE(pdd->page_out));
+	}
+	return 0;
+}
+
 static struct attribute attr_queue_size = {
 	.name = "size",
 	.mode = KFD_SYSFS_FILE_MODE
@@ -456,6 +479,15 @@ static struct kobj_type procfs_stats_type = {
 	.release = kfd_procfs_kobj_release,
 };
 
+static const struct sysfs_ops sysfs_counters_ops = {
+	.show = kfd_sysfs_counters_show,
+};
+
+static struct kobj_type sysfs_counters_type = {
+	.sysfs_ops = &sysfs_counters_ops,
+	.release = kfd_procfs_kobj_release,
+};
+
 int kfd_procfs_add_queue(struct queue *q)
 {
 	struct kfd_process *proc;
@@ -544,6 +576,50 @@ static void kfd_procfs_add_sysfs_stats(struct kfd_process *p)
 	}
 }
 
+static void kfd_procfs_add_sysfs_counters(struct kfd_process *p)
+{
+	int ret = 0;
+	int i;
+	char counters_dir_filename[MAX_SYSFS_FILENAME_LEN];
+
+	if (!p || !p->kobj)
+		return;
+
+	/*
+	 * Create sysfs files for each GPU which supports SVM
+	 * - proc/<pid>/counters_<gpuid>/
+	 * - proc/<pid>/counters_<gpuid>/faults
+	 * - proc/<pid>/counters_<gpuid>/page_in
+	 * - proc/<pid>/counters_<gpuid>/page_out
+	 */
+	for_each_set_bit(i, p->svms.bitmap_supported, p->n_pdds) {
+		struct kfd_process_device *pdd = p->pdds[i];
+		struct kobject *kobj_counters;
+
+		snprintf(counters_dir_filename, MAX_SYSFS_FILENAME_LEN,
+			"counters_%u", pdd->dev->id);
+		kobj_counters = kfd_alloc_struct(kobj_counters);
+		if (!kobj_counters)
+			return;
+
+		ret = kobject_init_and_add(kobj_counters, &sysfs_counters_type,
+					   p->kobj, counters_dir_filename);
+		if (ret) {
+			pr_warn("Creating KFD proc/%s folder failed",
+				counters_dir_filename);
+			kobject_put(kobj_counters);
+			return;
+		}
+
+		pdd->kobj_counters = kobj_counters;
+		kfd_sysfs_create_file(kobj_counters, &pdd->attr_faults,
+				      "faults");
+		kfd_sysfs_create_file(kobj_counters, &pdd->attr_page_in,
+				      "page_in");
+		kfd_sysfs_create_file(kobj_counters, &pdd->attr_page_out,
+				      "page_out");
+	}
+}
 
 static void kfd_procfs_add_sysfs_files(struct kfd_process *p)
 {
@@ -777,6 +853,7 @@ struct kfd_process *kfd_create_process(struct file *filep)
 
 		kfd_procfs_add_sysfs_stats(process);
 		kfd_procfs_add_sysfs_files(process);
+		kfd_procfs_add_sysfs_counters(process);
 	}
 out:
 	if (!IS_ERR(process))
@@ -919,44 +996,60 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
 	p->n_pdds = 0;
 }
 
-/* No process locking is needed in this function, because the process
- * is not findable any more. We must assume that no other thread is
- * using it any more, otherwise we couldn't safely free the process
- * structure in the end.
- */
-static void kfd_process_wq_release(struct work_struct *work)
+static void kfd_process_remove_sysfs(struct kfd_process *p)
 {
-	struct kfd_process *p = container_of(work, struct kfd_process,
-					     release_work);
+	struct kfd_process_device *pdd;
 	int i;
 
-	/* Remove the procfs files */
-	if (p->kobj) {
-		sysfs_remove_file(p->kobj, &p->attr_pasid);
-		kobject_del(p->kobj_queues);
-		kobject_put(p->kobj_queues);
-		p->kobj_queues = NULL;
+	if (!p->kobj)
+		return;
 
-		for (i = 0; i < p->n_pdds; i++) {
-			struct kfd_process_device *pdd = p->pdds[i];
+	sysfs_remove_file(p->kobj, &p->attr_pasid);
+	kobject_del(p->kobj_queues);
+	kobject_put(p->kobj_queues);
+	p->kobj_queues = NULL;
 
-			sysfs_remove_file(p->kobj, &pdd->attr_vram);
-			sysfs_remove_file(p->kobj, &pdd->attr_sdma);
+	for (i = 0; i < p->n_pdds; i++) {
+		pdd = p->pdds[i];
 
-			sysfs_remove_file(pdd->kobj_stats, &pdd->attr_evict);
-			if (pdd->dev->kfd2kgd->get_cu_occupancy)
-				sysfs_remove_file(pdd->kobj_stats,
-						  &pdd->attr_cu_occupancy);
-			kobject_del(pdd->kobj_stats);
-			kobject_put(pdd->kobj_stats);
-			pdd->kobj_stats = NULL;
-		}
+		sysfs_remove_file(p->kobj, &pdd->attr_vram);
+		sysfs_remove_file(p->kobj, &pdd->attr_sdma);
 
-		kobject_del(p->kobj);
-		kobject_put(p->kobj);
-		p->kobj = NULL;
+		sysfs_remove_file(pdd->kobj_stats, &pdd->attr_evict);
+		if (pdd->dev->kfd2kgd->get_cu_occupancy)
+			sysfs_remove_file(pdd->kobj_stats,
+					  &pdd->attr_cu_occupancy);
+		kobject_del(pdd->kobj_stats);
+		kobject_put(pdd->kobj_stats);
+		pdd->kobj_stats = NULL;
+	}
+
+	for_each_set_bit(i, p->svms.bitmap_supported, p->n_pdds) {
+		pdd = p->pdds[i];
+
+		sysfs_remove_file(pdd->kobj_counters, &pdd->attr_faults);
+		sysfs_remove_file(pdd->kobj_counters, &pdd->attr_page_in);
+		sysfs_remove_file(pdd->kobj_counters, &pdd->attr_page_out);
+		kobject_del(pdd->kobj_counters);
+		kobject_put(pdd->kobj_counters);
+		pdd->kobj_counters = NULL;
 	}
 
+	kobject_del(p->kobj);
+	kobject_put(p->kobj);
+	p->kobj = NULL;
+}
+
+/* No process locking is needed in this function, because the process
+ * is not findable any more. We must assume that no other thread is
+ * using it any more, otherwise we couldn't safely free the process
+ * structure in the end.
+ */
+static void kfd_process_wq_release(struct work_struct *work)
+{
+	struct kfd_process *p = container_of(work, struct kfd_process,
+					     release_work);
+	kfd_process_remove_sysfs(p);
 	kfd_iommu_unbind_process(p);
 
 	kfd_process_free_outstanding_kfd_bos(p);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/4] drm/amdkfd: implement counters for vm fault and migration
  2021-06-22 13:32 [PATCH 1/4] drm/amdkfd: add helper function for kfd sysfs create Philip Yang
  2021-06-22 13:32 ` [PATCH 2/4] drm/amdkfd: fix sysfs kobj leak Philip Yang
  2021-06-22 13:32 ` [PATCH 3/4] drm/amdkfd: add sysfs counters for vm fault and migration Philip Yang
@ 2021-06-22 13:32 ` Philip Yang
  2021-06-22 23:31   ` Felix Kuehling
  2021-06-23 15:02 ` [PATCH v2 " Philip Yang
  3 siblings, 1 reply; 8+ messages in thread
From: Philip Yang @ 2021-06-22 13:32 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

Add helper function to get process device data structure from adev to
update counters.

Update vm faults, page_in, page_out counters will no be executed in
parallel, use WRITE_ONCE to avoid any form of compiler optimizations.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 14 ++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 24 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  2 ++
 3 files changed, 40 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index fd8f544f0de2..45b5349283af 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -413,6 +413,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
 			uint64_t end)
 {
 	uint64_t npages = (end - start) >> PAGE_SHIFT;
+	struct kfd_process_device *pdd;
 	struct dma_fence *mfence = NULL;
 	struct migrate_vma migrate;
 	dma_addr_t *scratch;
@@ -473,6 +474,12 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
 out_free:
 	kvfree(buf);
 out:
+	if (!r) {
+		pdd = svm_range_get_pdd_by_adev(prange, adev);
+		if (pdd)
+			WRITE_ONCE(pdd->page_in, pdd->page_in + migrate.cpages);
+	}
+
 	return r;
 }
 
@@ -629,6 +636,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
 		       struct vm_area_struct *vma, uint64_t start, uint64_t end)
 {
 	uint64_t npages = (end - start) >> PAGE_SHIFT;
+	struct kfd_process_device *pdd;
 	struct dma_fence *mfence = NULL;
 	struct migrate_vma migrate;
 	dma_addr_t *scratch;
@@ -678,6 +686,12 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
 out_free:
 	kvfree(buf);
 out:
+	if (!r) {
+		pdd = svm_range_get_pdd_by_adev(prange, adev);
+		if (pdd)
+			WRITE_ONCE(pdd->page_out,
+				   pdd->page_out + migrate.cpages);
+	}
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 5468ea4264c6..f3323328f01f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -564,6 +564,24 @@ svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
 	return (struct amdgpu_device *)pdd->dev->kgd;
 }
 
+struct kfd_process_device *
+svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev)
+{
+	struct kfd_process *p;
+	int32_t gpu_idx, gpuid;
+	int r;
+
+	p = container_of(prange->svms, struct kfd_process, svms);
+
+	r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpu_idx);
+	if (r) {
+		pr_debug("failed to get device id by adev %p\n", adev);
+		return NULL;
+	}
+
+	return kfd_process_device_from_gpuidx(p, gpu_idx);
+}
+
 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
 {
 	struct ttm_operation_ctx ctx = { false, false };
@@ -2315,6 +2333,7 @@ int
 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 			uint64_t addr)
 {
+	struct kfd_process_device *pdd;
 	struct mm_struct *mm = NULL;
 	struct svm_range_list *svms;
 	struct svm_range *prange;
@@ -2440,6 +2459,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 out_unlock_svms:
 	mutex_unlock(&svms->lock);
 	mmap_read_unlock(mm);
+
+	pdd = svm_range_get_pdd_by_adev(prange, adev);
+	if (pdd)
+		WRITE_ONCE(pdd->faults, pdd->faults + 1);
+
 	mmput(mm);
 out:
 	kfd_unref_process(p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 0c0fc399395e..a9af03994d1a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -174,6 +174,8 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
 			 unsigned long offset, unsigned long npages);
 void svm_range_free_dma_mappings(struct svm_range *prange);
 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm);
+struct kfd_process_device *
+svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
 
 /* SVM API and HMM page migration work together, device memory type
  * is initialized to not 0 when page migration register device memory.
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 4/4] drm/amdkfd: implement counters for vm fault and migration
  2021-06-22 13:32 ` [PATCH 4/4] drm/amdkfd: implement " Philip Yang
@ 2021-06-22 23:31   ` Felix Kuehling
  2021-06-23 14:54     ` philip yang
  0 siblings, 1 reply; 8+ messages in thread
From: Felix Kuehling @ 2021-06-22 23:31 UTC (permalink / raw)
  To: Philip Yang, amd-gfx

Am 2021-06-22 um 9:32 a.m. schrieb Philip Yang:
> Add helper function to get process device data structure from adev to
> update counters.
>
> Update vm faults, page_in, page_out counters will no be executed in
> parallel, use WRITE_ONCE to avoid any form of compiler optimizations.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 14 ++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 24 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  2 ++
>  3 files changed, 40 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index fd8f544f0de2..45b5349283af 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -413,6 +413,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
>  			uint64_t end)
>  {
>  	uint64_t npages = (end - start) >> PAGE_SHIFT;
> +	struct kfd_process_device *pdd;
>  	struct dma_fence *mfence = NULL;
>  	struct migrate_vma migrate;
>  	dma_addr_t *scratch;
> @@ -473,6 +474,12 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
>  out_free:
>  	kvfree(buf);
>  out:
> +	if (!r) {
> +		pdd = svm_range_get_pdd_by_adev(prange, adev);
> +		if (pdd)
> +			WRITE_ONCE(pdd->page_in, pdd->page_in + migrate.cpages);
> +	}
> +
>  	return r;
>  }
>  
> @@ -629,6 +636,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>  		       struct vm_area_struct *vma, uint64_t start, uint64_t end)
>  {
>  	uint64_t npages = (end - start) >> PAGE_SHIFT;
> +	struct kfd_process_device *pdd;
>  	struct dma_fence *mfence = NULL;
>  	struct migrate_vma migrate;
>  	dma_addr_t *scratch;
> @@ -678,6 +686,12 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>  out_free:
>  	kvfree(buf);
>  out:
> +	if (!r) {
> +		pdd = svm_range_get_pdd_by_adev(prange, adev);
> +		if (pdd)
> +			WRITE_ONCE(pdd->page_out,
> +				   pdd->page_out + migrate.cpages);
> +	}
>  	return r;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 5468ea4264c6..f3323328f01f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -564,6 +564,24 @@ svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
>  	return (struct amdgpu_device *)pdd->dev->kgd;
>  }
>  
> +struct kfd_process_device *
> +svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev)
> +{
> +	struct kfd_process *p;
> +	int32_t gpu_idx, gpuid;
> +	int r;
> +
> +	p = container_of(prange->svms, struct kfd_process, svms);
> +
> +	r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpu_idx);
> +	if (r) {
> +		pr_debug("failed to get device id by adev %p\n", adev);
> +		return NULL;
> +	}
> +
> +	return kfd_process_device_from_gpuidx(p, gpu_idx);
> +}
> +
>  static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
>  {
>  	struct ttm_operation_ctx ctx = { false, false };
> @@ -2315,6 +2333,7 @@ int
>  svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>  			uint64_t addr)
>  {
> +	struct kfd_process_device *pdd;
>  	struct mm_struct *mm = NULL;
>  	struct svm_range_list *svms;
>  	struct svm_range *prange;
> @@ -2440,6 +2459,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>  out_unlock_svms:
>  	mutex_unlock(&svms->lock);
>  	mmap_read_unlock(mm);
> +
> +	pdd = svm_range_get_pdd_by_adev(prange, adev);

svm_range_get_pdd_by_adev needs to do a linear search. You don't need
this here because you already know the gpuidx. I think you can just call
kfd_process_device_from_gpuidx(p, gpu_idx) here.

With that fixed, the series is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


P.S.: Thanks for catching and fixing those memory leaks in patch 2.


> +	if (pdd)
> +		WRITE_ONCE(pdd->faults, pdd->faults + 1);
> +
>  	mmput(mm);
>  out:
>  	kfd_unref_process(p);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 0c0fc399395e..a9af03994d1a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -174,6 +174,8 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
>  			 unsigned long offset, unsigned long npages);
>  void svm_range_free_dma_mappings(struct svm_range *prange);
>  void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm);
> +struct kfd_process_device *
> +svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
>  
>  /* SVM API and HMM page migration work together, device memory type
>   * is initialized to not 0 when page migration register device memory.
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 4/4] drm/amdkfd: implement counters for vm fault and migration
  2021-06-22 23:31   ` Felix Kuehling
@ 2021-06-23 14:54     ` philip yang
  0 siblings, 0 replies; 8+ messages in thread
From: philip yang @ 2021-06-23 14:54 UTC (permalink / raw)
  To: Felix Kuehling, Philip Yang, amd-gfx

[-- Attachment #1: Type: text/html, Size: 6165 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2 4/4] drm/amdkfd: implement counters for vm fault and migration
  2021-06-22 13:32 [PATCH 1/4] drm/amdkfd: add helper function for kfd sysfs create Philip Yang
                   ` (2 preceding siblings ...)
  2021-06-22 13:32 ` [PATCH 4/4] drm/amdkfd: implement " Philip Yang
@ 2021-06-23 15:02 ` Philip Yang
  2021-06-23 18:52   ` Felix Kuehling
  3 siblings, 1 reply; 8+ messages in thread
From: Philip Yang @ 2021-06-23 15:02 UTC (permalink / raw)
  To: amd-gfx; +Cc: Philip Yang

Add helper function to get process device data structure from adev to
update counters.

Update vm faults, page_in, page_out counters will no be executed in
parallel, use WRITE_ONCE to avoid any form of compiler optimizations.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 14 ++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 45 +++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  2 ++
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index fd8f544f0de2..45b5349283af 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -413,6 +413,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
 			uint64_t end)
 {
 	uint64_t npages = (end - start) >> PAGE_SHIFT;
+	struct kfd_process_device *pdd;
 	struct dma_fence *mfence = NULL;
 	struct migrate_vma migrate;
 	dma_addr_t *scratch;
@@ -473,6 +474,12 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
 out_free:
 	kvfree(buf);
 out:
+	if (!r) {
+		pdd = svm_range_get_pdd_by_adev(prange, adev);
+		if (pdd)
+			WRITE_ONCE(pdd->page_in, pdd->page_in + migrate.cpages);
+	}
+
 	return r;
 }
 
@@ -629,6 +636,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
 		       struct vm_area_struct *vma, uint64_t start, uint64_t end)
 {
 	uint64_t npages = (end - start) >> PAGE_SHIFT;
+	struct kfd_process_device *pdd;
 	struct dma_fence *mfence = NULL;
 	struct migrate_vma migrate;
 	dma_addr_t *scratch;
@@ -678,6 +686,12 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
 out_free:
 	kvfree(buf);
 out:
+	if (!r) {
+		pdd = svm_range_get_pdd_by_adev(prange, adev);
+		if (pdd)
+			WRITE_ONCE(pdd->page_out,
+				   pdd->page_out + migrate.cpages);
+	}
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 7f5ebc00b349..00b46846a5e0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -564,6 +564,24 @@ svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
 	return (struct amdgpu_device *)pdd->dev->kgd;
 }
 
+struct kfd_process_device *
+svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev)
+{
+	struct kfd_process *p;
+	int32_t gpu_idx, gpuid;
+	int r;
+
+	p = container_of(prange->svms, struct kfd_process, svms);
+
+	r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpu_idx);
+	if (r) {
+		pr_debug("failed to get device id by adev %p\n", adev);
+		return NULL;
+	}
+
+	return kfd_process_device_from_gpuidx(p, gpu_idx);
+}
+
 static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
 {
 	struct ttm_operation_ctx ctx = { false, false };
@@ -2311,6 +2329,27 @@ static bool svm_range_skip_recover(struct svm_range *prange)
 	return false;
 }
 
+static void
+svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
+		      struct svm_range *prange, int32_t gpuidx)
+{
+	struct kfd_process_device *pdd;
+
+	if (gpuidx == MAX_GPU_INSTANCE)
+		/* fault is on different page of same range
+		 * or fault is skipped to recover later
+		 */
+		pdd = svm_range_get_pdd_by_adev(prange, adev);
+	else
+		/* fault recovered
+		 * or fault cannot recover because GPU no access on the range
+		 */
+		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
+
+	if (pdd)
+		WRITE_ONCE(pdd->faults, pdd->faults + 1);
+}
+
 int
 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 			uint64_t addr)
@@ -2320,7 +2359,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 	struct svm_range *prange;
 	struct kfd_process *p;
 	uint64_t timestamp;
-	int32_t best_loc, gpuidx;
+	int32_t best_loc;
+	int32_t gpuidx = MAX_GPU_INSTANCE;
 	bool write_locked = false;
 	int r = 0;
 
@@ -2440,6 +2480,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 out_unlock_svms:
 	mutex_unlock(&svms->lock);
 	mmap_read_unlock(mm);
+
+	svm_range_count_fault(adev, p, prange, gpuidx);
+
 	mmput(mm);
 out:
 	kfd_unref_process(p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 0c0fc399395e..a9af03994d1a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -174,6 +174,8 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
 			 unsigned long offset, unsigned long npages);
 void svm_range_free_dma_mappings(struct svm_range *prange);
 void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm);
+struct kfd_process_device *
+svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
 
 /* SVM API and HMM page migration work together, device memory type
  * is initialized to not 0 when page migration register device memory.
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 4/4] drm/amdkfd: implement counters for vm fault and migration
  2021-06-23 15:02 ` [PATCH v2 " Philip Yang
@ 2021-06-23 18:52   ` Felix Kuehling
  0 siblings, 0 replies; 8+ messages in thread
From: Felix Kuehling @ 2021-06-23 18:52 UTC (permalink / raw)
  To: Philip Yang, amd-gfx

On 2021-06-23 11:02 a.m., Philip Yang wrote:
> Add helper function to get process device data structure from adev to
> update counters.
>
> Update vm faults, page_in, page_out counters will no be executed in
> parallel, use WRITE_ONCE to avoid any form of compiler optimizations.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 14 ++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 45 +++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  2 ++
>   3 files changed, 60 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index fd8f544f0de2..45b5349283af 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -413,6 +413,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
>   			uint64_t end)
>   {
>   	uint64_t npages = (end - start) >> PAGE_SHIFT;
> +	struct kfd_process_device *pdd;
>   	struct dma_fence *mfence = NULL;
>   	struct migrate_vma migrate;
>   	dma_addr_t *scratch;
> @@ -473,6 +474,12 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
>   out_free:
>   	kvfree(buf);
>   out:
> +	if (!r) {
> +		pdd = svm_range_get_pdd_by_adev(prange, adev);
> +		if (pdd)
> +			WRITE_ONCE(pdd->page_in, pdd->page_in + migrate.cpages);
> +	}
> +
>   	return r;
>   }
>   
> @@ -629,6 +636,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   		       struct vm_area_struct *vma, uint64_t start, uint64_t end)
>   {
>   	uint64_t npages = (end - start) >> PAGE_SHIFT;
> +	struct kfd_process_device *pdd;
>   	struct dma_fence *mfence = NULL;
>   	struct migrate_vma migrate;
>   	dma_addr_t *scratch;
> @@ -678,6 +686,12 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   out_free:
>   	kvfree(buf);
>   out:
> +	if (!r) {
> +		pdd = svm_range_get_pdd_by_adev(prange, adev);
> +		if (pdd)
> +			WRITE_ONCE(pdd->page_out,
> +				   pdd->page_out + migrate.cpages);
> +	}
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 7f5ebc00b349..00b46846a5e0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -564,6 +564,24 @@ svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
>   	return (struct amdgpu_device *)pdd->dev->kgd;
>   }
>   
> +struct kfd_process_device *
> +svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev)
> +{
> +	struct kfd_process *p;
> +	int32_t gpu_idx, gpuid;
> +	int r;
> +
> +	p = container_of(prange->svms, struct kfd_process, svms);
> +
> +	r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpu_idx);
> +	if (r) {
> +		pr_debug("failed to get device id by adev %p\n", adev);
> +		return NULL;
> +	}
> +
> +	return kfd_process_device_from_gpuidx(p, gpu_idx);
> +}
> +
>   static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
>   {
>   	struct ttm_operation_ctx ctx = { false, false };
> @@ -2311,6 +2329,27 @@ static bool svm_range_skip_recover(struct svm_range *prange)
>   	return false;
>   }
>   
> +static void
> +svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
> +		      struct svm_range *prange, int32_t gpuidx)
> +{
> +	struct kfd_process_device *pdd;
> +
> +	if (gpuidx == MAX_GPU_INSTANCE)
> +		/* fault is on different page of same range
> +		 * or fault is skipped to recover later
> +		 */
> +		pdd = svm_range_get_pdd_by_adev(prange, adev);
> +	else
> +		/* fault recovered
> +		 * or fault cannot recover because GPU no access on the range
> +		 */
> +		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
> +
> +	if (pdd)
> +		WRITE_ONCE(pdd->faults, pdd->faults + 1);
> +}
> +
>   int
>   svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   			uint64_t addr)
> @@ -2320,7 +2359,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   	struct svm_range *prange;
>   	struct kfd_process *p;
>   	uint64_t timestamp;
> -	int32_t best_loc, gpuidx;
> +	int32_t best_loc;
> +	int32_t gpuidx = MAX_GPU_INSTANCE;
>   	bool write_locked = false;
>   	int r = 0;
>   
> @@ -2440,6 +2480,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   out_unlock_svms:
>   	mutex_unlock(&svms->lock);
>   	mmap_read_unlock(mm);
> +
> +	svm_range_count_fault(adev, p, prange, gpuidx);
> +
>   	mmput(mm);
>   out:
>   	kfd_unref_process(p);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 0c0fc399395e..a9af03994d1a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -174,6 +174,8 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
>   			 unsigned long offset, unsigned long npages);
>   void svm_range_free_dma_mappings(struct svm_range *prange);
>   void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm);
> +struct kfd_process_device *
> +svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
>   
>   /* SVM API and HMM page migration work together, device memory type
>    * is initialized to not 0 when page migration register device memory.
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-06-23 18:52 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-22 13:32 [PATCH 1/4] drm/amdkfd: add helper function for kfd sysfs create Philip Yang
2021-06-22 13:32 ` [PATCH 2/4] drm/amdkfd: fix sysfs kobj leak Philip Yang
2021-06-22 13:32 ` [PATCH 3/4] drm/amdkfd: add sysfs counters for vm fault and migration Philip Yang
2021-06-22 13:32 ` [PATCH 4/4] drm/amdkfd: implement " Philip Yang
2021-06-22 23:31   ` Felix Kuehling
2021-06-23 14:54     ` philip yang
2021-06-23 15:02 ` [PATCH v2 " Philip Yang
2021-06-23 18:52   ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.