All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code
@ 2020-09-10 18:54 Philip Cox
  2020-09-10 18:54 ` [PATCH 2/3] drm/amdkfd: Add process eviction counters to sysfs Philip Cox
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Philip Cox @ 2020-09-10 18:54 UTC (permalink / raw)
  To: amd-gfx
  Cc: Philip Cox, Felix.Kuehling, Tony.Tye, Laurent.Morichetti, Jonathan.Kim

Extending the module parameter debug_evictions to also print a stack
trace when the eviction code path is called.

Signed-off-by: Philip Cox <Philip.Cox@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c              | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 20ef048d6a03..cafbc3aa980a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1966,6 +1966,7 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
 
 	if (!p)
 		return -EINVAL;
+	WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
 	pdd = kfd_get_process_device_data(dqm->dev, p);
 	if (pdd)
 		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a0e12a79ab7d..1e15aa7d8ae8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1488,6 +1488,7 @@ void kfd_suspend_all_processes(void)
 	unsigned int temp;
 	int idx = srcu_read_lock(&kfd_processes_srcu);
 
+	WARN(debug_evictions, "Evicting all processes");
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
 		cancel_delayed_work_sync(&p->eviction_work);
 		cancel_delayed_work_sync(&p->restore_work);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/3] drm/amdkfd: Add process eviction counters to sysfs
  2020-09-10 18:54 [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code Philip Cox
@ 2020-09-10 18:54 ` Philip Cox
  2020-09-10 22:43   ` Felix Kuehling
  2020-09-10 18:54 ` [PATCH 3/3] drm/amdkfd: Reduce eviction/restore message levels Philip Cox
  2020-09-10 22:31 ` [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code Felix Kuehling
  2 siblings, 1 reply; 6+ messages in thread
From: Philip Cox @ 2020-09-10 18:54 UTC (permalink / raw)
  To: amd-gfx
  Cc: Philip Cox, Felix.Kuehling, Tony.Tye, Laurent.Morichetti, Jonathan.Kim

Add per-process eviction counters to sysfs to keep track of
how many eviction events have happened for each process.

Signed-off-by: Philip Cox <Philip.Cox@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  15 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 117 +++++++++++++++++++++++
 2 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 023629f28495..f6902e9c6077 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -631,7 +631,7 @@ enum kfd_pdd_bound {
 	PDD_BOUND_SUSPENDED,
 };
 
-#define MAX_SYSFS_FILENAME_LEN 11
+#define MAX_SYSFS_FILENAME_LEN 15
 
 /*
  * SDMA counter runs at 100MHz frequency.
@@ -692,10 +692,19 @@ struct kfd_process_device {
 	uint64_t sdma_past_activity_counter;
 	struct attribute attr_sdma;
 	char sdma_filename[MAX_SYSFS_FILENAME_LEN];
+
+	/* Eviction activity tracking */
+	atomic64_t evict_duration_counter;
+	struct attribute attr_evict;
+	char evict_filename[MAX_SYSFS_FILENAME_LEN];
 };
 
 #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
 
+struct kobj_counters_list {
+	struct list_head counters_list;
+	struct kobject *kobj;
+};
 /* Process data */
 struct kfd_process {
 	/*
@@ -766,13 +775,15 @@ struct kfd_process {
 	/* seqno of the last scheduled eviction */
 	unsigned int last_eviction_seqno;
 	/* Approx. the last timestamp (in jiffies) when the process was
-	 * restored after an eviction
+	 * restored or evicted.
 	 */
 	unsigned long last_restore_timestamp;
+	unsigned long last_evict_timestamp;
 
 	/* Kobj for our procfs */
 	struct kobject *kobj;
 	struct kobject *kobj_queues;
+	struct kobj_counters_list counters;
 	struct attribute attr_pasid;
 };
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 1e15aa7d8ae8..2a81d5a790a0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -344,6 +344,26 @@ static ssize_t kfd_procfs_queue_show(struct kobject *kobj,
 
 	return 0;
 }
+static ssize_t kfd_procfs_counters_show(struct kobject *kobj,
+				     struct attribute *attr, char *buffer)
+{
+	if (strcmp(attr->name, "evictions") == 0) {
+		struct kfd_process_device *pdd = container_of(attr,
+				struct kfd_process_device,
+				attr_evict);
+		uint64_t evict_jiffies;
+
+		evict_jiffies = atomic64_read(&pdd->evict_duration_counter);
+
+		return snprintf(buffer,
+				PAGE_SIZE,
+				"%llu\n",
+				jiffies64_to_msecs(evict_jiffies));
+	} else
+		pr_err("Invalid attribute");
+
+	return 0;
+}
 
 static struct attribute attr_queue_size = {
 	.name = "size",
@@ -376,6 +396,19 @@ static struct kobj_type procfs_queue_type = {
 	.default_attrs = procfs_queue_attrs,
 };
 
+static const struct sysfs_ops procfs_counters_ops = {
+	.show = kfd_procfs_counters_show,
+};
+
+static struct attribute *procfs_counters_attrs[] = {
+	NULL
+};
+
+static struct kobj_type procfs_counters_type = {
+	.sysfs_ops = &procfs_counters_ops,
+	.default_attrs = procfs_counters_attrs,
+};
+
 int kfd_procfs_add_queue(struct queue *q)
 {
 	struct kfd_process *proc;
@@ -417,6 +450,70 @@ static int kfd_sysfs_create_file(struct kfd_process *p, struct attribute *attr,
 	return ret;
 }
 
+static int kfd_procfs_add_sysfs_counters(struct kfd_process *p)
+{
+	int ret = 0;
+	struct kfd_process_device *pdd;
+	char counter_dir_filename[MAX_SYSFS_FILENAME_LEN];
+
+	if (!p)
+		return -EINVAL;
+
+	if (!p->kobj)
+		return -EFAULT;
+
+	INIT_LIST_HEAD(&p->counters.counters_list);
+	/*
+	 * Create sysfs files for each GPU:
+	 * - proc/<pid>/counters_<gpuid>/
+	 * - proc/<pid>/counters_<gpuid>/evictions
+	 */
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		struct kobj_counters_list *kobj_counter;
+
+		kobj_counter = kzalloc(sizeof(*kobj_counter),
+				GFP_KERNEL);
+		if (!kobj_counter)
+			return -ENOMEM;
+
+		snprintf(counter_dir_filename, MAX_SYSFS_FILENAME_LEN,
+				"counters_%u", pdd->dev->id);
+		kobj_counter->kobj = kfd_alloc_struct(kobj_counter->kobj);
+		if (!kobj_counter->kobj) {
+			kfree(kobj_counter);
+			return -ENOMEM;
+		}
+
+		ret = kobject_init_and_add(kobj_counter->kobj,
+						&procfs_counters_type,
+						p->kobj,
+						counter_dir_filename);
+
+		if (ret) {
+			pr_warn("Creating KFD proc/counters_%s folder failed",
+					counter_dir_filename);
+			kobject_put(kobj_counter->kobj);
+			kfree(kobj_counter);
+			goto err;
+		}
+
+		list_add(&kobj_counter->counters_list,
+				&p->counters.counters_list);
+		snprintf(pdd->evict_filename,
+				MAX_SYSFS_FILENAME_LEN,
+				"evictions");
+		pdd->attr_evict.name = pdd->evict_filename;
+		pdd->attr_evict.mode = KFD_SYSFS_FILE_MODE;
+		sysfs_attr_init(&pdd->attr_evict);
+		ret = sysfs_create_file(kobj_counter->kobj, &pdd->attr_evict);
+		if (ret)
+			pr_warn("Creating eviction counter for gpuid %d failed",
+				(int)pdd->dev->id);
+	}
+err:
+	return ret;
+}
+
 static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
 {
 	int ret = 0;
@@ -660,6 +757,11 @@ struct kfd_process *kfd_create_process(struct file *filep)
 		if (!process->kobj_queues)
 			pr_warn("Creating KFD proc/queues folder failed");
 
+		ret = kfd_procfs_add_sysfs_counters(process);
+		if (ret)
+			pr_warn("Creating sysfs counter dir for pid %d failed",
+				(int)process->lead_thread->pid);
+
 		ret = kfd_procfs_add_sysfs_files(process);
 		if (ret)
 			pr_warn("Creating sysfs usage file for pid %d failed",
@@ -806,6 +908,7 @@ static void kfd_process_wq_release(struct work_struct *work)
 					     release_work);
 	struct kfd_process_device *pdd;
 
+	struct kobj_counters_list *counters;
 	/* Remove the procfs files */
 	if (p->kobj) {
 		sysfs_remove_file(p->kobj, &p->attr_pasid);
@@ -818,6 +921,14 @@ static void kfd_process_wq_release(struct work_struct *work)
 			sysfs_remove_file(p->kobj, &pdd->attr_sdma);
 		}
 
+		list_for_each_entry(counters,
+				&p->counters.counters_list,
+				counters_list) {
+			sysfs_remove_file(p->kobj, &pdd->attr_evict);
+			kobject_del(counters->kobj);
+			kobject_put(counters->kobj);
+		}
+
 		kobject_del(p->kobj);
 		kobject_put(p->kobj);
 		p->kobj = NULL;
@@ -1125,6 +1236,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 	pdd->runtime_inuse = false;
 	pdd->vram_usage = 0;
 	pdd->sdma_past_activity_counter = 0;
+	atomic64_set(&pdd->evict_duration_counter, 0);
 	list_add(&pdd->per_device_list, &p->per_device_data);
 
 	/* Init idr used for memory handle translation */
@@ -1388,7 +1500,9 @@ int kfd_process_restore_queues(struct kfd_process *p)
 {
 	struct kfd_process_device *pdd;
 	int r, ret = 0;
+	uint64_t eviction_duration;
 
+	eviction_duration = p->last_restore_timestamp - p->last_evict_timestamp;
 	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
 		r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
 							      &pdd->qpd);
@@ -1397,6 +1511,7 @@ int kfd_process_restore_queues(struct kfd_process *p)
 			if (!ret)
 				ret = r;
 		}
+		atomic64_add(eviction_duration, &pdd->evict_duration_counter);
 	}
 
 	return ret;
@@ -1425,6 +1540,8 @@ static void evict_process_worker(struct work_struct *work)
 	 */
 	flush_delayed_work(&p->restore_work);
 
+	p->last_evict_timestamp = get_jiffies_64();
+
 	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
 	ret = kfd_process_evict_queues(p);
 	if (!ret) {
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/3] drm/amdkfd: Reduce eviction/restore message levels
  2020-09-10 18:54 [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code Philip Cox
  2020-09-10 18:54 ` [PATCH 2/3] drm/amdkfd: Add process eviction counters to sysfs Philip Cox
@ 2020-09-10 18:54 ` Philip Cox
  2020-09-10 22:43   ` Felix Kuehling
  2020-09-10 22:31 ` [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code Felix Kuehling
  2 siblings, 1 reply; 6+ messages in thread
From: Philip Cox @ 2020-09-10 18:54 UTC (permalink / raw)
  To: amd-gfx
  Cc: Philip Cox, Felix.Kuehling, Tony.Tye, Laurent.Morichetti, Jonathan.Kim

Reduce the eviction and restore messages from INFO level to DEBUG level.

Signed-off-by: Philip Cox <Philip.Cox@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index cafbc3aa980a..0d2bb20b49b7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -650,7 +650,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
 		goto out;
 
 	pdd = qpd_to_pdd(qpd);
-	pr_info_ratelimited("Evicting PASID 0x%x queues\n",
+	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Mark all queues as evicted. Deactivate all active queues on
@@ -700,7 +700,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 
 	pdd = qpd_to_pdd(qpd);
-	pr_info_ratelimited("Evicting PASID 0x%x queues\n",
+	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Mark all queues as evicted. Deactivate all active queues on
@@ -746,7 +746,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 		goto out;
 	}
 
-	pr_info_ratelimited("Restoring PASID 0x%x queues\n",
+	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Update PD Base in QPD */
@@ -826,7 +826,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 	}
 
-	pr_info_ratelimited("Restoring PASID 0x%x queues\n",
+	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Update PD Base in QPD */
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code
  2020-09-10 18:54 [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code Philip Cox
  2020-09-10 18:54 ` [PATCH 2/3] drm/amdkfd: Add process eviction counters to sysfs Philip Cox
  2020-09-10 18:54 ` [PATCH 3/3] drm/amdkfd: Reduce eviction/restore message levels Philip Cox
@ 2020-09-10 22:31 ` Felix Kuehling
  2 siblings, 0 replies; 6+ messages in thread
From: Felix Kuehling @ 2020-09-10 22:31 UTC (permalink / raw)
  To: Philip Cox, amd-gfx; +Cc: Tony.Tye, Laurent.Morichetti, Jonathan.Kim


Am 2020-09-10 um 2:54 p.m. schrieb Philip Cox:
> Extending the module parameter debug_evictions to also print a stack
> trace when the eviction code path is called.
>
> Signed-off-by: Philip Cox <Philip.Cox@amd.com>

This patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c              | 1 +
>  2 files changed, 2 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 20ef048d6a03..cafbc3aa980a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1966,6 +1966,7 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
>  
>  	if (!p)
>  		return -EINVAL;
> +	WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>  	pdd = kfd_get_process_device_data(dqm->dev, p);
>  	if (pdd)
>  		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a0e12a79ab7d..1e15aa7d8ae8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1488,6 +1488,7 @@ void kfd_suspend_all_processes(void)
>  	unsigned int temp;
>  	int idx = srcu_read_lock(&kfd_processes_srcu);
>  
> +	WARN(debug_evictions, "Evicting all processes");
>  	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>  		cancel_delayed_work_sync(&p->eviction_work);
>  		cancel_delayed_work_sync(&p->restore_work);
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/3] drm/amdkfd: Add process eviction counters to sysfs
  2020-09-10 18:54 ` [PATCH 2/3] drm/amdkfd: Add process eviction counters to sysfs Philip Cox
@ 2020-09-10 22:43   ` Felix Kuehling
  0 siblings, 0 replies; 6+ messages in thread
From: Felix Kuehling @ 2020-09-10 22:43 UTC (permalink / raw)
  To: Philip Cox, amd-gfx; +Cc: Tony.Tye, Laurent.Morichetti, Jonathan.Kim


Am 2020-09-10 um 2:54 p.m. schrieb Philip Cox:
> Add per-process eviction counters to sysfs to keep track of
> how many eviction events have happened for each process.
>
> Signed-off-by: Philip Cox <Philip.Cox@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  15 ++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 117 +++++++++++++++++++++++
>  2 files changed, 130 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 023629f28495..f6902e9c6077 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -631,7 +631,7 @@ enum kfd_pdd_bound {
>  	PDD_BOUND_SUSPENDED,
>  };
>  
> -#define MAX_SYSFS_FILENAME_LEN 11
> +#define MAX_SYSFS_FILENAME_LEN 15
>  
>  /*
>   * SDMA counter runs at 100MHz frequency.
> @@ -692,10 +692,19 @@ struct kfd_process_device {
>  	uint64_t sdma_past_activity_counter;
>  	struct attribute attr_sdma;
>  	char sdma_filename[MAX_SYSFS_FILENAME_LEN];
> +
> +	/* Eviction activity tracking */
> +	atomic64_t evict_duration_counter;
> +	struct attribute attr_evict;
> +	char evict_filename[MAX_SYSFS_FILENAME_LEN];

I don't think this name needs to be in a per-pdd variable, because it's
the same for all pdds. see below.


>  };
>  
>  #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
>  
> +struct kobj_counters_list {
> +	struct list_head counters_list;
> +	struct kobject *kobj;
> +};
>  /* Process data */
>  struct kfd_process {
>  	/*
> @@ -766,13 +775,15 @@ struct kfd_process {
>  	/* seqno of the last scheduled eviction */
>  	unsigned int last_eviction_seqno;
>  	/* Approx. the last timestamp (in jiffies) when the process was
> -	 * restored after an eviction
> +	 * restored or evicted.
>  	 */
>  	unsigned long last_restore_timestamp;
> +	unsigned long last_evict_timestamp;
>  
>  	/* Kobj for our procfs */
>  	struct kobject *kobj;
>  	struct kobject *kobj_queues;
> +	struct kobj_counters_list counters;
>  	struct attribute attr_pasid;
>  };
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 1e15aa7d8ae8..2a81d5a790a0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -344,6 +344,26 @@ static ssize_t kfd_procfs_queue_show(struct kobject *kobj,
>  
>  	return 0;
>  }
> +static ssize_t kfd_procfs_counters_show(struct kobject *kobj,
> +				     struct attribute *attr, char *buffer)
> +{
> +	if (strcmp(attr->name, "evictions") == 0) {
> +		struct kfd_process_device *pdd = container_of(attr,
> +				struct kfd_process_device,
> +				attr_evict);
> +		uint64_t evict_jiffies;
> +
> +		evict_jiffies = atomic64_read(&pdd->evict_duration_counter);
> +
> +		return snprintf(buffer,
> +				PAGE_SIZE,
> +				"%llu\n",
> +				jiffies64_to_msecs(evict_jiffies));
> +	} else
> +		pr_err("Invalid attribute");
> +
> +	return 0;
> +}
>  
>  static struct attribute attr_queue_size = {
>  	.name = "size",
> @@ -376,6 +396,19 @@ static struct kobj_type procfs_queue_type = {
>  	.default_attrs = procfs_queue_attrs,
>  };
>  
> +static const struct sysfs_ops procfs_counters_ops = {
> +	.show = kfd_procfs_counters_show,
> +};
> +
> +static struct attribute *procfs_counters_attrs[] = {
> +	NULL
> +};
> +
> +static struct kobj_type procfs_counters_type = {
> +	.sysfs_ops = &procfs_counters_ops,
> +	.default_attrs = procfs_counters_attrs,
> +};
> +
>  int kfd_procfs_add_queue(struct queue *q)
>  {
>  	struct kfd_process *proc;
> @@ -417,6 +450,70 @@ static int kfd_sysfs_create_file(struct kfd_process *p, struct attribute *attr,
>  	return ret;
>  }
>  
> +static int kfd_procfs_add_sysfs_counters(struct kfd_process *p)
> +{
> +	int ret = 0;
> +	struct kfd_process_device *pdd;
> +	char counter_dir_filename[MAX_SYSFS_FILENAME_LEN];
> +
> +	if (!p)
> +		return -EINVAL;
> +
> +	if (!p->kobj)
> +		return -EFAULT;
> +
> +	INIT_LIST_HEAD(&p->counters.counters_list);
> +	/*
> +	 * Create sysfs files for each GPU:
> +	 * - proc/<pid>/counters_<gpuid>/
> +	 * - proc/<pid>/counters_<gpuid>/evictions
> +	 */
> +	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
> +		struct kobj_counters_list *kobj_counter;
> +
> +		kobj_counter = kzalloc(sizeof(*kobj_counter),
> +				GFP_KERNEL);
> +		if (!kobj_counter)
> +			return -ENOMEM;
> +
> +		snprintf(counter_dir_filename, MAX_SYSFS_FILENAME_LEN,
> +				"counters_%u", pdd->dev->id);

As discussed on another email thread, let's rename this to stats_%u, as
the eviction file isn't technically a counter and we're thinking of
adding other stats that aren't counters either (e.g. CU occupancy).


> +		kobj_counter->kobj = kfd_alloc_struct(kobj_counter->kobj);
> +		if (!kobj_counter->kobj) {
> +			kfree(kobj_counter);
> +			return -ENOMEM;
> +		}
> +
> +		ret = kobject_init_and_add(kobj_counter->kobj,
> +						&procfs_counters_type,
> +						p->kobj,
> +						counter_dir_filename);
> +
> +		if (ret) {
> +			pr_warn("Creating KFD proc/counters_%s folder failed",
> +					counter_dir_filename);
> +			kobject_put(kobj_counter->kobj);
> +			kfree(kobj_counter);
> +			goto err;
> +		}
> +
> +		list_add(&kobj_counter->counters_list,
> +				&p->counters.counters_list);
> +		snprintf(pdd->evict_filename,
> +				MAX_SYSFS_FILENAME_LEN,
> +				"evictions");
> +		pdd->attr_evict.name = pdd->evict_filename;

Let's rename this to "evicted_ms" to better describe what is counted
here. Also, you don't need this in a per-pdd variable, because the name
is the same for every pdd. I think you can just assign the string
literal directly:

		pdd->attr_evict.name = "evicted_ms";


> +		pdd->attr_evict.mode = KFD_SYSFS_FILE_MODE;
> +		sysfs_attr_init(&pdd->attr_evict);
> +		ret = sysfs_create_file(kobj_counter->kobj, &pdd->attr_evict);
> +		if (ret)
> +			pr_warn("Creating eviction counter for gpuid %d failed",
> +				(int)pdd->dev->id);
> +	}
> +err:
> +	return ret;
> +}
> +
>  static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>  {
>  	int ret = 0;
> @@ -660,6 +757,11 @@ struct kfd_process *kfd_create_process(struct file *filep)
>  		if (!process->kobj_queues)
>  			pr_warn("Creating KFD proc/queues folder failed");
>  
> +		ret = kfd_procfs_add_sysfs_counters(process);
> +		if (ret)
> +			pr_warn("Creating sysfs counter dir for pid %d failed",
> +				(int)process->lead_thread->pid);
> +
>  		ret = kfd_procfs_add_sysfs_files(process);
>  		if (ret)
>  			pr_warn("Creating sysfs usage file for pid %d failed",
> @@ -806,6 +908,7 @@ static void kfd_process_wq_release(struct work_struct *work)
>  					     release_work);
>  	struct kfd_process_device *pdd;
>  
> +	struct kobj_counters_list *counters;
>  	/* Remove the procfs files */
>  	if (p->kobj) {
>  		sysfs_remove_file(p->kobj, &p->attr_pasid);
> @@ -818,6 +921,14 @@ static void kfd_process_wq_release(struct work_struct *work)
>  			sysfs_remove_file(p->kobj, &pdd->attr_sdma);
>  		}
>  
> +		list_for_each_entry(counters,
> +				&p->counters.counters_list,
> +				counters_list) {
> +			sysfs_remove_file(p->kobj, &pdd->attr_evict);
> +			kobject_del(counters->kobj);
> +			kobject_put(counters->kobj);
> +		}
> +
>  		kobject_del(p->kobj);
>  		kobject_put(p->kobj);
>  		p->kobj = NULL;
> @@ -1125,6 +1236,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>  	pdd->runtime_inuse = false;
>  	pdd->vram_usage = 0;
>  	pdd->sdma_past_activity_counter = 0;
> +	atomic64_set(&pdd->evict_duration_counter, 0);
>  	list_add(&pdd->per_device_list, &p->per_device_data);
>  
>  	/* Init idr used for memory handle translation */
> @@ -1388,7 +1500,9 @@ int kfd_process_restore_queues(struct kfd_process *p)
>  {
>  	struct kfd_process_device *pdd;
>  	int r, ret = 0;
> +	uint64_t eviction_duration;
>  
> +	eviction_duration = p->last_restore_timestamp - p->last_evict_timestamp;

These timestamps only count the time that a process was evicted due to a
TTM buffer eviction. They don't include evicted time from
userptr-related MMU notifiers.

To get a reliable measure of the time that a process spends evicted, you
should get timestamps in evict_process_queues_nocpsch and
evict_process_queues_cpsch and the corresponding restore functions in
kfd_device_queue_manager.c. That's where it does the reference counting
for all the possible eviction triggers and does the actual per-pdd
eviction. You'll need to maintain a last_evicted timestamp and a
evicted_duration counter in the pdd structure, separate from the per
process eviction timestamps.

Regards,
  Felix


>  	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
>  		r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
>  							      &pdd->qpd);
> @@ -1397,6 +1511,7 @@ int kfd_process_restore_queues(struct kfd_process *p)
>  			if (!ret)
>  				ret = r;
>  		}
> +		atomic64_add(eviction_duration, &pdd->evict_duration_counter);
>  	}
>  
>  	return ret;
> @@ -1425,6 +1540,8 @@ static void evict_process_worker(struct work_struct *work)
>  	 */
>  	flush_delayed_work(&p->restore_work);
>  
> +	p->last_evict_timestamp = get_jiffies_64();
> +
>  	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
>  	ret = kfd_process_evict_queues(p);
>  	if (!ret) {
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] drm/amdkfd: Reduce eviction/restore message levels
  2020-09-10 18:54 ` [PATCH 3/3] drm/amdkfd: Reduce eviction/restore message levels Philip Cox
@ 2020-09-10 22:43   ` Felix Kuehling
  0 siblings, 0 replies; 6+ messages in thread
From: Felix Kuehling @ 2020-09-10 22:43 UTC (permalink / raw)
  To: Philip Cox, amd-gfx; +Cc: Tony.Tye, Laurent.Morichetti, Jonathan.Kim

Am 2020-09-10 um 2:54 p.m. schrieb Philip Cox:
> Reduce the eviction and restore messages from INFO level to DEBUG level.
>
> Signed-off-by: Philip Cox <Philip.Cox@amd.com>

This patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index cafbc3aa980a..0d2bb20b49b7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -650,7 +650,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>  		goto out;
>  
>  	pdd = qpd_to_pdd(qpd);
> -	pr_info_ratelimited("Evicting PASID 0x%x queues\n",
> +	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>  			    pdd->process->pasid);
>  
>  	/* Mark all queues as evicted. Deactivate all active queues on
> @@ -700,7 +700,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>  		goto out;
>  
>  	pdd = qpd_to_pdd(qpd);
> -	pr_info_ratelimited("Evicting PASID 0x%x queues\n",
> +	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>  			    pdd->process->pasid);
>  
>  	/* Mark all queues as evicted. Deactivate all active queues on
> @@ -746,7 +746,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>  		goto out;
>  	}
>  
> -	pr_info_ratelimited("Restoring PASID 0x%x queues\n",
> +	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
>  			    pdd->process->pasid);
>  
>  	/* Update PD Base in QPD */
> @@ -826,7 +826,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>  		goto out;
>  	}
>  
> -	pr_info_ratelimited("Restoring PASID 0x%x queues\n",
> +	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
>  			    pdd->process->pasid);
>  
>  	/* Update PD Base in QPD */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-09-10 22:44 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-10 18:54 [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code Philip Cox
2020-09-10 18:54 ` [PATCH 2/3] drm/amdkfd: Add process eviction counters to sysfs Philip Cox
2020-09-10 22:43   ` Felix Kuehling
2020-09-10 18:54 ` [PATCH 3/3] drm/amdkfd: Reduce eviction/restore message levels Philip Cox
2020-09-10 22:43   ` Felix Kuehling
2020-09-10 22:31 ` [PATCH 1/3] drm/amdkfd: Add some eveiction debugging code Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.