All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] drm/amdkfd: Fixed kfd_process cleanup on module exit.
@ 2023-03-13 18:35 David Belanger
  2023-03-13 18:51 ` Felix Kuehling
  0 siblings, 1 reply; 2+ messages in thread
From: David Belanger @ 2023-03-13 18:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: David Belanger

Handle case when module is unloaded (kfd_exit) before a process space
(mm_struct) is released.

v2: Fixed potential race conditions by removing all kfd_process from
the process table first, then working on releasing the resources.

v3: Fixed loop element access / synchronization.  Fixed extra empty lines.

Signed-off-by: David Belanger <david.belanger@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_module.c  |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 75 +++++++++++++++++++++---
 3 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 09b966dc3768..aee2212e52f6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -77,6 +77,7 @@ static int kfd_init(void)
 
 static void kfd_exit(void)
 {
+	kfd_cleanup_processes();
 	kfd_debugfs_fini();
 	kfd_process_destroy_wq();
 	kfd_procfs_shutdown();
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index bfa30d12406b..7e4d992e48b3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -928,6 +928,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
 
 int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
+void kfd_cleanup_processes(void);
 struct kfd_process *kfd_create_process(struct file *filep);
 struct kfd_process *kfd_get_process(const struct task_struct *task);
 struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ebabe92f7edb..5614ef2ac49e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1167,6 +1167,17 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn)
 	kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
 }
 
+static void kfd_process_notifier_release_internal(struct kfd_process *p)
+{
+	cancel_delayed_work_sync(&p->eviction_work);
+	cancel_delayed_work_sync(&p->restore_work);
+
+	/* Indicate to other users that MM is no longer valid */
+	p->mm = NULL;
+
+	mmu_notifier_put(&p->mmu_notifier);
+}
+
 static void kfd_process_notifier_release(struct mmu_notifier *mn,
 					struct mm_struct *mm)
 {
@@ -1181,17 +1192,22 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 		return;
 
 	mutex_lock(&kfd_processes_mutex);
+	/*
+	 * Do early return if table is empty.
+	 *
+	 * This could potentially happen if this function is called concurrently
+	 * by mmu_notifier and by kfd_cleanup_pocesses.
+	 *
+	 */
+	if (hash_empty(kfd_processes_table)) {
+		mutex_unlock(&kfd_processes_mutex);
+		return;
+	}
 	hash_del_rcu(&p->kfd_processes);
 	mutex_unlock(&kfd_processes_mutex);
 	synchronize_srcu(&kfd_processes_srcu);
 
-	cancel_delayed_work_sync(&p->eviction_work);
-	cancel_delayed_work_sync(&p->restore_work);
-
-	/* Indicate to other users that MM is no longer valid */
-	p->mm = NULL;
-
-	mmu_notifier_put(&p->mmu_notifier);
+	kfd_process_notifier_release_internal(p);
 }
 
 static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
@@ -1200,6 +1216,51 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 	.free_notifier = kfd_process_free_notifier,
 };
 
+void kfd_cleanup_processes(void)
+{
+	/*
+	 * This code handles the case when driver is being unloaded before all
+	 * mm_struct are released.  We need to safely free the kfd_process and
+	 * avoid race conditions with mmu_notifier that might try to free them.
+	 *
+	 */
+
+	struct kfd_process *p;
+	struct hlist_node *p_temp;
+	unsigned int temp;
+	HLIST_HEAD(cleanup_list);
+
+	/*
+	 * Move all remaining kfd_process from the process table to a
+	 * temp list for processing.   Once done, callback from mmu_notifier
+	 * release will not see the kfd_process in the table and do early return,
+	 * avoiding double free issues.
+	 */
+	mutex_lock(&kfd_processes_mutex);
+	hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
+		hash_del_rcu(&p->kfd_processes);
+		synchronize_srcu(&kfd_processes_srcu);
+		hlist_add_head(&p->kfd_processes, &cleanup_list);
+	}
+	mutex_unlock(&kfd_processes_mutex);
+
+
+	/*
+	 * Release resources for all outstanding kfd_process collected.
+	 */
+	hlist_for_each_entry_safe(p, p_temp, &cleanup_list, kfd_processes)
+		kfd_process_notifier_release_internal(p);
+
+	/*
+	 * Must be called after all mmu_notifier_put are done and before
+	 * kfd_process_wq is released.
+	 *
+	 * Ensures that all outstanding free_notifier get called, triggering
+	 * the release of the kfd_process struct.
+	 */
+	mmu_notifier_synchronize();
+}
+
 static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 {
 	unsigned long  offset;
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v3] drm/amdkfd: Fixed kfd_process cleanup on module exit.
  2023-03-13 18:35 [PATCH v3] drm/amdkfd: Fixed kfd_process cleanup on module exit David Belanger
@ 2023-03-13 18:51 ` Felix Kuehling
  0 siblings, 0 replies; 2+ messages in thread
From: Felix Kuehling @ 2023-03-13 18:51 UTC (permalink / raw)
  To: amd-gfx, Belanger, David


Am 2023-03-13 um 14:35 schrieb David Belanger:
> Handle case when module is unloaded (kfd_exit) before a process space
> (mm_struct) is released.
>
> v2: Fixed potential race conditions by removing all kfd_process from
> the process table first, then working on releasing the resources.
>
> v3: Fixed loop element access / synchronization.  Fixed extra empty lines.
>
> Signed-off-by: David Belanger <david.belanger@amd.com>

This looks good. I'd make the comments slightly less verbose. See 
inline. With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_module.c  |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 75 +++++++++++++++++++++---
>   3 files changed, 70 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> index 09b966dc3768..aee2212e52f6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> @@ -77,6 +77,7 @@ static int kfd_init(void)
>   
>   static void kfd_exit(void)
>   {
> +	kfd_cleanup_processes();
>   	kfd_debugfs_fini();
>   	kfd_process_destroy_wq();
>   	kfd_procfs_shutdown();
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index bfa30d12406b..7e4d992e48b3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -928,6 +928,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
>   
>   int kfd_process_create_wq(void);
>   void kfd_process_destroy_wq(void);
> +void kfd_cleanup_processes(void);
>   struct kfd_process *kfd_create_process(struct file *filep);
>   struct kfd_process *kfd_get_process(const struct task_struct *task);
>   struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index ebabe92f7edb..5614ef2ac49e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1167,6 +1167,17 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn)
>   	kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
>   }
>   
> +static void kfd_process_notifier_release_internal(struct kfd_process *p)
> +{
> +	cancel_delayed_work_sync(&p->eviction_work);
> +	cancel_delayed_work_sync(&p->restore_work);
> +
> +	/* Indicate to other users that MM is no longer valid */
> +	p->mm = NULL;
> +
> +	mmu_notifier_put(&p->mmu_notifier);
> +}
> +
>   static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   					struct mm_struct *mm)
>   {
> @@ -1181,17 +1192,22 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   		return;
>   
>   	mutex_lock(&kfd_processes_mutex);
> +	/*
> +	 * Do early return if table is empty.
> +	 *
> +	 * This could potentially happen if this function is called concurrently
> +	 * by mmu_notifier and by kfd_cleanup_pocesses.
> +	 *
> +	 */
> +	if (hash_empty(kfd_processes_table)) {
> +		mutex_unlock(&kfd_processes_mutex);
> +		return;
> +	}
>   	hash_del_rcu(&p->kfd_processes);
>   	mutex_unlock(&kfd_processes_mutex);
>   	synchronize_srcu(&kfd_processes_srcu);
>   
> -	cancel_delayed_work_sync(&p->eviction_work);
> -	cancel_delayed_work_sync(&p->restore_work);
> -
> -	/* Indicate to other users that MM is no longer valid */
> -	p->mm = NULL;
> -
> -	mmu_notifier_put(&p->mmu_notifier);
> +	kfd_process_notifier_release_internal(p);
>   }
>   
>   static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
> @@ -1200,6 +1216,51 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
>   	.free_notifier = kfd_process_free_notifier,
>   };
>   
> +void kfd_cleanup_processes(void)
> +{
> +	/*
> +	 * This code handles the case when driver is being unloaded before all
> +	 * mm_struct are released.  We need to safely free the kfd_process and
> +	 * avoid race conditions with mmu_notifier that might try to free them.
> +	 *
> +	 */

Comments describing a function's overall purpose usually go before the 
function.


> +
> +	struct kfd_process *p;
> +	struct hlist_node *p_temp;
> +	unsigned int temp;
> +	HLIST_HEAD(cleanup_list);
> +
> +	/*
> +	 * Move all remaining kfd_process from the process table to a
> +	 * temp list for processing.   Once done, callback from mmu_notifier
> +	 * release will not see the kfd_process in the table and do early return,
> +	 * avoiding double free issues.
> +	 */
> +	mutex_lock(&kfd_processes_mutex);
> +	hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
> +		hash_del_rcu(&p->kfd_processes);
> +		synchronize_srcu(&kfd_processes_srcu);
> +		hlist_add_head(&p->kfd_processes, &cleanup_list);
> +	}
> +	mutex_unlock(&kfd_processes_mutex);
> +
> +
> +	/*
> +	 * Release resources for all outstanding kfd_process collected.
> +	 */

This comment is redundant. The processing of the cleanup list is already 
explained above.


> +	hlist_for_each_entry_safe(p, p_temp, &cleanup_list, kfd_processes)
> +		kfd_process_notifier_release_internal(p);
> +
> +	/*
> +	 * Must be called after all mmu_notifier_put are done and before
> +	 * kfd_process_wq is released.
> +	 *
> +	 * Ensures that all outstanding free_notifier get called, triggering
> +	 * the release of the kfd_process struct.

One of these sentences is redundant. I'd keep just the second one.

Regards,
   Felix


> +	 */
> +	mmu_notifier_synchronize();
> +}
> +
>   static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
>   {
>   	unsigned long  offset;

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-03-13 18:51 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-13 18:35 [PATCH v3] drm/amdkfd: Fixed kfd_process cleanup on module exit David Belanger
2023-03-13 18:51 ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.