All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone
@ 2022-01-19 16:22 Philip Yang
  2022-01-19 16:22 ` [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit Philip Yang
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Philip Yang @ 2022-01-19 16:22 UTC (permalink / raw)
  To: amd-gfx; +Cc: ruili.ji, felix.kuehling, Philip Yang

After mm is removed from task->mm, deferred_list work should continue to
handle deferred_range_list which maybe split to child range to avoid
child range leak, and remove ranges mmu interval notifier to avoid mm
mm_count leak, but skip updating notifier and inserting new notifier.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reported-by: Ruili Ji <ruili.ji@amd.com>
Tested-by: Ruili Ji <ruili.ji@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 41 ++++++++++++++++------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f2805ba74c80..9ec195e1ef23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1985,10 +1985,9 @@ svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
 }
 
 static void
-svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
+svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
+			 struct mm_struct *mm)
 {
-	struct mm_struct *mm = prange->work_item.mm;
-
 	switch (prange->work_item.op) {
 	case SVM_OP_NULL:
 		pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
@@ -2004,25 +2003,29 @@ svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
 	case SVM_OP_UPDATE_RANGE_NOTIFIER:
 		pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
 			 svms, prange, prange->start, prange->last);
-		svm_range_update_notifier_and_interval_tree(mm, prange);
+		if (mm)
+			svm_range_update_notifier_and_interval_tree(mm, prange);
 		break;
 	case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
 		pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
 			 svms, prange, prange->start, prange->last);
-		svm_range_update_notifier_and_interval_tree(mm, prange);
+		if (mm)
+			svm_range_update_notifier_and_interval_tree(mm, prange);
 		/* TODO: implement deferred validation and mapping */
 		break;
 	case SVM_OP_ADD_RANGE:
 		pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
 			 prange->start, prange->last);
 		svm_range_add_to_svms(prange);
-		svm_range_add_notifier_locked(mm, prange);
+		if (mm)
+			svm_range_add_notifier_locked(mm, prange);
 		break;
 	case SVM_OP_ADD_RANGE_AND_MAP:
 		pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
 			 prange, prange->start, prange->last);
 		svm_range_add_to_svms(prange);
-		svm_range_add_notifier_locked(mm, prange);
+		if (mm)
+			svm_range_add_notifier_locked(mm, prange);
 		/* TODO: implement deferred validation and mapping */
 		break;
 	default:
@@ -2071,20 +2074,22 @@ static void svm_range_deferred_list_work(struct work_struct *work)
 	pr_debug("enter svms 0x%p\n", svms);
 
 	p = container_of(svms, struct kfd_process, svms);
-	/* Avoid mm is gone when inserting mmu notifier */
+
+	/* If mm is gone, continue cleanup the deferred_range_list */
 	mm = get_task_mm(p->lead_thread);
-	if (!mm) {
+	if (!mm)
 		pr_debug("svms 0x%p process mm gone\n", svms);
-		return;
-	}
+
 retry:
-	mmap_write_lock(mm);
+	if (mm)
+		mmap_write_lock(mm);
 
 	/* Checking for the need to drain retry faults must be inside
 	 * mmap write lock to serialize with munmap notifiers.
 	 */
 	if (unlikely(atomic_read(&svms->drain_pagefaults))) {
-		mmap_write_unlock(mm);
+		if (mm)
+			mmap_write_unlock(mm);
 		svm_range_drain_retry_fault(svms);
 		goto retry;
 	}
@@ -2109,19 +2114,21 @@ static void svm_range_deferred_list_work(struct work_struct *work)
 			pr_debug("child prange 0x%p op %d\n", pchild,
 				 pchild->work_item.op);
 			list_del_init(&pchild->child_list);
-			svm_range_handle_list_op(svms, pchild);
+			svm_range_handle_list_op(svms, pchild, mm);
 		}
 		mutex_unlock(&prange->migrate_mutex);
 
-		svm_range_handle_list_op(svms, prange);
+		svm_range_handle_list_op(svms, prange, mm);
 		mutex_unlock(&svms->lock);
 
 		spin_lock(&svms->deferred_list_lock);
 	}
 	spin_unlock(&svms->deferred_list_lock);
 
-	mmap_write_unlock(mm);
-	mmput(mm);
+	if (mm) {
+		mmap_write_unlock(mm);
+		mmput(mm);
+	}
 	pr_debug("exit svms 0x%p\n", svms);
 }
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit
  2022-01-19 16:22 [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone Philip Yang
@ 2022-01-19 16:22 ` Philip Yang
  2022-01-20  9:47   ` Ji, Ruili
  2022-01-20  9:47 ` [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone Ji, Ruili
  2022-01-20 16:29 ` Felix Kuehling
  2 siblings, 1 reply; 5+ messages in thread
From: Philip Yang @ 2022-01-19 16:22 UTC (permalink / raw)
  To: amd-gfx; +Cc: ruili.ji, felix.kuehling, Philip Yang

kfd_process_notifier_release flush svm_range_restore_work
which calls svm_range_list_lock_and_flush_work to flush deferred_list
work, but if deferred_list work mmput release the last user, it will
call exit_mmap -> notifier_release, it is deadlock with below backtrace.

Move flush svm_range_restore_work to kfd_process_wq_release to avoid
deadlock. Then svm_range_restore_work take task->mm ref to avoid mm is
gone while validating and mapping ranges to GPU.

Workqueue: events svm_range_deferred_list_work [amdgpu]
Call Trace:
 wait_for_completion+0x94/0x100
 __flush_work+0x12a/0x1e0
 __cancel_work_timer+0x10e/0x190
 cancel_delayed_work_sync+0x13/0x20
 kfd_process_notifier_release+0x98/0x2a0 [amdgpu]
 __mmu_notifier_release+0x74/0x1f0
 exit_mmap+0x170/0x200
 mmput+0x5d/0x130
 svm_range_deferred_list_work+0x104/0x230 [amdgpu]
 process_one_work+0x220/0x3c0

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reported-by: Ruili Ji <ruili.ji@amd.com>
Tested-by: Ruili Ji <ruili.ji@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  1 -
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 15 +++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d1145da5348f..74f162887d3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1150,7 +1150,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 
 	cancel_delayed_work_sync(&p->eviction_work);
 	cancel_delayed_work_sync(&p->restore_work);
-	cancel_delayed_work_sync(&p->svms.restore_work);
 
 	mutex_lock(&p->mutex);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 9ec195e1ef23..2d2cae05dbea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1643,13 +1643,14 @@ static void svm_range_restore_work(struct work_struct *work)
 
 	pr_debug("restore svm ranges\n");
 
-	/* kfd_process_notifier_release destroys this worker thread. So during
-	 * the lifetime of this thread, kfd_process and mm will be valid.
-	 */
 	p = container_of(svms, struct kfd_process, svms);
-	mm = p->mm;
-	if (!mm)
+
+	/* Keep mm reference when svm_range_validate_and_map ranges */
+	mm = get_task_mm(p->lead_thread);
+	if (!mm) {
+		pr_debug("svms 0x%p process mm gone\n", svms);
 		return;
+	}
 
 	svm_range_list_lock_and_flush_work(svms, mm);
 	mutex_lock(&svms->lock);
@@ -1703,6 +1704,7 @@ static void svm_range_restore_work(struct work_struct *work)
 out_reschedule:
 	mutex_unlock(&svms->lock);
 	mmap_write_unlock(mm);
+	mmput(mm);
 
 	/* If validation failed, reschedule another attempt */
 	if (evicted_ranges) {
@@ -2837,6 +2839,8 @@ void svm_range_list_fini(struct kfd_process *p)
 
 	pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
 
+	cancel_delayed_work_sync(&p->svms.restore_work);
+
 	/* Ensure list work is finished before process is destroyed */
 	flush_work(&p->svms.deferred_list_work);
 
@@ -2847,7 +2851,6 @@ void svm_range_list_fini(struct kfd_process *p)
 	atomic_inc(&p->svms.drain_pagefaults);
 	svm_range_drain_retry_fault(&p->svms);
 
-
 	list_for_each_entry_safe(prange, next, &p->svms.list, list) {
 		svm_range_unlink(prange);
 		svm_range_remove_notifier(prange);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone
  2022-01-19 16:22 [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone Philip Yang
  2022-01-19 16:22 ` [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit Philip Yang
@ 2022-01-20  9:47 ` Ji, Ruili
  2022-01-20 16:29 ` Felix Kuehling
  2 siblings, 0 replies; 5+ messages in thread
From: Ji, Ruili @ 2022-01-20  9:47 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx; +Cc: Kuehling, Felix

[AMD Official Use Only]

sudo ./kfdtest --gtest_filter=KFDSVM*
sudo ./kfdtest
Test results are pass.
Tested-by: Ruili Ji <ruili.ji@amd.com>

-----Original Message-----
From: Yang, Philip <Philip.Yang@amd.com>
Sent: 2022年1月20日 0:23
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Ji, Ruili <Ruili.Ji@amd.com>; Yang, Philip <Philip.Yang@amd.com>
Subject: [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone

After mm is removed from task->mm, deferred_list work should continue to handle deferred_range_list which maybe split to child range to avoid child range leak, and remove ranges mmu interval notifier to avoid mm mm_count leak, but skip updating notifier and inserting new notifier.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reported-by: Ruili Ji <ruili.ji@amd.com>
Tested-by: Ruili Ji <ruili.ji@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 41 ++++++++++++++++------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f2805ba74c80..9ec195e1ef23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1985,10 +1985,9 @@ svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,  }

 static void
-svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
+svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
+                        struct mm_struct *mm)
 {
-       struct mm_struct *mm = prange->work_item.mm;
-
        switch (prange->work_item.op) {
        case SVM_OP_NULL:
                pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", @@ -2004,25 +2003,29 @@ svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
        case SVM_OP_UPDATE_RANGE_NOTIFIER:
                pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
                         svms, prange, prange->start, prange->last);
-               svm_range_update_notifier_and_interval_tree(mm, prange);
+               if (mm)
+                       svm_range_update_notifier_and_interval_tree(mm, prange);
                break;
        case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
                pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
                         svms, prange, prange->start, prange->last);
-               svm_range_update_notifier_and_interval_tree(mm, prange);
+               if (mm)
+                       svm_range_update_notifier_and_interval_tree(mm, prange);
                /* TODO: implement deferred validation and mapping */
                break;
        case SVM_OP_ADD_RANGE:
                pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
                         prange->start, prange->last);
                svm_range_add_to_svms(prange);
-               svm_range_add_notifier_locked(mm, prange);
+               if (mm)
+                       svm_range_add_notifier_locked(mm, prange);
                break;
        case SVM_OP_ADD_RANGE_AND_MAP:
                pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
                         prange, prange->start, prange->last);
                svm_range_add_to_svms(prange);
-               svm_range_add_notifier_locked(mm, prange);
+               if (mm)
+                       svm_range_add_notifier_locked(mm, prange);
                /* TODO: implement deferred validation and mapping */
                break;
        default:
@@ -2071,20 +2074,22 @@ static void svm_range_deferred_list_work(struct work_struct *work)
        pr_debug("enter svms 0x%p\n", svms);

        p = container_of(svms, struct kfd_process, svms);
-       /* Avoid mm is gone when inserting mmu notifier */
+
+       /* If mm is gone, continue cleanup the deferred_range_list */
        mm = get_task_mm(p->lead_thread);
-       if (!mm) {
+       if (!mm)
                pr_debug("svms 0x%p process mm gone\n", svms);
-               return;
-       }
+
 retry:
-       mmap_write_lock(mm);
+       if (mm)
+               mmap_write_lock(mm);

        /* Checking for the need to drain retry faults must be inside
         * mmap write lock to serialize with munmap notifiers.
         */
        if (unlikely(atomic_read(&svms->drain_pagefaults))) {
-               mmap_write_unlock(mm);
+               if (mm)
+                       mmap_write_unlock(mm);
                svm_range_drain_retry_fault(svms);
                goto retry;
        }
@@ -2109,19 +2114,21 @@ static void svm_range_deferred_list_work(struct work_struct *work)
                        pr_debug("child prange 0x%p op %d\n", pchild,
                                 pchild->work_item.op);
                        list_del_init(&pchild->child_list);
-                       svm_range_handle_list_op(svms, pchild);
+                       svm_range_handle_list_op(svms, pchild, mm);
                }
                mutex_unlock(&prange->migrate_mutex);

-               svm_range_handle_list_op(svms, prange);
+               svm_range_handle_list_op(svms, prange, mm);
                mutex_unlock(&svms->lock);

                spin_lock(&svms->deferred_list_lock);
        }
        spin_unlock(&svms->deferred_list_lock);

-       mmap_write_unlock(mm);
-       mmput(mm);
+       if (mm) {
+               mmap_write_unlock(mm);
+               mmput(mm);
+       }
        pr_debug("exit svms 0x%p\n", svms);
 }

--
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit
  2022-01-19 16:22 ` [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit Philip Yang
@ 2022-01-20  9:47   ` Ji, Ruili
  0 siblings, 0 replies; 5+ messages in thread
From: Ji, Ruili @ 2022-01-20  9:47 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx; +Cc: Kuehling, Felix

[AMD Official Use Only]

sudo ./kfdtest --gtest_filter=KFDSVM*
sudo ./kfdtest
Test results are pass.
Tested-by: Ruili Ji <ruili.ji@amd.com>

-----Original Message-----
From: Yang, Philip <Philip.Yang@amd.com>
Sent: 2022年1月20日 0:23
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Ji, Ruili <Ruili.Ji@amd.com>; Yang, Philip <Philip.Yang@amd.com>
Subject: [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit

kfd_process_notifier_release flush svm_range_restore_work which calls svm_range_list_lock_and_flush_work to flush deferred_list work, but if deferred_list work mmput release the last user, it will call exit_mmap -> notifier_release, it is deadlock with below backtrace.

Move flush svm_range_restore_work to kfd_process_wq_release to avoid deadlock. Then svm_range_restore_work take task->mm ref to avoid mm is gone while validating and mapping ranges to GPU.

Workqueue: events svm_range_deferred_list_work [amdgpu] Call Trace:
 wait_for_completion+0x94/0x100
 __flush_work+0x12a/0x1e0
 __cancel_work_timer+0x10e/0x190
 cancel_delayed_work_sync+0x13/0x20
 kfd_process_notifier_release+0x98/0x2a0 [amdgpu]
 __mmu_notifier_release+0x74/0x1f0
 exit_mmap+0x170/0x200
 mmput+0x5d/0x130
 svm_range_deferred_list_work+0x104/0x230 [amdgpu]
 process_one_work+0x220/0x3c0

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reported-by: Ruili Ji <ruili.ji@amd.com>
Tested-by: Ruili Ji <ruili.ji@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  1 -
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 15 +++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d1145da5348f..74f162887d3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1150,7 +1150,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,

        cancel_delayed_work_sync(&p->eviction_work);
        cancel_delayed_work_sync(&p->restore_work);
-       cancel_delayed_work_sync(&p->svms.restore_work);

        mutex_lock(&p->mutex);

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 9ec195e1ef23..2d2cae05dbea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1643,13 +1643,14 @@ static void svm_range_restore_work(struct work_struct *work)

        pr_debug("restore svm ranges\n");

-       /* kfd_process_notifier_release destroys this worker thread. So during
-        * the lifetime of this thread, kfd_process and mm will be valid.
-        */
        p = container_of(svms, struct kfd_process, svms);
-       mm = p->mm;
-       if (!mm)
+
+       /* Keep mm reference when svm_range_validate_and_map ranges */
+       mm = get_task_mm(p->lead_thread);
+       if (!mm) {
+               pr_debug("svms 0x%p process mm gone\n", svms);
                return;
+       }

        svm_range_list_lock_and_flush_work(svms, mm);
        mutex_lock(&svms->lock);
@@ -1703,6 +1704,7 @@ static void svm_range_restore_work(struct work_struct *work)
 out_reschedule:
        mutex_unlock(&svms->lock);
        mmap_write_unlock(mm);
+       mmput(mm);

        /* If validation failed, reschedule another attempt */
        if (evicted_ranges) {
@@ -2837,6 +2839,8 @@ void svm_range_list_fini(struct kfd_process *p)

        pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);

+       cancel_delayed_work_sync(&p->svms.restore_work);
+
        /* Ensure list work is finished before process is destroyed */
        flush_work(&p->svms.deferred_list_work);

@@ -2847,7 +2851,6 @@ void svm_range_list_fini(struct kfd_process *p)
        atomic_inc(&p->svms.drain_pagefaults);
        svm_range_drain_retry_fault(&p->svms);

-
        list_for_each_entry_safe(prange, next, &p->svms.list, list) {
                svm_range_unlink(prange);
                svm_range_remove_notifier(prange);
--
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone
  2022-01-19 16:22 [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone Philip Yang
  2022-01-19 16:22 ` [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit Philip Yang
  2022-01-20  9:47 ` [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone Ji, Ruili
@ 2022-01-20 16:29 ` Felix Kuehling
  2 siblings, 0 replies; 5+ messages in thread
From: Felix Kuehling @ 2022-01-20 16:29 UTC (permalink / raw)
  To: Philip Yang, amd-gfx; +Cc: ruili.ji

Can we instead take a proper reference to the mm in
svm_range_add_list_work? That way the mm would remain valid as long as
the work is scheduled.

So instead of calling get_task_mm in svm_range_deferred_list_work, do it
in svm_range_add_list_work.

Regards,
  Felix


Am 2022-01-19 um 11:22 a.m. schrieb Philip Yang:
> After mm is removed from task->mm, deferred_list work should continue to
> handle deferred_range_list which maybe split to child range to avoid
> child range leak, and remove ranges mmu interval notifier to avoid mm
> mm_count leak, but skip updating notifier and inserting new notifier.
>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> Reported-by: Ruili Ji <ruili.ji@amd.com>
> Tested-by: Ruili Ji <ruili.ji@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 41 ++++++++++++++++------------
>  1 file changed, 24 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index f2805ba74c80..9ec195e1ef23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1985,10 +1985,9 @@ svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
>  }
>  
>  static void
> -svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
> +svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
> +			 struct mm_struct *mm)
>  {
> -	struct mm_struct *mm = prange->work_item.mm;
> -
>  	switch (prange->work_item.op) {
>  	case SVM_OP_NULL:
>  		pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
> @@ -2004,25 +2003,29 @@ svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
>  	case SVM_OP_UPDATE_RANGE_NOTIFIER:
>  		pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
>  			 svms, prange, prange->start, prange->last);
> -		svm_range_update_notifier_and_interval_tree(mm, prange);
> +		if (mm)
> +			svm_range_update_notifier_and_interval_tree(mm, prange);
>  		break;
>  	case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
>  		pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
>  			 svms, prange, prange->start, prange->last);
> -		svm_range_update_notifier_and_interval_tree(mm, prange);
> +		if (mm)
> +			svm_range_update_notifier_and_interval_tree(mm, prange);
>  		/* TODO: implement deferred validation and mapping */
>  		break;
>  	case SVM_OP_ADD_RANGE:
>  		pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
>  			 prange->start, prange->last);
>  		svm_range_add_to_svms(prange);
> -		svm_range_add_notifier_locked(mm, prange);
> +		if (mm)
> +			svm_range_add_notifier_locked(mm, prange);
>  		break;
>  	case SVM_OP_ADD_RANGE_AND_MAP:
>  		pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
>  			 prange, prange->start, prange->last);
>  		svm_range_add_to_svms(prange);
> -		svm_range_add_notifier_locked(mm, prange);
> +		if (mm)
> +			svm_range_add_notifier_locked(mm, prange);
>  		/* TODO: implement deferred validation and mapping */
>  		break;
>  	default:
> @@ -2071,20 +2074,22 @@ static void svm_range_deferred_list_work(struct work_struct *work)
>  	pr_debug("enter svms 0x%p\n", svms);
>  
>  	p = container_of(svms, struct kfd_process, svms);
> -	/* Avoid mm is gone when inserting mmu notifier */
> +
> +	/* If mm is gone, continue cleanup the deferred_range_list */
>  	mm = get_task_mm(p->lead_thread);
> -	if (!mm) {
> +	if (!mm)
>  		pr_debug("svms 0x%p process mm gone\n", svms);
> -		return;
> -	}
> +
>  retry:
> -	mmap_write_lock(mm);
> +	if (mm)
> +		mmap_write_lock(mm);
>  
>  	/* Checking for the need to drain retry faults must be inside
>  	 * mmap write lock to serialize with munmap notifiers.
>  	 */
>  	if (unlikely(atomic_read(&svms->drain_pagefaults))) {
> -		mmap_write_unlock(mm);
> +		if (mm)
> +			mmap_write_unlock(mm);
>  		svm_range_drain_retry_fault(svms);
>  		goto retry;
>  	}
> @@ -2109,19 +2114,21 @@ static void svm_range_deferred_list_work(struct work_struct *work)
>  			pr_debug("child prange 0x%p op %d\n", pchild,
>  				 pchild->work_item.op);
>  			list_del_init(&pchild->child_list);
> -			svm_range_handle_list_op(svms, pchild);
> +			svm_range_handle_list_op(svms, pchild, mm);
>  		}
>  		mutex_unlock(&prange->migrate_mutex);
>  
> -		svm_range_handle_list_op(svms, prange);
> +		svm_range_handle_list_op(svms, prange, mm);
>  		mutex_unlock(&svms->lock);
>  
>  		spin_lock(&svms->deferred_list_lock);
>  	}
>  	spin_unlock(&svms->deferred_list_lock);
>  
> -	mmap_write_unlock(mm);
> -	mmput(mm);
> +	if (mm) {
> +		mmap_write_unlock(mm);
> +		mmput(mm);
> +	}
>  	pr_debug("exit svms 0x%p\n", svms);
>  }
>  

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-01-20 16:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-19 16:22 [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone Philip Yang
2022-01-19 16:22 ` [PATCH 2/2] drm/amdkfd: svm range restore work deadlock when process exit Philip Yang
2022-01-20  9:47   ` Ji, Ruili
2022-01-20  9:47 ` [PATCH 1/2] drm/amdkfd: svm deferred_list work continue cleanup after mm gone Ji, Ruili
2022-01-20 16:29 ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.