On 2021-04-20 9:25 p.m., Felix Kuehling wrote:
Am 2021-04-20 um 8:45 p.m. schrieb Felix Kuehling:
Am 2021-04-19 um 9:52 p.m. schrieb Alex Sierra:
SVM ranges are created for unregistered memory, triggered
by page faults. These ranges are migrated/mapped to
GPU VRAM memory.

Signed-off-by: Alex Sierra <alex.sierra@amd.com>
This looks generally good to me. One more nit-pick inline in addition to
Philip's comments. And one question.
I found another potential deadlock. See inline. [+Philip]



---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 103 ++++++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 45dd055118eb..a8a92c533cf7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2179,6 +2179,84 @@ svm_range_best_restore_location(struct svm_range *prange,
 
 	return -1;
 }
+static int
+svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
+				unsigned long *start, unsigned long *last)
+{
+	struct vm_area_struct *vma;
+	struct interval_tree_node *node;
+	unsigned long start_limit, end_limit;
+
+	vma = find_vma(p->mm, addr);
+	if (!vma || addr < vma->vm_start) {
+		pr_debug("VMA does not exist in address [0x%llx]\n", addr);
+		return -EFAULT;
+	}
+	start_limit = max(vma->vm_start,
+			(unsigned long)ALIGN_DOWN(addr, 2UL << 20)) >> PAGE_SHIFT;
+	end_limit = min(vma->vm_end,
+			(unsigned long)ALIGN(addr + 1, 2UL << 20)) >> PAGE_SHIFT;
+	/* First range that starts after the fault address */
+	node = interval_tree_iter_first(&p->svms.objects, (addr >> PAGE_SHIFT) + 1, ULONG_MAX);
+	if (node) {
+		end_limit = min(end_limit, node->start);
+		/* Last range that ends before the fault address */
+		node = container_of(rb_prev(&node->rb), struct interval_tree_node, rb);
+	} else {
+		/* Last range must end before addr because there was no range after addr */
+		node = container_of(rb_last(&p->svms.objects.rb_root),
+				    struct interval_tree_node, rb);
+	}
+	if (node)
+		start_limit = max(start_limit, node->last + 1);
+
+	*start = start_limit;
+	*last = end_limit - 1;
+
+	pr_debug("vma start: %lx start: %lx vma end: %lx last: %lx\n",
+		  vma->vm_start >> PAGE_SHIFT, *start,
+		  vma->vm_end >> PAGE_SHIFT, *last);
+
+	return 0;
+
+}
+static struct
+svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
+						struct kfd_process *p,
+						struct mm_struct *mm,
+						int64_t addr)
+{
+	struct svm_range *prange = NULL;
+	struct svm_range_list *svms;
+	unsigned long start, last;
+	uint32_t gpuid, gpuidx;
+
+	if (svm_range_get_range_boundaries(p, addr << PAGE_SHIFT,
+					   &start, &last))
+		return NULL;
+
+	svms = &p->svms;
+	prange = svm_range_new(&p->svms, start, last);
+	if (!prange) {
+		pr_debug("Failed to create prange in address [0x%llx]\\n", addr);
+		goto out;
You can just return here, since you're not doing any cleanup at the out:
label.


+	}
+	if (kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx)) {
+		pr_debug("failed to get gpuid from kgd\n");
+		svm_range_free(prange);
+		prange = NULL;
+		goto out;
Just return.


+	}
+	prange->preferred_loc = gpuid;
+	prange->actual_loc = 0;
+	/* Gurantee prange is migrate it */
+	prange->validate_timestamp -= AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING;
Is this really specific to svm_range_create_unregistered_range? Or
should we always do this in svm_range_new to guarantee that new ranges
can get validated?
It's good idea to set prange->validate_timestamp to 0 in svm_range_new, then we don't need the special handle here, and restore_page will recover range to update page table without waiting for AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING for new range, AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING is used to skip duplicate retry fault on different pages of same range.
Regards,
  Felix


+	svm_range_add_to_svms(prange);
+	svm_range_add_notifier_locked(mm, prange);
+
+out:
+	return prange;
+}
 
 /* svm_range_skip_recover - decide if prange can be recovered
  * @prange: svm range structure
@@ -2228,6 +2306,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 	struct kfd_process *p;
 	uint64_t timestamp;
 	int32_t best_loc, gpuidx;
+	bool write_locked = false;
 	int r = 0;
 
 	p = kfd_lookup_process_by_pasid(pasid);
@@ -2251,14 +2330,34 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 	}
 
 	mmap_read_lock(mm);
+retry_write_locked:
 	mutex_lock(&svms->lock);
 	prange = svm_range_from_addr(svms, addr, NULL);
 	if (!prange) {
 		pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
 			 svms, addr);
-		r = -EFAULT;
-		goto out_unlock_svms;
+		if (!write_locked) {
+			/* Need the write lock to create new range with MMU notifier.
+			 * Also flush pending deferred work to make sure the interval
+			 * tree is up to date before we add a new range
+			 */
+			mutex_unlock(&svms->lock);
+			mmap_read_unlock(mm);
+			svm_range_list_lock_and_flush_work(svms, mm);
I think this can deadlock with a deferred worker trying to drain
interrupts (Philip's patch series). If we cannot flush deferred work
here, we need to be more careful creating new ranges to make sure they
don't conflict with added deferred or child ranges.

It's impossible to have deadlock with deferred worker to drain interrupts, because drain interrupt wait for restore_pages without taking any lock, and restore_pages flush deferred work without taking any lock too.

Regards,

Philip


Regards,
  Felix


+			write_locked = true;
+			goto retry_write_locked;
+		}
+		prange = svm_range_create_unregistered_range(adev, p, mm, addr);
+		if (!prange) {
+			pr_debug("failed to create unregisterd range svms 0x%p address [0x%llx]\n",
+			svms, addr);
+			mmap_write_downgrade(mm);
+			r = -EFAULT;
+			goto out_unlock_svms;
+		}
 	}
+	if (write_locked)
+		mmap_write_downgrade(mm);
 
 	mutex_lock(&prange->migrate_mutex);
 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx