All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence
@ 2024-03-18 12:08 Shashank Sharma
  2024-03-18 12:08 ` [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush Shashank Sharma
  2024-03-18 14:58 ` [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence Christian König
  0 siblings, 2 replies; 12+ messages in thread
From: Shashank Sharma @ 2024-03-18 12:08 UTC (permalink / raw)
  To: amd-gfx
  Cc: Christian Koenig, Felix Kuehling, Rajneesh Bhardwaj,
	Alex Deucher, Shashank Sharma

From: Christian Koenig <christian.koenig@amd.com>

The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
    - rebase
    - set dma_fence_error only in case of error
    - add tlb_flush fence only when PT/PD BO is locked (Felix)
    - use vm->pasid when f is NULL (Mukul)

V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
    - move the misplaced fence_create call to the end (Philip)

V5: - free the f->dependency properly

V6: (Shashank)
    - light code movement, moved all the clean-up in previous patch
    - introduce params.needs_flush and its usage in this patch
    - rebase without TLB HW sequence patch

V7:
   - Keep the vm->last_update_fence and tlb_cb code until
     we can fix the HW sequencing (Christian)
   - Move all the tlb_fence related code in a separate function so that
     its easier to read and review

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
Signed-off-by: Christian Koenig <christian.koenig@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/Makefile           |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  68 +++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |   8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c    |   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c   |   4 +
 .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112 ++++++++++++++++++
 7 files changed, 171 insertions(+), 30 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 4536c8ad0e11..f24f11ac3e92 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
 	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
 	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
 	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
-	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
+	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
+	amdgpu_ib.o amdgpu_pll.o \
 	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
 	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
 	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 81fb3465e197..26f1c3359642 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -885,6 +885,40 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
 	kfree(tlb_cb);
 }
 
+static int
+amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, struct dma_fence **fence)
+{
+	struct amdgpu_vm_tlb_seq_struct *tlb_cb;
+	struct amdgpu_vm *vm = params->vm;
+
+	if (!fence || !*fence)
+		return 0;
+
+	tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
+	if (!tlb_cb)
+		return -ENOMEM;
+
+	tlb_cb->vm = vm;
+	if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
+				    amdgpu_vm_tlb_seq_cb)) {
+		dma_fence_put(vm->last_tlb_flush);
+		vm->last_tlb_flush = dma_fence_get(*fence);
+	} else {
+		amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
+	}
+
+	/* Prepare a TLB flush fence to be attached to PTs */
+	if (!params->unlocked && vm->is_compute_context) {
+		amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
+
+		/* Makes sure no PD/PT is freed before the flush */
+		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
+				   DMA_RESV_USAGE_BOOKKEEP);
+	}
+
+	return 0;
+}
+
 /**
  * amdgpu_vm_update_range - update a range in the vm page table
  *
@@ -917,7 +951,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			   struct dma_fence **fence)
 {
 	struct amdgpu_vm_update_params params;
-	struct amdgpu_vm_tlb_seq_struct *tlb_cb;
 	struct amdgpu_res_cursor cursor;
 	enum amdgpu_sync_mode sync_mode;
 	int r, idx;
@@ -925,12 +958,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (!drm_dev_enter(adev_to_drm(adev), &idx))
 		return -ENODEV;
 
-	tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
-	if (!tlb_cb) {
-		r = -ENOMEM;
-		goto error_unlock;
-	}
-
 	/* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,
 	 * heavy-weight flush TLB unconditionally.
 	 */
@@ -948,6 +975,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	params.immediate = immediate;
 	params.pages_addr = pages_addr;
 	params.unlocked = unlocked;
+	params.needs_flush = flush_tlb;
 	params.allow_override = allow_override;
 
 	/* Implicitly sync to command submissions in the same VM before
@@ -961,7 +989,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	amdgpu_vm_eviction_lock(vm);
 	if (vm->evicting) {
 		r = -EBUSY;
-		goto error_free;
+		goto error_unlock;
 	}
 
 	if (!unlocked && !dma_fence_is_signaled(vm->last_unlocked)) {
@@ -974,7 +1002,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	r = vm->update_funcs->prepare(&params, resv, sync_mode);
 	if (r)
-		goto error_free;
+		goto error_unlock;
 
 	amdgpu_res_first(pages_addr ? NULL : res, offset,
 			 (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, &cursor);
@@ -1024,29 +1052,18 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		tmp = start + num_entries;
 		r = amdgpu_vm_ptes_update(&params, start, tmp, addr, flags);
 		if (r)
-			goto error_free;
+			goto error_unlock;
 
 		amdgpu_res_next(&cursor, num_entries * AMDGPU_GPU_PAGE_SIZE);
 		start = tmp;
 	}
 
 	r = vm->update_funcs->commit(&params, fence);
+	if (r)
+		goto error_unlock;
 
-	if (flush_tlb || params.table_freed) {
-		tlb_cb->vm = vm;
-		if (fence && *fence &&
-		    !dma_fence_add_callback(*fence, &tlb_cb->cb,
-					   amdgpu_vm_tlb_seq_cb)) {
-			dma_fence_put(vm->last_tlb_flush);
-			vm->last_tlb_flush = dma_fence_get(*fence);
-		} else {
-			amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
-		}
-		tlb_cb = NULL;
-	}
-
-error_free:
-	kfree(tlb_cb);
+	if (params.needs_flush)
+		r = amdgpu_vm_tlb_flush(&params, fence);
 
 error_unlock:
 	amdgpu_vm_eviction_unlock(vm);
@@ -2391,6 +2408,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	mutex_init(&vm->eviction_lock);
 	vm->evicting = false;
+	vm->tlb_fence_context = dma_fence_context_alloc(1);
 
 	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
 				false, &root, xcp_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 8efa8422f4f7..b0a4fe683352 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -257,9 +257,9 @@ struct amdgpu_vm_update_params {
 	unsigned int num_dw_left;
 
 	/**
-	 * @table_freed: return true if page table is freed when updating
+	 * @needs_flush: true whenever we need to invalidate the TLB
 	 */
-	bool table_freed;
+	bool needs_flush;
 
 	/**
 	 * @allow_override: true for memory that is not uncached: allows MTYPE
@@ -342,6 +342,7 @@ struct amdgpu_vm {
 	atomic64_t		tlb_seq;
 	struct dma_fence	*last_tlb_flush;
 	atomic64_t		kfd_last_flushed_seq;
+	uint64_t		tlb_fence_context;
 
 	/* How many times we had to re-generate the page tables */
 	uint64_t		generation;
@@ -611,5 +612,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
 				  uint64_t addr,
 				  uint32_t status,
 				  unsigned int vmhub);
+void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
+				 struct amdgpu_vm *vm,
+				 struct dma_fence **fence);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
index 6e31621452de..3895bd7d176a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
@@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,
 static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
 				struct dma_fence **fence)
 {
-	/* Flush HDP */
+	if (p->needs_flush)
+		atomic64_inc(&p->vm->tlb_seq);
+
 	mb();
 	amdgpu_device_flush_hdp(p->adev, NULL);
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 124389a6bf48..601df0ce8290 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -972,7 +972,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
 			while (cursor.pfn < frag_start) {
 				/* Make sure previous mapping is freed */
 				if (cursor.entry->bo) {
-					params->table_freed = true;
+					params->needs_flush = true;
 					amdgpu_vm_pt_free_dfs(adev, params->vm,
 							      &cursor,
 							      params->unlocked);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 349416e176a1..66e8a016126b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
 
 	WARN_ON(ib->length_dw == 0);
 	amdgpu_ring_pad_ib(ring, ib);
+
+	if (p->needs_flush)
+		atomic64_inc(&p->vm->tlb_seq);
+
 	WARN_ON(ib->length_dw > p->num_dw_left);
 	f = amdgpu_job_submit(p->job);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
new file mode 100644
index 000000000000..51cddfa3f1e8
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/dma-fence.h>
+#include <linux/workqueue.h>
+
+#include "amdgpu.h"
+#include "amdgpu_vm.h"
+#include "amdgpu_gmc.h"
+
+struct amdgpu_tlb_fence {
+	struct dma_fence	base;
+	struct amdgpu_device	*adev;
+	struct dma_fence	*dependency;
+	struct work_struct	work;
+	spinlock_t		lock;
+	uint16_t		pasid;
+
+};
+
+static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence *fence)
+{
+	return "amdgpu tlb fence";
+}
+
+static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f)
+{
+	return "amdgpu tlb timeline";
+}
+
+static void amdgpu_tlb_fence_work(struct work_struct *work)
+{
+	struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
+	int r;
+
+	if (f->dependency) {
+		dma_fence_wait(f->dependency, false);
+		dma_fence_put(f->dependency);
+		f->dependency = NULL;
+	}
+
+	r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
+	if (r) {
+		dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
+			f->pasid);
+		dma_fence_set_error(&f->base, r);
+	}
+
+	dma_fence_signal(&f->base);
+	dma_fence_put(&f->base);
+}
+
+static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
+	.use_64bit_seqno = true,
+	.get_driver_name = amdgpu_tlb_fence_get_driver_name,
+	.get_timeline_name = amdgpu_tlb_fence_get_timeline_name
+};
+
+void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+				struct dma_fence **fence)
+{
+	struct amdgpu_tlb_fence *f;
+
+	f = kmalloc(sizeof(*f), GFP_KERNEL);
+	if (!f) {
+		/*
+		 * We can't fail since the PDEs and PTEs are already updated, so
+		 * just block for the dependency and execute the TLB flush
+		 */
+		if (*fence)
+			dma_fence_wait(*fence, false);
+
+		amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
+		*fence = dma_fence_get_stub();
+		return;
+	}
+
+	f->adev = adev;
+	f->dependency = *fence;
+	f->pasid = vm->pasid;
+	INIT_WORK(&f->work, amdgpu_tlb_fence_work);
+	spin_lock_init(&f->lock);
+
+	dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
+		       vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
+
+	/* TODO: We probably need a separate wq here */
+	dma_fence_get(&f->base);
+	schedule_work(&f->work);
+
+	*fence = &f->base;
+}
-- 
2.43.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 12:08 [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence Shashank Sharma
@ 2024-03-18 12:08 ` Shashank Sharma
  2024-03-18 14:04   ` Bhardwaj, Rajneesh
  2024-03-18 14:58 ` [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence Christian König
  1 sibling, 1 reply; 12+ messages in thread
From: Shashank Sharma @ 2024-03-18 12:08 UTC (permalink / raw)
  To: amd-gfx
  Cc: Shashank Sharma, Christian König, Alex Deucher,
	Felix Kuehling, Rajneesh Bhardwaj

The idea behind this patch is to delay the freeing of PT entry objects
until the TLB flush is done.

This patch:
- Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will keep the
  objects that need to be freed after tlb_flush.
- Adds PT entries in this list in amdgpu_vm_ptes_update after finding
  the PT entry.
- Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + free)
  to simply freeing of the BOs, also renames it to
  amdgpu_vm_pt_free_list to reflect this same.
- Exports function amdgpu_vm_pt_free_list to be called directly.
- Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range.

V2: rebase
V4: Addressed review comments from Christian
    - add only locked PTEs entries in TLB flush waitlist.
    - do not create a separate function for list flush.
    - do not create a new lock for TLB flush.
    - there is no need to wait on tlb_flush_fence exclusively.

V5: Addressed review comments from Christian
    - change the amdgpu_vm_pt_free_dfs's functionality to simple freeing
      of the objects and rename it.
    - add all the PTE objects in params->tlb_flush_waitlist
    - let amdgpu_vm_pt_free_root handle the freeing of BOs independently
    - call amdgpu_vm_pt_free_list directly

V6: Rebase
V7: Rebase

Cc: Christian König <Christian.Koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Felix Kuehling <felix.kuehling@amd.com>
Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Acked-by: Felix Kuehling <felix.kuehling@amd.com>
Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  5 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 53 +++++++++++++----------
 3 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 26f1c3359642..eaa402f99fe0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	params.unlocked = unlocked;
 	params.needs_flush = flush_tlb;
 	params.allow_override = allow_override;
+	INIT_LIST_HEAD(&params.tlb_flush_waitlist);
 
 	/* Implicitly sync to command submissions in the same VM before
 	 * unmapping. Sync to moving fences before mapping.
@@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (r)
 		goto error_unlock;
 
-	if (params.needs_flush)
+	if (params.needs_flush) {
 		r = amdgpu_vm_tlb_flush(&params, fence);
+		amdgpu_vm_pt_free_list(adev, &params);
+	}
 
 error_unlock:
 	amdgpu_vm_eviction_unlock(vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index b0a4fe683352..54d7da396de0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -266,6 +266,11 @@ struct amdgpu_vm_update_params {
 	 * to be overridden for NUMA local memory.
 	 */
 	bool allow_override;
+
+	/**
+	 * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
+	 */
+	struct list_head tlb_flush_waitlist;
 };
 
 struct amdgpu_vm_update_funcs {
@@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
 			  uint64_t start, uint64_t end,
 			  uint64_t dst, uint64_t flags);
 void amdgpu_vm_pt_free_work(struct work_struct *work);
+void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
+			    struct amdgpu_vm_update_params *params);
 
 #if defined(CONFIG_DEBUG_FS)
 void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 601df0ce8290..440dc8c581fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
 }
 
 /**
- * amdgpu_vm_pt_free_dfs - free PD/PT levels
+ * amdgpu_vm_pt_free_list - free PD/PT levels
  *
  * @adev: amdgpu device structure
- * @vm: amdgpu vm structure
- * @start: optional cursor where to start freeing PDs/PTs
- * @unlocked: vm resv unlock status
+ * @params: see amdgpu_vm_update_params definition
  *
- * Free the page directory or page table level and all sub levels.
+ * Free the page directory objects saved in the flush list
  */
-static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
-				  struct amdgpu_vm *vm,
-				  struct amdgpu_vm_pt_cursor *start,
-				  bool unlocked)
+void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
+			    struct amdgpu_vm_update_params *params)
 {
-	struct amdgpu_vm_pt_cursor cursor;
-	struct amdgpu_vm_bo_base *entry;
+	struct amdgpu_vm_bo_base *entry, *next;
+	struct amdgpu_vm *vm = params->vm;
+	bool unlocked = params->unlocked;
 
 	if (unlocked) {
 		spin_lock(&vm->status_lock);
-		for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
-			list_move(&entry->vm_status, &vm->pt_freed);
-
-		if (start)
-			list_move(&start->entry->vm_status, &vm->pt_freed);
+		list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
 		spin_unlock(&vm->status_lock);
 		schedule_work(&vm->pt_free_work);
 		return;
 	}
 
-	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
+	list_for_each_entry_safe(entry, next, &params->tlb_flush_waitlist, vm_status)
 		amdgpu_vm_pt_free(entry);
-
-	if (start)
-		amdgpu_vm_pt_free(start->entry);
 }
 
 /**
@@ -667,7 +657,11 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
  */
 void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 {
-	amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
+	struct amdgpu_vm_pt_cursor cursor;
+	struct amdgpu_vm_bo_base *entry;
+
+	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry)
+		amdgpu_vm_pt_free(entry);
 }
 
 /**
@@ -972,10 +966,21 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
 			while (cursor.pfn < frag_start) {
 				/* Make sure previous mapping is freed */
 				if (cursor.entry->bo) {
+					struct amdgpu_vm_pt_cursor seek;
+					struct amdgpu_vm_bo_base *entry;
+
 					params->needs_flush = true;
-					amdgpu_vm_pt_free_dfs(adev, params->vm,
-							      &cursor,
-							      params->unlocked);
+					spin_lock(&params->vm->status_lock);
+					for_each_amdgpu_vm_pt_dfs_safe(adev, params->vm, &cursor,
+								       seek, entry) {
+						list_move(&entry->vm_status,
+							  &params->tlb_flush_waitlist);
+					}
+
+					/* enter start node now */
+					list_move(&cursor.entry->vm_status,
+						  &params->tlb_flush_waitlist);
+					spin_unlock(&params->vm->status_lock);
 				}
 				amdgpu_vm_pt_next(adev, &cursor);
 			}
-- 
2.43.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 12:08 ` [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush Shashank Sharma
@ 2024-03-18 14:04   ` Bhardwaj, Rajneesh
  2024-03-18 14:06     ` Sharma, Shashank
  0 siblings, 1 reply; 12+ messages in thread
From: Bhardwaj, Rajneesh @ 2024-03-18 14:04 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx
  Cc: Christian König, Alex Deucher, Felix Kuehling

[-- Attachment #1: Type: text/plain, Size: 10122 bytes --]

HI Shashank

We'll probably need a v8 with the null pointer crash fixed i.e. before 
freeing the PT entries check for a valid entry before calling 
amdgpu_vm_pt_free. The crash is seen with device memory allocators but 
the system memory allocators are looking fine.

[  127.255863] [drm] Using MTYPE_RW for local memory
[  333.606136] hugetlbfs: test_with_MPI.e (25268): Using mlock ulimits 
for SHM_HUGETLB is obsolete
[  415.351447] BUG: kernel NULL pointer dereference, address: 
0000000000000008
[  415.359245] #PF: supervisor write access in kernel mode
[  415.365081] #PF: error_code(0x0002) - not-present page
[  415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0
[  415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI
[  415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: 
G           OE 5.18.2-mi300-build-140423-ubuntu-22.04+ #24
[  415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS 
RMO1001AS 02/21/2024
[  415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu]
[  415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 74 42 
48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 75 b0 4c 89 
ff <48
 > 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63
[  415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287
[  415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 
0000000000000000
[  415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: 
ffff888161f80000
[  415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: 
ffffc9000401fa00
[  415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: 
ffffc9000401fb20
[  415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: 
ffff888161f80000
[  415.476312] FS:  00007f132ff89840(0000) GS:ffff889f87c00000(0000) 
knlGS:0000000000000000
[  415.485350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 
0000000000770ef0
[  415.499738] PKRU: 55555554
[  415.502750] Call Trace:
[  415.505482]  <TASK>
[  415.507825]  amdgpu_vm_update_range+0x32a/0x880 [amdgpu]
[  415.513869]  amdgpu_vm_clear_freed+0x117/0x250 [amdgpu]
[  415.519814] amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 
[amdgpu]
[  415.527729]  kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu]
[  415.534551]  kfd_ioctl+0x3b6/0x510 [amdgpu]
[  415.539324]  ? kfd_ioctl_get_dmabuf_info+0x1d0/0x1d0 [amdgpu]
[  415.545844]  ? __fget_light+0xc5/0x100
[  415.550037]  __x64_sys_ioctl+0x91/0xc0
[  415.554227]  do_syscall_64+0x5c/0x80
[  415.558223]  ? debug_smp_processor_id+0x17/0x20
[  415.563285]  ? fpregs_assert_state_consistent+0x23/0x60
[  415.569124]  ? exit_to_user_mode_prepare+0x45/0x190
[  415.574572]  ? ksys_write+0xce/0xe0



On 3/18/2024 8:08 AM, Shashank Sharma wrote:

> The idea behind this patch is to delay the freeing of PT entry objects
> until the TLB flush is done.
>
> This patch:
> - Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will keep the
>    objects that need to be freed after tlb_flush.
> - Adds PT entries in this list in amdgpu_vm_ptes_update after finding
>    the PT entry.
> - Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + free)
>    to simply freeing of the BOs, also renames it to
>    amdgpu_vm_pt_free_list to reflect this same.
> - Exports function amdgpu_vm_pt_free_list to be called directly.
> - Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range.
>
> V2: rebase
> V4: Addressed review comments from Christian
>      - add only locked PTEs entries in TLB flush waitlist.
>      - do not create a separate function for list flush.
>      - do not create a new lock for TLB flush.
>      - there is no need to wait on tlb_flush_fence exclusively.
>
> V5: Addressed review comments from Christian
>      - change the amdgpu_vm_pt_free_dfs's functionality to simple freeing
>        of the objects and rename it.
>      - add all the PTE objects in params->tlb_flush_waitlist
>      - let amdgpu_vm_pt_free_root handle the freeing of BOs independently
>      - call amdgpu_vm_pt_free_list directly
>
> V6: Rebase
> V7: Rebase
>
> Cc: Christian König<Christian.Koenig@amd.com>
> Cc: Alex Deucher<alexander.deucher@amd.com>
> Cc: Felix Kuehling<felix.kuehling@amd.com>
> Cc: Rajneesh Bhardwaj<rajneesh.bhardwaj@amd.com>
> Acked-by: Felix Kuehling<felix.kuehling@amd.com>
> Acked-by: Rajneesh Bhardwaj<rajneesh.bhardwaj@amd.com>
> Tested-by: Rajneesh Bhardwaj<rajneesh.bhardwaj@amd.com>
> Signed-off-by: Shashank Sharma<shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  5 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  7 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 53 +++++++++++++----------
>   3 files changed, 40 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 26f1c3359642..eaa402f99fe0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	params.unlocked = unlocked;
>   	params.needs_flush = flush_tlb;
>   	params.allow_override = allow_override;
> +	INIT_LIST_HEAD(&params.tlb_flush_waitlist);
>   
>   	/* Implicitly sync to command submissions in the same VM before
>   	 * unmapping. Sync to moving fences before mapping.
> @@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	if (r)
>   		goto error_unlock;
>   
> -	if (params.needs_flush)
> +	if (params.needs_flush) {
>   		r = amdgpu_vm_tlb_flush(&params, fence);
> +		amdgpu_vm_pt_free_list(adev, &params);
> +	}
>   
>   error_unlock:
>   	amdgpu_vm_eviction_unlock(vm);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index b0a4fe683352..54d7da396de0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -266,6 +266,11 @@ struct amdgpu_vm_update_params {
>   	 * to be overridden for NUMA local memory.
>   	 */
>   	bool allow_override;
> +
> +	/**
> +	 * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
> +	 */
> +	struct list_head tlb_flush_waitlist;
>   };
>   
>   struct amdgpu_vm_update_funcs {
> @@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
>   			  uint64_t start, uint64_t end,
>   			  uint64_t dst, uint64_t flags);
>   void amdgpu_vm_pt_free_work(struct work_struct *work);
> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
> +			    struct amdgpu_vm_update_params *params);
>   
>   #if defined(CONFIG_DEBUG_FS)
>   void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index 601df0ce8290..440dc8c581fc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
>   }
>   
>   /**
> - * amdgpu_vm_pt_free_dfs - free PD/PT levels
> + * amdgpu_vm_pt_free_list - free PD/PT levels
>    *
>    * @adev: amdgpu device structure
> - * @vm: amdgpu vm structure
> - * @start: optional cursor where to start freeing PDs/PTs
> - * @unlocked: vm resv unlock status
> + * @params: see amdgpu_vm_update_params definition
>    *
> - * Free the page directory or page table level and all sub levels.
> + * Free the page directory objects saved in the flush list
>    */
> -static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
> -				  struct amdgpu_vm *vm,
> -				  struct amdgpu_vm_pt_cursor *start,
> -				  bool unlocked)
> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
> +			    struct amdgpu_vm_update_params *params)
>   {
> -	struct amdgpu_vm_pt_cursor cursor;
> -	struct amdgpu_vm_bo_base *entry;
> +	struct amdgpu_vm_bo_base *entry, *next;
> +	struct amdgpu_vm *vm = params->vm;
> +	bool unlocked = params->unlocked;
>   
>   	if (unlocked) {
>   		spin_lock(&vm->status_lock);
> -		for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
> -			list_move(&entry->vm_status, &vm->pt_freed);
> -
> -		if (start)
> -			list_move(&start->entry->vm_status, &vm->pt_freed);
> +		list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
>   		spin_unlock(&vm->status_lock);
>   		schedule_work(&vm->pt_free_work);
>   		return;
>   	}
>   
> -	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
> +	list_for_each_entry_safe(entry, next, &params->tlb_flush_waitlist, vm_status)
>   		amdgpu_vm_pt_free(entry);
> -
> -	if (start)
> -		amdgpu_vm_pt_free(start->entry);
>   }
>   
>   /**
> @@ -667,7 +657,11 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
>    */
>   void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   {
> -	amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
> +	struct amdgpu_vm_pt_cursor cursor;
> +	struct amdgpu_vm_bo_base *entry;
> +
> +	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry)
> +		amdgpu_vm_pt_free(entry);
>   }
>   
>   /**
> @@ -972,10 +966,21 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
>   			while (cursor.pfn < frag_start) {
>   				/* Make sure previous mapping is freed */
>   				if (cursor.entry->bo) {
> +					struct amdgpu_vm_pt_cursor seek;
> +					struct amdgpu_vm_bo_base *entry;
> +
>   					params->needs_flush = true;
> -					amdgpu_vm_pt_free_dfs(adev, params->vm,
> -							      &cursor,
> -							      params->unlocked);
> +					spin_lock(&params->vm->status_lock);
> +					for_each_amdgpu_vm_pt_dfs_safe(adev, params->vm, &cursor,
> +								       seek, entry) {
> +						list_move(&entry->vm_status,
> +							  &params->tlb_flush_waitlist);
> +					}
> +
> +					/* enter start node now */
> +					list_move(&cursor.entry->vm_status,
> +						  &params->tlb_flush_waitlist);
> +					spin_unlock(&params->vm->status_lock);
>   				}
>   				amdgpu_vm_pt_next(adev, &cursor);
>   			}

[-- Attachment #2: Type: text/html, Size: 12686 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 14:04   ` Bhardwaj, Rajneesh
@ 2024-03-18 14:06     ` Sharma, Shashank
  2024-03-18 14:40       ` Bhardwaj, Rajneesh
  0 siblings, 1 reply; 12+ messages in thread
From: Sharma, Shashank @ 2024-03-18 14:06 UTC (permalink / raw)
  To: Bhardwaj, Rajneesh, amd-gfx
  Cc: Koenig, Christian, Deucher, Alexander, Kuehling, Felix

[-- Attachment #1: Type: text/plain, Size: 12053 bytes --]

[AMD Official Use Only - General]

Already sent a NULL check patch based on this backtrace, I am waiting for Rajneesh's feedback.

Regards
Shashank
________________________________
From: Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
Sent: Monday, March 18, 2024 3:04 PM
To: Sharma, Shashank <Shashank.Sharma@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Koenig, Christian <Christian.Koenig@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
Subject: Re: [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush


HI Shashank

We'll probably need a v8 with the null pointer crash fixed i.e. before freeing the PT entries check for a valid entry before calling amdgpu_vm_pt_free. The crash is seen with device memory allocators but the system memory allocators are looking fine.



[  127.255863] [drm] Using MTYPE_RW for local memory
[  333.606136] hugetlbfs: test_with_MPI.e (25268): Using mlock ulimits for SHM_HUGETLB is obsolete
[  415.351447] BUG: kernel NULL pointer dereference, address: 0000000000000008
[  415.359245] #PF: supervisor write access in kernel mode
[  415.365081] #PF: error_code(0x0002) - not-present page
[  415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0
[  415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI
[  415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: G           OE     5.18.2-mi300-build-140423-ubuntu-22.04+ #24
[  415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS RMO1001AS 02/21/2024
[  415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu]
[  415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 74 42 48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 75 b0 4c 89 ff <48
> 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63
[  415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287
[  415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 0000000000000000
[  415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: ffff888161f80000
[  415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: ffffc9000401fa00
[  415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: ffffc9000401fb20
[  415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: ffff888161f80000
[  415.476312] FS:  00007f132ff89840(0000) GS:ffff889f87c00000(0000) knlGS:0000000000000000
[  415.485350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 0000000000770ef0
[  415.499738] PKRU: 55555554
[  415.502750] Call Trace:
[  415.505482]  <TASK>
[  415.507825]  amdgpu_vm_update_range+0x32a/0x880 [amdgpu]
[  415.513869]  amdgpu_vm_clear_freed+0x117/0x250 [amdgpu]
[  415.519814]  amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 [amdgpu]
[  415.527729]  kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu]
[  415.534551]  kfd_ioctl+0x3b6/0x510 [amdgpu]
[  415.539324]  ? kfd_ioctl_get_dmabuf_info+0x1d0/0x1d0 [amdgpu]
[  415.545844]  ? __fget_light+0xc5/0x100
[  415.550037]  __x64_sys_ioctl+0x91/0xc0
[  415.554227]  do_syscall_64+0x5c/0x80
[  415.558223]  ? debug_smp_processor_id+0x17/0x20
[  415.563285]  ? fpregs_assert_state_consistent+0x23/0x60
[  415.569124]  ? exit_to_user_mode_prepare+0x45/0x190
[  415.574572]  ? ksys_write+0xce/0xe0




On 3/18/2024 8:08 AM, Shashank Sharma wrote:

The idea behind this patch is to delay the freeing of PT entry objects
until the TLB flush is done.

This patch:
- Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will keep the
  objects that need to be freed after tlb_flush.
- Adds PT entries in this list in amdgpu_vm_ptes_update after finding
  the PT entry.
- Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + free)
  to simply freeing of the BOs, also renames it to
  amdgpu_vm_pt_free_list to reflect this same.
- Exports function amdgpu_vm_pt_free_list to be called directly.
- Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range.

V2: rebase
V4: Addressed review comments from Christian
    - add only locked PTEs entries in TLB flush waitlist.
    - do not create a separate function for list flush.
    - do not create a new lock for TLB flush.
    - there is no need to wait on tlb_flush_fence exclusively.

V5: Addressed review comments from Christian
    - change the amdgpu_vm_pt_free_dfs's functionality to simple freeing
      of the objects and rename it.
    - add all the PTE objects in params->tlb_flush_waitlist
    - let amdgpu_vm_pt_free_root handle the freeing of BOs independently
    - call amdgpu_vm_pt_free_list directly

V6: Rebase
V7: Rebase

Cc: Christian König <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com><mailto:alexander.deucher@amd.com>
Cc: Felix Kuehling <felix.kuehling@amd.com><mailto:felix.kuehling@amd.com>
Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com><mailto:rajneesh.bhardwaj@amd.com>
Acked-by: Felix Kuehling <felix.kuehling@amd.com><mailto:felix.kuehling@amd.com>
Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com><mailto:rajneesh.bhardwaj@amd.com>
Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com><mailto:rajneesh.bhardwaj@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com><mailto:shashank.sharma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  5 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 53 +++++++++++++----------
 3 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 26f1c3359642..eaa402f99fe0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
        params.unlocked = unlocked;
        params.needs_flush = flush_tlb;
        params.allow_override = allow_override;
+       INIT_LIST_HEAD(&params.tlb_flush_waitlist);

        /* Implicitly sync to command submissions in the same VM before
         * unmapping. Sync to moving fences before mapping.
@@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
        if (r)
                goto error_unlock;

-       if (params.needs_flush)
+       if (params.needs_flush) {
                r = amdgpu_vm_tlb_flush(&params, fence);
+               amdgpu_vm_pt_free_list(adev, &params);
+       }

 error_unlock:
        amdgpu_vm_eviction_unlock(vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index b0a4fe683352..54d7da396de0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -266,6 +266,11 @@ struct amdgpu_vm_update_params {
         * to be overridden for NUMA local memory.
         */
        bool allow_override;
+
+       /**
+        * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
+        */
+       struct list_head tlb_flush_waitlist;
 };

 struct amdgpu_vm_update_funcs {
@@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
                          uint64_t start, uint64_t end,
                          uint64_t dst, uint64_t flags);
 void amdgpu_vm_pt_free_work(struct work_struct *work);
+void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
+                           struct amdgpu_vm_update_params *params);

 #if defined(CONFIG_DEBUG_FS)
 void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 601df0ce8290..440dc8c581fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
 }

 /**
- * amdgpu_vm_pt_free_dfs - free PD/PT levels
+ * amdgpu_vm_pt_free_list - free PD/PT levels
  *
  * @adev: amdgpu device structure
- * @vm: amdgpu vm structure
- * @start: optional cursor where to start freeing PDs/PTs
- * @unlocked: vm resv unlock status
+ * @params: see amdgpu_vm_update_params definition
  *
- * Free the page directory or page table level and all sub levels.
+ * Free the page directory objects saved in the flush list
  */
-static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
-                                 struct amdgpu_vm *vm,
-                                 struct amdgpu_vm_pt_cursor *start,
-                                 bool unlocked)
+void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
+                           struct amdgpu_vm_update_params *params)
 {
-       struct amdgpu_vm_pt_cursor cursor;
-       struct amdgpu_vm_bo_base *entry;
+       struct amdgpu_vm_bo_base *entry, *next;
+       struct amdgpu_vm *vm = params->vm;
+       bool unlocked = params->unlocked;

        if (unlocked) {
                spin_lock(&vm->status_lock);
-               for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
-                       list_move(&entry->vm_status, &vm->pt_freed);
-
-               if (start)
-                       list_move(&start->entry->vm_status, &vm->pt_freed);
+               list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
                spin_unlock(&vm->status_lock);
                schedule_work(&vm->pt_free_work);
                return;
        }

-       for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
+       list_for_each_entry_safe(entry, next, &params->tlb_flush_waitlist, vm_status)
                amdgpu_vm_pt_free(entry);
-
-       if (start)
-               amdgpu_vm_pt_free(start->entry);
 }

 /**
@@ -667,7 +657,11 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
  */
 void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 {
-       amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
+       struct amdgpu_vm_pt_cursor cursor;
+       struct amdgpu_vm_bo_base *entry;
+
+       for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry)
+               amdgpu_vm_pt_free(entry);
 }

 /**
@@ -972,10 +966,21 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
                        while (cursor.pfn < frag_start) {
                                /* Make sure previous mapping is freed */
                                if (cursor.entry->bo) {
+                                       struct amdgpu_vm_pt_cursor seek;
+                                       struct amdgpu_vm_bo_base *entry;
+
                                        params->needs_flush = true;
-                                       amdgpu_vm_pt_free_dfs(adev, params->vm,
-                                                             &cursor,
-                                                             params->unlocked);
+                                       spin_lock(&params->vm->status_lock);
+                                       for_each_amdgpu_vm_pt_dfs_safe(adev, params->vm, &cursor,
+                                                                      seek, entry) {
+                                               list_move(&entry->vm_status,
+                                                         &params->tlb_flush_waitlist);
+                                       }
+
+                                       /* enter start node now */
+                                       list_move(&cursor.entry->vm_status,
+                                                 &params->tlb_flush_waitlist);
+                                       spin_unlock(&params->vm->status_lock);
                                }
                                amdgpu_vm_pt_next(adev, &cursor);
                        }


[-- Attachment #2: Type: text/html, Size: 14723 bytes --]

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 14:06     ` Sharma, Shashank
@ 2024-03-18 14:40       ` Bhardwaj, Rajneesh
  2024-03-18 14:44         ` [PATCH v8] " Shashank Sharma
  0 siblings, 1 reply; 12+ messages in thread
From: Bhardwaj, Rajneesh @ 2024-03-18 14:40 UTC (permalink / raw)
  To: Sharma, Shashank, amd-gfx
  Cc: Koenig, Christian, Deucher, Alexander, Kuehling, Felix

[-- Attachment #1: Type: text/plain, Size: 11257 bytes --]

The change you shared with me fixes the crash. Pl include in v8.


On 3/18/2024 10:06 AM, Sharma, Shashank wrote:
>
> [AMD Official Use Only - General]
>
>
> Already sent a NULL check patch based on this backtrace, I am waiting 
> for Rajneesh's feedback.
>
> Regards
> Shashank
> ------------------------------------------------------------------------
> *From:* Bhardwaj, Rajneesh <Rajneesh.Bhardwaj@amd.com>
> *Sent:* Monday, March 18, 2024 3:04 PM
> *To:* Sharma, Shashank <Shashank.Sharma@amd.com>; 
> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
> *Cc:* Koenig, Christian <Christian.Koenig@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>
> *Subject:* Re: [PATCH v7 2/2] drm/amdgpu: sync page table freeing with 
> tlb flush
>
> HI Shashank
>
> We'll probably need a v8 with the null pointer crash fixed i.e. before 
> freeing the PT entries check for a valid entry before calling 
> amdgpu_vm_pt_free. The crash is seen with device memory allocators but 
> the system memory allocators are looking fine.
>
> [  127.255863] [drm] Using MTYPE_RW for local memory
> [  333.606136] hugetlbfs: test_with_MPI.e (25268): Using mlock ulimits 
> for SHM_HUGETLB is obsolete
> [  415.351447] BUG: kernel NULL pointer dereference, address: 
> 0000000000000008
> [  415.359245] #PF: supervisor write access in kernel mode
> [  415.365081] #PF: error_code(0x0002) - not-present page
> [  415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0
> [  415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI
> [  415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: 
> G           OE 5.18.2-mi300-build-140423-ubuntu-22.04+ #24
> [  415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS 
> RMO1001AS 02/21/2024
> [  415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu]
> [  415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 74 
> 42 48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 75 b0 
> 4c 89 ff <48
> > 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63
> [  415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287
> [  415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 
> 0000000000000000
> [  415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: 
> ffff888161f80000
> [  415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: 
> ffffc9000401fa00
> [  415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: 
> ffffc9000401fb20
> [  415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: 
> ffff888161f80000
> [  415.476312] FS:  00007f132ff89840(0000) GS:ffff889f87c00000(0000) 
> knlGS:0000000000000000
> [  415.485350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 
> 0000000000770ef0
> [  415.499738] PKRU: 55555554
> [  415.502750] Call Trace:
> [  415.505482]  <TASK>
> [  415.507825]  amdgpu_vm_update_range+0x32a/0x880 [amdgpu]
> [  415.513869]  amdgpu_vm_clear_freed+0x117/0x250 [amdgpu]
> [  415.519814] amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 
> [amdgpu]
> [  415.527729]  kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu]
> [  415.534551]  kfd_ioctl+0x3b6/0x510 [amdgpu]
> [  415.539324]  ? kfd_ioctl_get_dmabuf_info+0x1d0/0x1d0 [amdgpu]
> [  415.545844]  ? __fget_light+0xc5/0x100
> [  415.550037]  __x64_sys_ioctl+0x91/0xc0
> [  415.554227]  do_syscall_64+0x5c/0x80
> [  415.558223]  ? debug_smp_processor_id+0x17/0x20
> [  415.563285]  ? fpregs_assert_state_consistent+0x23/0x60
> [  415.569124]  ? exit_to_user_mode_prepare+0x45/0x190
> [  415.574572]  ? ksys_write+0xce/0xe0
>
>
> On 3/18/2024 8:08 AM, Shashank Sharma wrote:
>
>     The idea behind this patch is to delay the freeing of PT entry
>     objects until the TLB flush is done. This patch: - Adds a
>     tlb_flush_waitlist in amdgpu_vm_update_params which will keep the
>     objects that need to be freed after tlb_flush. - Adds PT entries
>     in this list in amdgpu_vm_ptes_update after finding the PT entry.
>     - Changes functionality of amdgpu_vm_pt_free_dfs from (df_search +
>     free) to simply freeing of the BOs, also renames it to
>     amdgpu_vm_pt_free_list to reflect this same. - Exports function
>     amdgpu_vm_pt_free_list to be called directly. - Calls
>     amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range. V2:
>     rebase V4: Addressed review comments from Christian - add only
>     locked PTEs entries in TLB flush waitlist. - do not create a
>     separate function for list flush. - do not create a new lock for
>     TLB flush. - there is no need to wait on tlb_flush_fence
>     exclusively. V5: Addressed review comments from Christian - change
>     the amdgpu_vm_pt_free_dfs's functionality to simple freeing of the
>     objects and rename it. - add all the PTE objects in
>     params->tlb_flush_waitlist - let amdgpu_vm_pt_free_root handle the
>     freeing of BOs independently - call amdgpu_vm_pt_free_list
>     directly V6: Rebase V7: Rebase Cc: Christian König
>     <Christian.Koenig@amd.com> <mailto:Christian.Koenig@amd.com> Cc:
>     Alex Deucher <alexander.deucher@amd.com>
>     <mailto:alexander.deucher@amd.com> Cc: Felix Kuehling
>     <felix.kuehling@amd.com> <mailto:felix.kuehling@amd.com> Cc:
>     Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>     <mailto:rajneesh.bhardwaj@amd.com> Acked-by: Felix Kuehling
>     <felix.kuehling@amd.com> <mailto:felix.kuehling@amd.com> Acked-by:
>     Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>     <mailto:rajneesh.bhardwaj@amd.com> Tested-by: Rajneesh Bhardwaj
>     <rajneesh.bhardwaj@amd.com> <mailto:rajneesh.bhardwaj@amd.com>
>     Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>     <mailto:shashank.sharma@amd.com> ---
>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 ++-
>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 +++
>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 53
>     +++++++++++++---------- 3 files changed, 40 insertions(+), 25
>     deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index
>     26f1c3359642..eaa402f99fe0 100644 ---
>     a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -977,6 +977,7 @@ int
>     amdgpu_vm_update_range(struct amdgpu_device *adev, struct
>     amdgpu_vm *vm, params.unlocked = unlocked; params.needs_flush =
>     flush_tlb; params.allow_override = allow_override; +
>     INIT_LIST_HEAD(&params.tlb_flush_waitlist); /* Implicitly sync to
>     command submissions in the same VM before * unmapping. Sync to
>     moving fences before mapping. @@ -1062,8 +1063,10 @@ int
>     amdgpu_vm_update_range(struct amdgpu_device *adev, struct
>     amdgpu_vm *vm, if (r) goto error_unlock; - if (params.needs_flush)
>     + if (params.needs_flush) { r = amdgpu_vm_tlb_flush(&params,
>     fence); + amdgpu_vm_pt_free_list(adev, &params); + } error_unlock:
>     amdgpu_vm_eviction_unlock(vm); diff --git
>     a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index
>     b0a4fe683352..54d7da396de0 100644 ---
>     a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -266,6 +266,11 @@
>     struct amdgpu_vm_update_params { * to be overridden for NUMA local
>     memory. */ bool allow_override; + + /** + * @tlb_flush_waitlist:
>     temporary storage for BOs until tlb_flush + */ + struct list_head
>     tlb_flush_waitlist; }; struct amdgpu_vm_update_funcs { @@ -547,6
>     +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params
>     *params, uint64_t start, uint64_t end, uint64_t dst, uint64_t
>     flags); void amdgpu_vm_pt_free_work(struct work_struct *work);
>     +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev, + struct
>     amdgpu_vm_update_params *params); #if defined(CONFIG_DEBUG_FS)
>     void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct
>     seq_file *m); diff --git
>     a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index
>     601df0ce8290..440dc8c581fc 100644 ---
>     a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -622,40 +622,30 @@
>     void amdgpu_vm_pt_free_work(struct work_struct *work) } /** - *
>     amdgpu_vm_pt_free_dfs - free PD/PT levels + *
>     amdgpu_vm_pt_free_list - free PD/PT levels * * @adev: amdgpu
>     device structure - * @vm: amdgpu vm structure - * @start: optional
>     cursor where to start freeing PDs/PTs - * @unlocked: vm resv
>     unlock status + * @params: see amdgpu_vm_update_params definition
>     * - * Free the page directory or page table level and all sub
>     levels. + * Free the page directory objects saved in the flush
>     list */ -static void amdgpu_vm_pt_free_dfs(struct amdgpu_device
>     *adev, - struct amdgpu_vm *vm, - struct amdgpu_vm_pt_cursor
>     *start, - bool unlocked) +void amdgpu_vm_pt_free_list(struct
>     amdgpu_device *adev, + struct amdgpu_vm_update_params *params) { -
>     struct amdgpu_vm_pt_cursor cursor; - struct amdgpu_vm_bo_base
>     *entry; + struct amdgpu_vm_bo_base *entry, *next; + struct
>     amdgpu_vm *vm = params->vm; + bool unlocked = params->unlocked; if
>     (unlocked) { spin_lock(&vm->status_lock); -
>     for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) -
>     list_move(&entry->vm_status, &vm->pt_freed); - - if (start) -
>     list_move(&start->entry->vm_status, &vm->pt_freed); +
>     list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
>     spin_unlock(&vm->status_lock); schedule_work(&vm->pt_free_work);
>     return; } - for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start,
>     cursor, entry) + list_for_each_entry_safe(entry, next,
>     &params->tlb_flush_waitlist, vm_status) amdgpu_vm_pt_free(entry);
>     - - if (start) - amdgpu_vm_pt_free(start->entry); } /** @@ -667,7
>     +657,11 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device
>     *adev, */ void amdgpu_vm_pt_free_root(struct amdgpu_device *adev,
>     struct amdgpu_vm *vm) { - amdgpu_vm_pt_free_dfs(adev, vm, NULL,
>     false); + struct amdgpu_vm_pt_cursor cursor; + struct
>     amdgpu_vm_bo_base *entry; + + for_each_amdgpu_vm_pt_dfs_safe(adev,
>     vm, NULL, cursor, entry) + amdgpu_vm_pt_free(entry); } /** @@
>     -972,10 +966,21 @@ int amdgpu_vm_ptes_update(struct
>     amdgpu_vm_update_params *params, while (cursor.pfn < frag_start) {
>     /* Make sure previous mapping is freed */ if (cursor.entry->bo) {
>     + struct amdgpu_vm_pt_cursor seek; + struct amdgpu_vm_bo_base
>     *entry; + params->needs_flush = true; -
>     amdgpu_vm_pt_free_dfs(adev, params->vm, - &cursor, -
>     params->unlocked); + spin_lock(&params->vm->status_lock); +
>     for_each_amdgpu_vm_pt_dfs_safe(adev, params->vm, &cursor, + seek,
>     entry) { + list_move(&entry->vm_status, +
>     &params->tlb_flush_waitlist); + } + + /* enter start node now */ +
>     list_move(&cursor.entry->vm_status, +
>     &params->tlb_flush_waitlist); +
>     spin_unlock(&params->vm->status_lock); } amdgpu_vm_pt_next(adev,
>     &cursor); }
>

[-- Attachment #2: Type: text/html, Size: 16468 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v8] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 14:40       ` Bhardwaj, Rajneesh
@ 2024-03-18 14:44         ` Shashank Sharma
  2024-03-18 15:01           ` Christian König
  0 siblings, 1 reply; 12+ messages in thread
From: Shashank Sharma @ 2024-03-18 14:44 UTC (permalink / raw)
  To: amd-gfx
  Cc: Shashank Sharma, Christian König, Alex Deucher,
	Felix Kuehling, Rajneesh Bhardwaj

The idea behind this patch is to delay the freeing of PT entry objects
until the TLB flush is done.

This patch:
- Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will keep the
  objects that need to be freed after tlb_flush.
- Adds PT entries in this list in amdgpu_vm_ptes_update after finding
  the PT entry.
- Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + free)
  to simply freeing of the BOs, also renames it to
  amdgpu_vm_pt_free_list to reflect this same.
- Exports function amdgpu_vm_pt_free_list to be called directly.
- Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range.

V2: rebase
V4: Addressed review comments from Christian
    - add only locked PTEs entries in TLB flush waitlist.
    - do not create a separate function for list flush.
    - do not create a new lock for TLB flush.
    - there is no need to wait on tlb_flush_fence exclusively.

V5: Addressed review comments from Christian
    - change the amdgpu_vm_pt_free_dfs's functionality to simple freeing
      of the objects and rename it.
    - add all the PTE objects in params->tlb_flush_waitlist
    - let amdgpu_vm_pt_free_root handle the freeing of BOs independently
    - call amdgpu_vm_pt_free_list directly

V6: Rebase
V7: Rebase
V8: Added a NULL check to fix this backtrace issue:
[  415.351447] BUG: kernel NULL pointer dereference, address: 0000000000000008
[  415.359245] #PF: supervisor write access in kernel mode
[  415.365081] #PF: error_code(0x0002) - not-present page
[  415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0
[  415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI
[  415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: G           OE     5.18.2-mi300-build-140423-ubuntu-22.04+ #24
[  415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS RMO1001AS 02/21/2024
[  415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu]
[  415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 74 42 48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 75 b0 4c 89 ff <48
> 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63
[  415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287
[  415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 0000000000000000
[  415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: ffff888161f80000
[  415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: ffffc9000401fa00
[  415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: ffffc9000401fb20
[  415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: ffff888161f80000
[  415.476312] FS:  00007f132ff89840(0000) GS:ffff889f87c00000(0000) knlGS:0000000000000000
[  415.485350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 0000000000770ef0
[  415.499738] PKRU: 55555554
[  415.502750] Call Trace:
[  415.505482]  <TASK>
[  415.507825]  amdgpu_vm_update_range+0x32a/0x880 [amdgpu]
[  415.513869]  amdgpu_vm_clear_freed+0x117/0x250 [amdgpu]
[  415.519814]  amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 [amdgpu]
[  415.527729]  kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu]
[  415.534551]  kfd_ioctl+0x3b6/0x510 [amdgpu]

Cc: Christian König <Christian.Koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Felix Kuehling <felix.kuehling@amd.com>
Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Acked-by: Felix Kuehling <felix.kuehling@amd.com>
Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 58 +++++++++++++----------
 3 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 26f1c3359642..eaa402f99fe0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	params.unlocked = unlocked;
 	params.needs_flush = flush_tlb;
 	params.allow_override = allow_override;
+	INIT_LIST_HEAD(&params.tlb_flush_waitlist);
 
 	/* Implicitly sync to command submissions in the same VM before
 	 * unmapping. Sync to moving fences before mapping.
@@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (r)
 		goto error_unlock;
 
-	if (params.needs_flush)
+	if (params.needs_flush) {
 		r = amdgpu_vm_tlb_flush(&params, fence);
+		amdgpu_vm_pt_free_list(adev, &params);
+	}
 
 error_unlock:
 	amdgpu_vm_eviction_unlock(vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index b0a4fe683352..54d7da396de0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -266,6 +266,11 @@ struct amdgpu_vm_update_params {
 	 * to be overridden for NUMA local memory.
 	 */
 	bool allow_override;
+
+	/**
+	 * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
+	 */
+	struct list_head tlb_flush_waitlist;
 };
 
 struct amdgpu_vm_update_funcs {
@@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
 			  uint64_t start, uint64_t end,
 			  uint64_t dst, uint64_t flags);
 void amdgpu_vm_pt_free_work(struct work_struct *work);
+void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
+			    struct amdgpu_vm_update_params *params);
 
 #if defined(CONFIG_DEBUG_FS)
 void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 601df0ce8290..9231edfb427e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
 }
 
 /**
- * amdgpu_vm_pt_free_dfs - free PD/PT levels
+ * amdgpu_vm_pt_free_list - free PD/PT levels
  *
  * @adev: amdgpu device structure
- * @vm: amdgpu vm structure
- * @start: optional cursor where to start freeing PDs/PTs
- * @unlocked: vm resv unlock status
+ * @params: see amdgpu_vm_update_params definition
  *
- * Free the page directory or page table level and all sub levels.
+ * Free the page directory objects saved in the flush list
  */
-static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
-				  struct amdgpu_vm *vm,
-				  struct amdgpu_vm_pt_cursor *start,
-				  bool unlocked)
+void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
+			    struct amdgpu_vm_update_params *params)
 {
-	struct amdgpu_vm_pt_cursor cursor;
-	struct amdgpu_vm_bo_base *entry;
+	struct amdgpu_vm_bo_base *entry, *next;
+	struct amdgpu_vm *vm = params->vm;
+	bool unlocked = params->unlocked;
 
 	if (unlocked) {
 		spin_lock(&vm->status_lock);
-		for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
-			list_move(&entry->vm_status, &vm->pt_freed);
-
-		if (start)
-			list_move(&start->entry->vm_status, &vm->pt_freed);
+		list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
 		spin_unlock(&vm->status_lock);
 		schedule_work(&vm->pt_free_work);
 		return;
 	}
 
-	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
+	list_for_each_entry_safe(entry, next, &params->tlb_flush_waitlist, vm_status)
 		amdgpu_vm_pt_free(entry);
-
-	if (start)
-		amdgpu_vm_pt_free(start->entry);
 }
 
 /**
@@ -667,7 +657,13 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
  */
 void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 {
-	amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
+	struct amdgpu_vm_pt_cursor cursor;
+	struct amdgpu_vm_bo_base *entry;
+
+	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) {
+		if (entry)
+			amdgpu_vm_pt_free(entry);
+	}
 }
 
 /**
@@ -972,10 +968,24 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
 			while (cursor.pfn < frag_start) {
 				/* Make sure previous mapping is freed */
 				if (cursor.entry->bo) {
+					struct amdgpu_vm_pt_cursor seek;
+					struct amdgpu_vm_bo_base *entry;
+
 					params->needs_flush = true;
-					amdgpu_vm_pt_free_dfs(adev, params->vm,
-							      &cursor,
-							      params->unlocked);
+					spin_lock(&params->vm->status_lock);
+					for_each_amdgpu_vm_pt_dfs_safe(adev, params->vm, &cursor,
+								       seek, entry) {
+						if (!entry || !entry->bo)
+							continue;
+
+						list_move(&entry->vm_status,
+							  &params->tlb_flush_waitlist);
+					}
+
+					/* enter start node now */
+					list_move(&cursor.entry->vm_status,
+						  &params->tlb_flush_waitlist);
+					spin_unlock(&params->vm->status_lock);
 				}
 				amdgpu_vm_pt_next(adev, &cursor);
 			}
-- 
2.43.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence
  2024-03-18 12:08 [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence Shashank Sharma
  2024-03-18 12:08 ` [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush Shashank Sharma
@ 2024-03-18 14:58 ` Christian König
  2024-03-18 15:19   ` Sharma, Shashank
  1 sibling, 1 reply; 12+ messages in thread
From: Christian König @ 2024-03-18 14:58 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx; +Cc: Felix Kuehling, Rajneesh Bhardwaj, Alex Deucher

Am 18.03.24 um 13:08 schrieb Shashank Sharma:
> From: Christian Koenig <christian.koenig@amd.com>
>
> The problem is that when (for example) 4k pages are replaced
> with a single 2M page we need to wait for change to be flushed
> out by invalidating the TLB before the PT can be freed.
>
> Solve this by moving the TLB flush into a DMA-fence object which
> can be used to delay the freeing of the PT BOs until it is signaled.
>
> V2: (Shashank)
>      - rebase
>      - set dma_fence_error only in case of error
>      - add tlb_flush fence only when PT/PD BO is locked (Felix)
>      - use vm->pasid when f is NULL (Mukul)
>
> V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
>      - move the misplaced fence_create call to the end (Philip)
>
> V5: - free the f->dependency properly
>
> V6: (Shashank)
>      - light code movement, moved all the clean-up in previous patch
>      - introduce params.needs_flush and its usage in this patch
>      - rebase without TLB HW sequence patch
>
> V7:
>     - Keep the vm->last_update_fence and tlb_cb code until
>       we can fix the HW sequencing (Christian)
>     - Move all the tlb_fence related code in a separate function so that
>       its easier to read and review
>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Cc: Felix Kuehling <Felix.Kuehling@amd.com>
> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
> Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
> Signed-off-by: Christian Koenig <christian.koenig@amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/Makefile           |   3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  68 +++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |   8 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c    |   4 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c   |   4 +
>   .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112 ++++++++++++++++++
>   7 files changed, 171 insertions(+), 30 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 4536c8ad0e11..f24f11ac3e92 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
>   	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
>   	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
>   	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
> -	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
> +	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
> +	amdgpu_ib.o amdgpu_pll.o \
>   	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
>   	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
>   	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 81fb3465e197..26f1c3359642 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -885,6 +885,40 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
>   	kfree(tlb_cb);
>   }
>   
> +static int
> +amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, struct dma_fence **fence)
> +{
> +	struct amdgpu_vm_tlb_seq_struct *tlb_cb;
> +	struct amdgpu_vm *vm = params->vm;
> +
> +	if (!fence || !*fence)
> +		return 0;
> +
> +	tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
> +	if (!tlb_cb)
> +		return -ENOMEM;

That won't work like this. The page tables are already updated, so you 
need to have the callback no matter what.

That's why the code previously allocated the tlb_cb structure before 
updating the page tables.

Regards,
Christian.

> +
> +	tlb_cb->vm = vm;
> +	if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
> +				    amdgpu_vm_tlb_seq_cb)) {
> +		dma_fence_put(vm->last_tlb_flush);
> +		vm->last_tlb_flush = dma_fence_get(*fence);
> +	} else {
> +		amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
> +	}
> +
> +	/* Prepare a TLB flush fence to be attached to PTs */
> +	if (!params->unlocked && vm->is_compute_context) {
> +		amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
> +
> +		/* Makes sure no PD/PT is freed before the flush */
> +		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
> +				   DMA_RESV_USAGE_BOOKKEEP);
> +	}
> +
> +	return 0;
> +}
> +
>   /**
>    * amdgpu_vm_update_range - update a range in the vm page table
>    *
> @@ -917,7 +951,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   			   struct dma_fence **fence)
>   {
>   	struct amdgpu_vm_update_params params;
> -	struct amdgpu_vm_tlb_seq_struct *tlb_cb;
>   	struct amdgpu_res_cursor cursor;
>   	enum amdgpu_sync_mode sync_mode;
>   	int r, idx;
> @@ -925,12 +958,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	if (!drm_dev_enter(adev_to_drm(adev), &idx))
>   		return -ENODEV;
>   
> -	tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
> -	if (!tlb_cb) {
> -		r = -ENOMEM;
> -		goto error_unlock;
> -	}
> -
>   	/* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,
>   	 * heavy-weight flush TLB unconditionally.
>   	 */
> @@ -948,6 +975,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	params.immediate = immediate;
>   	params.pages_addr = pages_addr;
>   	params.unlocked = unlocked;
> +	params.needs_flush = flush_tlb;
>   	params.allow_override = allow_override;
>   
>   	/* Implicitly sync to command submissions in the same VM before
> @@ -961,7 +989,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	amdgpu_vm_eviction_lock(vm);
>   	if (vm->evicting) {
>   		r = -EBUSY;
> -		goto error_free;
> +		goto error_unlock;
>   	}
>   
>   	if (!unlocked && !dma_fence_is_signaled(vm->last_unlocked)) {
> @@ -974,7 +1002,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   
>   	r = vm->update_funcs->prepare(&params, resv, sync_mode);
>   	if (r)
> -		goto error_free;
> +		goto error_unlock;
>   
>   	amdgpu_res_first(pages_addr ? NULL : res, offset,
>   			 (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, &cursor);
> @@ -1024,29 +1052,18 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   		tmp = start + num_entries;
>   		r = amdgpu_vm_ptes_update(&params, start, tmp, addr, flags);
>   		if (r)
> -			goto error_free;
> +			goto error_unlock;
>   
>   		amdgpu_res_next(&cursor, num_entries * AMDGPU_GPU_PAGE_SIZE);
>   		start = tmp;
>   	}
>   
>   	r = vm->update_funcs->commit(&params, fence);
> +	if (r)
> +		goto error_unlock;
>   
> -	if (flush_tlb || params.table_freed) {
> -		tlb_cb->vm = vm;
> -		if (fence && *fence &&
> -		    !dma_fence_add_callback(*fence, &tlb_cb->cb,
> -					   amdgpu_vm_tlb_seq_cb)) {
> -			dma_fence_put(vm->last_tlb_flush);
> -			vm->last_tlb_flush = dma_fence_get(*fence);
> -		} else {
> -			amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
> -		}
> -		tlb_cb = NULL;
> -	}
> -
> -error_free:
> -	kfree(tlb_cb);
> +	if (params.needs_flush)
> +		r = amdgpu_vm_tlb_flush(&params, fence);
>   
>   error_unlock:
>   	amdgpu_vm_eviction_unlock(vm);
> @@ -2391,6 +2408,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   
>   	mutex_init(&vm->eviction_lock);
>   	vm->evicting = false;
> +	vm->tlb_fence_context = dma_fence_context_alloc(1);
>   
>   	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
>   				false, &root, xcp_id);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 8efa8422f4f7..b0a4fe683352 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -257,9 +257,9 @@ struct amdgpu_vm_update_params {
>   	unsigned int num_dw_left;
>   
>   	/**
> -	 * @table_freed: return true if page table is freed when updating
> +	 * @needs_flush: true whenever we need to invalidate the TLB
>   	 */
> -	bool table_freed;
> +	bool needs_flush;
>   
>   	/**
>   	 * @allow_override: true for memory that is not uncached: allows MTYPE
> @@ -342,6 +342,7 @@ struct amdgpu_vm {
>   	atomic64_t		tlb_seq;
>   	struct dma_fence	*last_tlb_flush;
>   	atomic64_t		kfd_last_flushed_seq;
> +	uint64_t		tlb_fence_context;
>   
>   	/* How many times we had to re-generate the page tables */
>   	uint64_t		generation;
> @@ -611,5 +612,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
>   				  uint64_t addr,
>   				  uint32_t status,
>   				  unsigned int vmhub);
> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
> +				 struct amdgpu_vm *vm,
> +				 struct dma_fence **fence);
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> index 6e31621452de..3895bd7d176a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> @@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,
>   static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
>   				struct dma_fence **fence)
>   {
> -	/* Flush HDP */
> +	if (p->needs_flush)
> +		atomic64_inc(&p->vm->tlb_seq);
> +
>   	mb();
>   	amdgpu_device_flush_hdp(p->adev, NULL);
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index 124389a6bf48..601df0ce8290 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -972,7 +972,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
>   			while (cursor.pfn < frag_start) {
>   				/* Make sure previous mapping is freed */
>   				if (cursor.entry->bo) {
> -					params->table_freed = true;
> +					params->needs_flush = true;
>   					amdgpu_vm_pt_free_dfs(adev, params->vm,
>   							      &cursor,
>   							      params->unlocked);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 349416e176a1..66e8a016126b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>   
>   	WARN_ON(ib->length_dw == 0);
>   	amdgpu_ring_pad_ib(ring, ib);
> +
> +	if (p->needs_flush)
> +		atomic64_inc(&p->vm->tlb_seq);
> +
>   	WARN_ON(ib->length_dw > p->num_dw_left);
>   	f = amdgpu_job_submit(p->job);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
> new file mode 100644
> index 000000000000..51cddfa3f1e8
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
> @@ -0,0 +1,112 @@
> +// SPDX-License-Identifier: GPL-2.0 OR MIT
> +/*
> + * Copyright 2023 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/dma-fence.h>
> +#include <linux/workqueue.h>
> +
> +#include "amdgpu.h"
> +#include "amdgpu_vm.h"
> +#include "amdgpu_gmc.h"
> +
> +struct amdgpu_tlb_fence {
> +	struct dma_fence	base;
> +	struct amdgpu_device	*adev;
> +	struct dma_fence	*dependency;
> +	struct work_struct	work;
> +	spinlock_t		lock;
> +	uint16_t		pasid;
> +
> +};
> +
> +static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence *fence)
> +{
> +	return "amdgpu tlb fence";
> +}
> +
> +static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f)
> +{
> +	return "amdgpu tlb timeline";
> +}
> +
> +static void amdgpu_tlb_fence_work(struct work_struct *work)
> +{
> +	struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
> +	int r;
> +
> +	if (f->dependency) {
> +		dma_fence_wait(f->dependency, false);
> +		dma_fence_put(f->dependency);
> +		f->dependency = NULL;
> +	}
> +
> +	r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
> +	if (r) {
> +		dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
> +			f->pasid);
> +		dma_fence_set_error(&f->base, r);
> +	}
> +
> +	dma_fence_signal(&f->base);
> +	dma_fence_put(&f->base);
> +}
> +
> +static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
> +	.use_64bit_seqno = true,
> +	.get_driver_name = amdgpu_tlb_fence_get_driver_name,
> +	.get_timeline_name = amdgpu_tlb_fence_get_timeline_name
> +};
> +
> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> +				struct dma_fence **fence)
> +{
> +	struct amdgpu_tlb_fence *f;
> +
> +	f = kmalloc(sizeof(*f), GFP_KERNEL);
> +	if (!f) {
> +		/*
> +		 * We can't fail since the PDEs and PTEs are already updated, so
> +		 * just block for the dependency and execute the TLB flush
> +		 */
> +		if (*fence)
> +			dma_fence_wait(*fence, false);
> +
> +		amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
> +		*fence = dma_fence_get_stub();
> +		return;
> +	}
> +
> +	f->adev = adev;
> +	f->dependency = *fence;
> +	f->pasid = vm->pasid;
> +	INIT_WORK(&f->work, amdgpu_tlb_fence_work);
> +	spin_lock_init(&f->lock);
> +
> +	dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
> +		       vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
> +
> +	/* TODO: We probably need a separate wq here */
> +	dma_fence_get(&f->base);
> +	schedule_work(&f->work);
> +
> +	*fence = &f->base;
> +}


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 14:44         ` [PATCH v8] " Shashank Sharma
@ 2024-03-18 15:01           ` Christian König
  2024-03-18 15:22             ` Sharma, Shashank
  0 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2024-03-18 15:01 UTC (permalink / raw)
  To: Shashank Sharma, amd-gfx; +Cc: Alex Deucher, Felix Kuehling, Rajneesh Bhardwaj

Am 18.03.24 um 15:44 schrieb Shashank Sharma:
> The idea behind this patch is to delay the freeing of PT entry objects
> until the TLB flush is done.
>
> This patch:
> - Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will keep the
>    objects that need to be freed after tlb_flush.
> - Adds PT entries in this list in amdgpu_vm_ptes_update after finding
>    the PT entry.
> - Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + free)
>    to simply freeing of the BOs, also renames it to
>    amdgpu_vm_pt_free_list to reflect this same.
> - Exports function amdgpu_vm_pt_free_list to be called directly.
> - Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range.
>
> V2: rebase
> V4: Addressed review comments from Christian
>      - add only locked PTEs entries in TLB flush waitlist.
>      - do not create a separate function for list flush.
>      - do not create a new lock for TLB flush.
>      - there is no need to wait on tlb_flush_fence exclusively.
>
> V5: Addressed review comments from Christian
>      - change the amdgpu_vm_pt_free_dfs's functionality to simple freeing
>        of the objects and rename it.
>      - add all the PTE objects in params->tlb_flush_waitlist
>      - let amdgpu_vm_pt_free_root handle the freeing of BOs independently
>      - call amdgpu_vm_pt_free_list directly
>
> V6: Rebase
> V7: Rebase
> V8: Added a NULL check to fix this backtrace issue:
> [  415.351447] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [  415.359245] #PF: supervisor write access in kernel mode
> [  415.365081] #PF: error_code(0x0002) - not-present page
> [  415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0
> [  415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI
> [  415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: G           OE     5.18.2-mi300-build-140423-ubuntu-22.04+ #24
> [  415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS RMO1001AS 02/21/2024
> [  415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu]
> [  415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 74 42 48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 75 b0 4c 89 ff <48
>> 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63
> [  415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287
> [  415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 0000000000000000
> [  415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: ffff888161f80000
> [  415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: ffffc9000401fa00
> [  415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: ffffc9000401fb20
> [  415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: ffff888161f80000
> [  415.476312] FS:  00007f132ff89840(0000) GS:ffff889f87c00000(0000) knlGS:0000000000000000
> [  415.485350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 0000000000770ef0
> [  415.499738] PKRU: 55555554
> [  415.502750] Call Trace:
> [  415.505482]  <TASK>
> [  415.507825]  amdgpu_vm_update_range+0x32a/0x880 [amdgpu]
> [  415.513869]  amdgpu_vm_clear_freed+0x117/0x250 [amdgpu]
> [  415.519814]  amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 [amdgpu]
> [  415.527729]  kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu]
> [  415.534551]  kfd_ioctl+0x3b6/0x510 [amdgpu]
>
> Cc: Christian König <Christian.Koenig@amd.com>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: Felix Kuehling <felix.kuehling@amd.com>
> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
> Acked-by: Felix Kuehling <felix.kuehling@amd.com>
> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  5 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  7 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 58 +++++++++++++----------
>   3 files changed, 45 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 26f1c3359642..eaa402f99fe0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	params.unlocked = unlocked;
>   	params.needs_flush = flush_tlb;
>   	params.allow_override = allow_override;
> +	INIT_LIST_HEAD(&params.tlb_flush_waitlist);
>   
>   	/* Implicitly sync to command submissions in the same VM before
>   	 * unmapping. Sync to moving fences before mapping.
> @@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	if (r)
>   		goto error_unlock;
>   
> -	if (params.needs_flush)
> +	if (params.needs_flush) {
>   		r = amdgpu_vm_tlb_flush(&params, fence);
> +		amdgpu_vm_pt_free_list(adev, &params);

This is completed independent of the VM flush and should always be called.

> +	}
>   
>   error_unlock:
>   	amdgpu_vm_eviction_unlock(vm);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index b0a4fe683352..54d7da396de0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -266,6 +266,11 @@ struct amdgpu_vm_update_params {
>   	 * to be overridden for NUMA local memory.
>   	 */
>   	bool allow_override;
> +
> +	/**
> +	 * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
> +	 */
> +	struct list_head tlb_flush_waitlist;
>   };
>   
>   struct amdgpu_vm_update_funcs {
> @@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
>   			  uint64_t start, uint64_t end,
>   			  uint64_t dst, uint64_t flags);
>   void amdgpu_vm_pt_free_work(struct work_struct *work);
> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
> +			    struct amdgpu_vm_update_params *params);
>   
>   #if defined(CONFIG_DEBUG_FS)
>   void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index 601df0ce8290..9231edfb427e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
>   }
>   
>   /**
> - * amdgpu_vm_pt_free_dfs - free PD/PT levels
> + * amdgpu_vm_pt_free_list - free PD/PT levels
>    *
>    * @adev: amdgpu device structure
> - * @vm: amdgpu vm structure
> - * @start: optional cursor where to start freeing PDs/PTs
> - * @unlocked: vm resv unlock status
> + * @params: see amdgpu_vm_update_params definition
>    *
> - * Free the page directory or page table level and all sub levels.
> + * Free the page directory objects saved in the flush list
>    */
> -static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
> -				  struct amdgpu_vm *vm,
> -				  struct amdgpu_vm_pt_cursor *start,
> -				  bool unlocked)
> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
> +			    struct amdgpu_vm_update_params *params)
>   {
> -	struct amdgpu_vm_pt_cursor cursor;
> -	struct amdgpu_vm_bo_base *entry;
> +	struct amdgpu_vm_bo_base *entry, *next;
> +	struct amdgpu_vm *vm = params->vm;
> +	bool unlocked = params->unlocked;
>   
>   	if (unlocked) {
>   		spin_lock(&vm->status_lock);
> -		for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
> -			list_move(&entry->vm_status, &vm->pt_freed);
> -
> -		if (start)
> -			list_move(&start->entry->vm_status, &vm->pt_freed);
> +		list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
>   		spin_unlock(&vm->status_lock);
>   		schedule_work(&vm->pt_free_work);
>   		return;
>   	}
>   
> -	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
> +	list_for_each_entry_safe(entry, next, &params->tlb_flush_waitlist, vm_status)
>   		amdgpu_vm_pt_free(entry);
> -
> -	if (start)
> -		amdgpu_vm_pt_free(start->entry);
>   }
>   
>   /**
> @@ -667,7 +657,13 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
>    */
>   void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   {
> -	amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
> +	struct amdgpu_vm_pt_cursor cursor;
> +	struct amdgpu_vm_bo_base *entry;
> +
> +	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) {
> +		if (entry)

Entry can't be NULL here I think.

> +			amdgpu_vm_pt_free(entry);
> +	}
>   }
>   
>   /**
> @@ -972,10 +968,24 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
>   			while (cursor.pfn < frag_start) {
>   				/* Make sure previous mapping is freed */
>   				if (cursor.entry->bo) {
> +					struct amdgpu_vm_pt_cursor seek;
> +					struct amdgpu_vm_bo_base *entry;
> +
>   					params->needs_flush = true;
> -					amdgpu_vm_pt_free_dfs(adev, params->vm,
> -							      &cursor,
> -							      params->unlocked);
> +					spin_lock(&params->vm->status_lock);
> +					for_each_amdgpu_vm_pt_dfs_safe(adev, params->vm, &cursor,
> +								       seek, entry) {
> +						if (!entry || !entry->bo)
> +							continue;
> +
> +						list_move(&entry->vm_status,
> +							  &params->tlb_flush_waitlist);
> +					}
> +
> +					/* enter start node now */
> +					list_move(&cursor.entry->vm_status,
> +						  &params->tlb_flush_waitlist);
> +					spin_unlock(&params->vm->status_lock);

I would put this into a separate function maybe.

Apart from those nit-picks looks good to me.

Regards,
Christian.

>   				}
>   				amdgpu_vm_pt_next(adev, &cursor);
>   			}


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence
  2024-03-18 14:58 ` [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence Christian König
@ 2024-03-18 15:19   ` Sharma, Shashank
  0 siblings, 0 replies; 12+ messages in thread
From: Sharma, Shashank @ 2024-03-18 15:19 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: Felix Kuehling, Rajneesh Bhardwaj, Alex Deucher


On 18/03/2024 15:58, Christian König wrote:
> Am 18.03.24 um 13:08 schrieb Shashank Sharma:
>> From: Christian Koenig <christian.koenig@amd.com>
>>
>> The problem is that when (for example) 4k pages are replaced
>> with a single 2M page we need to wait for change to be flushed
>> out by invalidating the TLB before the PT can be freed.
>>
>> Solve this by moving the TLB flush into a DMA-fence object which
>> can be used to delay the freeing of the PT BOs until it is signaled.
>>
>> V2: (Shashank)
>>      - rebase
>>      - set dma_fence_error only in case of error
>>      - add tlb_flush fence only when PT/PD BO is locked (Felix)
>>      - use vm->pasid when f is NULL (Mukul)
>>
>> V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
>>      - move the misplaced fence_create call to the end (Philip)
>>
>> V5: - free the f->dependency properly
>>
>> V6: (Shashank)
>>      - light code movement, moved all the clean-up in previous patch
>>      - introduce params.needs_flush and its usage in this patch
>>      - rebase without TLB HW sequence patch
>>
>> V7:
>>     - Keep the vm->last_update_fence and tlb_cb code until
>>       we can fix the HW sequencing (Christian)
>>     - Move all the tlb_fence related code in a separate function so that
>>       its easier to read and review
>>
>> Cc: Christian Koenig <christian.koenig@amd.com>
>> Cc: Felix Kuehling <Felix.Kuehling@amd.com>
>> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>> Cc: Alex Deucher <alexander.deucher@amd.com>
>> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>> Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
>> Signed-off-by: Christian Koenig <christian.koenig@amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/Makefile           |   3 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  68 +++++++----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |   8 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c    |   4 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     |   2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c   |   4 +
>>   .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112 ++++++++++++++++++
>>   7 files changed, 171 insertions(+), 30 deletions(-)
>>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
>> b/drivers/gpu/drm/amd/amdgpu/Makefile
>> index 4536c8ad0e11..f24f11ac3e92 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
>> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
>> @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
>> amdgpu_kms.o \
>>       amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
>>       atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
>>       atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
>> -    amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o 
>> amdgpu_pll.o \
>> +    amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
>> +    amdgpu_ib.o amdgpu_pll.o \
>>       amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
>>       amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o 
>> amdgpu_virt.o \
>>       amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 81fb3465e197..26f1c3359642 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -885,6 +885,40 @@ static void amdgpu_vm_tlb_seq_cb(struct 
>> dma_fence *fence,
>>       kfree(tlb_cb);
>>   }
>>   +static int
>> +amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, struct 
>> dma_fence **fence)
>> +{
>> +    struct amdgpu_vm_tlb_seq_struct *tlb_cb;
>> +    struct amdgpu_vm *vm = params->vm;
>> +
>> +    if (!fence || !*fence)
>> +        return 0;
>> +
>> +    tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
>> +    if (!tlb_cb)
>> +        return -ENOMEM;
>
> That won't work like this. The page tables are already updated, so you 
> need to have the callback no matter what.
>
> That's why the code previously allocated the tlb_cb structure before 
> updating the page tables.


I could see from the previous code that there was also handling of cb 
being OOM, but I think your point is 'do not start the PT update until 
we have memory for callback', so I will accommodate that.

- Shashank

>
> Regards,
> Christian.
>
>> +
>> +    tlb_cb->vm = vm;
>> +    if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
>> +                    amdgpu_vm_tlb_seq_cb)) {
>> +        dma_fence_put(vm->last_tlb_flush);
>> +        vm->last_tlb_flush = dma_fence_get(*fence);
>> +    } else {
>> +        amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
>> +    }
>> +
>> +    /* Prepare a TLB flush fence to be attached to PTs */
>> +    if (!params->unlocked && vm->is_compute_context) {
>> +        amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
>> +
>> +        /* Makes sure no PD/PT is freed before the flush */
>> +        dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
>> +                   DMA_RESV_USAGE_BOOKKEEP);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   /**
>>    * amdgpu_vm_update_range - update a range in the vm page table
>>    *
>> @@ -917,7 +951,6 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>                  struct dma_fence **fence)
>>   {
>>       struct amdgpu_vm_update_params params;
>> -    struct amdgpu_vm_tlb_seq_struct *tlb_cb;
>>       struct amdgpu_res_cursor cursor;
>>       enum amdgpu_sync_mode sync_mode;
>>       int r, idx;
>> @@ -925,12 +958,6 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>       if (!drm_dev_enter(adev_to_drm(adev), &idx))
>>           return -ENODEV;
>>   -    tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
>> -    if (!tlb_cb) {
>> -        r = -ENOMEM;
>> -        goto error_unlock;
>> -    }
>> -
>>       /* Vega20+XGMI where PTEs get inadvertently cached in L2 
>> texture cache,
>>        * heavy-weight flush TLB unconditionally.
>>        */
>> @@ -948,6 +975,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>       params.immediate = immediate;
>>       params.pages_addr = pages_addr;
>>       params.unlocked = unlocked;
>> +    params.needs_flush = flush_tlb;
>>       params.allow_override = allow_override;
>>         /* Implicitly sync to command submissions in the same VM before
>> @@ -961,7 +989,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>       amdgpu_vm_eviction_lock(vm);
>>       if (vm->evicting) {
>>           r = -EBUSY;
>> -        goto error_free;
>> +        goto error_unlock;
>>       }
>>         if (!unlocked && !dma_fence_is_signaled(vm->last_unlocked)) {
>> @@ -974,7 +1002,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>         r = vm->update_funcs->prepare(&params, resv, sync_mode);
>>       if (r)
>> -        goto error_free;
>> +        goto error_unlock;
>>         amdgpu_res_first(pages_addr ? NULL : res, offset,
>>                (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, &cursor);
>> @@ -1024,29 +1052,18 @@ int amdgpu_vm_update_range(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>           tmp = start + num_entries;
>>           r = amdgpu_vm_ptes_update(&params, start, tmp, addr, flags);
>>           if (r)
>> -            goto error_free;
>> +            goto error_unlock;
>>             amdgpu_res_next(&cursor, num_entries * 
>> AMDGPU_GPU_PAGE_SIZE);
>>           start = tmp;
>>       }
>>         r = vm->update_funcs->commit(&params, fence);
>> +    if (r)
>> +        goto error_unlock;
>>   -    if (flush_tlb || params.table_freed) {
>> -        tlb_cb->vm = vm;
>> -        if (fence && *fence &&
>> -            !dma_fence_add_callback(*fence, &tlb_cb->cb,
>> -                       amdgpu_vm_tlb_seq_cb)) {
>> -            dma_fence_put(vm->last_tlb_flush);
>> -            vm->last_tlb_flush = dma_fence_get(*fence);
>> -        } else {
>> -            amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
>> -        }
>> -        tlb_cb = NULL;
>> -    }
>> -
>> -error_free:
>> -    kfree(tlb_cb);
>> +    if (params.needs_flush)
>> +        r = amdgpu_vm_tlb_flush(&params, fence);
>>     error_unlock:
>>       amdgpu_vm_eviction_unlock(vm);
>> @@ -2391,6 +2408,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, 
>> struct amdgpu_vm *vm,
>>         mutex_init(&vm->eviction_lock);
>>       vm->evicting = false;
>> +    vm->tlb_fence_context = dma_fence_context_alloc(1);
>>         r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
>>                   false, &root, xcp_id);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 8efa8422f4f7..b0a4fe683352 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -257,9 +257,9 @@ struct amdgpu_vm_update_params {
>>       unsigned int num_dw_left;
>>         /**
>> -     * @table_freed: return true if page table is freed when updating
>> +     * @needs_flush: true whenever we need to invalidate the TLB
>>        */
>> -    bool table_freed;
>> +    bool needs_flush;
>>         /**
>>        * @allow_override: true for memory that is not uncached: 
>> allows MTYPE
>> @@ -342,6 +342,7 @@ struct amdgpu_vm {
>>       atomic64_t        tlb_seq;
>>       struct dma_fence    *last_tlb_flush;
>>       atomic64_t        kfd_last_flushed_seq;
>> +    uint64_t        tlb_fence_context;
>>         /* How many times we had to re-generate the page tables */
>>       uint64_t        generation;
>> @@ -611,5 +612,8 @@ void amdgpu_vm_update_fault_cache(struct 
>> amdgpu_device *adev,
>>                     uint64_t addr,
>>                     uint32_t status,
>>                     unsigned int vmhub);
>> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
>> +                 struct amdgpu_vm *vm,
>> +                 struct dma_fence **fence);
>>     #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>> index 6e31621452de..3895bd7d176a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>> @@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct 
>> amdgpu_vm_update_params *p,
>>   static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
>>                   struct dma_fence **fence)
>>   {
>> -    /* Flush HDP */
>> +    if (p->needs_flush)
>> +        atomic64_inc(&p->vm->tlb_seq);
>> +
>>       mb();
>>       amdgpu_device_flush_hdp(p->adev, NULL);
>>       return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> index 124389a6bf48..601df0ce8290 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> @@ -972,7 +972,7 @@ int amdgpu_vm_ptes_update(struct 
>> amdgpu_vm_update_params *params,
>>               while (cursor.pfn < frag_start) {
>>                   /* Make sure previous mapping is freed */
>>                   if (cursor.entry->bo) {
>> -                    params->table_freed = true;
>> +                    params->needs_flush = true;
>>                       amdgpu_vm_pt_free_dfs(adev, params->vm,
>>                                     &cursor,
>>                                     params->unlocked);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> index 349416e176a1..66e8a016126b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> @@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct 
>> amdgpu_vm_update_params *p,
>>         WARN_ON(ib->length_dw == 0);
>>       amdgpu_ring_pad_ib(ring, ib);
>> +
>> +    if (p->needs_flush)
>> +        atomic64_inc(&p->vm->tlb_seq);
>> +
>>       WARN_ON(ib->length_dw > p->num_dw_left);
>>       f = amdgpu_job_submit(p->job);
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> new file mode 100644
>> index 000000000000..51cddfa3f1e8
>> --- /dev/null
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> @@ -0,0 +1,112 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR MIT
>> +/*
>> + * Copyright 2023 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person 
>> obtaining a
>> + * copy of this software and associated documentation files (the 
>> "Software"),
>> + * to deal in the Software without restriction, including without 
>> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, 
>> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom 
>> the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be 
>> included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>> EVENT SHALL
>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
>> DAMAGES OR
>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
>> OTHERWISE,
>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
>> USE OR
>> + * OTHER DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +#include <linux/dma-fence.h>
>> +#include <linux/workqueue.h>
>> +
>> +#include "amdgpu.h"
>> +#include "amdgpu_vm.h"
>> +#include "amdgpu_gmc.h"
>> +
>> +struct amdgpu_tlb_fence {
>> +    struct dma_fence    base;
>> +    struct amdgpu_device    *adev;
>> +    struct dma_fence    *dependency;
>> +    struct work_struct    work;
>> +    spinlock_t        lock;
>> +    uint16_t        pasid;
>> +
>> +};
>> +
>> +static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence 
>> *fence)
>> +{
>> +    return "amdgpu tlb fence";
>> +}
>> +
>> +static const char *amdgpu_tlb_fence_get_timeline_name(struct 
>> dma_fence *f)
>> +{
>> +    return "amdgpu tlb timeline";
>> +}
>> +
>> +static void amdgpu_tlb_fence_work(struct work_struct *work)
>> +{
>> +    struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
>> +    int r;
>> +
>> +    if (f->dependency) {
>> +        dma_fence_wait(f->dependency, false);
>> +        dma_fence_put(f->dependency);
>> +        f->dependency = NULL;
>> +    }
>> +
>> +    r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
>> +    if (r) {
>> +        dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
>> +            f->pasid);
>> +        dma_fence_set_error(&f->base, r);
>> +    }
>> +
>> +    dma_fence_signal(&f->base);
>> +    dma_fence_put(&f->base);
>> +}
>> +
>> +static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
>> +    .use_64bit_seqno = true,
>> +    .get_driver_name = amdgpu_tlb_fence_get_driver_name,
>> +    .get_timeline_name = amdgpu_tlb_fence_get_timeline_name
>> +};
>> +
>> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm,
>> +                struct dma_fence **fence)
>> +{
>> +    struct amdgpu_tlb_fence *f;
>> +
>> +    f = kmalloc(sizeof(*f), GFP_KERNEL);
>> +    if (!f) {
>> +        /*
>> +         * We can't fail since the PDEs and PTEs are already 
>> updated, so
>> +         * just block for the dependency and execute the TLB flush
>> +         */
>> +        if (*fence)
>> +            dma_fence_wait(*fence, false);
>> +
>> +        amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
>> +        *fence = dma_fence_get_stub();
>> +        return;
>> +    }
>> +
>> +    f->adev = adev;
>> +    f->dependency = *fence;
>> +    f->pasid = vm->pasid;
>> +    INIT_WORK(&f->work, amdgpu_tlb_fence_work);
>> +    spin_lock_init(&f->lock);
>> +
>> +    dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
>> +               vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
>> +
>> +    /* TODO: We probably need a separate wq here */
>> +    dma_fence_get(&f->base);
>> +    schedule_work(&f->work);
>> +
>> +    *fence = &f->base;
>> +}
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 15:01           ` Christian König
@ 2024-03-18 15:22             ` Sharma, Shashank
  2024-03-18 15:24               ` Christian König
  0 siblings, 1 reply; 12+ messages in thread
From: Sharma, Shashank @ 2024-03-18 15:22 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: Alex Deucher, Felix Kuehling, Rajneesh Bhardwaj


On 18/03/2024 16:01, Christian König wrote:
> Am 18.03.24 um 15:44 schrieb Shashank Sharma:
>> The idea behind this patch is to delay the freeing of PT entry objects
>> until the TLB flush is done.
>>
>> This patch:
>> - Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will 
>> keep the
>>    objects that need to be freed after tlb_flush.
>> - Adds PT entries in this list in amdgpu_vm_ptes_update after finding
>>    the PT entry.
>> - Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + free)
>>    to simply freeing of the BOs, also renames it to
>>    amdgpu_vm_pt_free_list to reflect this same.
>> - Exports function amdgpu_vm_pt_free_list to be called directly.
>> - Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range.
>>
>> V2: rebase
>> V4: Addressed review comments from Christian
>>      - add only locked PTEs entries in TLB flush waitlist.
>>      - do not create a separate function for list flush.
>>      - do not create a new lock for TLB flush.
>>      - there is no need to wait on tlb_flush_fence exclusively.
>>
>> V5: Addressed review comments from Christian
>>      - change the amdgpu_vm_pt_free_dfs's functionality to simple 
>> freeing
>>        of the objects and rename it.
>>      - add all the PTE objects in params->tlb_flush_waitlist
>>      - let amdgpu_vm_pt_free_root handle the freeing of BOs 
>> independently
>>      - call amdgpu_vm_pt_free_list directly
>>
>> V6: Rebase
>> V7: Rebase
>> V8: Added a NULL check to fix this backtrace issue:
>> [  415.351447] BUG: kernel NULL pointer dereference, address: 
>> 0000000000000008
>> [  415.359245] #PF: supervisor write access in kernel mode
>> [  415.365081] #PF: error_code(0x0002) - not-present page
>> [  415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0
>> [  415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI
>> [  415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: 
>> G           OE     5.18.2-mi300-build-140423-ubuntu-22.04+ #24
>> [  415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS 
>> RMO1001AS 02/21/2024
>> [  415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu]
>> [  415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 74 
>> 42 48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 75 b0 
>> 4c 89 ff <48
>>> 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63
>> [  415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287
>> [  415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 
>> 0000000000000000
>> [  415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: 
>> ffff888161f80000
>> [  415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: 
>> ffffc9000401fa00
>> [  415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: 
>> ffffc9000401fb20
>> [  415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: 
>> ffff888161f80000
>> [  415.476312] FS:  00007f132ff89840(0000) GS:ffff889f87c00000(0000) 
>> knlGS:0000000000000000
>> [  415.485350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 
>> 0000000000770ef0
>> [  415.499738] PKRU: 55555554
>> [  415.502750] Call Trace:
>> [  415.505482]  <TASK>
>> [  415.507825]  amdgpu_vm_update_range+0x32a/0x880 [amdgpu]
>> [  415.513869]  amdgpu_vm_clear_freed+0x117/0x250 [amdgpu]
>> [  415.519814] amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 
>> [amdgpu]
>> [  415.527729]  kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu]
>> [  415.534551]  kfd_ioctl+0x3b6/0x510 [amdgpu]
>>
>> Cc: Christian König <Christian.Koenig@amd.com>
>> Cc: Alex Deucher <alexander.deucher@amd.com>
>> Cc: Felix Kuehling <felix.kuehling@amd.com>
>> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>> Acked-by: Felix Kuehling <felix.kuehling@amd.com>
>> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  5 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  7 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 58 +++++++++++++----------
>>   3 files changed, 45 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 26f1c3359642..eaa402f99fe0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>       params.unlocked = unlocked;
>>       params.needs_flush = flush_tlb;
>>       params.allow_override = allow_override;
>> +    INIT_LIST_HEAD(&params.tlb_flush_waitlist);
>>         /* Implicitly sync to command submissions in the same VM before
>>        * unmapping. Sync to moving fences before mapping.
>> @@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>       if (r)
>>           goto error_unlock;
>>   -    if (params.needs_flush)
>> +    if (params.needs_flush) {
>>           r = amdgpu_vm_tlb_flush(&params, fence);
>> +        amdgpu_vm_pt_free_list(adev, &params);
>
> This is completed independent of the VM flush and should always be 
> called.
>
>> +    }
>>     error_unlock:
>>       amdgpu_vm_eviction_unlock(vm);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index b0a4fe683352..54d7da396de0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -266,6 +266,11 @@ struct amdgpu_vm_update_params {
>>        * to be overridden for NUMA local memory.
>>        */
>>       bool allow_override;
>> +
>> +    /**
>> +     * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
>> +     */
>> +    struct list_head tlb_flush_waitlist;
>>   };
>>     struct amdgpu_vm_update_funcs {
>> @@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct 
>> amdgpu_vm_update_params *params,
>>                 uint64_t start, uint64_t end,
>>                 uint64_t dst, uint64_t flags);
>>   void amdgpu_vm_pt_free_work(struct work_struct *work);
>> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
>> +                struct amdgpu_vm_update_params *params);
>>     #if defined(CONFIG_DEBUG_FS)
>>   void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct 
>> seq_file *m);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> index 601df0ce8290..9231edfb427e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> @@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct work_struct 
>> *work)
>>   }
>>     /**
>> - * amdgpu_vm_pt_free_dfs - free PD/PT levels
>> + * amdgpu_vm_pt_free_list - free PD/PT levels
>>    *
>>    * @adev: amdgpu device structure
>> - * @vm: amdgpu vm structure
>> - * @start: optional cursor where to start freeing PDs/PTs
>> - * @unlocked: vm resv unlock status
>> + * @params: see amdgpu_vm_update_params definition
>>    *
>> - * Free the page directory or page table level and all sub levels.
>> + * Free the page directory objects saved in the flush list
>>    */
>> -static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
>> -                  struct amdgpu_vm *vm,
>> -                  struct amdgpu_vm_pt_cursor *start,
>> -                  bool unlocked)
>> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
>> +                struct amdgpu_vm_update_params *params)
>>   {
>> -    struct amdgpu_vm_pt_cursor cursor;
>> -    struct amdgpu_vm_bo_base *entry;
>> +    struct amdgpu_vm_bo_base *entry, *next;
>> +    struct amdgpu_vm *vm = params->vm;
>> +    bool unlocked = params->unlocked;
>>         if (unlocked) {
>>           spin_lock(&vm->status_lock);
>> -        for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
>> -            list_move(&entry->vm_status, &vm->pt_freed);
>> -
>> -        if (start)
>> -            list_move(&start->entry->vm_status, &vm->pt_freed);
>> +        list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
>>           spin_unlock(&vm->status_lock);
>>           schedule_work(&vm->pt_free_work);
>>           return;
>>       }
>>   -    for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
>> +    list_for_each_entry_safe(entry, next, 
>> &params->tlb_flush_waitlist, vm_status)
>>           amdgpu_vm_pt_free(entry);
>> -
>> -    if (start)
>> -        amdgpu_vm_pt_free(start->entry);
>>   }
>>     /**
>> @@ -667,7 +657,13 @@ static void amdgpu_vm_pt_free_dfs(struct 
>> amdgpu_device *adev,
>>    */
>>   void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm)
>>   {
>> -    amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
>> +    struct amdgpu_vm_pt_cursor cursor;
>> +    struct amdgpu_vm_bo_base *entry;
>> +
>> +    for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) {
>> +        if (entry)
>
> Entry can't be NULL here I think.
Yeah, makes sense, if there is a GPU process, it will have at-least one 
page table entry.
>
>> +            amdgpu_vm_pt_free(entry);
>> +    }
>>   }
>>     /**
>> @@ -972,10 +968,24 @@ int amdgpu_vm_ptes_update(struct 
>> amdgpu_vm_update_params *params,
>>               while (cursor.pfn < frag_start) {
>>                   /* Make sure previous mapping is freed */
>>                   if (cursor.entry->bo) {
>> +                    struct amdgpu_vm_pt_cursor seek;
>> +                    struct amdgpu_vm_bo_base *entry;
>> +
>>                       params->needs_flush = true;
>> -                    amdgpu_vm_pt_free_dfs(adev, params->vm,
>> -                                  &cursor,
>> -                                  params->unlocked);
>> + spin_lock(&params->vm->status_lock);
>> +                    for_each_amdgpu_vm_pt_dfs_safe(adev, params->vm, 
>> &cursor,
>> +                                       seek, entry) {
>> +                        if (!entry || !entry->bo)
>> +                            continue;
>> +
>> +                        list_move(&entry->vm_status,
>> + &params->tlb_flush_waitlist);
>> +                    }
>> +
>> +                    /* enter start node now */
>> +                    list_move(&cursor.entry->vm_status,
>> +                          &params->tlb_flush_waitlist);
>> + spin_unlock(&params->vm->status_lock);
>
> I would put this into a separate function maybe.

Noted,

- Shashank

>
> Apart from those nit-picks looks good to me.
>
> Regards,
> Christian.
>
>>                   }
>>                   amdgpu_vm_pt_next(adev, &cursor);
>>               }
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 15:22             ` Sharma, Shashank
@ 2024-03-18 15:24               ` Christian König
  2024-03-21 14:18                 ` Sharma, Shashank
  0 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2024-03-18 15:24 UTC (permalink / raw)
  To: Sharma, Shashank, amd-gfx; +Cc: Alex Deucher, Felix Kuehling, Rajneesh Bhardwaj

Am 18.03.24 um 16:22 schrieb Sharma, Shashank:
>
> On 18/03/2024 16:01, Christian König wrote:
>> Am 18.03.24 um 15:44 schrieb Shashank Sharma:
>>> The idea behind this patch is to delay the freeing of PT entry objects
>>> until the TLB flush is done.
>>>
>>> This patch:
>>> - Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will 
>>> keep the
>>>    objects that need to be freed after tlb_flush.
>>> - Adds PT entries in this list in amdgpu_vm_ptes_update after finding
>>>    the PT entry.
>>> - Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + 
>>> free)
>>>    to simply freeing of the BOs, also renames it to
>>>    amdgpu_vm_pt_free_list to reflect this same.
>>> - Exports function amdgpu_vm_pt_free_list to be called directly.
>>> - Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range.
>>>
>>> V2: rebase
>>> V4: Addressed review comments from Christian
>>>      - add only locked PTEs entries in TLB flush waitlist.
>>>      - do not create a separate function for list flush.
>>>      - do not create a new lock for TLB flush.
>>>      - there is no need to wait on tlb_flush_fence exclusively.
>>>
>>> V5: Addressed review comments from Christian
>>>      - change the amdgpu_vm_pt_free_dfs's functionality to simple 
>>> freeing
>>>        of the objects and rename it.
>>>      - add all the PTE objects in params->tlb_flush_waitlist
>>>      - let amdgpu_vm_pt_free_root handle the freeing of BOs 
>>> independently
>>>      - call amdgpu_vm_pt_free_list directly
>>>
>>> V6: Rebase
>>> V7: Rebase
>>> V8: Added a NULL check to fix this backtrace issue:
>>> [  415.351447] BUG: kernel NULL pointer dereference, address: 
>>> 0000000000000008
>>> [  415.359245] #PF: supervisor write access in kernel mode
>>> [  415.365081] #PF: error_code(0x0002) - not-present page
>>> [  415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0
>>> [  415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI
>>> [  415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: 
>>> G           OE 5.18.2-mi300-build-140423-ubuntu-22.04+ #24
>>> [  415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS 
>>> RMO1001AS 02/21/2024
>>> [  415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu]
>>> [  415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 74 
>>> 42 48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 75 b0 
>>> 4c 89 ff <48
>>>> 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63
>>> [  415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287
>>> [  415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 
>>> 0000000000000000
>>> [  415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: 
>>> ffff888161f80000
>>> [  415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: 
>>> ffffc9000401fa00
>>> [  415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: 
>>> ffffc9000401fb20
>>> [  415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: 
>>> ffff888161f80000
>>> [  415.476312] FS:  00007f132ff89840(0000) GS:ffff889f87c00000(0000) 
>>> knlGS:0000000000000000
>>> [  415.485350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [  415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 
>>> 0000000000770ef0
>>> [  415.499738] PKRU: 55555554
>>> [  415.502750] Call Trace:
>>> [  415.505482]  <TASK>
>>> [  415.507825]  amdgpu_vm_update_range+0x32a/0x880 [amdgpu]
>>> [  415.513869]  amdgpu_vm_clear_freed+0x117/0x250 [amdgpu]
>>> [  415.519814] amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 
>>> [amdgpu]
>>> [  415.527729]  kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu]
>>> [  415.534551]  kfd_ioctl+0x3b6/0x510 [amdgpu]
>>>
>>> Cc: Christian König <Christian.Koenig@amd.com>
>>> Cc: Alex Deucher <alexander.deucher@amd.com>
>>> Cc: Felix Kuehling <felix.kuehling@amd.com>
>>> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>> Acked-by: Felix Kuehling <felix.kuehling@amd.com>
>>> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  5 +-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  7 +++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 58 
>>> +++++++++++++----------
>>>   3 files changed, 45 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 26f1c3359642..eaa402f99fe0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>>> *adev, struct amdgpu_vm *vm,
>>>       params.unlocked = unlocked;
>>>       params.needs_flush = flush_tlb;
>>>       params.allow_override = allow_override;
>>> +    INIT_LIST_HEAD(&params.tlb_flush_waitlist);
>>>         /* Implicitly sync to command submissions in the same VM before
>>>        * unmapping. Sync to moving fences before mapping.
>>> @@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct 
>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>       if (r)
>>>           goto error_unlock;
>>>   -    if (params.needs_flush)
>>> +    if (params.needs_flush) {
>>>           r = amdgpu_vm_tlb_flush(&params, fence);
>>> +        amdgpu_vm_pt_free_list(adev, &params);
>>
>> This is completed independent of the VM flush and should always be 
>> called.
>>
>>> +    }
>>>     error_unlock:
>>>       amdgpu_vm_eviction_unlock(vm);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> index b0a4fe683352..54d7da396de0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>> @@ -266,6 +266,11 @@ struct amdgpu_vm_update_params {
>>>        * to be overridden for NUMA local memory.
>>>        */
>>>       bool allow_override;
>>> +
>>> +    /**
>>> +     * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
>>> +     */
>>> +    struct list_head tlb_flush_waitlist;
>>>   };
>>>     struct amdgpu_vm_update_funcs {
>>> @@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct 
>>> amdgpu_vm_update_params *params,
>>>                 uint64_t start, uint64_t end,
>>>                 uint64_t dst, uint64_t flags);
>>>   void amdgpu_vm_pt_free_work(struct work_struct *work);
>>> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
>>> +                struct amdgpu_vm_update_params *params);
>>>     #if defined(CONFIG_DEBUG_FS)
>>>   void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct 
>>> seq_file *m);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>> index 601df0ce8290..9231edfb427e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>> @@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct work_struct 
>>> *work)
>>>   }
>>>     /**
>>> - * amdgpu_vm_pt_free_dfs - free PD/PT levels
>>> + * amdgpu_vm_pt_free_list - free PD/PT levels
>>>    *
>>>    * @adev: amdgpu device structure
>>> - * @vm: amdgpu vm structure
>>> - * @start: optional cursor where to start freeing PDs/PTs
>>> - * @unlocked: vm resv unlock status
>>> + * @params: see amdgpu_vm_update_params definition
>>>    *
>>> - * Free the page directory or page table level and all sub levels.
>>> + * Free the page directory objects saved in the flush list
>>>    */
>>> -static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
>>> -                  struct amdgpu_vm *vm,
>>> -                  struct amdgpu_vm_pt_cursor *start,
>>> -                  bool unlocked)
>>> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
>>> +                struct amdgpu_vm_update_params *params)
>>>   {
>>> -    struct amdgpu_vm_pt_cursor cursor;
>>> -    struct amdgpu_vm_bo_base *entry;
>>> +    struct amdgpu_vm_bo_base *entry, *next;
>>> +    struct amdgpu_vm *vm = params->vm;
>>> +    bool unlocked = params->unlocked;
>>>         if (unlocked) {
>>>           spin_lock(&vm->status_lock);
>>> -        for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
>>> -            list_move(&entry->vm_status, &vm->pt_freed);
>>> -
>>> -        if (start)
>>> -            list_move(&start->entry->vm_status, &vm->pt_freed);
>>> +        list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
>>>           spin_unlock(&vm->status_lock);
>>>           schedule_work(&vm->pt_free_work);
>>>           return;
>>>       }
>>>   -    for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
>>> +    list_for_each_entry_safe(entry, next, 
>>> &params->tlb_flush_waitlist, vm_status)
>>>           amdgpu_vm_pt_free(entry);
>>> -
>>> -    if (start)
>>> -        amdgpu_vm_pt_free(start->entry);
>>>   }
>>>     /**
>>> @@ -667,7 +657,13 @@ static void amdgpu_vm_pt_free_dfs(struct 
>>> amdgpu_device *adev,
>>>    */
>>>   void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct 
>>> amdgpu_vm *vm)
>>>   {
>>> -    amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
>>> +    struct amdgpu_vm_pt_cursor cursor;
>>> +    struct amdgpu_vm_bo_base *entry;
>>> +
>>> +    for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) {
>>> +        if (entry)
>>
>> Entry can't be NULL here I think.
> Yeah, makes sense, if there is a GPU process, it will have at-least 
> one page table entry.

The problem is rather that entry is either the root entry or a child 
element within the array of entries.

So entry can never be NULL, only entry->bo can be NULL if there is no 
PD/PT allocated for this slot.

Regards,
Christian.

>>
>>> +            amdgpu_vm_pt_free(entry);
>>> +    }
>>>   }
>>>     /**
>>> @@ -972,10 +968,24 @@ int amdgpu_vm_ptes_update(struct 
>>> amdgpu_vm_update_params *params,
>>>               while (cursor.pfn < frag_start) {
>>>                   /* Make sure previous mapping is freed */
>>>                   if (cursor.entry->bo) {
>>> +                    struct amdgpu_vm_pt_cursor seek;
>>> +                    struct amdgpu_vm_bo_base *entry;
>>> +
>>>                       params->needs_flush = true;
>>> -                    amdgpu_vm_pt_free_dfs(adev, params->vm,
>>> -                                  &cursor,
>>> -                                  params->unlocked);
>>> + spin_lock(&params->vm->status_lock);
>>> +                    for_each_amdgpu_vm_pt_dfs_safe(adev, 
>>> params->vm, &cursor,
>>> +                                       seek, entry) {
>>> +                        if (!entry || !entry->bo)
>>> +                            continue;
>>> +
>>> +                        list_move(&entry->vm_status,
>>> + &params->tlb_flush_waitlist);
>>> +                    }
>>> +
>>> +                    /* enter start node now */
>>> + list_move(&cursor.entry->vm_status,
>>> + &params->tlb_flush_waitlist);
>>> + spin_unlock(&params->vm->status_lock);
>>
>> I would put this into a separate function maybe.
>
> Noted,
>
> - Shashank
>
>>
>> Apart from those nit-picks looks good to me.
>>
>> Regards,
>> Christian.
>>
>>>                   }
>>>                   amdgpu_vm_pt_next(adev, &cursor);
>>>               }
>>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8] drm/amdgpu: sync page table freeing with tlb flush
  2024-03-18 15:24               ` Christian König
@ 2024-03-21 14:18                 ` Sharma, Shashank
  0 siblings, 0 replies; 12+ messages in thread
From: Sharma, Shashank @ 2024-03-21 14:18 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: Alex Deucher, Felix Kuehling, Rajneesh Bhardwaj

[-- Attachment #1: Type: text/plain, Size: 14639 bytes --]


On 18/03/2024 16:24, Christian König wrote:
> Am 18.03.24 um 16:22 schrieb Sharma, Shashank:
>>
>> On 18/03/2024 16:01, Christian König wrote:
>>> Am 18.03.24 um 15:44 schrieb Shashank Sharma:
>>>> The idea behind this patch is to delay the freeing of PT entry objects
>>>> until the TLB flush is done.
>>>>
>>>> This patch:
>>>> - Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will 
>>>> keep the
>>>>    objects that need to be freed after tlb_flush.
>>>> - Adds PT entries in this list in amdgpu_vm_ptes_update after finding
>>>>    the PT entry.
>>>> - Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + 
>>>> free)
>>>>    to simply freeing of the BOs, also renames it to
>>>>    amdgpu_vm_pt_free_list to reflect this same.
>>>> - Exports function amdgpu_vm_pt_free_list to be called directly.
>>>> - Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range.
>>>>
>>>> V2: rebase
>>>> V4: Addressed review comments from Christian
>>>>      - add only locked PTEs entries in TLB flush waitlist.
>>>>      - do not create a separate function for list flush.
>>>>      - do not create a new lock for TLB flush.
>>>>      - there is no need to wait on tlb_flush_fence exclusively.
>>>>
>>>> V5: Addressed review comments from Christian
>>>>      - change the amdgpu_vm_pt_free_dfs's functionality to simple 
>>>> freeing
>>>>        of the objects and rename it.
>>>>      - add all the PTE objects in params->tlb_flush_waitlist
>>>>      - let amdgpu_vm_pt_free_root handle the freeing of BOs 
>>>> independently
>>>>      - call amdgpu_vm_pt_free_list directly
>>>>
>>>> V6: Rebase
>>>> V7: Rebase
>>>> V8: Added a NULL check to fix this backtrace issue:
>>>> [  415.351447] BUG: kernel NULL pointer dereference, address: 
>>>> 0000000000000008
>>>> [  415.359245] #PF: supervisor write access in kernel mode
>>>> [  415.365081] #PF: error_code(0x0002) - not-present page
>>>> [  415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0
>>>> [  415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI
>>>> [  415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: 
>>>> G           OE 5.18.2-mi300-build-140423-ubuntu-22.04+ #24
>>>> [  415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS 
>>>> RMO1001AS 02/21/2024
>>>> [  415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu]
>>>> [  415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 
>>>> 74 42 48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 
>>>> 75 b0 4c 89 ff <48
>>>>> 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63
>>>> [  415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287
>>>> [  415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 
>>>> 0000000000000000
>>>> [  415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: 
>>>> ffff888161f80000
>>>> [  415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: 
>>>> ffffc9000401fa00
>>>> [  415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: 
>>>> ffffc9000401fb20
>>>> [  415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: 
>>>> ffff888161f80000
>>>> [  415.476312] FS:  00007f132ff89840(0000) 
>>>> GS:ffff889f87c00000(0000) knlGS:0000000000000000
>>>> [  415.485350] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> [  415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 
>>>> 0000000000770ef0
>>>> [  415.499738] PKRU: 55555554
>>>> [  415.502750] Call Trace:
>>>> [  415.505482]  <TASK>
>>>> [  415.507825]  amdgpu_vm_update_range+0x32a/0x880 [amdgpu]
>>>> [  415.513869]  amdgpu_vm_clear_freed+0x117/0x250 [amdgpu]
>>>> [  415.519814] 
>>>> amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 [amdgpu]
>>>> [  415.527729]  kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu]
>>>> [  415.534551]  kfd_ioctl+0x3b6/0x510 [amdgpu]
>>>>
>>>> Cc: Christian König <Christian.Koenig@amd.com>
>>>> Cc: Alex Deucher <alexander.deucher@amd.com>
>>>> Cc: Felix Kuehling <felix.kuehling@amd.com>
>>>> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>>> Acked-by: Felix Kuehling <felix.kuehling@amd.com>
>>>> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>>> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
>>>> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  5 +-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  7 +++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 58 
>>>> +++++++++++++----------
>>>>   3 files changed, 45 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 26f1c3359642..eaa402f99fe0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>>>> *adev, struct amdgpu_vm *vm,
>>>>       params.unlocked = unlocked;
>>>>       params.needs_flush = flush_tlb;
>>>>       params.allow_override = allow_override;
>>>> +    INIT_LIST_HEAD(&params.tlb_flush_waitlist);
>>>>         /* Implicitly sync to command submissions in the same VM 
>>>> before
>>>>        * unmapping. Sync to moving fences before mapping.
>>>> @@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct 
>>>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>>>       if (r)
>>>>           goto error_unlock;
>>>>   -    if (params.needs_flush)
>>>> +    if (params.needs_flush) {
>>>>           r = amdgpu_vm_tlb_flush(&params, fence);
>>>> +        amdgpu_vm_pt_free_list(adev, &params);
>>>
>>> This is completed independent of the VM flush and should always be 
>>> called.
>>>
>>>> +    }
>>>>     error_unlock:
>>>>       amdgpu_vm_eviction_unlock(vm);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> index b0a4fe683352..54d7da396de0 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> @@ -266,6 +266,11 @@ struct amdgpu_vm_update_params {
>>>>        * to be overridden for NUMA local memory.
>>>>        */
>>>>       bool allow_override;
>>>> +
>>>> +    /**
>>>> +     * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
>>>> +     */
>>>> +    struct list_head tlb_flush_waitlist;
>>>>   };
>>>>     struct amdgpu_vm_update_funcs {
>>>> @@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct 
>>>> amdgpu_vm_update_params *params,
>>>>                 uint64_t start, uint64_t end,
>>>>                 uint64_t dst, uint64_t flags);
>>>>   void amdgpu_vm_pt_free_work(struct work_struct *work);
>>>> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
>>>> +                struct amdgpu_vm_update_params *params);
>>>>     #if defined(CONFIG_DEBUG_FS)
>>>>   void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct 
>>>> seq_file *m);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>> index 601df0ce8290..9231edfb427e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>> @@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct 
>>>> work_struct *work)
>>>>   }
>>>>     /**
>>>> - * amdgpu_vm_pt_free_dfs - free PD/PT levels
>>>> + * amdgpu_vm_pt_free_list - free PD/PT levels
>>>>    *
>>>>    * @adev: amdgpu device structure
>>>> - * @vm: amdgpu vm structure
>>>> - * @start: optional cursor where to start freeing PDs/PTs
>>>> - * @unlocked: vm resv unlock status
>>>> + * @params: see amdgpu_vm_update_params definition
>>>>    *
>>>> - * Free the page directory or page table level and all sub levels.
>>>> + * Free the page directory objects saved in the flush list
>>>>    */
>>>> -static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
>>>> -                  struct amdgpu_vm *vm,
>>>> -                  struct amdgpu_vm_pt_cursor *start,
>>>> -                  bool unlocked)
>>>> +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
>>>> +                struct amdgpu_vm_update_params *params)
>>>>   {
>>>> -    struct amdgpu_vm_pt_cursor cursor;
>>>> -    struct amdgpu_vm_bo_base *entry;
>>>> +    struct amdgpu_vm_bo_base *entry, *next;
>>>> +    struct amdgpu_vm *vm = params->vm;
>>>> +    bool unlocked = params->unlocked;
>>>>         if (unlocked) {
>>>>           spin_lock(&vm->status_lock);
>>>> -        for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, 
>>>> entry)
>>>> -            list_move(&entry->vm_status, &vm->pt_freed);
>>>> -
>>>> -        if (start)
>>>> -            list_move(&start->entry->vm_status, &vm->pt_freed);
>>>> +        list_splice_init(&vm->pt_freed, &params->tlb_flush_waitlist);
>>>>           spin_unlock(&vm->status_lock);
>>>>           schedule_work(&vm->pt_free_work);
>>>>           return;
>>>>       }
>>>>   -    for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
>>>> +    list_for_each_entry_safe(entry, next, 
>>>> &params->tlb_flush_waitlist, vm_status)
>>>>           amdgpu_vm_pt_free(entry);
>>>> -
>>>> -    if (start)
>>>> -        amdgpu_vm_pt_free(start->entry);
>>>>   }
>>>>     /**
>>>> @@ -667,7 +657,13 @@ static void amdgpu_vm_pt_free_dfs(struct 
>>>> amdgpu_device *adev,
>>>>    */
>>>>   void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct 
>>>> amdgpu_vm *vm)
>>>>   {
>>>> -    amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
>>>> +    struct amdgpu_vm_pt_cursor cursor;
>>>> +    struct amdgpu_vm_bo_base *entry;
>>>> +
>>>> +    for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) {
>>>> +        if (entry)
>>>
>>> Entry can't be NULL here I think.
>> Yeah, makes sense, if there is a GPU process, it will have at-least 
>> one page table entry.
>
> The problem is rather that entry is either the root entry or a child 
> element within the array of entries.
>
> So entry can never be NULL, only entry->bo can be NULL if there is no 
> PD/PT allocated for this slot.

It looks like it can be NULL, Rajneesh has just shared a backtrace from 
the testing, which shows a NULL here:

[Mar21 06:55] BUG: unable to handle page fault for address: ffffc9002d637aa0
[  +0.007689] #PF: supervisor write access in kernel mode
[  +0.005833] #PF: error_code(0x0002) - not-present page
[  +0.005732] PGD 100000067 P4D 100000067 PUD 1001ec067 PMD 4882af067 PTE 0
[  +0.007579] Oops: 0002 [#1] PREEMPT SMP NOPTI
[  +0.004861] CPU: 52 PID: 8146 Comm: kworker/52:2 Tainted: G           
OE     5.18.2-mi300-build-140423-ubuntu-22.04+ #24
[  +0.012135] Hardware name: AMD Corporation Sh54p/Sh54p, BIOS WPP4311S 
03/11/2024
[  +0.008254] Workqueue: events delayed_fput
[  +0.004573] RIP: 0010:amdgpu_vm_pt_free+0x66/0xe0 [amdgpu]
[  +0.006270] Code: 01 74 6e 48 c7 45 e8 00 00 00 00 31 f6 48 83 c7 58 
e8 0e ea 3b ff 48 8b 03 48 8d 78 38 e8 f2 9b 90 c0 48 8b 43 20 48 8b 53 
18 <48> 89 42 08 48 89 10 48 b8 00 01 00 00 00 00 ad de 48 89 43 18 48
[  +0.020954] RSP: 0018:ffffc9002e117c08 EFLAGS: 00010246
[  +0.005830] RAX: ffff8884867bda20 RBX: ffff8884867bd9a8 RCX: 
0000000000000000
[  +0.007961] RDX: ffffc9002d637a98 RSI: ffff888482845458 RDI: 
ffffffffc155916e
[  +0.007958] RBP: ffffc9002e117c20 R08: 0000000000000000 R09: 
0000000000000001
[  +0.007961] R10: ffff888482843000 R11: 0000000141eed000 R12: 
ffff8884867bd9a8
[  +0.007959] R13: ffff888471d68098 R14: ffff888471d68098 R15: 
00000000c1dab300
[  +0.007960] FS:  0000000000000000(0000) GS:ffff88e1cf700000(0000) 
knlGS:0000000000000000
[  +0.009027] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  +0.006409] CR2: ffffc9002d637aa0 CR3: 0000000006410006 CR4: 
0000000000770ee0
[  +0.007961] PKRU: 55555554
[  +0.003016] Call Trace:
[  +0.002726]  <TASK>
[  +0.002340]  amdgpu_vm_pt_free_root+0x60/0xa0 [amdgpu]
[  +0.005843]  amdgpu_vm_fini+0x2cb/0x5d0 [amdgpu]
[  +0.005248]  ? amdgpu_ctx_mgr_entity_fini+0x53/0x1c0 [amdgpu]
[  +0.006520]  amdgpu_driver_postclose_kms+0x191/0x2d0 [amdgpu]
[  +0.006520]  drm_file_free.part.0+0x1e5/0x260 [drm]

I might have to add a follow-up patch here.

- Shashank

>
> Regards,
> Christian.
>
>>>
>>>> +            amdgpu_vm_pt_free(entry);
>>>> +    }
>>>>   }
>>>>     /**
>>>> @@ -972,10 +968,24 @@ int amdgpu_vm_ptes_update(struct 
>>>> amdgpu_vm_update_params *params,
>>>>               while (cursor.pfn < frag_start) {
>>>>                   /* Make sure previous mapping is freed */
>>>>                   if (cursor.entry->bo) {
>>>> +                    struct amdgpu_vm_pt_cursor seek;
>>>> +                    struct amdgpu_vm_bo_base *entry;
>>>> +
>>>>                       params->needs_flush = true;
>>>> -                    amdgpu_vm_pt_free_dfs(adev, params->vm,
>>>> -                                  &cursor,
>>>> -                                  params->unlocked);
>>>> + spin_lock(&params->vm->status_lock);
>>>> +                    for_each_amdgpu_vm_pt_dfs_safe(adev, 
>>>> params->vm, &cursor,
>>>> +                                       seek, entry) {
>>>> +                        if (!entry || !entry->bo)
>>>> +                            continue;
>>>> +
>>>> +                        list_move(&entry->vm_status,
>>>> + &params->tlb_flush_waitlist);
>>>> +                    }
>>>> +
>>>> +                    /* enter start node now */
>>>> + list_move(&cursor.entry->vm_status,
>>>> + &params->tlb_flush_waitlist);
>>>> + spin_unlock(&params->vm->status_lock);
>>>
>>> I would put this into a separate function maybe.
>>
>> Noted,
>>
>> - Shashank
>>
>>>
>>> Apart from those nit-picks looks good to me.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>                   }
>>>>                   amdgpu_vm_pt_next(adev, &cursor);
>>>>               }
>>>
>

[-- Attachment #2: Type: text/html, Size: 26975 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-03-21 14:18 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-18 12:08 [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence Shashank Sharma
2024-03-18 12:08 ` [PATCH v7 2/2] drm/amdgpu: sync page table freeing with tlb flush Shashank Sharma
2024-03-18 14:04   ` Bhardwaj, Rajneesh
2024-03-18 14:06     ` Sharma, Shashank
2024-03-18 14:40       ` Bhardwaj, Rajneesh
2024-03-18 14:44         ` [PATCH v8] " Shashank Sharma
2024-03-18 15:01           ` Christian König
2024-03-18 15:22             ` Sharma, Shashank
2024-03-18 15:24               ` Christian König
2024-03-21 14:18                 ` Sharma, Shashank
2024-03-18 14:58 ` [PATCH v7 1/2] drm/amdgpu: implement TLB flush fence Christian König
2024-03-18 15:19   ` Sharma, Shashank

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.