HI Shashank We'll probably need a v8 with the null pointer crash fixed i.e. before freeing the PT entries check for a valid entry before calling amdgpu_vm_pt_free. The crash is seen with device memory allocators but the system memory allocators are looking fine. [ 127.255863] [drm] Using MTYPE_RW for local memory [ 333.606136] hugetlbfs: test_with_MPI.e (25268): Using mlock ulimits for SHM_HUGETLB is obsolete [ 415.351447] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 415.359245] #PF: supervisor write access in kernel mode [ 415.365081] #PF: error_code(0x0002) - not-present page [ 415.370817] PGD 101259067 P4D 101259067 PUD 10125a067 PMD 0 [ 415.377140] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 415.382004] CPU: 0 PID: 25481 Comm: test_with_MPI.e Tainted: G OE 5.18.2-mi300-build-140423-ubuntu-22.04+ #24 [ 415.394437] Hardware name: AMD Corporation Sh51p/Sh51p, BIOS RMO1001AS 02/21/2024 [ 415.402797] RIP: 0010:amdgpu_vm_ptes_update+0x6fd/0xa10 [amdgpu] [ 415.409648] Code: 4c 89 ff 4d 8d 66 30 e8 f1 ed ff ff 48 85 db 74 42 48 39 5d a0 74 40 48 8b 53 20 48 8b 4b 18 48 8d 43 18 48 8d 75 b0 4c 89 ff <48 > 89 51 08 48 89 0a 49 8b 56 30 48 89 42 08 48 89 53 18 4c 89 63 [ 415.430621] RSP: 0018:ffffc9000401f990 EFLAGS: 00010287 [ 415.436456] RAX: ffff888147bb82f0 RBX: ffff888147bb82d8 RCX: 0000000000000000 [ 415.444426] RDX: 0000000000000000 RSI: ffffc9000401fa30 RDI: ffff888161f80000 [ 415.452397] RBP: ffffc9000401fa80 R08: 0000000000000000 R09: ffffc9000401fa00 [ 415.460368] R10: 00000007f0cc0000 R11: 00000007f0c85000 R12: ffffc9000401fb20 [ 415.468340] R13: 00000007f0d00000 R14: ffffc9000401faf0 R15: ffff888161f80000 [ 415.476312] FS: 00007f132ff89840(0000) GS:ffff889f87c00000(0000) knlGS:0000000000000000 [ 415.485350] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 415.491767] CR2: 0000000000000008 CR3: 0000000161d46003 CR4: 0000000000770ef0 [ 415.499738] PKRU: 55555554 [ 415.502750] Call Trace: [ 415.505482] [ 415.507825] amdgpu_vm_update_range+0x32a/0x880 [amdgpu] [ 415.513869] amdgpu_vm_clear_freed+0x117/0x250 [amdgpu] [ 415.519814] amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x18c/0x250 [amdgpu] [ 415.527729] kfd_ioctl_unmap_memory_from_gpu+0xed/0x340 [amdgpu] [ 415.534551] kfd_ioctl+0x3b6/0x510 [amdgpu] [ 415.539324] ? kfd_ioctl_get_dmabuf_info+0x1d0/0x1d0 [amdgpu] [ 415.545844] ? __fget_light+0xc5/0x100 [ 415.550037] __x64_sys_ioctl+0x91/0xc0 [ 415.554227] do_syscall_64+0x5c/0x80 [ 415.558223] ? debug_smp_processor_id+0x17/0x20 [ 415.563285] ? fpregs_assert_state_consistent+0x23/0x60 [ 415.569124] ? exit_to_user_mode_prepare+0x45/0x190 [ 415.574572] ? ksys_write+0xce/0xe0 On 3/18/2024 8:08 AM, Shashank Sharma wrote: > The idea behind this patch is to delay the freeing of PT entry objects > until the TLB flush is done. > > This patch: > - Adds a tlb_flush_waitlist in amdgpu_vm_update_params which will keep the > objects that need to be freed after tlb_flush. > - Adds PT entries in this list in amdgpu_vm_ptes_update after finding > the PT entry. > - Changes functionality of amdgpu_vm_pt_free_dfs from (df_search + free) > to simply freeing of the BOs, also renames it to > amdgpu_vm_pt_free_list to reflect this same. > - Exports function amdgpu_vm_pt_free_list to be called directly. > - Calls amdgpu_vm_pt_free_list directly from amdgpu_vm_update_range. > > V2: rebase > V4: Addressed review comments from Christian > - add only locked PTEs entries in TLB flush waitlist. > - do not create a separate function for list flush. > - do not create a new lock for TLB flush. > - there is no need to wait on tlb_flush_fence exclusively. > > V5: Addressed review comments from Christian > - change the amdgpu_vm_pt_free_dfs's functionality to simple freeing > of the objects and rename it. > - add all the PTE objects in params->tlb_flush_waitlist > - let amdgpu_vm_pt_free_root handle the freeing of BOs independently > - call amdgpu_vm_pt_free_list directly > > V6: Rebase > V7: Rebase > > Cc: Christian König > Cc: Alex Deucher > Cc: Felix Kuehling > Cc: Rajneesh Bhardwaj > Acked-by: Felix Kuehling > Acked-by: Rajneesh Bhardwaj > Tested-by: Rajneesh Bhardwaj > Signed-off-by: Shashank Sharma > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 53 +++++++++++++---------- > 3 files changed, 40 insertions(+), 25 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 26f1c3359642..eaa402f99fe0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -977,6 +977,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > params.unlocked = unlocked; > params.needs_flush = flush_tlb; > params.allow_override = allow_override; > + INIT_LIST_HEAD(¶ms.tlb_flush_waitlist); > > /* Implicitly sync to command submissions in the same VM before > * unmapping. Sync to moving fences before mapping. > @@ -1062,8 +1063,10 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, > if (r) > goto error_unlock; > > - if (params.needs_flush) > + if (params.needs_flush) { > r = amdgpu_vm_tlb_flush(¶ms, fence); > + amdgpu_vm_pt_free_list(adev, ¶ms); > + } > > error_unlock: > amdgpu_vm_eviction_unlock(vm); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index b0a4fe683352..54d7da396de0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -266,6 +266,11 @@ struct amdgpu_vm_update_params { > * to be overridden for NUMA local memory. > */ > bool allow_override; > + > + /** > + * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush > + */ > + struct list_head tlb_flush_waitlist; > }; > > struct amdgpu_vm_update_funcs { > @@ -547,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, > uint64_t start, uint64_t end, > uint64_t dst, uint64_t flags); > void amdgpu_vm_pt_free_work(struct work_struct *work); > +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev, > + struct amdgpu_vm_update_params *params); > > #if defined(CONFIG_DEBUG_FS) > void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > index 601df0ce8290..440dc8c581fc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > @@ -622,40 +622,30 @@ void amdgpu_vm_pt_free_work(struct work_struct *work) > } > > /** > - * amdgpu_vm_pt_free_dfs - free PD/PT levels > + * amdgpu_vm_pt_free_list - free PD/PT levels > * > * @adev: amdgpu device structure > - * @vm: amdgpu vm structure > - * @start: optional cursor where to start freeing PDs/PTs > - * @unlocked: vm resv unlock status > + * @params: see amdgpu_vm_update_params definition > * > - * Free the page directory or page table level and all sub levels. > + * Free the page directory objects saved in the flush list > */ > -static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev, > - struct amdgpu_vm *vm, > - struct amdgpu_vm_pt_cursor *start, > - bool unlocked) > +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev, > + struct amdgpu_vm_update_params *params) > { > - struct amdgpu_vm_pt_cursor cursor; > - struct amdgpu_vm_bo_base *entry; > + struct amdgpu_vm_bo_base *entry, *next; > + struct amdgpu_vm *vm = params->vm; > + bool unlocked = params->unlocked; > > if (unlocked) { > spin_lock(&vm->status_lock); > - for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) > - list_move(&entry->vm_status, &vm->pt_freed); > - > - if (start) > - list_move(&start->entry->vm_status, &vm->pt_freed); > + list_splice_init(&vm->pt_freed, ¶ms->tlb_flush_waitlist); > spin_unlock(&vm->status_lock); > schedule_work(&vm->pt_free_work); > return; > } > > - for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) > + list_for_each_entry_safe(entry, next, ¶ms->tlb_flush_waitlist, vm_status) > amdgpu_vm_pt_free(entry); > - > - if (start) > - amdgpu_vm_pt_free(start->entry); > } > > /** > @@ -667,7 +657,11 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev, > */ > void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm) > { > - amdgpu_vm_pt_free_dfs(adev, vm, NULL, false); > + struct amdgpu_vm_pt_cursor cursor; > + struct amdgpu_vm_bo_base *entry; > + > + for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) > + amdgpu_vm_pt_free(entry); > } > > /** > @@ -972,10 +966,21 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, > while (cursor.pfn < frag_start) { > /* Make sure previous mapping is freed */ > if (cursor.entry->bo) { > + struct amdgpu_vm_pt_cursor seek; > + struct amdgpu_vm_bo_base *entry; > + > params->needs_flush = true; > - amdgpu_vm_pt_free_dfs(adev, params->vm, > - &cursor, > - params->unlocked); > + spin_lock(¶ms->vm->status_lock); > + for_each_amdgpu_vm_pt_dfs_safe(adev, params->vm, &cursor, > + seek, entry) { > + list_move(&entry->vm_status, > + ¶ms->tlb_flush_waitlist); > + } > + > + /* enter start node now */ > + list_move(&cursor.entry->vm_status, > + ¶ms->tlb_flush_waitlist); > + spin_unlock(¶ms->vm->status_lock); > } > amdgpu_vm_pt_next(adev, &cursor); > }