All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3)
@ 2019-07-13  6:42 Kuehling, Felix
       [not found] ` <20190713064211.20047-1-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Kuehling, Felix @ 2019-07-13  6:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Walk page table for the faulting address and dump PDEs and PTEs at
all levels. Also flag discrepancies where a PDE points to a different
address than the next level PDB or PTB BO.

v2:
* Fix address shift for GFXv8
* Redo PDB/PTB address checking to work on all generations

v3:
* Simplified pde address and flag check

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 79 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  7 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  6 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +-
 6 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index bbbf069efb77..78440748c87f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1505,9 +1505,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
  * This is used to access VRAM that backs a buffer object via MMIO
  * access for debugging purposes.
  */
-static int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo,
-				    unsigned long offset,
-				    void *buf, int len, int write)
+int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset,
+			     void *buf, int len, int write)
 {
 	struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
 	struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index bccb8c49e597..cffbafffa9d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -83,6 +83,8 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev);
 void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev,
 					bool enable);
 
+int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset,
+			     void *buf, int len, int write);
 int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
 		       uint64_t dst_offset, uint32_t byte_count,
 		       struct reservation_object *resv,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 1951f2abbdbc..64ee46eaa041 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -544,6 +544,78 @@ static void amdgpu_vm_pt_next_dfs(struct amdgpu_device *adev,
 	     amdgpu_vm_pt_continue_dfs((start), (entry));			\
 	     (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor)))
 
+/**
+ * amdgpu_vm_dump_pte - dump PTEs along a page table walk
+ *
+ * @adev: amdgpu device pointer
+ * @vm: VM address space
+ * @addr: virtual address
+ *
+ * Walks the page table of @vm at the given @addr and prints the PDEs
+ * and PTEs along the way on a single line.
+ */
+void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+			uint64_t addr)
+{
+	static const char *level_entry[4] = {"PDE2", "PDE1", "PDE0", "PTE"};
+	static const char *level_block[4] = {"PDB2", "PDB1", "PDB0", "PTB"};
+	struct amdgpu_vm_pt_cursor cursor;
+	uint64_t pde_addr, pde_flags, last_pde;
+	char buf[128];
+	int i = 0;
+
+	amdgpu_gmc_get_pde_for_bo(vm->root.base.bo, adev->vm_manager.root_level,
+				  &pde_addr, &pde_flags);
+	last_pde = pde_addr | pde_flags;
+
+	amdgpu_vm_pt_start(adev, vm, addr >> PAGE_SHIFT, &cursor);
+
+	do {
+		unsigned int mask, shift, idx;
+		struct amdgpu_bo *bo;
+		uint64_t pte;
+
+		mask = amdgpu_vm_entries_mask(adev, cursor.level);
+		shift = amdgpu_vm_level_shift(adev, cursor.level);
+		idx = (cursor.pfn >> shift) & mask;
+
+		bo = cursor.entry->base.bo;
+		if (bo) {
+			/* Flag discrepancy between previous level PDE
+			 * and the actual address of this PTB or PDB.
+			 */
+			amdgpu_gmc_get_pde_for_bo(bo, cursor.level,
+						  &pde_addr, &pde_flags);
+			if ((pde_addr | pde_flags) != last_pde)
+				i += snprintf(buf + i, sizeof(buf) - i, "!");
+
+			amdgpu_ttm_access_memory(&bo->tbo, idx * sizeof(pte),
+						 &pte, sizeof(pte), false);
+			i += snprintf(buf + i, sizeof(buf) - i,
+				      "%s[%d]=0x%llx ",
+				      level_entry[cursor.level], idx, pte);
+			last_pde = pte;
+		} else {
+			/* Flag discrepancy if previous level PDE had
+			 * a valid entry but there is no PTB or PDB BO.
+			 */
+			if ((last_pde & AMDGPU_PTE_VALID) &&
+			    !(last_pde & AMDGPU_PDE_PTE))
+				i += snprintf(buf + i, sizeof(buf) - i, "!");
+			i += snprintf(buf + i, sizeof(buf) - i,
+				      "no %s ", level_block[cursor.level]);
+			last_pde = 0;
+		}
+
+		++cursor.level;
+		cursor.parent = cursor.entry;
+		if (!cursor.entry->entries)
+			break;
+		cursor.entry = &cursor.entry->entries[idx];
+	} while (cursor.entry);
+	dev_err(adev->dev, "%s", buf);
+}
+
 /**
  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
  *
@@ -3081,8 +3153,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * @pasid: PASID identifier for VM
  * @task_info: task_info to fill.
  */
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
-			 struct amdgpu_task_info *task_info)
+struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
+					  unsigned int pasid,
+					  struct amdgpu_task_info *task_info)
 {
 	struct amdgpu_vm *vm;
 	unsigned long flags;
@@ -3094,6 +3167,8 @@ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
 		*task_info = vm->task_info;
 
 	spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
+
+	return vm;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 489a162ca620..6a8b833d180e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -348,6 +348,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid);
 void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+			uint64_t addr);
 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 			 struct list_head *validated,
 			 struct amdgpu_bo_list_entry *entry);
@@ -401,8 +403,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 				  struct amdgpu_job *job);
 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
-			     struct amdgpu_task_info *task_info);
+struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
+					  unsigned int pasid,
+					  struct amdgpu_task_info *task_info);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 8bf2ba310fd9..18207ecfd85c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1448,9 +1448,10 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 
 	if (printk_ratelimit()) {
 		struct amdgpu_task_info task_info;
+		struct amdgpu_vm *vm;
 
 		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
-		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+		vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
 
 		dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
 			entry->src_id, entry->src_data[0], task_info.process_name,
@@ -1461,6 +1462,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 			status);
 		gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
 					 entry->pasid);
+		if (vm)
+			amdgpu_vm_dump_pte(adev, vm, (uint64_t)addr
+					   << AMDGPU_GPU_PAGE_SHIFT);
 	}
 
 	vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index bd5d36944481..f27e88af4016 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -331,9 +331,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 
 	if (printk_ratelimit()) {
 		struct amdgpu_task_info task_info;
+		struct amdgpu_vm *vm;
 
 		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
-		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+		vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
 
 		dev_err(adev->dev,
 			"[%s] %s page fault (src_id:%u ring:%u vmid:%u "
@@ -349,6 +350,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			dev_err(adev->dev,
 				"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
 				status);
+		if (vm)
+			amdgpu_vm_dump_pte(adev, vm, addr);
 	}
 
 	return 0;
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] drm/amdgpu: Fix silent amdgpu_bo_move failures
       [not found] ` <20190713064211.20047-1-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2019-07-13  6:42   ` Kuehling, Felix
       [not found]     ` <20190713064211.20047-2-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2019-07-16  9:27   ` [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3) Christian König
  1 sibling, 1 reply; 5+ messages in thread
From: Kuehling, Felix @ 2019-07-13  6:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Under memory pressure, buffer moves between RAM to VRAM  can
fail when there is no GTT space available. In those cases
amdgpu_bo_move falls back to ttm_bo_move_memcpy, which seems to
succeed, although it doesn't really support non-contiguous or
invisible VRAM. This manifests as VM faults with corrupted page
table entries in KFD eviction stress tests.

Print some helpful messages when lack of GTT space is causing buffer
moves to fail. Check that source and destination memory regions are
supported by ttm_bo_move_memcpy before taking that fallback.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 40 +++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 78440748c87f..37d9a3b09946 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -498,6 +498,7 @@ static int amdgpu_move_vram_ram(struct ttm_buffer_object *bo, bool evict,
 	placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
 	r = ttm_bo_mem_space(bo, &placement, &tmp_mem, ctx);
 	if (unlikely(r)) {
+		pr_err("Failed to find GTT space for blit from VRAM\n");
 		return r;
 	}
 
@@ -556,6 +557,7 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict,
 	placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
 	r = ttm_bo_mem_space(bo, &placement, &tmp_mem, ctx);
 	if (unlikely(r)) {
+		pr_err("Failed to find GTT space for blit to VRAM\n");
 		return r;
 	}
 
@@ -575,6 +577,30 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict,
 	return r;
 }
 
+/**
+ * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy
+ *
+ * Called by amdgpu_bo_move()
+ */
+static bool amdgpu_mem_visible(struct amdgpu_device *adev,
+			       struct ttm_mem_reg *mem)
+{
+	struct drm_mm_node *nodes = mem->mm_node;
+
+	if (mem->mem_type == TTM_PL_SYSTEM ||
+	    mem->mem_type == TTM_PL_TT)
+		return true;
+	if (mem->mem_type != TTM_PL_VRAM)
+		return false;
+
+	/* ttm_mem_reg_ioremap only supports contiguous memory */
+	if (nodes->size != mem->num_pages)
+		return false;
+
+	return ((nodes->start + nodes->size) << PAGE_SHIFT)
+		<= adev->gmc.visible_vram_size;
+}
+
 /**
  * amdgpu_bo_move - Move a buffer object to a new memory location
  *
@@ -619,8 +645,10 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
 		return 0;
 	}
 
-	if (!adev->mman.buffer_funcs_enabled)
+	if (!adev->mman.buffer_funcs_enabled) {
+		r = -ENODEV;
 		goto memcpy;
+	}
 
 	if (old_mem->mem_type == TTM_PL_VRAM &&
 	    new_mem->mem_type == TTM_PL_SYSTEM) {
@@ -635,10 +663,16 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
 
 	if (r) {
 memcpy:
-		r = ttm_bo_move_memcpy(bo, ctx, new_mem);
-		if (r) {
+		/* Check that all memory is CPU accessible */
+		if (!amdgpu_mem_visible(adev, old_mem) ||
+		    !amdgpu_mem_visible(adev, new_mem)) {
+			pr_err("Move buffer fallback to memcpy unavailable\n");
 			return r;
 		}
+
+		r = ttm_bo_move_memcpy(bo, ctx, new_mem);
+		if (r)
+			return r;
 	}
 
 	if (bo->type == ttm_bo_type_device &&
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3)
       [not found] ` <20190713064211.20047-1-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
  2019-07-13  6:42   ` [PATCH 2/2] drm/amdgpu: Fix silent amdgpu_bo_move failures Kuehling, Felix
@ 2019-07-16  9:27   ` Christian König
       [not found]     ` <79761463-5821-006a-6537-4cf28d43452b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  1 sibling, 1 reply; 5+ messages in thread
From: Christian König @ 2019-07-16  9:27 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 13.07.19 um 08:42 schrieb Kuehling, Felix:
> Walk page table for the faulting address and dump PDEs and PTEs at
> all levels. Also flag discrepancies where a PDE points to a different
> address than the next level PDB or PTB BO.
>
> v2:
> * Fix address shift for GFXv8
> * Redo PDB/PTB address checking to work on all generations
>
> v3:
> * Simplified pde address and flag check
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  5 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 79 ++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  7 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  6 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +-
>   6 files changed, 95 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index bbbf069efb77..78440748c87f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -1505,9 +1505,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
>    * This is used to access VRAM that backs a buffer object via MMIO
>    * access for debugging purposes.
>    */
> -static int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo,
> -				    unsigned long offset,
> -				    void *buf, int len, int write)
> +int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset,
> +			     void *buf, int len, int write)
>   {
>   	struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
>   	struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index bccb8c49e597..cffbafffa9d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -83,6 +83,8 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev);
>   void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev,
>   					bool enable);
>   
> +int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset,
> +			     void *buf, int len, int write);
>   int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
>   		       uint64_t dst_offset, uint32_t byte_count,
>   		       struct reservation_object *resv,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 1951f2abbdbc..64ee46eaa041 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -544,6 +544,78 @@ static void amdgpu_vm_pt_next_dfs(struct amdgpu_device *adev,
>   	     amdgpu_vm_pt_continue_dfs((start), (entry));			\
>   	     (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor)))
>   
> +/**
> + * amdgpu_vm_dump_pte - dump PTEs along a page table walk
> + *
> + * @adev: amdgpu device pointer
> + * @vm: VM address space
> + * @addr: virtual address
> + *
> + * Walks the page table of @vm at the given @addr and prints the PDEs
> + * and PTEs along the way on a single line.
> + */
> +void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> +			uint64_t addr)
> +{
> +	static const char *level_entry[4] = {"PDE2", "PDE1", "PDE0", "PTE"};
> +	static const char *level_block[4] = {"PDB2", "PDB1", "PDB0", "PTB"};
> +	struct amdgpu_vm_pt_cursor cursor;
> +	uint64_t pde_addr, pde_flags, last_pde;
> +	char buf[128];
> +	int i = 0;
> +
> +	amdgpu_gmc_get_pde_for_bo(vm->root.base.bo, adev->vm_manager.root_level,
> +				  &pde_addr, &pde_flags);
> +	last_pde = pde_addr | pde_flags;
> +
> +	amdgpu_vm_pt_start(adev, vm, addr >> PAGE_SHIFT, &cursor);

Walking the VM structure without a lock is dangerous, but we can only 
take the lock in the fault worker on Vega10.

> +
> +	do {
> +		unsigned int mask, shift, idx;
> +		struct amdgpu_bo *bo;
> +		uint64_t pte;
> +
> +		mask = amdgpu_vm_entries_mask(adev, cursor.level);
> +		shift = amdgpu_vm_level_shift(adev, cursor.level);
> +		idx = (cursor.pfn >> shift) & mask;
> +
> +		bo = cursor.entry->base.bo;
> +		if (bo) {
> +			/* Flag discrepancy between previous level PDE
> +			 * and the actual address of this PTB or PDB.
> +			 */
> +			amdgpu_gmc_get_pde_for_bo(bo, cursor.level,
> +						  &pde_addr, &pde_flags);
> +			if ((pde_addr | pde_flags) != last_pde)
> +				i += snprintf(buf + i, sizeof(buf) - i, "!");
> +
> +			amdgpu_ttm_access_memory(&bo->tbo, idx * sizeof(pte),
> +						 &pte, sizeof(pte), false);
> +			i += snprintf(buf + i, sizeof(buf) - i,
> +				      "%s[%d]=0x%llx ",
> +				      level_entry[cursor.level], idx, pte);
> +			last_pde = pte;
> +		} else {
> +			/* Flag discrepancy if previous level PDE had
> +			 * a valid entry but there is no PTB or PDB BO.
> +			 */
> +			if ((last_pde & AMDGPU_PTE_VALID) &&
> +			    !(last_pde & AMDGPU_PDE_PTE))
> +				i += snprintf(buf + i, sizeof(buf) - i, "!");
> +			i += snprintf(buf + i, sizeof(buf) - i,
> +				      "no %s ", level_block[cursor.level]);
> +			last_pde = 0;
> +		}
> +
> +		++cursor.level;
> +		cursor.parent = cursor.entry;
> +		if (!cursor.entry->entries)
> +			break;
> +		cursor.entry = &cursor.entry->entries[idx];
> +	} while (cursor.entry);
> +	dev_err(adev->dev, "%s", buf);
> +}
> +
>   /**
>    * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
>    *
> @@ -3081,8 +3153,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>    * @pasid: PASID identifier for VM
>    * @task_info: task_info to fill.
>    */
> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
> -			 struct amdgpu_task_info *task_info)
> +struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
> +					  unsigned int pasid,
> +					  struct amdgpu_task_info *task_info)
>   {
>   	struct amdgpu_vm *vm;
>   	unsigned long flags;
> @@ -3094,6 +3167,8 @@ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
>   		*task_info = vm->task_info;
>   
>   	spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
> +
> +	return vm;

This is dangerous as well when we are in the interrupt handler.

As soon as the spinlock is dropped the VM structure can be freed by 
another thread.

Christian.


>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 489a162ca620..6a8b833d180e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -348,6 +348,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid);
>   void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
>   void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
> +void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> +			uint64_t addr);
>   void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
>   			 struct list_head *validated,
>   			 struct amdgpu_bo_list_entry *entry);
> @@ -401,8 +403,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>   				  struct amdgpu_job *job);
>   void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
>   
> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
> -			     struct amdgpu_task_info *task_info);
> +struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
> +					  unsigned int pasid,
> +					  struct amdgpu_task_info *task_info);
>   
>   void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index 8bf2ba310fd9..18207ecfd85c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -1448,9 +1448,10 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>   
>   	if (printk_ratelimit()) {
>   		struct amdgpu_task_info task_info;
> +		struct amdgpu_vm *vm;
>   
>   		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> -		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> +		vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>   
>   		dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
>   			entry->src_id, entry->src_data[0], task_info.process_name,
> @@ -1461,6 +1462,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>   			status);
>   		gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
>   					 entry->pasid);
> +		if (vm)
> +			amdgpu_vm_dump_pte(adev, vm, (uint64_t)addr
> +					   << AMDGPU_GPU_PAGE_SHIFT);
>   	}
>   
>   	vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index bd5d36944481..f27e88af4016 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -331,9 +331,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   
>   	if (printk_ratelimit()) {
>   		struct amdgpu_task_info task_info;
> +		struct amdgpu_vm *vm;
>   
>   		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> -		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> +		vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>   
>   		dev_err(adev->dev,
>   			"[%s] %s page fault (src_id:%u ring:%u vmid:%u "
> @@ -349,6 +350,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   			dev_err(adev->dev,
>   				"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
>   				status);
> +		if (vm)
> +			amdgpu_vm_dump_pte(adev, vm, addr);
>   	}
>   
>   	return 0;

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Fix silent amdgpu_bo_move failures
       [not found]     ` <20190713064211.20047-2-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
@ 2019-07-16  9:29       ` Christian König
  0 siblings, 0 replies; 5+ messages in thread
From: Christian König @ 2019-07-16  9:29 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 13.07.19 um 08:42 schrieb Kuehling, Felix:
> Under memory pressure, buffer moves between RAM to VRAM  can
> fail when there is no GTT space available. In those cases
> amdgpu_bo_move falls back to ttm_bo_move_memcpy, which seems to
> succeed, although it doesn't really support non-contiguous or
> invisible VRAM. This manifests as VM faults with corrupted page
> table entries in KFD eviction stress tests.
>
> Print some helpful messages when lack of GTT space is causing buffer
> moves to fail. Check that source and destination memory regions are
> supported by ttm_bo_move_memcpy before taking that fallback.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 40 +++++++++++++++++++++++--
>   1 file changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 78440748c87f..37d9a3b09946 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -498,6 +498,7 @@ static int amdgpu_move_vram_ram(struct ttm_buffer_object *bo, bool evict,
>   	placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
>   	r = ttm_bo_mem_space(bo, &placement, &tmp_mem, ctx);
>   	if (unlikely(r)) {
> +		pr_err("Failed to find GTT space for blit from VRAM\n");
>   		return r;
>   	}
>   
> @@ -556,6 +557,7 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict,
>   	placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
>   	r = ttm_bo_mem_space(bo, &placement, &tmp_mem, ctx);
>   	if (unlikely(r)) {
> +		pr_err("Failed to find GTT space for blit to VRAM\n");
>   		return r;
>   	}
>   
> @@ -575,6 +577,30 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict,
>   	return r;
>   }
>   
> +/**
> + * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy
> + *
> + * Called by amdgpu_bo_move()
> + */
> +static bool amdgpu_mem_visible(struct amdgpu_device *adev,
> +			       struct ttm_mem_reg *mem)
> +{
> +	struct drm_mm_node *nodes = mem->mm_node;
> +
> +	if (mem->mem_type == TTM_PL_SYSTEM ||
> +	    mem->mem_type == TTM_PL_TT)
> +		return true;
> +	if (mem->mem_type != TTM_PL_VRAM)
> +		return false;
> +
> +	/* ttm_mem_reg_ioremap only supports contiguous memory */
> +	if (nodes->size != mem->num_pages)
> +		return false;
> +
> +	return ((nodes->start + nodes->size) << PAGE_SHIFT)
> +		<= adev->gmc.visible_vram_size;
> +}
> +
>   /**
>    * amdgpu_bo_move - Move a buffer object to a new memory location
>    *
> @@ -619,8 +645,10 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   		return 0;
>   	}
>   
> -	if (!adev->mman.buffer_funcs_enabled)
> +	if (!adev->mman.buffer_funcs_enabled) {
> +		r = -ENODEV;
>   		goto memcpy;
> +	}
>   
>   	if (old_mem->mem_type == TTM_PL_VRAM &&
>   	    new_mem->mem_type == TTM_PL_SYSTEM) {
> @@ -635,10 +663,16 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>   
>   	if (r) {
>   memcpy:
> -		r = ttm_bo_move_memcpy(bo, ctx, new_mem);
> -		if (r) {
> +		/* Check that all memory is CPU accessible */
> +		if (!amdgpu_mem_visible(adev, old_mem) ||
> +		    !amdgpu_mem_visible(adev, new_mem)) {
> +			pr_err("Move buffer fallback to memcpy unavailable\n");
>   			return r;
>   		}
> +
> +		r = ttm_bo_move_memcpy(bo, ctx, new_mem);
> +		if (r)
> +			return r;
>   	}
>   
>   	if (bo->type == ttm_bo_type_device &&

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3)
       [not found]     ` <79761463-5821-006a-6537-4cf28d43452b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-07-16 19:57       ` Kuehling, Felix
  0 siblings, 0 replies; 5+ messages in thread
From: Kuehling, Felix @ 2019-07-16 19:57 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-07-16 5:27 a.m., Christian König wrote:
> Am 13.07.19 um 08:42 schrieb Kuehling, Felix:
>> Walk page table for the faulting address and dump PDEs and PTEs at
>> all levels. Also flag discrepancies where a PDE points to a different
>> address than the next level PDB or PTB BO.
>>
>> v2:
>> * Fix address shift for GFXv8
>> * Redo PDB/PTB address checking to work on all generations
>>
>> v3:
>> * Simplified pde address and flag check
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  5 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  2 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 79 ++++++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  7 ++-
>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  6 +-
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +-
>>   6 files changed, 95 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index bbbf069efb77..78440748c87f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -1505,9 +1505,8 @@ static bool 
>> amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
>>    * This is used to access VRAM that backs a buffer object via MMIO
>>    * access for debugging purposes.
>>    */
>> -static int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo,
>> -                    unsigned long offset,
>> -                    void *buf, int len, int write)
>> +int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned 
>> long offset,
>> +                 void *buf, int len, int write)
>>   {
>>       struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
>>       struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> index bccb8c49e597..cffbafffa9d7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> @@ -83,6 +83,8 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev);
>>   void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev,
>>                       bool enable);
>>   +int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, 
>> unsigned long offset,
>> +                 void *buf, int len, int write);
>>   int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
>>                  uint64_t dst_offset, uint32_t byte_count,
>>                  struct reservation_object *resv,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 1951f2abbdbc..64ee46eaa041 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -544,6 +544,78 @@ static void amdgpu_vm_pt_next_dfs(struct 
>> amdgpu_device *adev,
>>            amdgpu_vm_pt_continue_dfs((start), (entry));            \
>>            (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), 
>> &(cursor)))
>>   +/**
>> + * amdgpu_vm_dump_pte - dump PTEs along a page table walk
>> + *
>> + * @adev: amdgpu device pointer
>> + * @vm: VM address space
>> + * @addr: virtual address
>> + *
>> + * Walks the page table of @vm at the given @addr and prints the PDEs
>> + * and PTEs along the way on a single line.
>> + */
>> +void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm 
>> *vm,
>> +            uint64_t addr)
>> +{
>> +    static const char *level_entry[4] = {"PDE2", "PDE1", "PDE0", 
>> "PTE"};
>> +    static const char *level_block[4] = {"PDB2", "PDB1", "PDB0", 
>> "PTB"};
>> +    struct amdgpu_vm_pt_cursor cursor;
>> +    uint64_t pde_addr, pde_flags, last_pde;
>> +    char buf[128];
>> +    int i = 0;
>> +
>> +    amdgpu_gmc_get_pde_for_bo(vm->root.base.bo, 
>> adev->vm_manager.root_level,
>> +                  &pde_addr, &pde_flags);
>> +    last_pde = pde_addr | pde_flags;
>> +
>> +    amdgpu_vm_pt_start(adev, vm, addr >> PAGE_SHIFT, &cursor);
>
> Walking the VM structure without a lock is dangerous, but we can only 
> take the lock in the fault worker on Vega10.

Not sure what you mean by fault worker. As far as I can tell, the VM 
structure is protected by the page table reservation lock, which I can't 
take in an interrupt handler. You're referring to a fault worker, which 
could take the lock, but I don't see that in staging yet.


>
>> +
>> +    do {
>> +        unsigned int mask, shift, idx;
>> +        struct amdgpu_bo *bo;
>> +        uint64_t pte;
>> +
>> +        mask = amdgpu_vm_entries_mask(adev, cursor.level);
>> +        shift = amdgpu_vm_level_shift(adev, cursor.level);
>> +        idx = (cursor.pfn >> shift) & mask;
>> +
>> +        bo = cursor.entry->base.bo;
>> +        if (bo) {
>> +            /* Flag discrepancy between previous level PDE
>> +             * and the actual address of this PTB or PDB.
>> +             */
>> +            amdgpu_gmc_get_pde_for_bo(bo, cursor.level,
>> +                          &pde_addr, &pde_flags);
>> +            if ((pde_addr | pde_flags) != last_pde)
>> +                i += snprintf(buf + i, sizeof(buf) - i, "!");
>> +
>> +            amdgpu_ttm_access_memory(&bo->tbo, idx * sizeof(pte),
>> +                         &pte, sizeof(pte), false);
>> +            i += snprintf(buf + i, sizeof(buf) - i,
>> +                      "%s[%d]=0x%llx ",
>> +                      level_entry[cursor.level], idx, pte);
>> +            last_pde = pte;
>> +        } else {
>> +            /* Flag discrepancy if previous level PDE had
>> +             * a valid entry but there is no PTB or PDB BO.
>> +             */
>> +            if ((last_pde & AMDGPU_PTE_VALID) &&
>> +                !(last_pde & AMDGPU_PDE_PTE))
>> +                i += snprintf(buf + i, sizeof(buf) - i, "!");
>> +            i += snprintf(buf + i, sizeof(buf) - i,
>> +                      "no %s ", level_block[cursor.level]);
>> +            last_pde = 0;
>> +        }
>> +
>> +        ++cursor.level;
>> +        cursor.parent = cursor.entry;
>> +        if (!cursor.entry->entries)
>> +            break;
>> +        cursor.entry = &cursor.entry->entries[idx];
>> +    } while (cursor.entry);
>> +    dev_err(adev->dev, "%s", buf);
>> +}
>> +
>>   /**
>>    * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
>>    *
>> @@ -3081,8 +3153,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, 
>> void *data, struct drm_file *filp)
>>    * @pasid: PASID identifier for VM
>>    * @task_info: task_info to fill.
>>    */
>> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned 
>> int pasid,
>> -             struct amdgpu_task_info *task_info)
>> +struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
>> +                      unsigned int pasid,
>> +                      struct amdgpu_task_info *task_info)
>>   {
>>       struct amdgpu_vm *vm;
>>       unsigned long flags;
>> @@ -3094,6 +3167,8 @@ void amdgpu_vm_get_task_info(struct 
>> amdgpu_device *adev, unsigned int pasid,
>>           *task_info = vm->task_info;
>> spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>> +
>> +    return vm;
>
> This is dangerous as well when we are in the interrupt handler.
>
> As soon as the spinlock is dropped the VM structure can be freed by 
> another thread.

OK, that's easier to fix. I was trying to avoid looking up the VM from 
the PASID twice. But I could do it in amdgpu_vm_dump_pte and hold the 
spin lock as long as I'm accessing the VM.

The reservation lock is the bigger problem. I'll keep this patch around, 
because I find it useful, but I won't submit it until I find a safe 
place to access the VM structure in a fault handler or worker.

Regards,
   Felix


>
> Christian.
>
>
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 489a162ca620..6a8b833d180e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -348,6 +348,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, 
>> struct amdgpu_vm *vm,
>>   int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm, unsigned int pasid);
>>   void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm);
>>   void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
>> +void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm 
>> *vm,
>> +            uint64_t addr);
>>   void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
>>                struct list_head *validated,
>>                struct amdgpu_bo_list_entry *entry);
>> @@ -401,8 +403,9 @@ bool amdgpu_vm_need_pipeline_sync(struct 
>> amdgpu_ring *ring,
>>                     struct amdgpu_job *job);
>>   void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
>>   -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned 
>> int pasid,
>> -                 struct amdgpu_task_info *task_info);
>> +struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
>> +                      unsigned int pasid,
>> +                      struct amdgpu_task_info *task_info);
>>     void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> index 8bf2ba310fd9..18207ecfd85c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> @@ -1448,9 +1448,10 @@ static int gmc_v8_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>         if (printk_ratelimit()) {
>>           struct amdgpu_task_info task_info;
>> +        struct amdgpu_vm *vm;
>>             memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> -        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> +        vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>             dev_err(adev->dev, "GPU fault detected: %d 0x%08x for 
>> process %s pid %d thread %s pid %d\n",
>>               entry->src_id, entry->src_data[0], task_info.process_name,
>> @@ -1461,6 +1462,9 @@ static int gmc_v8_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>               status);
>>           gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
>>                        entry->pasid);
>> +        if (vm)
>> +            amdgpu_vm_dump_pte(adev, vm, (uint64_t)addr
>> +                       << AMDGPU_GPU_PAGE_SHIFT);
>>       }
>>         vmid = REG_GET_FIELD(status, 
>> VM_CONTEXT1_PROTECTION_FAULT_STATUS,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index bd5d36944481..f27e88af4016 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -331,9 +331,10 @@ static int gmc_v9_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>         if (printk_ratelimit()) {
>>           struct amdgpu_task_info task_info;
>> +        struct amdgpu_vm *vm;
>>             memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> -        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> +        vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>             dev_err(adev->dev,
>>               "[%s] %s page fault (src_id:%u ring:%u vmid:%u "
>> @@ -349,6 +350,8 @@ static int gmc_v9_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>               dev_err(adev->dev,
>>                   "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
>>                   status);
>> +        if (vm)
>> +            amdgpu_vm_dump_pte(adev, vm, addr);
>>       }
>>         return 0;
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-07-16 19:57 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-13  6:42 [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3) Kuehling, Felix
     [not found] ` <20190713064211.20047-1-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2019-07-13  6:42   ` [PATCH 2/2] drm/amdgpu: Fix silent amdgpu_bo_move failures Kuehling, Felix
     [not found]     ` <20190713064211.20047-2-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2019-07-16  9:29       ` Christian König
2019-07-16  9:27   ` [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3) Christian König
     [not found]     ` <79761463-5821-006a-6537-4cf28d43452b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-07-16 19:57       ` Kuehling, Felix

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.