All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Christian König" <ckoenig.leichtzumerken-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
To: "Kuehling, Felix" <Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>,
	"amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org"
	<amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Subject: Re: [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3)
Date: Tue, 16 Jul 2019 11:27:14 +0200	[thread overview]
Message-ID: <79761463-5821-006a-6537-4cf28d43452b@gmail.com> (raw)
In-Reply-To: <20190713064211.20047-1-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>

Am 13.07.19 um 08:42 schrieb Kuehling, Felix:
> Walk page table for the faulting address and dump PDEs and PTEs at
> all levels. Also flag discrepancies where a PDE points to a different
> address than the next level PDB or PTB BO.
>
> v2:
> * Fix address shift for GFXv8
> * Redo PDB/PTB address checking to work on all generations
>
> v3:
> * Simplified pde address and flag check
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  5 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 79 ++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  7 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  6 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +-
>   6 files changed, 95 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index bbbf069efb77..78440748c87f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -1505,9 +1505,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
>    * This is used to access VRAM that backs a buffer object via MMIO
>    * access for debugging purposes.
>    */
> -static int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo,
> -				    unsigned long offset,
> -				    void *buf, int len, int write)
> +int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset,
> +			     void *buf, int len, int write)
>   {
>   	struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
>   	struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index bccb8c49e597..cffbafffa9d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -83,6 +83,8 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev);
>   void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev,
>   					bool enable);
>   
> +int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset,
> +			     void *buf, int len, int write);
>   int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
>   		       uint64_t dst_offset, uint32_t byte_count,
>   		       struct reservation_object *resv,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 1951f2abbdbc..64ee46eaa041 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -544,6 +544,78 @@ static void amdgpu_vm_pt_next_dfs(struct amdgpu_device *adev,
>   	     amdgpu_vm_pt_continue_dfs((start), (entry));			\
>   	     (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor)))
>   
> +/**
> + * amdgpu_vm_dump_pte - dump PTEs along a page table walk
> + *
> + * @adev: amdgpu device pointer
> + * @vm: VM address space
> + * @addr: virtual address
> + *
> + * Walks the page table of @vm at the given @addr and prints the PDEs
> + * and PTEs along the way on a single line.
> + */
> +void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> +			uint64_t addr)
> +{
> +	static const char *level_entry[4] = {"PDE2", "PDE1", "PDE0", "PTE"};
> +	static const char *level_block[4] = {"PDB2", "PDB1", "PDB0", "PTB"};
> +	struct amdgpu_vm_pt_cursor cursor;
> +	uint64_t pde_addr, pde_flags, last_pde;
> +	char buf[128];
> +	int i = 0;
> +
> +	amdgpu_gmc_get_pde_for_bo(vm->root.base.bo, adev->vm_manager.root_level,
> +				  &pde_addr, &pde_flags);
> +	last_pde = pde_addr | pde_flags;
> +
> +	amdgpu_vm_pt_start(adev, vm, addr >> PAGE_SHIFT, &cursor);

Walking the VM structure without a lock is dangerous, but we can only 
take the lock in the fault worker on Vega10.

> +
> +	do {
> +		unsigned int mask, shift, idx;
> +		struct amdgpu_bo *bo;
> +		uint64_t pte;
> +
> +		mask = amdgpu_vm_entries_mask(adev, cursor.level);
> +		shift = amdgpu_vm_level_shift(adev, cursor.level);
> +		idx = (cursor.pfn >> shift) & mask;
> +
> +		bo = cursor.entry->base.bo;
> +		if (bo) {
> +			/* Flag discrepancy between previous level PDE
> +			 * and the actual address of this PTB or PDB.
> +			 */
> +			amdgpu_gmc_get_pde_for_bo(bo, cursor.level,
> +						  &pde_addr, &pde_flags);
> +			if ((pde_addr | pde_flags) != last_pde)
> +				i += snprintf(buf + i, sizeof(buf) - i, "!");
> +
> +			amdgpu_ttm_access_memory(&bo->tbo, idx * sizeof(pte),
> +						 &pte, sizeof(pte), false);
> +			i += snprintf(buf + i, sizeof(buf) - i,
> +				      "%s[%d]=0x%llx ",
> +				      level_entry[cursor.level], idx, pte);
> +			last_pde = pte;
> +		} else {
> +			/* Flag discrepancy if previous level PDE had
> +			 * a valid entry but there is no PTB or PDB BO.
> +			 */
> +			if ((last_pde & AMDGPU_PTE_VALID) &&
> +			    !(last_pde & AMDGPU_PDE_PTE))
> +				i += snprintf(buf + i, sizeof(buf) - i, "!");
> +			i += snprintf(buf + i, sizeof(buf) - i,
> +				      "no %s ", level_block[cursor.level]);
> +			last_pde = 0;
> +		}
> +
> +		++cursor.level;
> +		cursor.parent = cursor.entry;
> +		if (!cursor.entry->entries)
> +			break;
> +		cursor.entry = &cursor.entry->entries[idx];
> +	} while (cursor.entry);
> +	dev_err(adev->dev, "%s", buf);
> +}
> +
>   /**
>    * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
>    *
> @@ -3081,8 +3153,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>    * @pasid: PASID identifier for VM
>    * @task_info: task_info to fill.
>    */
> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
> -			 struct amdgpu_task_info *task_info)
> +struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
> +					  unsigned int pasid,
> +					  struct amdgpu_task_info *task_info)
>   {
>   	struct amdgpu_vm *vm;
>   	unsigned long flags;
> @@ -3094,6 +3167,8 @@ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
>   		*task_info = vm->task_info;
>   
>   	spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
> +
> +	return vm;

This is dangerous as well when we are in the interrupt handler.

As soon as the spinlock is dropped the VM structure can be freed by 
another thread.

Christian.


>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 489a162ca620..6a8b833d180e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -348,6 +348,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid);
>   void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
>   void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
> +void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> +			uint64_t addr);
>   void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
>   			 struct list_head *validated,
>   			 struct amdgpu_bo_list_entry *entry);
> @@ -401,8 +403,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>   				  struct amdgpu_job *job);
>   void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
>   
> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
> -			     struct amdgpu_task_info *task_info);
> +struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
> +					  unsigned int pasid,
> +					  struct amdgpu_task_info *task_info);
>   
>   void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index 8bf2ba310fd9..18207ecfd85c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -1448,9 +1448,10 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>   
>   	if (printk_ratelimit()) {
>   		struct amdgpu_task_info task_info;
> +		struct amdgpu_vm *vm;
>   
>   		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> -		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> +		vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>   
>   		dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
>   			entry->src_id, entry->src_data[0], task_info.process_name,
> @@ -1461,6 +1462,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>   			status);
>   		gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
>   					 entry->pasid);
> +		if (vm)
> +			amdgpu_vm_dump_pte(adev, vm, (uint64_t)addr
> +					   << AMDGPU_GPU_PAGE_SHIFT);
>   	}
>   
>   	vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index bd5d36944481..f27e88af4016 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -331,9 +331,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   
>   	if (printk_ratelimit()) {
>   		struct amdgpu_task_info task_info;
> +		struct amdgpu_vm *vm;
>   
>   		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> -		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> +		vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>   
>   		dev_err(adev->dev,
>   			"[%s] %s page fault (src_id:%u ring:%u vmid:%u "
> @@ -349,6 +350,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   			dev_err(adev->dev,
>   				"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
>   				status);
> +		if (vm)
> +			amdgpu_vm_dump_pte(adev, vm, addr);
>   	}
>   
>   	return 0;

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2019-07-16  9:27 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-07-13  6:42 [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3) Kuehling, Felix
     [not found] ` <20190713064211.20047-1-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2019-07-13  6:42   ` [PATCH 2/2] drm/amdgpu: Fix silent amdgpu_bo_move failures Kuehling, Felix
     [not found]     ` <20190713064211.20047-2-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2019-07-16  9:29       ` Christian König
2019-07-16  9:27   ` Christian König [this message]
     [not found]     ` <79761463-5821-006a-6537-4cf28d43452b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-07-16 19:57       ` [PATCH 1/2] drm/amdgpu: Dump PDEs and PTEs on VM faults (v3) Kuehling, Felix

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=79761463-5821-006a-6537-4cf28d43452b@gmail.com \
    --to=ckoenig.leichtzumerken-re5jqeeqqe8avxtiumwx3w@public.gmane.org \
    --cc=Felix.Kuehling-5C7GfCeVMHo@public.gmane.org \
    --cc=amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=christian.koenig-5C7GfCeVMHo@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.