amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: "Christian König" <christian.koenig@amd.com>
To: "Pan, Xinhui" <Xinhui.Pan@amd.com>,
	"amd-gfx@lists.freedesktop.org" <amd-gfx@lists.freedesktop.org>
Cc: "Deucher, Alexander" <Alexander.Deucher@amd.com>,
	"Kuehling, Felix" <Felix.Kuehling@amd.com>
Subject: Re: [RFC PATCH v4] drm/amdgpu: Remove kfd eviction fence before release bo
Date: Sat, 8 Feb 2020 16:57:25 +0100	[thread overview]
Message-ID: <4733e64d-fe03-962a-b07d-70e4b1582605@amd.com> (raw)
In-Reply-To: <SN6PR12MB28001FE5A3616729FC7D5E03871F0@SN6PR12MB2800.namprd12.prod.outlook.com>

Am 08.02.20 um 16:09 schrieb Pan, Xinhui:
> No need to trigger eviction as the memory mapping will not be used anymore.
>
> All pt/pd bos share same resv, hence the same shared eviction fence. Everytime page table is freed, the fence will be signled and that cuases kfd unexcepted evictions.
>
> Signed-off-by: xinhui pan <xinhui.pan@example.com>
> ---
> change from v3:
> fix a coding error
>
> change from v2:
> based on Chris' drm/ttm: rework BO delayed delete patchset.
>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 36 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  1 +
>   drivers/gpu/drm/ttm/ttm_bo.c                  | 16 +++++----
>   5 files changed, 49 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 47b0f2957d1f..265b1ed7264c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -96,6 +96,7 @@ struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
>   						       struct mm_struct *mm);
>   bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
>   struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
> +int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
>   
>   struct amdkfd_process_info {
>   	/* List head of all VMs that belong to a KFD process */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index ef721cb65868..d4b117065c1e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -276,6 +276,41 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
>   	return 0;
>   }
>   
> +int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
> +{
> +	struct amdgpu_bo *root = bo;
> +	struct amdgpu_vm_bo_base *vm_bo;
> +	struct amdgpu_vm *vm;
> +	struct amdkfd_process_info *info;
> +	struct amdgpu_amdkfd_fence *ef;
> +	int ret;
> +
> +	while (root->parent)
> +		root = root->parent;
> +
> +	vm_bo = root->vm_bo;
> +	if (!vm_bo)
> +		return 0;
> +
> +	vm = vm_bo->vm;
> +	if (!vm)
> +		return 0;
> +
> +	info = vm->process_info;
> +	if (!info || !info->eviction_fence)
> +		return 0;
> +
> +	ef = container_of(dma_fence_get(&info->eviction_fence->base),
> +			struct amdgpu_amdkfd_fence, base);
> +
> +	BUG_ON(!dma_resv_trylock(&bo->tbo.base._resv));
> +	ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef);
> +	dma_resv_unlock(&bo->tbo.base._resv);
> +
> +	dma_fence_put(&ef->base);
> +	return ret;
> +}
> +
>   static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
>   				     bool wait)
>   {
> @@ -1051,6 +1086,7 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
>   		WARN_ON(!list_empty(&process_info->userptr_valid_list));
>   		WARN_ON(!list_empty(&process_info->userptr_inval_list));
>   
> +		vm->process_info = NULL;
>   		dma_fence_put(&process_info->eviction_fence->base);
>   		cancel_delayed_work_sync(&process_info->restore_userptr_work);
>   		put_pid(process_info->pid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 6f60a581e3ba..3784d178c965 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -1307,6 +1307,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
>   	if (abo->kfd_bo)
>   		amdgpu_amdkfd_unreserve_memory_limit(abo);
>   
> +	amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo);
> +
>   	if (bo->mem.mem_type != TTM_PL_VRAM || !bo->mem.mm_node ||
>   	    !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE))
>   		return;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 247f328b7223..eca4ec66c1ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -3109,6 +3109,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   	}
>   
>   	amdgpu_vm_free_pts(adev, vm, NULL);
> +	root->vm_bo = NULL;
>   	amdgpu_bo_unreserve(root);
>   	amdgpu_bo_unref(&root);
>   	WARN_ON(vm->root.base.bo);
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 6c3cea509e25..855d3566381e 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -399,8 +399,7 @@ static int ttm_bo_individualize_resv(struct ttm_buffer_object *bo)
>   	BUG_ON(!dma_resv_trylock(&bo->base._resv));
>   
>   	r = dma_resv_copy_fences(&bo->base._resv, bo->base.resv);
> -	if (r)
> -		dma_resv_unlock(&bo->base._resv);
> +	dma_resv_unlock(&bo->base._resv);
>   
>   	return r;
>   }
> @@ -565,9 +564,6 @@ static void ttm_bo_release(struct kref *kref)
>   	int ret;
>   
>   	if (!bo->deleted) {
> -		if (bo->bdev->driver->release_notify)
> -			bo->bdev->driver->release_notify(bo);
> -
>   		drm_vma_offset_remove(bdev->vma_manager, &bo->base.vma_node);
>   		ttm_mem_io_lock(man, false);
>   		ttm_mem_io_free_vm(bo);
> @@ -581,6 +577,14 @@ static void ttm_bo_release(struct kref *kref)
>   			dma_resv_wait_timeout_rcu(bo->base.resv, true, false,
>   						  30 * HZ);
>   		}
> +
> +		spin_lock(&ttm_bo_glob.lru_lock);
> +		if (bo->type != ttm_bo_type_sg)
> +			bo->base.resv = &bo->base._resv;

This still doesn't works correctly and can cause very subtle crashes.

I will try to send out a patch set on Monday which should work.

Christian.

> +		spin_unlock(&ttm_bo_glob.lru_lock);
> +
> +		if (bo->bdev->driver->release_notify)
> +			bo->bdev->driver->release_notify(bo);
>   	}
>   
>   	if (!dma_resv_test_signaled_rcu(bo->base.resv, true)) {
> @@ -599,8 +603,6 @@ static void ttm_bo_release(struct kref *kref)
>   		}
>   
>   		spin_lock(&ttm_bo_glob.lru_lock);
> -		if (bo->type != ttm_bo_type_sg)
> -			bo->base.resv = &bo->base._resv;
>   		kref_init(&bo->kref);
>   		list_add_tail(&bo->ddestroy, &bdev->ddestroy);
>   		spin_unlock(&ttm_bo_glob.lru_lock);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

      reply	other threads:[~2020-02-08 15:57 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-08 15:09 [RFC PATCH v4] drm/amdgpu: Remove kfd eviction fence before release bo Pan, Xinhui
2020-02-08 15:57 ` Christian König [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4733e64d-fe03-962a-b07d-70e4b1582605@amd.com \
    --to=christian.koenig@amd.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=Felix.Kuehling@amd.com \
    --cc=Xinhui.Pan@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).