All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling
@ 2019-06-15  1:52 Yang, Philip
       [not found] ` <20190615015231.31871-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 2+ messages in thread
From: Yang, Philip @ 2019-06-15  1:52 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yang, Philip

Under memory pressure, hmm_range_fault may return error code -ENOMEM
or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log
message because we will retry restore again.

Call get_user_pages_done if TTM get user pages failed will have
WARN_ONCE kernel calling stack dump log.

Change-Id: I086f92944630f9d1a70365c00417cb9440662464
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 38 +++----------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 74e86952553f..10abae398e51 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
 		ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
 						   bo->tbo.ttm->pages);
 		if (ret) {
-			bo->tbo.ttm->pages[0] = NULL;
-			pr_info("%s: Failed to get user pages: %d\n",
+			pr_debug("%s: Failed to get user pages: %d\n",
 				__func__, ret);
-			/* Pretend it succeeded. It will fail later
-			 * with a VM fault if the GPU tries to access
-			 * it. Better than hanging indefinitely with
-			 * stalled user mode queues.
-			 */
-		}
-	}
-
-	return 0;
-}
 
-/* Remove invalid userptr BOs from hmm track list
- *
- * Stop HMM track the userptr update
- */
-static void untrack_invalid_user_pages(struct amdkfd_process_info *process_info)
-{
-	struct kgd_mem *mem, *tmp_mem;
-	struct amdgpu_bo *bo;
+			/* Return error -EBUSY or -ENOMEM, retry restore */
+			return ret;
+		}
 
-	list_for_each_entry_safe(mem, tmp_mem,
-				 &process_info->userptr_inval_list,
-				 validate_list.head) {
-		bo = mem->bo;
 		amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
 	}
+
+	return 0;
 }
 
 /* Validate invalid userptr BOs
@@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
 		list_move_tail(&mem->validate_list.head,
 			       &process_info->userptr_valid_list);
 
-		/* Stop HMM track the userptr update. We dont check the return
-		 * value for concurrent CPU page table update because we will
-		 * reschedule the restore worker if process_info->evicted_bos
-		 * is updated.
-		 */
-		amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
-
 		/* Update mapping. If the BO was not validated
 		 * (because we couldn't get user pages), this will
 		 * clear the page table entries, which will result in
@@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
 	}
 
 unlock_out:
-	untrack_invalid_user_pages(process_info);
 	mutex_unlock(&process_info->lock);
 	mmput(mm);
 	put_task_struct(usertask);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling
       [not found] ` <20190615015231.31871-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
@ 2019-06-17 20:45   ` Kuehling, Felix
  0 siblings, 0 replies; 2+ messages in thread
From: Kuehling, Felix @ 2019-06-17 20:45 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-06-14 9:52 p.m., Yang, Philip wrote:
> Under memory pressure, hmm_range_fault may return error code -ENOMEM
> or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log
> message because we will retry restore again.
>
> Call get_user_pages_done if TTM get user pages failed will have
> WARN_ONCE kernel calling stack dump log.
>
> Change-Id: I086f92944630f9d1a70365c00417cb9440662464
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 38 +++----------------
>   1 file changed, 6 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 74e86952553f..10abae398e51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
>   		ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
>   						   bo->tbo.ttm->pages);
>   		if (ret) {
> -			bo->tbo.ttm->pages[0] = NULL;
> -			pr_info("%s: Failed to get user pages: %d\n",
> +			pr_debug("%s: Failed to get user pages: %d\n",
>   				__func__, ret);
> -			/* Pretend it succeeded. It will fail later
> -			 * with a VM fault if the GPU tries to access
> -			 * it. Better than hanging indefinitely with
> -			 * stalled user mode queues.
> -			 */
> -		}
> -	}
> -
> -	return 0;
> -}
>   
> -/* Remove invalid userptr BOs from hmm track list
> - *
> - * Stop HMM track the userptr update
> - */
> -static void untrack_invalid_user_pages(struct amdkfd_process_info *process_info)
> -{
> -	struct kgd_mem *mem, *tmp_mem;
> -	struct amdgpu_bo *bo;
> +			/* Return error -EBUSY or -ENOMEM, retry restore */
> +			return ret;
> +		}
>   
> -	list_for_each_entry_safe(mem, tmp_mem,
> -				 &process_info->userptr_inval_list,
> -				 validate_list.head) {
> -		bo = mem->bo;
>   		amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>   	}
> +
> +	return 0;
>   }
>   
>   /* Validate invalid userptr BOs
> @@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
>   		list_move_tail(&mem->validate_list.head,
>   			       &process_info->userptr_valid_list);
>   
> -		/* Stop HMM track the userptr update. We dont check the return
> -		 * value for concurrent CPU page table update because we will
> -		 * reschedule the restore worker if process_info->evicted_bos
> -		 * is updated.
> -		 */
> -		amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> -
>   		/* Update mapping. If the BO was not validated
>   		 * (because we couldn't get user pages), this will
>   		 * clear the page table entries, which will result in
> @@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>   	}
>   
>   unlock_out:
> -	untrack_invalid_user_pages(process_info);
>   	mutex_unlock(&process_info->lock);
>   	mmput(mm);
>   	put_task_struct(usertask);
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-06-17 20:45 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-15  1:52 [PATCH] drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling Yang, Philip
     [not found] ` <20190615015231.31871-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
2019-06-17 20:45   ` Kuehling, Felix

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.