All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
@ 2019-05-07  9:36 Chunming Zhou
       [not found] ` <20190507093642.7859-1-david1.zhou-5C7GfCeVMHo@public.gmane.org>
  2019-05-07 10:53 ` [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6 Koenig, Christian
  0 siblings, 2 replies; 14+ messages in thread
From: Chunming Zhou @ 2019-05-07  9:36 UTC (permalink / raw)
  To: Christian.Koenig, Prike.Liang, dri-devel, amd-gfx

heavy gpu job could occupy memory long time, which lead other user fail to get memory.

basically pick up Christian idea:

1. Reserve the BO in DC using a ww_mutex ticket (trivial).
2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
6. If any of the "If's" above fail we just back off and return -EBUSY.

v2: fix some minor check
v3: address Christian v2 comments.
v4: fix some missing
v5: handle first_bo unlock and bo_get/put
v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.

Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 113 ++++++++++++++++++++++++++++++-----
 1 file changed, 97 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 8502b3ed2d88..bbf1d14d00a7 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
  * b. Otherwise, trylock it.
  */
 static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
-			struct ttm_operation_ctx *ctx, bool *locked)
+			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
 {
 	bool ret = false;
 
 	*locked = false;
+	if (busy)
+		*busy = false;
 	if (bo->resv == ctx->resv) {
 		reservation_object_assert_held(bo->resv);
 		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
@@ -779,35 +781,45 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
 	} else {
 		*locked = reservation_object_trylock(bo->resv);
 		ret = *locked;
+		if (!ret && busy)
+			*busy = true;
 	}
 
 	return ret;
 }
 
-static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
-			       uint32_t mem_type,
-			       const struct ttm_place *place,
-			       struct ttm_operation_ctx *ctx)
+static struct ttm_buffer_object*
+ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
+			 struct ttm_mem_type_manager *man,
+			 const struct ttm_place *place,
+			 struct ttm_operation_ctx *ctx,
+			 struct ttm_buffer_object **first_bo,
+			 bool *locked)
 {
-	struct ttm_bo_global *glob = bdev->glob;
-	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 	struct ttm_buffer_object *bo = NULL;
-	bool locked = false;
-	unsigned i;
-	int ret;
+	int i;
 
-	spin_lock(&glob->lru_lock);
+	if (first_bo)
+		*first_bo = NULL;
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &man->lru[i], lru) {
-			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
+			bool busy = false;
+			if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
+							    &busy)) {
+				if (first_bo && !(*first_bo) && busy) {
+					ttm_bo_get(bo);
+					*first_bo = bo;
+				}
 				continue;
+			}
 
 			if (place && !bdev->driver->eviction_valuable(bo,
 								      place)) {
-				if (locked)
+				if (*locked)
 					reservation_object_unlock(bo->resv);
 				continue;
 			}
+
 			break;
 		}
 
@@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 		bo = NULL;
 	}
 
+	return bo;
+}
+
+static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
+			       uint32_t mem_type,
+			       const struct ttm_place *place,
+			       struct ttm_operation_ctx *ctx)
+{
+	struct ttm_bo_global *glob = bdev->glob;
+	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
+	struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
+	bool locked = false;
+	int ret;
+
+	spin_lock(&glob->lru_lock);
+	bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
+				      &locked);
 	if (!bo) {
+		struct ttm_operation_ctx busy_ctx;
+
 		spin_unlock(&glob->lru_lock);
-		return -EBUSY;
+		/* check if other user occupy memory too long time */
+		if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx) {
+			if (first_bo)
+				ttm_bo_put(first_bo);
+			return -EBUSY;
+		}
+		if (first_bo->resv == ctx->resv) {
+			ttm_bo_put(first_bo);
+			return -EBUSY;
+		}
+		if (ctx->interruptible)
+			ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
+							  ctx->resv->lock.ctx);
+		else
+			ret = ww_mutex_lock(&first_bo->resv->lock, ctx->resv->lock.ctx);
+		if (ret) {
+			ttm_bo_put(first_bo);
+			return ret;
+		}
+		spin_lock(&glob->lru_lock);
+		/* previous busy resv lock is held by above, idle now,
+		 * so let them evictable.
+		 */
+		busy_ctx.interruptible = ctx->interruptible;
+		busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
+		busy_ctx.resv	       = first_bo->resv;
+		busy_ctx.flags	       = TTM_OPT_FLAG_ALLOW_RES_EVICT;
+
+		bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
+					      &locked);
+		if (bo && (bo->resv == first_bo->resv))
+			locked = true;
+		else if (bo)
+			ww_mutex_unlock(&first_bo->resv->lock);
+		if (!bo) {
+			spin_unlock(&glob->lru_lock);
+			ttm_bo_put(first_bo);
+			return -EBUSY;
+		}
 	}
 
 	kref_get(&bo->list_kref);
@@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 		ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
 					  ctx->no_wait_gpu, locked);
 		kref_put(&bo->list_kref, ttm_bo_release_list);
+		if (first_bo)
+			ttm_bo_put(first_bo);
 		return ret;
 	}
 
 	ttm_bo_del_from_lru(bo);
 	spin_unlock(&glob->lru_lock);
+	if (first_bo)
+		ttm_bo_put(first_bo);
 
 	ret = ttm_bo_evict(bo, ctx);
 	if (locked) {
@@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
+	struct ttm_operation_ctx native_ctx = {
+		.interruptible = false,
+		.no_wait_gpu = false,
+		.resv = bo->resv,
+		.flags = 0
+	};
+	struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
 	int ret;
 
 	do {
@@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 			return ret;
 		if (mem->mm_node)
 			break;
-		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
+		ret = ttm_mem_evict_first(bdev, mem_type, place, evict_ctx);
 		if (unlikely(ret != 0))
 			return ret;
 	} while (1);
@@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
-			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
+			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							   NULL)) {
 				ret = 0;
 				break;
 			}
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve
       [not found] ` <20190507093642.7859-1-david1.zhou-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-07  9:36   ` Chunming Zhou
       [not found]     ` <20190507093642.7859-2-david1.zhou-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Chunming Zhou @ 2019-05-07  9:36 UTC (permalink / raw)
  To: Christian.Koenig-5C7GfCeVMHo, Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Chunming Zhou

add ticket for display bo, so that it can preempt busy bo.

Change-Id: I9f031cdcc8267de00e819ae303baa0a52df8ebb9
Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index ac22f7351a42..8633d52e3fbe 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4176,6 +4176,9 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	struct amdgpu_device *adev;
 	struct amdgpu_bo *rbo;
 	struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
+	struct list_head list, duplicates;
+	struct ttm_validate_buffer tv;
+	struct ww_acquire_ctx ticket;
 	uint64_t tiling_flags;
 	uint32_t domain;
 	int r;
@@ -4192,9 +4195,18 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	obj = new_state->fb->obj[0];
 	rbo = gem_to_amdgpu_bo(obj);
 	adev = amdgpu_ttm_adev(rbo->tbo.bdev);
-	r = amdgpu_bo_reserve(rbo, false);
-	if (unlikely(r != 0))
+	INIT_LIST_HEAD(&list);
+	INIT_LIST_HEAD(&duplicates);
+
+	tv.bo = &rbo->tbo;
+	tv.num_shared = 1;
+	list_add(&tv.head, &list);
+
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
+	if (r) {
+		dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
 		return r;
+	}
 
 	if (plane->type != DRM_PLANE_TYPE_CURSOR)
 		domain = amdgpu_display_supported_domains(adev);
@@ -4205,21 +4217,21 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		return r;
 	}
 
 	r = amdgpu_ttm_alloc_gart(&rbo->tbo);
 	if (unlikely(r != 0)) {
 		amdgpu_bo_unpin(rbo);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		DRM_ERROR("%p bind failed\n", rbo);
 		return r;
 	}
 
 	amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
 
-	amdgpu_bo_unreserve(rbo);
+	ttm_eu_backoff_reservation(&ticket, &list);
 
 	afb->address = amdgpu_bo_gpu_offset(rbo);
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve
       [not found]     ` <20190507093642.7859-2-david1.zhou-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-07 10:46       ` Koenig, Christian
  0 siblings, 0 replies; 14+ messages in thread
From: Koenig, Christian @ 2019-05-07 10:46 UTC (permalink / raw)
  To: Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 07.05.19 um 11:36 schrieb Chunming Zhou:
> add ticket for display bo, so that it can preempt busy bo.
>
> Change-Id: I9f031cdcc8267de00e819ae303baa0a52df8ebb9
> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
> ---
>   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 22 ++++++++++++++-----
>   1 file changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index ac22f7351a42..8633d52e3fbe 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -4176,6 +4176,9 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
>   	struct amdgpu_device *adev;
>   	struct amdgpu_bo *rbo;
>   	struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
> +	struct list_head list, duplicates;
> +	struct ttm_validate_buffer tv;
> +	struct ww_acquire_ctx ticket;
>   	uint64_t tiling_flags;
>   	uint32_t domain;
>   	int r;
> @@ -4192,9 +4195,18 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
>   	obj = new_state->fb->obj[0];
>   	rbo = gem_to_amdgpu_bo(obj);
>   	adev = amdgpu_ttm_adev(rbo->tbo.bdev);
> -	r = amdgpu_bo_reserve(rbo, false);
> -	if (unlikely(r != 0))
> +	INIT_LIST_HEAD(&list);
> +	INIT_LIST_HEAD(&duplicates);
> +
> +	tv.bo = &rbo->tbo;
> +	tv.num_shared = 1;
> +	list_add(&tv.head, &list);
> +
> +	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);

duplicates are superfluous and can be NULL in this case.

Apart from that the patch is Reviewed-by: Christian König 
<christian.koenig@amd.com>

Regards,
Christian.

> +	if (r) {
> +		dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
>   		return r;
> +	}
>   
>   	if (plane->type != DRM_PLANE_TYPE_CURSOR)
>   		domain = amdgpu_display_supported_domains(adev);
> @@ -4205,21 +4217,21 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
>   	if (unlikely(r != 0)) {
>   		if (r != -ERESTARTSYS)
>   			DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
> -		amdgpu_bo_unreserve(rbo);
> +		ttm_eu_backoff_reservation(&ticket, &list);
>   		return r;
>   	}
>   
>   	r = amdgpu_ttm_alloc_gart(&rbo->tbo);
>   	if (unlikely(r != 0)) {
>   		amdgpu_bo_unpin(rbo);
> -		amdgpu_bo_unreserve(rbo);
> +		ttm_eu_backoff_reservation(&ticket, &list);
>   		DRM_ERROR("%p bind failed\n", rbo);
>   		return r;
>   	}
>   
>   	amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
>   
> -	amdgpu_bo_unreserve(rbo);
> +	ttm_eu_backoff_reservation(&ticket, &list);
>   
>   	afb->address = amdgpu_bo_gpu_offset(rbo);
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
  2019-05-07  9:36 [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6 Chunming Zhou
       [not found] ` <20190507093642.7859-1-david1.zhou-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-07 10:53 ` Koenig, Christian
       [not found]   ` <f4b1ddf2-b80b-260e-54c9-b0e62ecbe90b-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 14+ messages in thread
From: Koenig, Christian @ 2019-05-07 10:53 UTC (permalink / raw)
  To: Zhou, David(ChunMing), Liang, Prike, dri-devel, amd-gfx

Am 07.05.19 um 11:36 schrieb Chunming Zhou:
> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>
> basically pick up Christian idea:
>
> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>
> v2: fix some minor check
> v3: address Christian v2 comments.
> v4: fix some missing
> v5: handle first_bo unlock and bo_get/put
> v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
>
> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 113 ++++++++++++++++++++++++++++++-----
>   1 file changed, 97 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 8502b3ed2d88..bbf1d14d00a7 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -			struct ttm_operation_ctx *ctx, bool *locked)
> +			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>   	bool ret = false;
>   
>   	*locked = false;
> +	if (busy)
> +		*busy = false;
>   	if (bo->resv == ctx->resv) {
>   		reservation_object_assert_held(bo->resv);
>   		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> @@ -779,35 +781,45 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>   	} else {
>   		*locked = reservation_object_trylock(bo->resv);
>   		ret = *locked;
> +		if (!ret && busy)
> +			*busy = true;
>   	}
>   
>   	return ret;
>   }
>   
> -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> -			       uint32_t mem_type,
> -			       const struct ttm_place *place,
> -			       struct ttm_operation_ctx *ctx)
> +static struct ttm_buffer_object*
> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
> +			 struct ttm_mem_type_manager *man,
> +			 const struct ttm_place *place,
> +			 struct ttm_operation_ctx *ctx,
> +			 struct ttm_buffer_object **first_bo,
> +			 bool *locked)
>   {
> -	struct ttm_bo_global *glob = bdev->glob;
> -	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>   	struct ttm_buffer_object *bo = NULL;
> -	bool locked = false;
> -	unsigned i;
> -	int ret;
> +	int i;
>   
> -	spin_lock(&glob->lru_lock);
> +	if (first_bo)
> +		*first_bo = NULL;
>   	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>   		list_for_each_entry(bo, &man->lru[i], lru) {
> -			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +			bool busy = false;
> +			if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
> +							    &busy)) {

A newline between declaration and code please.

> +				if (first_bo && !(*first_bo) && busy) {
> +					ttm_bo_get(bo);
> +					*first_bo = bo;
> +				}
>   				continue;
> +			}
>   
>   			if (place && !bdev->driver->eviction_valuable(bo,
>   								      place)) {
> -				if (locked)
> +				if (*locked)
>   					reservation_object_unlock(bo->resv);
>   				continue;
>   			}
> +
>   			break;
>   		}
>   
> @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>   		bo = NULL;
>   	}
>   
> +	return bo;
> +}
> +
> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> +			       uint32_t mem_type,
> +			       const struct ttm_place *place,
> +			       struct ttm_operation_ctx *ctx)
> +{
> +	struct ttm_bo_global *glob = bdev->glob;
> +	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> +	struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
> +	bool locked = false;
> +	int ret;
> +
> +	spin_lock(&glob->lru_lock);
> +	bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
> +				      &locked);
>   	if (!bo) {
> +		struct ttm_operation_ctx busy_ctx;
> +
>   		spin_unlock(&glob->lru_lock);
> -		return -EBUSY;
> +		/* check if other user occupy memory too long time */
> +		if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx) {
> +			if (first_bo)
> +				ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
> +		if (first_bo->resv == ctx->resv) {
> +			ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
> +		if (ctx->interruptible)
> +			ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
> +							  ctx->resv->lock.ctx);
> +		else
> +			ret = ww_mutex_lock(&first_bo->resv->lock, ctx->resv->lock.ctx);
> +		if (ret) {
> +			ttm_bo_put(first_bo);
> +			return ret;
> +		}
> +		spin_lock(&glob->lru_lock);
> +		/* previous busy resv lock is held by above, idle now,
> +		 * so let them evictable.
> +		 */
> +		busy_ctx.interruptible = ctx->interruptible;
> +		busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
> +		busy_ctx.resv	       = first_bo->resv;
> +		busy_ctx.flags	       = TTM_OPT_FLAG_ALLOW_RES_EVICT;
> +
> +		bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
> +					      &locked);
> +		if (bo && (bo->resv == first_bo->resv))
> +			locked = true;
> +		else if (bo)
> +			ww_mutex_unlock(&first_bo->resv->lock);
> +		if (!bo) {
> +			spin_unlock(&glob->lru_lock);
> +			ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
>   	}
>   
>   	kref_get(&bo->list_kref);
> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>   		ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>   					  ctx->no_wait_gpu, locked);
>   		kref_put(&bo->list_kref, ttm_bo_release_list);
> +		if (first_bo)
> +			ttm_bo_put(first_bo);
>   		return ret;
>   	}
>   
>   	ttm_bo_del_from_lru(bo);
>   	spin_unlock(&glob->lru_lock);
> +	if (first_bo)
> +		ttm_bo_put(first_bo);
>   
>   	ret = ttm_bo_evict(bo, ctx);
>   	if (locked) {
> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>   {
>   	struct ttm_bo_device *bdev = bo->bdev;
>   	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> +	struct ttm_operation_ctx native_ctx = {
> +		.interruptible = false,
> +		.no_wait_gpu = false,
> +		.resv = bo->resv,
> +		.flags = 0
> +	};
> +	struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;

I thought we made the ctx parameter mandatory, didn't we? Could be that 
I remember that incorrectly.

Christian.

>   	int ret;
>   
>   	do {
> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>   			return ret;
>   		if (mem->mm_node)
>   			break;
> -		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
> +		ret = ttm_mem_evict_first(bdev, mem_type, place, evict_ctx);
>   		if (unlikely(ret != 0))
>   			return ret;
>   	} while (1);
> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>   	spin_lock(&glob->lru_lock);
>   	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>   		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +							   NULL)) {
>   				ret = 0;
>   				break;
>   			}

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
       [not found]   ` <f4b1ddf2-b80b-260e-54c9-b0e62ecbe90b-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-07 11:08     ` zhoucm1
  2019-05-07 11:13       ` Koenig, Christian
  0 siblings, 1 reply; 14+ messages in thread
From: zhoucm1 @ 2019-05-07 11:08 UTC (permalink / raw)
  To: Koenig, Christian, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2019年05月07日 18:53, Koenig, Christian wrote:
> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>>
>> basically pick up Christian idea:
>>
>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
>> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
>> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
>> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
>> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>>
>> v2: fix some minor check
>> v3: address Christian v2 comments.
>> v4: fix some missing
>> v5: handle first_bo unlock and bo_get/put
>> v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
>>
>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
>> ---
>>    drivers/gpu/drm/ttm/ttm_bo.c | 113 ++++++++++++++++++++++++++++++-----
>>    1 file changed, 97 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index 8502b3ed2d88..bbf1d14d00a7 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>     * b. Otherwise, trylock it.
>>     */
>>    static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>> -			struct ttm_operation_ctx *ctx, bool *locked)
>> +			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>>    {
>>    	bool ret = false;
>>    
>>    	*locked = false;
>> +	if (busy)
>> +		*busy = false;
>>    	if (bo->resv == ctx->resv) {
>>    		reservation_object_assert_held(bo->resv);
>>    		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>> @@ -779,35 +781,45 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>    	} else {
>>    		*locked = reservation_object_trylock(bo->resv);
>>    		ret = *locked;
>> +		if (!ret && busy)
>> +			*busy = true;
>>    	}
>>    
>>    	return ret;
>>    }
>>    
>> -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>> -			       uint32_t mem_type,
>> -			       const struct ttm_place *place,
>> -			       struct ttm_operation_ctx *ctx)
>> +static struct ttm_buffer_object*
>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>> +			 struct ttm_mem_type_manager *man,
>> +			 const struct ttm_place *place,
>> +			 struct ttm_operation_ctx *ctx,
>> +			 struct ttm_buffer_object **first_bo,
>> +			 bool *locked)
>>    {
>> -	struct ttm_bo_global *glob = bdev->glob;
>> -	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>    	struct ttm_buffer_object *bo = NULL;
>> -	bool locked = false;
>> -	unsigned i;
>> -	int ret;
>> +	int i;
>>    
>> -	spin_lock(&glob->lru_lock);
>> +	if (first_bo)
>> +		*first_bo = NULL;
>>    	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>    		list_for_each_entry(bo, &man->lru[i], lru) {
>> -			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>> +			bool busy = false;
>> +			if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>> +							    &busy)) {
> A newline between declaration and code please.
>
>> +				if (first_bo && !(*first_bo) && busy) {
>> +					ttm_bo_get(bo);
>> +					*first_bo = bo;
>> +				}
>>    				continue;
>> +			}
>>    
>>    			if (place && !bdev->driver->eviction_valuable(bo,
>>    								      place)) {
>> -				if (locked)
>> +				if (*locked)
>>    					reservation_object_unlock(bo->resv);
>>    				continue;
>>    			}
>> +
>>    			break;
>>    		}
>>    
>> @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>    		bo = NULL;
>>    	}
>>    
>> +	return bo;
>> +}
>> +
>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>> +			       uint32_t mem_type,
>> +			       const struct ttm_place *place,
>> +			       struct ttm_operation_ctx *ctx)
>> +{
>> +	struct ttm_bo_global *glob = bdev->glob;
>> +	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> +	struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>> +	bool locked = false;
>> +	int ret;
>> +
>> +	spin_lock(&glob->lru_lock);
>> +	bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
>> +				      &locked);
>>    	if (!bo) {
>> +		struct ttm_operation_ctx busy_ctx;
>> +
>>    		spin_unlock(&glob->lru_lock);
>> -		return -EBUSY;
>> +		/* check if other user occupy memory too long time */
>> +		if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx) {
>> +			if (first_bo)
>> +				ttm_bo_put(first_bo);
>> +			return -EBUSY;
>> +		}
>> +		if (first_bo->resv == ctx->resv) {
>> +			ttm_bo_put(first_bo);
>> +			return -EBUSY;
>> +		}
>> +		if (ctx->interruptible)
>> +			ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
>> +							  ctx->resv->lock.ctx);
>> +		else
>> +			ret = ww_mutex_lock(&first_bo->resv->lock, ctx->resv->lock.ctx);
>> +		if (ret) {
>> +			ttm_bo_put(first_bo);
>> +			return ret;
>> +		}
>> +		spin_lock(&glob->lru_lock);
>> +		/* previous busy resv lock is held by above, idle now,
>> +		 * so let them evictable.
>> +		 */
>> +		busy_ctx.interruptible = ctx->interruptible;
>> +		busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>> +		busy_ctx.resv	       = first_bo->resv;
>> +		busy_ctx.flags	       = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>> +
>> +		bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
>> +					      &locked);
>> +		if (bo && (bo->resv == first_bo->resv))
>> +			locked = true;
>> +		else if (bo)
>> +			ww_mutex_unlock(&first_bo->resv->lock);
>> +		if (!bo) {
>> +			spin_unlock(&glob->lru_lock);
>> +			ttm_bo_put(first_bo);
>> +			return -EBUSY;
>> +		}
>>    	}
>>    
>>    	kref_get(&bo->list_kref);
>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>    		ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>    					  ctx->no_wait_gpu, locked);
>>    		kref_put(&bo->list_kref, ttm_bo_release_list);
>> +		if (first_bo)
>> +			ttm_bo_put(first_bo);
>>    		return ret;
>>    	}
>>    
>>    	ttm_bo_del_from_lru(bo);
>>    	spin_unlock(&glob->lru_lock);
>> +	if (first_bo)
>> +		ttm_bo_put(first_bo);
>>    
>>    	ret = ttm_bo_evict(bo, ctx);
>>    	if (locked) {
>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>>    {
>>    	struct ttm_bo_device *bdev = bo->bdev;
>>    	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> +	struct ttm_operation_ctx native_ctx = {
>> +		.interruptible = false,
>> +		.no_wait_gpu = false,
>> +		.resv = bo->resv,
>> +		.flags = 0
>> +	};
>> +	struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
> I thought we made the ctx parameter mandatory, didn't we? Could be that
> I remember that incorrectly.
Prike said he see ctx->resv is null, in that case, code doesn't run into 
busy path.
Oh, as you mentioned here, we need add .resv=bo->resv for every 
ttm_operation_ctx. That's a huge change which will cross all vendor drivers.

Can we just force to evaluate evict_ctx->resv = bo->resv? That means we 
just add one extra line: evict_ctx->resv = bo->resv. How about that?

-David
>
> Christian.
>
>>    	int ret;
>>    
>>    	do {
>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>>    			return ret;
>>    		if (mem->mm_node)
>>    			break;
>> -		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>> +		ret = ttm_mem_evict_first(bdev, mem_type, place, evict_ctx);
>>    		if (unlikely(ret != 0))
>>    			return ret;
>>    	} while (1);
>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>>    	spin_lock(&glob->lru_lock);
>>    	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>    		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> -			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>> +			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>> +							   NULL)) {
>>    				ret = 0;
>>    				break;
>>    			}

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
  2019-05-07 11:08     ` zhoucm1
@ 2019-05-07 11:13       ` Koenig, Christian
  2019-05-07 11:22         ` zhoucm1
  0 siblings, 1 reply; 14+ messages in thread
From: Koenig, Christian @ 2019-05-07 11:13 UTC (permalink / raw)
  To: Zhou, David(ChunMing)

Am 07.05.19 um 13:08 schrieb zhoucm1:
>
>
> On 2019年05月07日 18:53, Koenig, Christian wrote:
>> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>>> heavy gpu job could occupy memory long time, which lead other user 
>>> fail to get memory.
>>>
>>> basically pick up Christian idea:
>>>
>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>> 2. If we then run into this EBUSY condition in TTM check if the BO 
>>> we need memory for (or rather the ww_mutex of its reservation 
>>> object) has a ticket assigned.
>>> 3. If we have a ticket we grab a reference to the first BO on the 
>>> LRU, drop the LRU lock and try to grab the reservation lock with the 
>>> ticket.
>>> 4. If getting the reservation lock with the ticket succeeded we 
>>> check if the BO is still the first one on the LRU in question (the 
>>> BO could have moved).
>>> 5. If the BO is still the first one on the LRU in question we try to 
>>> evict it as we would evict any other BO.
>>> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>>>
>>> v2: fix some minor check
>>> v3: address Christian v2 comments.
>>> v4: fix some missing
>>> v5: handle first_bo unlock and bo_get/put
>>> v6: abstract unified iterate function, and handle all possible 
>>> usecase not only pinned bo.
>>>
>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
>>> ---
>>>    drivers/gpu/drm/ttm/ttm_bo.c | 113 
>>> ++++++++++++++++++++++++++++++-----
>>>    1 file changed, 97 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>> index 8502b3ed2d88..bbf1d14d00a7 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>     * b. Otherwise, trylock it.
>>>     */
>>>    static bool ttm_bo_evict_swapout_allowable(struct 
>>> ttm_buffer_object *bo,
>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>>>    {
>>>        bool ret = false;
>>>           *locked = false;
>>> +    if (busy)
>>> +        *busy = false;
>>>        if (bo->resv == ctx->resv) {
>>>            reservation_object_assert_held(bo->resv);
>>>            if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>> @@ -779,35 +781,45 @@ static bool 
>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>        } else {
>>>            *locked = reservation_object_trylock(bo->resv);
>>>            ret = *locked;
>>> +        if (!ret && busy)
>>> +            *busy = true;
>>>        }
>>>           return ret;
>>>    }
>>>    -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>> -                   uint32_t mem_type,
>>> -                   const struct ttm_place *place,
>>> -                   struct ttm_operation_ctx *ctx)
>>> +static struct ttm_buffer_object*
>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>> +             struct ttm_mem_type_manager *man,
>>> +             const struct ttm_place *place,
>>> +             struct ttm_operation_ctx *ctx,
>>> +             struct ttm_buffer_object **first_bo,
>>> +             bool *locked)
>>>    {
>>> -    struct ttm_bo_global *glob = bdev->glob;
>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>        struct ttm_buffer_object *bo = NULL;
>>> -    bool locked = false;
>>> -    unsigned i;
>>> -    int ret;
>>> +    int i;
>>>    -    spin_lock(&glob->lru_lock);
>>> +    if (first_bo)
>>> +        *first_bo = NULL;
>>>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>            list_for_each_entry(bo, &man->lru[i], lru) {
>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>> +            bool busy = false;
>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>> +                                &busy)) {
>> A newline between declaration and code please.
>>
>>> +                if (first_bo && !(*first_bo) && busy) {
>>> +                    ttm_bo_get(bo);
>>> +                    *first_bo = bo;
>>> +                }
>>>                    continue;
>>> +            }
>>>                   if (place && !bdev->driver->eviction_valuable(bo,
>>>                                          place)) {
>>> -                if (locked)
>>> +                if (*locked)
>>>                        reservation_object_unlock(bo->resv);
>>>                    continue;
>>>                }
>>> +
>>>                break;
>>>            }
>>>    @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct 
>>> ttm_bo_device *bdev,
>>>            bo = NULL;
>>>        }
>>>    +    return bo;
>>> +}
>>> +
>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>> +                   uint32_t mem_type,
>>> +                   const struct ttm_place *place,
>>> +                   struct ttm_operation_ctx *ctx)
>>> +{
>>> +    struct ttm_bo_global *glob = bdev->glob;
>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>> +    bool locked = false;
>>> +    int ret;
>>> +
>>> +    spin_lock(&glob->lru_lock);
>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
>>> +                      &locked);
>>>        if (!bo) {
>>> +        struct ttm_operation_ctx busy_ctx;
>>> +
>>>            spin_unlock(&glob->lru_lock);
>>> -        return -EBUSY;
>>> +        /* check if other user occupy memory too long time */
>>> +        if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx) {
>>> +            if (first_bo)
>>> +                ttm_bo_put(first_bo);
>>> +            return -EBUSY;
>>> +        }
>>> +        if (first_bo->resv == ctx->resv) {
>>> +            ttm_bo_put(first_bo);
>>> +            return -EBUSY;
>>> +        }
>>> +        if (ctx->interruptible)
>>> +            ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>> +                              ctx->resv->lock.ctx);
>>> +        else
>>> +            ret = ww_mutex_lock(&first_bo->resv->lock, 
>>> ctx->resv->lock.ctx);
>>> +        if (ret) {
>>> +            ttm_bo_put(first_bo);
>>> +            return ret;
>>> +        }
>>> +        spin_lock(&glob->lru_lock);
>>> +        /* previous busy resv lock is held by above, idle now,
>>> +         * so let them evictable.
>>> +         */
>>> +        busy_ctx.interruptible = ctx->interruptible;
>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>> +        busy_ctx.resv           = first_bo->resv;
>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>> +
>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, 
>>> NULL,
>>> +                          &locked);
>>> +        if (bo && (bo->resv == first_bo->resv))
>>> +            locked = true;
>>> +        else if (bo)
>>> +            ww_mutex_unlock(&first_bo->resv->lock);
>>> +        if (!bo) {
>>> +            spin_unlock(&glob->lru_lock);
>>> +            ttm_bo_put(first_bo);
>>> +            return -EBUSY;
>>> +        }
>>>        }
>>>           kref_get(&bo->list_kref);
>>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct 
>>> ttm_bo_device *bdev,
>>>            ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>                          ctx->no_wait_gpu, locked);
>>>            kref_put(&bo->list_kref, ttm_bo_release_list);
>>> +        if (first_bo)
>>> +            ttm_bo_put(first_bo);
>>>            return ret;
>>>        }
>>>           ttm_bo_del_from_lru(bo);
>>>        spin_unlock(&glob->lru_lock);
>>> +    if (first_bo)
>>> +        ttm_bo_put(first_bo);
>>>           ret = ttm_bo_evict(bo, ctx);
>>>        if (locked) {
>>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct 
>>> ttm_buffer_object *bo,
>>>    {
>>>        struct ttm_bo_device *bdev = bo->bdev;
>>>        struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>> +    struct ttm_operation_ctx native_ctx = {
>>> +        .interruptible = false,
>>> +        .no_wait_gpu = false,
>>> +        .resv = bo->resv,
>>> +        .flags = 0
>>> +    };
>>> +    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
>> I thought we made the ctx parameter mandatory, didn't we? Could be that
>> I remember that incorrectly.
> Prike said he see ctx->resv is null, in that case, code doesn't run 
> into busy path.
> Oh, as you mentioned here, we need add .resv=bo->resv for every 
> ttm_operation_ctx. That's a huge change which will cross all vendor 
> drivers.
>
> Can we just force to evaluate evict_ctx->resv = bo->resv? That means 
> we just add one extra line: evict_ctx->resv = bo->resv. How about that?

Well only if ctx->resv is NULL, otherwise we would overwrite some 
reservation context given by the driver.

Probably better to give the acquir_ctx as separate parameter to 
ttm_mem_evict_first().

Christian.

>
> -David
>>
>> Christian.
>>
>>>        int ret;
>>>           do {
>>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct 
>>> ttm_buffer_object *bo,
>>>                return ret;
>>>            if (mem->mm_node)
>>>                break;
>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place, evict_ctx);
>>>            if (unlikely(ret != 0))
>>>                return ret;
>>>        } while (1);
>>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, 
>>> struct ttm_operation_ctx *ctx)
>>>        spin_lock(&glob->lru_lock);
>>>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>            list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>> +                               NULL)) {
>>>                    ret = 0;
>>>                    break;
>>>                }
>

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
  2019-05-07 11:13       ` Koenig, Christian
@ 2019-05-07 11:22         ` zhoucm1
  2019-05-07 11:24           ` Christian König
  0 siblings, 1 reply; 14+ messages in thread
From: zhoucm1 @ 2019-05-07 11:22 UTC (permalink / raw)
  To: Koenig, Christian, Zhou, David(ChunMing),
	Liang, Prike, dri-devel, amd-gfx



On 2019年05月07日 19:13, Koenig, Christian wrote:
> Am 07.05.19 um 13:08 schrieb zhoucm1:
>>
>> On 2019年05月07日 18:53, Koenig, Christian wrote:
>>> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>>>> heavy gpu job could occupy memory long time, which lead other user
>>>> fail to get memory.
>>>>
>>>> basically pick up Christian idea:
>>>>
>>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>>> 2. If we then run into this EBUSY condition in TTM check if the BO
>>>> we need memory for (or rather the ww_mutex of its reservation
>>>> object) has a ticket assigned.
>>>> 3. If we have a ticket we grab a reference to the first BO on the
>>>> LRU, drop the LRU lock and try to grab the reservation lock with the
>>>> ticket.
>>>> 4. If getting the reservation lock with the ticket succeeded we
>>>> check if the BO is still the first one on the LRU in question (the
>>>> BO could have moved).
>>>> 5. If the BO is still the first one on the LRU in question we try to
>>>> evict it as we would evict any other BO.
>>>> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>>>>
>>>> v2: fix some minor check
>>>> v3: address Christian v2 comments.
>>>> v4: fix some missing
>>>> v5: handle first_bo unlock and bo_get/put
>>>> v6: abstract unified iterate function, and handle all possible
>>>> usecase not only pinned bo.
>>>>
>>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>>> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
>>>> ---
>>>>     drivers/gpu/drm/ttm/ttm_bo.c | 113
>>>> ++++++++++++++++++++++++++++++-----
>>>>     1 file changed, 97 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>> index 8502b3ed2d88..bbf1d14d00a7 100644
>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>>      * b. Otherwise, trylock it.
>>>>      */
>>>>     static bool ttm_bo_evict_swapout_allowable(struct
>>>> ttm_buffer_object *bo,
>>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>>>>     {
>>>>         bool ret = false;
>>>>            *locked = false;
>>>> +    if (busy)
>>>> +        *busy = false;
>>>>         if (bo->resv == ctx->resv) {
>>>>             reservation_object_assert_held(bo->resv);
>>>>             if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>> @@ -779,35 +781,45 @@ static bool
>>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>>         } else {
>>>>             *locked = reservation_object_trylock(bo->resv);
>>>>             ret = *locked;
>>>> +        if (!ret && busy)
>>>> +            *busy = true;
>>>>         }
>>>>            return ret;
>>>>     }
>>>>     -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>> -                   uint32_t mem_type,
>>>> -                   const struct ttm_place *place,
>>>> -                   struct ttm_operation_ctx *ctx)
>>>> +static struct ttm_buffer_object*
>>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>>> +             struct ttm_mem_type_manager *man,
>>>> +             const struct ttm_place *place,
>>>> +             struct ttm_operation_ctx *ctx,
>>>> +             struct ttm_buffer_object **first_bo,
>>>> +             bool *locked)
>>>>     {
>>>> -    struct ttm_bo_global *glob = bdev->glob;
>>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>         struct ttm_buffer_object *bo = NULL;
>>>> -    bool locked = false;
>>>> -    unsigned i;
>>>> -    int ret;
>>>> +    int i;
>>>>     -    spin_lock(&glob->lru_lock);
>>>> +    if (first_bo)
>>>> +        *first_bo = NULL;
>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>             list_for_each_entry(bo, &man->lru[i], lru) {
>>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>>> +            bool busy = false;
>>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>>> +                                &busy)) {
>>> A newline between declaration and code please.
>>>
>>>> +                if (first_bo && !(*first_bo) && busy) {
>>>> +                    ttm_bo_get(bo);
>>>> +                    *first_bo = bo;
>>>> +                }
>>>>                     continue;
>>>> +            }
>>>>                    if (place && !bdev->driver->eviction_valuable(bo,
>>>>                                           place)) {
>>>> -                if (locked)
>>>> +                if (*locked)
>>>>                         reservation_object_unlock(bo->resv);
>>>>                     continue;
>>>>                 }
>>>> +
>>>>                 break;
>>>>             }
>>>>     @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct
>>>> ttm_bo_device *bdev,
>>>>             bo = NULL;
>>>>         }
>>>>     +    return bo;
>>>> +}
>>>> +
>>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>> +                   uint32_t mem_type,
>>>> +                   const struct ttm_place *place,
>>>> +                   struct ttm_operation_ctx *ctx)
>>>> +{
>>>> +    struct ttm_bo_global *glob = bdev->glob;
>>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>>> +    bool locked = false;
>>>> +    int ret;
>>>> +
>>>> +    spin_lock(&glob->lru_lock);
>>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
>>>> +                      &locked);
>>>>         if (!bo) {
>>>> +        struct ttm_operation_ctx busy_ctx;
>>>> +
>>>>             spin_unlock(&glob->lru_lock);
>>>> -        return -EBUSY;
>>>> +        /* check if other user occupy memory too long time */
>>>> +        if (!first_bo || !ctx || !ctx->resv || !ctx->resv->lock.ctx) {
>>>> +            if (first_bo)
>>>> +                ttm_bo_put(first_bo);
>>>> +            return -EBUSY;
>>>> +        }
>>>> +        if (first_bo->resv == ctx->resv) {
>>>> +            ttm_bo_put(first_bo);
>>>> +            return -EBUSY;
>>>> +        }
>>>> +        if (ctx->interruptible)
>>>> +            ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>>> +                              ctx->resv->lock.ctx);
>>>> +        else
>>>> +            ret = ww_mutex_lock(&first_bo->resv->lock,
>>>> ctx->resv->lock.ctx);
>>>> +        if (ret) {
>>>> +            ttm_bo_put(first_bo);
>>>> +            return ret;
>>>> +        }
>>>> +        spin_lock(&glob->lru_lock);
>>>> +        /* previous busy resv lock is held by above, idle now,
>>>> +         * so let them evictable.
>>>> +         */
>>>> +        busy_ctx.interruptible = ctx->interruptible;
>>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>>> +        busy_ctx.resv           = first_bo->resv;
>>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>>> +
>>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx,
>>>> NULL,
>>>> +                          &locked);
>>>> +        if (bo && (bo->resv == first_bo->resv))
>>>> +            locked = true;
>>>> +        else if (bo)
>>>> +            ww_mutex_unlock(&first_bo->resv->lock);
>>>> +        if (!bo) {
>>>> +            spin_unlock(&glob->lru_lock);
>>>> +            ttm_bo_put(first_bo);
>>>> +            return -EBUSY;
>>>> +        }
>>>>         }
>>>>            kref_get(&bo->list_kref);
>>>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct
>>>> ttm_bo_device *bdev,
>>>>             ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>>                           ctx->no_wait_gpu, locked);
>>>>             kref_put(&bo->list_kref, ttm_bo_release_list);
>>>> +        if (first_bo)
>>>> +            ttm_bo_put(first_bo);
>>>>             return ret;
>>>>         }
>>>>            ttm_bo_del_from_lru(bo);
>>>>         spin_unlock(&glob->lru_lock);
>>>> +    if (first_bo)
>>>> +        ttm_bo_put(first_bo);
>>>>            ret = ttm_bo_evict(bo, ctx);
>>>>         if (locked) {
>>>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct
>>>> ttm_buffer_object *bo,
>>>>     {
>>>>         struct ttm_bo_device *bdev = bo->bdev;
>>>>         struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>> +    struct ttm_operation_ctx native_ctx = {
>>>> +        .interruptible = false,
>>>> +        .no_wait_gpu = false,
>>>> +        .resv = bo->resv,
>>>> +        .flags = 0
>>>> +    };
>>>> +    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
>>> I thought we made the ctx parameter mandatory, didn't we? Could be that
>>> I remember that incorrectly.
>> Prike said he see ctx->resv is null, in that case, code doesn't run
>> into busy path.
>> Oh, as you mentioned here, we need add .resv=bo->resv for every
>> ttm_operation_ctx. That's a huge change which will cross all vendor
>> drivers.
>>
>> Can we just force to evaluate evict_ctx->resv = bo->resv? That means
>> we just add one extra line: evict_ctx->resv = bo->resv. How about that?
> Well only if ctx->resv is NULL, otherwise we would overwrite some
> reservation context given by the driver.
>
> Probably better to give the acquir_ctx as separate parameter to
> ttm_mem_evict_first().
still put acquire_ctx into ttm_operation_ctx? Then that's same ctx->resv.
Current problem is we don't pass resv anywhere except ALLOW_EVICT case.
If you have concern for overwritten, we have to do ".resv = bo->resv" in 
every ttm_operation_ctx definitions.

-David
>
> Christian.
>
>> -David
>>> Christian.
>>>
>>>>         int ret;
>>>>            do {
>>>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct
>>>> ttm_buffer_object *bo,
>>>>                 return ret;
>>>>             if (mem->mm_node)
>>>>                 break;
>>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place, evict_ctx);
>>>>             if (unlikely(ret != 0))
>>>>                 return ret;
>>>>         } while (1);
>>>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
>>>> struct ttm_operation_ctx *ctx)
>>>>         spin_lock(&glob->lru_lock);
>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>             list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>>> +                               NULL)) {
>>>>                     ret = 0;
>>>>                     break;
>>>>                 }

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
  2019-05-07 11:22         ` zhoucm1
@ 2019-05-07 11:24           ` Christian König
       [not found]             ` <968487eb-f78e-9922-a073-8ed08111e307-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2019-05-07 11:24 UTC (permalink / raw)
  To: zhoucm1, Koenig, Christian, Zhou, David(ChunMing),
	Liang, Prike, dri-devel, amd-gfx

Am 07.05.19 um 13:22 schrieb zhoucm1:
>
>
> On 2019年05月07日 19:13, Koenig, Christian wrote:
>> Am 07.05.19 um 13:08 schrieb zhoucm1:
>>>
>>> On 2019年05月07日 18:53, Koenig, Christian wrote:
>>>> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>>>>> heavy gpu job could occupy memory long time, which lead other user
>>>>> fail to get memory.
>>>>>
>>>>> basically pick up Christian idea:
>>>>>
>>>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>>>> 2. If we then run into this EBUSY condition in TTM check if the BO
>>>>> we need memory for (or rather the ww_mutex of its reservation
>>>>> object) has a ticket assigned.
>>>>> 3. If we have a ticket we grab a reference to the first BO on the
>>>>> LRU, drop the LRU lock and try to grab the reservation lock with the
>>>>> ticket.
>>>>> 4. If getting the reservation lock with the ticket succeeded we
>>>>> check if the BO is still the first one on the LRU in question (the
>>>>> BO could have moved).
>>>>> 5. If the BO is still the first one on the LRU in question we try to
>>>>> evict it as we would evict any other BO.
>>>>> 6. If any of the "If's" above fail we just back off and return 
>>>>> -EBUSY.
>>>>>
>>>>> v2: fix some minor check
>>>>> v3: address Christian v2 comments.
>>>>> v4: fix some missing
>>>>> v5: handle first_bo unlock and bo_get/put
>>>>> v6: abstract unified iterate function, and handle all possible
>>>>> usecase not only pinned bo.
>>>>>
>>>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>>>> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/ttm/ttm_bo.c | 113
>>>>> ++++++++++++++++++++++++++++++-----
>>>>>     1 file changed, 97 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>> index 8502b3ed2d88..bbf1d14d00a7 100644
>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>>>      * b. Otherwise, trylock it.
>>>>>      */
>>>>>     static bool ttm_bo_evict_swapout_allowable(struct
>>>>> ttm_buffer_object *bo,
>>>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>>>>>     {
>>>>>         bool ret = false;
>>>>>            *locked = false;
>>>>> +    if (busy)
>>>>> +        *busy = false;
>>>>>         if (bo->resv == ctx->resv) {
>>>>>             reservation_object_assert_held(bo->resv);
>>>>>             if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>>> @@ -779,35 +781,45 @@ static bool
>>>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>>>         } else {
>>>>>             *locked = reservation_object_trylock(bo->resv);
>>>>>             ret = *locked;
>>>>> +        if (!ret && busy)
>>>>> +            *busy = true;
>>>>>         }
>>>>>            return ret;
>>>>>     }
>>>>>     -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>> -                   uint32_t mem_type,
>>>>> -                   const struct ttm_place *place,
>>>>> -                   struct ttm_operation_ctx *ctx)
>>>>> +static struct ttm_buffer_object*
>>>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>>>> +             struct ttm_mem_type_manager *man,
>>>>> +             const struct ttm_place *place,
>>>>> +             struct ttm_operation_ctx *ctx,
>>>>> +             struct ttm_buffer_object **first_bo,
>>>>> +             bool *locked)
>>>>>     {
>>>>> -    struct ttm_bo_global *glob = bdev->glob;
>>>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>         struct ttm_buffer_object *bo = NULL;
>>>>> -    bool locked = false;
>>>>> -    unsigned i;
>>>>> -    int ret;
>>>>> +    int i;
>>>>>     -    spin_lock(&glob->lru_lock);
>>>>> +    if (first_bo)
>>>>> +        *first_bo = NULL;
>>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>             list_for_each_entry(bo, &man->lru[i], lru) {
>>>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>>>> +            bool busy = false;
>>>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>>>> +                                &busy)) {
>>>> A newline between declaration and code please.
>>>>
>>>>> +                if (first_bo && !(*first_bo) && busy) {
>>>>> +                    ttm_bo_get(bo);
>>>>> +                    *first_bo = bo;
>>>>> +                }
>>>>>                     continue;
>>>>> +            }
>>>>>                    if (place && !bdev->driver->eviction_valuable(bo,
>>>>>                                           place)) {
>>>>> -                if (locked)
>>>>> +                if (*locked)
>>>>> reservation_object_unlock(bo->resv);
>>>>>                     continue;
>>>>>                 }
>>>>> +
>>>>>                 break;
>>>>>             }
>>>>>     @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct
>>>>> ttm_bo_device *bdev,
>>>>>             bo = NULL;
>>>>>         }
>>>>>     +    return bo;
>>>>> +}
>>>>> +
>>>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>> +                   uint32_t mem_type,
>>>>> +                   const struct ttm_place *place,
>>>>> +                   struct ttm_operation_ctx *ctx)
>>>>> +{
>>>>> +    struct ttm_bo_global *glob = bdev->glob;
>>>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>>>> +    bool locked = false;
>>>>> +    int ret;
>>>>> +
>>>>> +    spin_lock(&glob->lru_lock);
>>>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
>>>>> +                      &locked);
>>>>>         if (!bo) {
>>>>> +        struct ttm_operation_ctx busy_ctx;
>>>>> +
>>>>>             spin_unlock(&glob->lru_lock);
>>>>> -        return -EBUSY;
>>>>> +        /* check if other user occupy memory too long time */
>>>>> +        if (!first_bo || !ctx || !ctx->resv || 
>>>>> !ctx->resv->lock.ctx) {
>>>>> +            if (first_bo)
>>>>> +                ttm_bo_put(first_bo);
>>>>> +            return -EBUSY;
>>>>> +        }
>>>>> +        if (first_bo->resv == ctx->resv) {
>>>>> +            ttm_bo_put(first_bo);
>>>>> +            return -EBUSY;
>>>>> +        }
>>>>> +        if (ctx->interruptible)
>>>>> +            ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>>>> +                              ctx->resv->lock.ctx);
>>>>> +        else
>>>>> +            ret = ww_mutex_lock(&first_bo->resv->lock,
>>>>> ctx->resv->lock.ctx);
>>>>> +        if (ret) {
>>>>> +            ttm_bo_put(first_bo);
>>>>> +            return ret;
>>>>> +        }
>>>>> +        spin_lock(&glob->lru_lock);
>>>>> +        /* previous busy resv lock is held by above, idle now,
>>>>> +         * so let them evictable.
>>>>> +         */
>>>>> +        busy_ctx.interruptible = ctx->interruptible;
>>>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>>>> +        busy_ctx.resv           = first_bo->resv;
>>>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>>>> +
>>>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx,
>>>>> NULL,
>>>>> +                          &locked);
>>>>> +        if (bo && (bo->resv == first_bo->resv))
>>>>> +            locked = true;
>>>>> +        else if (bo)
>>>>> + ww_mutex_unlock(&first_bo->resv->lock);
>>>>> +        if (!bo) {
>>>>> +            spin_unlock(&glob->lru_lock);
>>>>> +            ttm_bo_put(first_bo);
>>>>> +            return -EBUSY;
>>>>> +        }
>>>>>         }
>>>>>            kref_get(&bo->list_kref);
>>>>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct
>>>>> ttm_bo_device *bdev,
>>>>>             ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>>>                           ctx->no_wait_gpu, locked);
>>>>>             kref_put(&bo->list_kref, ttm_bo_release_list);
>>>>> +        if (first_bo)
>>>>> +            ttm_bo_put(first_bo);
>>>>>             return ret;
>>>>>         }
>>>>>            ttm_bo_del_from_lru(bo);
>>>>>         spin_unlock(&glob->lru_lock);
>>>>> +    if (first_bo)
>>>>> +        ttm_bo_put(first_bo);
>>>>>            ret = ttm_bo_evict(bo, ctx);
>>>>>         if (locked) {
>>>>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct
>>>>> ttm_buffer_object *bo,
>>>>>     {
>>>>>         struct ttm_bo_device *bdev = bo->bdev;
>>>>>         struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>> +    struct ttm_operation_ctx native_ctx = {
>>>>> +        .interruptible = false,
>>>>> +        .no_wait_gpu = false,
>>>>> +        .resv = bo->resv,
>>>>> +        .flags = 0
>>>>> +    };
>>>>> +    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
>>>> I thought we made the ctx parameter mandatory, didn't we? Could be 
>>>> that
>>>> I remember that incorrectly.
>>> Prike said he see ctx->resv is null, in that case, code doesn't run
>>> into busy path.
>>> Oh, as you mentioned here, we need add .resv=bo->resv for every
>>> ttm_operation_ctx. That's a huge change which will cross all vendor
>>> drivers.
>>>
>>> Can we just force to evaluate evict_ctx->resv = bo->resv? That means
>>> we just add one extra line: evict_ctx->resv = bo->resv. How about that?
>> Well only if ctx->resv is NULL, otherwise we would overwrite some
>> reservation context given by the driver.
>>
>> Probably better to give the acquir_ctx as separate parameter to
>> ttm_mem_evict_first().
> still put acquire_ctx into ttm_operation_ctx? Then that's same ctx->resv.
> Current problem is we don't pass resv anywhere except ALLOW_EVICT case.
> If you have concern for overwritten, we have to do ".resv = bo->resv" 
> in every ttm_operation_ctx definitions.

No, what I mean is to add the acquire_ctx as separate parameter to 
ttm_mem_evict_first().

E.g. we only need it in this function and it is actually not related to 
the ttm operation context filled in by the driver.

Christian.

>
> -David
>>
>> Christian.
>>
>>> -David
>>>> Christian.
>>>>
>>>>>         int ret;
>>>>>            do {
>>>>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct
>>>>> ttm_buffer_object *bo,
>>>>>                 return ret;
>>>>>             if (mem->mm_node)
>>>>>                 break;
>>>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place, evict_ctx);
>>>>>             if (unlikely(ret != 0))
>>>>>                 return ret;
>>>>>         } while (1);
>>>>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
>>>>> struct ttm_operation_ctx *ctx)
>>>>>         spin_lock(&glob->lru_lock);
>>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>             list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>>>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>>>> +                               NULL)) {
>>>>>                     ret = 0;
>>>>>                     break;
>>>>>                 }
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
       [not found]             ` <968487eb-f78e-9922-a073-8ed08111e307-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-05-07 11:37               ` Thomas Hellstrom
       [not found]                 ` <93fbb994-d305-dfc4-f8e5-502647d7386f-4+hqylr40dJg9hUCZPvPmw@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Hellstrom @ 2019-05-07 11:37 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, zhoucm1, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 5/7/19 1:24 PM, Christian König wrote:
> Am 07.05.19 um 13:22 schrieb zhoucm1:
>>
>>
>> On 2019年05月07日 19:13, Koenig, Christian wrote:
>>> Am 07.05.19 um 13:08 schrieb zhoucm1:
>>>>
>>>> On 2019年05月07日 18:53, Koenig, Christian wrote:
>>>>> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>>>>>> heavy gpu job could occupy memory long time, which lead other user
>>>>>> fail to get memory.
>>>>>>
>>>>>> basically pick up Christian idea:
>>>>>>
>>>>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>>>>> 2. If we then run into this EBUSY condition in TTM check if the BO
>>>>>> we need memory for (or rather the ww_mutex of its reservation
>>>>>> object) has a ticket assigned.
>>>>>> 3. If we have a ticket we grab a reference to the first BO on the
>>>>>> LRU, drop the LRU lock and try to grab the reservation lock with the
>>>>>> ticket.
>>>>>> 4. If getting the reservation lock with the ticket succeeded we
>>>>>> check if the BO is still the first one on the LRU in question (the
>>>>>> BO could have moved).
>>>>>> 5. If the BO is still the first one on the LRU in question we try to
>>>>>> evict it as we would evict any other BO.
>>>>>> 6. If any of the "If's" above fail we just back off and return 
>>>>>> -EBUSY.
>>>>>>
>>>>>> v2: fix some minor check
>>>>>> v3: address Christian v2 comments.
>>>>>> v4: fix some missing
>>>>>> v5: handle first_bo unlock and bo_get/put
>>>>>> v6: abstract unified iterate function, and handle all possible
>>>>>> usecase not only pinned bo.
>>>>>>
>>>>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>>>>> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/ttm/ttm_bo.c | 113
>>>>>> ++++++++++++++++++++++++++++++-----
>>>>>>     1 file changed, 97 insertions(+), 16 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>> index 8502b3ed2d88..bbf1d14d00a7 100644
>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>>>>      * b. Otherwise, trylock it.
>>>>>>      */
>>>>>>     static bool ttm_bo_evict_swapout_allowable(struct
>>>>>> ttm_buffer_object *bo,
>>>>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>>>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool 
>>>>>> *busy)
>>>>>>     {
>>>>>>         bool ret = false;
>>>>>>            *locked = false;
>>>>>> +    if (busy)
>>>>>> +        *busy = false;
>>>>>>         if (bo->resv == ctx->resv) {
>>>>>>             reservation_object_assert_held(bo->resv);
>>>>>>             if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>>>> @@ -779,35 +781,45 @@ static bool
>>>>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>>>>         } else {
>>>>>>             *locked = reservation_object_trylock(bo->resv);
>>>>>>             ret = *locked;
>>>>>> +        if (!ret && busy)
>>>>>> +            *busy = true;
>>>>>>         }
>>>>>>            return ret;
>>>>>>     }
>>>>>>     -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>>> -                   uint32_t mem_type,
>>>>>> -                   const struct ttm_place *place,
>>>>>> -                   struct ttm_operation_ctx *ctx)
>>>>>> +static struct ttm_buffer_object*
>>>>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>>>>> +             struct ttm_mem_type_manager *man,
>>>>>> +             const struct ttm_place *place,
>>>>>> +             struct ttm_operation_ctx *ctx,
>>>>>> +             struct ttm_buffer_object **first_bo,
>>>>>> +             bool *locked)
>>>>>>     {
>>>>>> -    struct ttm_bo_global *glob = bdev->glob;
>>>>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>         struct ttm_buffer_object *bo = NULL;
>>>>>> -    bool locked = false;
>>>>>> -    unsigned i;
>>>>>> -    int ret;
>>>>>> +    int i;
>>>>>>     -    spin_lock(&glob->lru_lock);
>>>>>> +    if (first_bo)
>>>>>> +        *first_bo = NULL;
>>>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>>             list_for_each_entry(bo, &man->lru[i], lru) {
>>>>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>>>>> +            bool busy = false;
>>>>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>>>>> +                                &busy)) {
>>>>> A newline between declaration and code please.
>>>>>
>>>>>> +                if (first_bo && !(*first_bo) && busy) {
>>>>>> +                    ttm_bo_get(bo);
>>>>>> +                    *first_bo = bo;
>>>>>> +                }
>>>>>>                     continue;
>>>>>> +            }
>>>>>>                    if (place && !bdev->driver->eviction_valuable(bo,
>>>>>>                                           place)) {
>>>>>> -                if (locked)
>>>>>> +                if (*locked)
>>>>>> reservation_object_unlock(bo->resv);
>>>>>>                     continue;
>>>>>>                 }
>>>>>> +
>>>>>>                 break;
>>>>>>             }
>>>>>>     @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct
>>>>>> ttm_bo_device *bdev,
>>>>>>             bo = NULL;
>>>>>>         }
>>>>>>     +    return bo;
>>>>>> +}
>>>>>> +
>>>>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>>> +                   uint32_t mem_type,
>>>>>> +                   const struct ttm_place *place,
>>>>>> +                   struct ttm_operation_ctx *ctx)
>>>>>> +{
>>>>>> +    struct ttm_bo_global *glob = bdev->glob;
>>>>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>>>>> +    bool locked = false;
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    spin_lock(&glob->lru_lock);
>>>>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
>>>>>> +                      &locked);
>>>>>>         if (!bo) {
>>>>>> +        struct ttm_operation_ctx busy_ctx;
>>>>>> +
>>>>>>             spin_unlock(&glob->lru_lock);
>>>>>> -        return -EBUSY;
>>>>>> +        /* check if other user occupy memory too long time */
>>>>>> +        if (!first_bo || !ctx || !ctx->resv || 
>>>>>> !ctx->resv->lock.ctx) {
>>>>>> +            if (first_bo)
>>>>>> +                ttm_bo_put(first_bo);
>>>>>> +            return -EBUSY;
>>>>>> +        }
>>>>>> +        if (first_bo->resv == ctx->resv) {
>>>>>> +            ttm_bo_put(first_bo);
>>>>>> +            return -EBUSY;
>>>>>> +        }
>>>>>> +        if (ctx->interruptible)
>>>>>> +            ret = 
>>>>>> ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>>>>> + ctx->resv->lock.ctx);
>>>>>> +        else
>>>>>> +            ret = ww_mutex_lock(&first_bo->resv->lock,
>>>>>> ctx->resv->lock.ctx);
>>>>>> +        if (ret) {
>>>>>> +            ttm_bo_put(first_bo);
>>>>>> +            return ret;
>>>>>> +        }
>>>>>> +        spin_lock(&glob->lru_lock);
>>>>>> +        /* previous busy resv lock is held by above, idle now,
>>>>>> +         * so let them evictable.
>>>>>> +         */
>>>>>> +        busy_ctx.interruptible = ctx->interruptible;
>>>>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>>>>> +        busy_ctx.resv           = first_bo->resv;
>>>>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>>>>> +
>>>>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx,
>>>>>> NULL,
>>>>>> +                          &locked);
>>>>>> +        if (bo && (bo->resv == first_bo->resv))
>>>>>> +            locked = true;
>>>>>> +        else if (bo)
>>>>>> + ww_mutex_unlock(&first_bo->resv->lock);
>>>>>> +        if (!bo) {
>>>>>> +            spin_unlock(&glob->lru_lock);
>>>>>> +            ttm_bo_put(first_bo);
>>>>>> +            return -EBUSY;
>>>>>> +        }
>>>>>>         }
>>>>>>            kref_get(&bo->list_kref);
>>>>>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct
>>>>>> ttm_bo_device *bdev,
>>>>>>             ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>>>>                           ctx->no_wait_gpu, locked);
>>>>>>             kref_put(&bo->list_kref, ttm_bo_release_list);
>>>>>> +        if (first_bo)
>>>>>> +            ttm_bo_put(first_bo);
>>>>>>             return ret;
>>>>>>         }
>>>>>>            ttm_bo_del_from_lru(bo);
>>>>>>         spin_unlock(&glob->lru_lock);
>>>>>> +    if (first_bo)
>>>>>> +        ttm_bo_put(first_bo);
>>>>>>            ret = ttm_bo_evict(bo, ctx);
>>>>>>         if (locked) {
>>>>>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct
>>>>>> ttm_buffer_object *bo,
>>>>>>     {
>>>>>>         struct ttm_bo_device *bdev = bo->bdev;
>>>>>>         struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>> +    struct ttm_operation_ctx native_ctx = {
>>>>>> +        .interruptible = false,
>>>>>> +        .no_wait_gpu = false,
>>>>>> +        .resv = bo->resv,
>>>>>> +        .flags = 0
>>>>>> +    };
>>>>>> +    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
>>>>> I thought we made the ctx parameter mandatory, didn't we? Could be 
>>>>> that
>>>>> I remember that incorrectly.
>>>> Prike said he see ctx->resv is null, in that case, code doesn't run
>>>> into busy path.
>>>> Oh, as you mentioned here, we need add .resv=bo->resv for every
>>>> ttm_operation_ctx. That's a huge change which will cross all vendor
>>>> drivers.
>>>>
>>>> Can we just force to evaluate evict_ctx->resv = bo->resv? That means
>>>> we just add one extra line: evict_ctx->resv = bo->resv. How about 
>>>> that?
>>> Well only if ctx->resv is NULL, otherwise we would overwrite some
>>> reservation context given by the driver.
>>>
>>> Probably better to give the acquir_ctx as separate parameter to
>>> ttm_mem_evict_first().
>> still put acquire_ctx into ttm_operation_ctx? Then that's same 
>> ctx->resv.
>> Current problem is we don't pass resv anywhere except ALLOW_EVICT case.
>> If you have concern for overwritten, we have to do ".resv = bo->resv" 
>> in every ttm_operation_ctx definitions.
>
> No, what I mean is to add the acquire_ctx as separate parameter to 
> ttm_mem_evict_first().
>
> E.g. we only need it in this function and it is actually not related 
> to the ttm operation context filled in by the driver.


FWIW, I think it would be nice at some point to have a reservation 
context being part of the ttm operation context, so that validate and 
evict could do sleeping reservations, and have bos remain on the lru 
even when reserved...

/Thomas


>
> Christian.
>
>>
>> -David
>>>
>>> Christian.
>>>
>>>> -David
>>>>> Christian.
>>>>>
>>>>>>         int ret;
>>>>>>            do {
>>>>>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct
>>>>>> ttm_buffer_object *bo,
>>>>>>                 return ret;
>>>>>>             if (mem->mm_node)
>>>>>>                 break;
>>>>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>>>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place, 
>>>>>> evict_ctx);
>>>>>>             if (unlikely(ret != 0))
>>>>>>                 return ret;
>>>>>>         } while (1);
>>>>>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
>>>>>> struct ttm_operation_ctx *ctx)
>>>>>>         spin_lock(&glob->lru_lock);
>>>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>>             list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>>>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
>>>>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>>>>> +                               NULL)) {
>>>>>>                     ret = 0;
>>>>>>                     break;
>>>>>>                 }
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
       [not found]                 ` <93fbb994-d305-dfc4-f8e5-502647d7386f-4+hqylr40dJg9hUCZPvPmw@public.gmane.org>
@ 2019-05-07 11:42                   ` Koenig, Christian
       [not found]                     ` <fe4a6a5e-b075-b1cc-a24c-af6c3126145b-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Koenig, Christian @ 2019-05-07 11:42 UTC (permalink / raw)
  To: Thomas Hellstrom, Zhou, David(ChunMing)

Am 07.05.19 um 13:37 schrieb Thomas Hellstrom:
> [CAUTION: External Email]
>
> On 5/7/19 1:24 PM, Christian König wrote:
>> Am 07.05.19 um 13:22 schrieb zhoucm1:
>>>
>>>
>>> On 2019年05月07日 19:13, Koenig, Christian wrote:
>>>> Am 07.05.19 um 13:08 schrieb zhoucm1:
>>>>>
>>>>> On 2019年05月07日 18:53, Koenig, Christian wrote:
>>>>>> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>>>>>>> heavy gpu job could occupy memory long time, which lead other user
>>>>>>> fail to get memory.
>>>>>>>
>>>>>>> basically pick up Christian idea:
>>>>>>>
>>>>>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>>>>>> 2. If we then run into this EBUSY condition in TTM check if the BO
>>>>>>> we need memory for (or rather the ww_mutex of its reservation
>>>>>>> object) has a ticket assigned.
>>>>>>> 3. If we have a ticket we grab a reference to the first BO on the
>>>>>>> LRU, drop the LRU lock and try to grab the reservation lock with 
>>>>>>> the
>>>>>>> ticket.
>>>>>>> 4. If getting the reservation lock with the ticket succeeded we
>>>>>>> check if the BO is still the first one on the LRU in question (the
>>>>>>> BO could have moved).
>>>>>>> 5. If the BO is still the first one on the LRU in question we 
>>>>>>> try to
>>>>>>> evict it as we would evict any other BO.
>>>>>>> 6. If any of the "If's" above fail we just back off and return
>>>>>>> -EBUSY.
>>>>>>>
>>>>>>> v2: fix some minor check
>>>>>>> v3: address Christian v2 comments.
>>>>>>> v4: fix some missing
>>>>>>> v5: handle first_bo unlock and bo_get/put
>>>>>>> v6: abstract unified iterate function, and handle all possible
>>>>>>> usecase not only pinned bo.
>>>>>>>
>>>>>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>>>>>> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/ttm/ttm_bo.c | 113
>>>>>>> ++++++++++++++++++++++++++++++-----
>>>>>>>     1 file changed, 97 insertions(+), 16 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> index 8502b3ed2d88..bbf1d14d00a7 100644
>>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>>>>>      * b. Otherwise, trylock it.
>>>>>>>      */
>>>>>>>     static bool ttm_bo_evict_swapout_allowable(struct
>>>>>>> ttm_buffer_object *bo,
>>>>>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>>>>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool
>>>>>>> *busy)
>>>>>>>     {
>>>>>>>         bool ret = false;
>>>>>>>            *locked = false;
>>>>>>> +    if (busy)
>>>>>>> +        *busy = false;
>>>>>>>         if (bo->resv == ctx->resv) {
>>>>>>> reservation_object_assert_held(bo->resv);
>>>>>>>             if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>>>>> @@ -779,35 +781,45 @@ static bool
>>>>>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>>>>>         } else {
>>>>>>>             *locked = reservation_object_trylock(bo->resv);
>>>>>>>             ret = *locked;
>>>>>>> +        if (!ret && busy)
>>>>>>> +            *busy = true;
>>>>>>>         }
>>>>>>>            return ret;
>>>>>>>     }
>>>>>>>     -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>>>> -                   uint32_t mem_type,
>>>>>>> -                   const struct ttm_place *place,
>>>>>>> -                   struct ttm_operation_ctx *ctx)
>>>>>>> +static struct ttm_buffer_object*
>>>>>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>>>>>> +             struct ttm_mem_type_manager *man,
>>>>>>> +             const struct ttm_place *place,
>>>>>>> +             struct ttm_operation_ctx *ctx,
>>>>>>> +             struct ttm_buffer_object **first_bo,
>>>>>>> +             bool *locked)
>>>>>>>     {
>>>>>>> -    struct ttm_bo_global *glob = bdev->glob;
>>>>>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>>         struct ttm_buffer_object *bo = NULL;
>>>>>>> -    bool locked = false;
>>>>>>> -    unsigned i;
>>>>>>> -    int ret;
>>>>>>> +    int i;
>>>>>>>     -    spin_lock(&glob->lru_lock);
>>>>>>> +    if (first_bo)
>>>>>>> +        *first_bo = NULL;
>>>>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>>>             list_for_each_entry(bo, &man->lru[i], lru) {
>>>>>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>>>>>> +            bool busy = false;
>>>>>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>>>>>> +                                &busy)) {
>>>>>> A newline between declaration and code please.
>>>>>>
>>>>>>> +                if (first_bo && !(*first_bo) && busy) {
>>>>>>> +                    ttm_bo_get(bo);
>>>>>>> +                    *first_bo = bo;
>>>>>>> +                }
>>>>>>>                     continue;
>>>>>>> +            }
>>>>>>>                    if (place && 
>>>>>>> !bdev->driver->eviction_valuable(bo,
>>>>>>>                                           place)) {
>>>>>>> -                if (locked)
>>>>>>> +                if (*locked)
>>>>>>> reservation_object_unlock(bo->resv);
>>>>>>>                     continue;
>>>>>>>                 }
>>>>>>> +
>>>>>>>                 break;
>>>>>>>             }
>>>>>>>     @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct
>>>>>>> ttm_bo_device *bdev,
>>>>>>>             bo = NULL;
>>>>>>>         }
>>>>>>>     +    return bo;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>>>> +                   uint32_t mem_type,
>>>>>>> +                   const struct ttm_place *place,
>>>>>>> +                   struct ttm_operation_ctx *ctx)
>>>>>>> +{
>>>>>>> +    struct ttm_bo_global *glob = bdev->glob;
>>>>>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>>>>>> +    bool locked = false;
>>>>>>> +    int ret;
>>>>>>> +
>>>>>>> +    spin_lock(&glob->lru_lock);
>>>>>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, 
>>>>>>> &first_bo,
>>>>>>> +                      &locked);
>>>>>>>         if (!bo) {
>>>>>>> +        struct ttm_operation_ctx busy_ctx;
>>>>>>> +
>>>>>>>             spin_unlock(&glob->lru_lock);
>>>>>>> -        return -EBUSY;
>>>>>>> +        /* check if other user occupy memory too long time */
>>>>>>> +        if (!first_bo || !ctx || !ctx->resv ||
>>>>>>> !ctx->resv->lock.ctx) {
>>>>>>> +            if (first_bo)
>>>>>>> +                ttm_bo_put(first_bo);
>>>>>>> +            return -EBUSY;
>>>>>>> +        }
>>>>>>> +        if (first_bo->resv == ctx->resv) {
>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>> +            return -EBUSY;
>>>>>>> +        }
>>>>>>> +        if (ctx->interruptible)
>>>>>>> +            ret =
>>>>>>> ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>>>>>> + ctx->resv->lock.ctx);
>>>>>>> +        else
>>>>>>> +            ret = ww_mutex_lock(&first_bo->resv->lock,
>>>>>>> ctx->resv->lock.ctx);
>>>>>>> +        if (ret) {
>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>> +            return ret;
>>>>>>> +        }
>>>>>>> +        spin_lock(&glob->lru_lock);
>>>>>>> +        /* previous busy resv lock is held by above, idle now,
>>>>>>> +         * so let them evictable.
>>>>>>> +         */
>>>>>>> +        busy_ctx.interruptible = ctx->interruptible;
>>>>>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>>>>>> +        busy_ctx.resv           = first_bo->resv;
>>>>>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>>>>>> +
>>>>>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx,
>>>>>>> NULL,
>>>>>>> +                          &locked);
>>>>>>> +        if (bo && (bo->resv == first_bo->resv))
>>>>>>> +            locked = true;
>>>>>>> +        else if (bo)
>>>>>>> + ww_mutex_unlock(&first_bo->resv->lock);
>>>>>>> +        if (!bo) {
>>>>>>> +            spin_unlock(&glob->lru_lock);
>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>> +            return -EBUSY;
>>>>>>> +        }
>>>>>>>         }
>>>>>>>            kref_get(&bo->list_kref);
>>>>>>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct
>>>>>>> ttm_bo_device *bdev,
>>>>>>>             ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>>>>>                           ctx->no_wait_gpu, locked);
>>>>>>>             kref_put(&bo->list_kref, ttm_bo_release_list);
>>>>>>> +        if (first_bo)
>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>>             return ret;
>>>>>>>         }
>>>>>>>            ttm_bo_del_from_lru(bo);
>>>>>>>         spin_unlock(&glob->lru_lock);
>>>>>>> +    if (first_bo)
>>>>>>> +        ttm_bo_put(first_bo);
>>>>>>>            ret = ttm_bo_evict(bo, ctx);
>>>>>>>         if (locked) {
>>>>>>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct
>>>>>>> ttm_buffer_object *bo,
>>>>>>>     {
>>>>>>>         struct ttm_bo_device *bdev = bo->bdev;
>>>>>>>         struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>> +    struct ttm_operation_ctx native_ctx = {
>>>>>>> +        .interruptible = false,
>>>>>>> +        .no_wait_gpu = false,
>>>>>>> +        .resv = bo->resv,
>>>>>>> +        .flags = 0
>>>>>>> +    };
>>>>>>> +    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
>>>>>> I thought we made the ctx parameter mandatory, didn't we? Could be
>>>>>> that
>>>>>> I remember that incorrectly.
>>>>> Prike said he see ctx->resv is null, in that case, code doesn't run
>>>>> into busy path.
>>>>> Oh, as you mentioned here, we need add .resv=bo->resv for every
>>>>> ttm_operation_ctx. That's a huge change which will cross all vendor
>>>>> drivers.
>>>>>
>>>>> Can we just force to evaluate evict_ctx->resv = bo->resv? That means
>>>>> we just add one extra line: evict_ctx->resv = bo->resv. How about
>>>>> that?
>>>> Well only if ctx->resv is NULL, otherwise we would overwrite some
>>>> reservation context given by the driver.
>>>>
>>>> Probably better to give the acquir_ctx as separate parameter to
>>>> ttm_mem_evict_first().
>>> still put acquire_ctx into ttm_operation_ctx? Then that's same
>>> ctx->resv.
>>> Current problem is we don't pass resv anywhere except ALLOW_EVICT case.
>>> If you have concern for overwritten, we have to do ".resv = bo->resv"
>>> in every ttm_operation_ctx definitions.
>>
>> No, what I mean is to add the acquire_ctx as separate parameter to
>> ttm_mem_evict_first().
>>
>> E.g. we only need it in this function and it is actually not related
>> to the ttm operation context filled in by the driver.
>
>
> FWIW, I think it would be nice at some point to have a reservation
> context being part of the ttm operation context, so that validate and
> evict could do sleeping reservations, and have bos remain on the lru
> even when reserved...

Yeah, well that's exactly what the ctx->resv parameter is good for :)

And yes, we do keep the BOs on the LRU even when they are reserved.

Christian.

>
> /Thomas
>
>
>>
>> Christian.
>>
>>>
>>> -David
>>>>
>>>> Christian.
>>>>
>>>>> -David
>>>>>> Christian.
>>>>>>
>>>>>>>         int ret;
>>>>>>>            do {
>>>>>>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct
>>>>>>> ttm_buffer_object *bo,
>>>>>>>                 return ret;
>>>>>>>             if (mem->mm_node)
>>>>>>>                 break;
>>>>>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>>>>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place,
>>>>>>> evict_ctx);
>>>>>>>             if (unlikely(ret != 0))
>>>>>>>                 return ret;
>>>>>>>         } while (1);
>>>>>>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global 
>>>>>>> *glob,
>>>>>>> struct ttm_operation_ctx *ctx)
>>>>>>>         spin_lock(&glob->lru_lock);
>>>>>>>         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>>>             list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>>>>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx, 
>>>>>>> &locked)) {
>>>>>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>>>>>> +                               NULL)) {
>>>>>>>                     ret = 0;
>>>>>>>                     break;
>>>>>>>                 }
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
       [not found]                     ` <fe4a6a5e-b075-b1cc-a24c-af6c3126145b-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-08  8:34                       ` Thomas Hellstrom
  2019-05-08  9:03                         ` Koenig, Christian
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Hellstrom @ 2019-05-08  8:34 UTC (permalink / raw)
  To: Koenig, Christian, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 5/7/19 1:42 PM, Koenig, Christian wrote:
> Am 07.05.19 um 13:37 schrieb Thomas Hellstrom:
>> [CAUTION: External Email]
>>
>> On 5/7/19 1:24 PM, Christian König wrote:
>>> Am 07.05.19 um 13:22 schrieb zhoucm1:
>>>>
>>>> On 2019年05月07日 19:13, Koenig, Christian wrote:
>>>>> Am 07.05.19 um 13:08 schrieb zhoucm1:
>>>>>> On 2019年05月07日 18:53, Koenig, Christian wrote:
>>>>>>> Am 07.05.19 um 11:36 schrieb Chunming Zhou:
>>>>>>>> heavy gpu job could occupy memory long time, which lead other user
>>>>>>>> fail to get memory.
>>>>>>>>
>>>>>>>> basically pick up Christian idea:
>>>>>>>>
>>>>>>>> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
>>>>>>>> 2. If we then run into this EBUSY condition in TTM check if the BO
>>>>>>>> we need memory for (or rather the ww_mutex of its reservation
>>>>>>>> object) has a ticket assigned.
>>>>>>>> 3. If we have a ticket we grab a reference to the first BO on the
>>>>>>>> LRU, drop the LRU lock and try to grab the reservation lock with
>>>>>>>> the
>>>>>>>> ticket.
>>>>>>>> 4. If getting the reservation lock with the ticket succeeded we
>>>>>>>> check if the BO is still the first one on the LRU in question (the
>>>>>>>> BO could have moved).
>>>>>>>> 5. If the BO is still the first one on the LRU in question we
>>>>>>>> try to
>>>>>>>> evict it as we would evict any other BO.
>>>>>>>> 6. If any of the "If's" above fail we just back off and return
>>>>>>>> -EBUSY.
>>>>>>>>
>>>>>>>> v2: fix some minor check
>>>>>>>> v3: address Christian v2 comments.
>>>>>>>> v4: fix some missing
>>>>>>>> v5: handle first_bo unlock and bo_get/put
>>>>>>>> v6: abstract unified iterate function, and handle all possible
>>>>>>>> usecase not only pinned bo.
>>>>>>>>
>>>>>>>> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
>>>>>>>> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/ttm/ttm_bo.c | 113
>>>>>>>> ++++++++++++++++++++++++++++++-----
>>>>>>>>      1 file changed, 97 insertions(+), 16 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>> index 8502b3ed2d88..bbf1d14d00a7 100644
>>>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>>>>>>       * b. Otherwise, trylock it.
>>>>>>>>       */
>>>>>>>>      static bool ttm_bo_evict_swapout_allowable(struct
>>>>>>>> ttm_buffer_object *bo,
>>>>>>>> -            struct ttm_operation_ctx *ctx, bool *locked)
>>>>>>>> +            struct ttm_operation_ctx *ctx, bool *locked, bool
>>>>>>>> *busy)
>>>>>>>>      {
>>>>>>>>          bool ret = false;
>>>>>>>>             *locked = false;
>>>>>>>> +    if (busy)
>>>>>>>> +        *busy = false;
>>>>>>>>          if (bo->resv == ctx->resv) {
>>>>>>>> reservation_object_assert_held(bo->resv);
>>>>>>>>              if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>>>>>> @@ -779,35 +781,45 @@ static bool
>>>>>>>> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>>>>>>>>          } else {
>>>>>>>>              *locked = reservation_object_trylock(bo->resv);
>>>>>>>>              ret = *locked;
>>>>>>>> +        if (!ret && busy)
>>>>>>>> +            *busy = true;
>>>>>>>>          }
>>>>>>>>             return ret;
>>>>>>>>      }
>>>>>>>>      -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>>>>> -                   uint32_t mem_type,
>>>>>>>> -                   const struct ttm_place *place,
>>>>>>>> -                   struct ttm_operation_ctx *ctx)
>>>>>>>> +static struct ttm_buffer_object*
>>>>>>>> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
>>>>>>>> +             struct ttm_mem_type_manager *man,
>>>>>>>> +             const struct ttm_place *place,
>>>>>>>> +             struct ttm_operation_ctx *ctx,
>>>>>>>> +             struct ttm_buffer_object **first_bo,
>>>>>>>> +             bool *locked)
>>>>>>>>      {
>>>>>>>> -    struct ttm_bo_global *glob = bdev->glob;
>>>>>>>> -    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>>>          struct ttm_buffer_object *bo = NULL;
>>>>>>>> -    bool locked = false;
>>>>>>>> -    unsigned i;
>>>>>>>> -    int ret;
>>>>>>>> +    int i;
>>>>>>>>      -    spin_lock(&glob->lru_lock);
>>>>>>>> +    if (first_bo)
>>>>>>>> +        *first_bo = NULL;
>>>>>>>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>>>>              list_for_each_entry(bo, &man->lru[i], lru) {
>>>>>>>> -            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
>>>>>>>> +            bool busy = false;
>>>>>>>> +            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
>>>>>>>> +                                &busy)) {
>>>>>>> A newline between declaration and code please.
>>>>>>>
>>>>>>>> +                if (first_bo && !(*first_bo) && busy) {
>>>>>>>> +                    ttm_bo_get(bo);
>>>>>>>> +                    *first_bo = bo;
>>>>>>>> +                }
>>>>>>>>                      continue;
>>>>>>>> +            }
>>>>>>>>                     if (place &&
>>>>>>>> !bdev->driver->eviction_valuable(bo,
>>>>>>>>                                            place)) {
>>>>>>>> -                if (locked)
>>>>>>>> +                if (*locked)
>>>>>>>> reservation_object_unlock(bo->resv);
>>>>>>>>                      continue;
>>>>>>>>                  }
>>>>>>>> +
>>>>>>>>                  break;
>>>>>>>>              }
>>>>>>>>      @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct
>>>>>>>> ttm_bo_device *bdev,
>>>>>>>>              bo = NULL;
>>>>>>>>          }
>>>>>>>>      +    return bo;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>>>>> +                   uint32_t mem_type,
>>>>>>>> +                   const struct ttm_place *place,
>>>>>>>> +                   struct ttm_operation_ctx *ctx)
>>>>>>>> +{
>>>>>>>> +    struct ttm_bo_global *glob = bdev->glob;
>>>>>>>> +    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>>> +    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
>>>>>>>> +    bool locked = false;
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    spin_lock(&glob->lru_lock);
>>>>>>>> +    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx,
>>>>>>>> &first_bo,
>>>>>>>> +                      &locked);
>>>>>>>>          if (!bo) {
>>>>>>>> +        struct ttm_operation_ctx busy_ctx;
>>>>>>>> +
>>>>>>>>              spin_unlock(&glob->lru_lock);
>>>>>>>> -        return -EBUSY;
>>>>>>>> +        /* check if other user occupy memory too long time */
>>>>>>>> +        if (!first_bo || !ctx || !ctx->resv ||
>>>>>>>> !ctx->resv->lock.ctx) {
>>>>>>>> +            if (first_bo)
>>>>>>>> +                ttm_bo_put(first_bo);
>>>>>>>> +            return -EBUSY;
>>>>>>>> +        }
>>>>>>>> +        if (first_bo->resv == ctx->resv) {
>>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>>> +            return -EBUSY;
>>>>>>>> +        }
>>>>>>>> +        if (ctx->interruptible)
>>>>>>>> +            ret =
>>>>>>>> ww_mutex_lock_interruptible(&first_bo->resv->lock,
>>>>>>>> + ctx->resv->lock.ctx);
>>>>>>>> +        else
>>>>>>>> +            ret = ww_mutex_lock(&first_bo->resv->lock,
>>>>>>>> ctx->resv->lock.ctx);
>>>>>>>> +        if (ret) {
>>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>>> +            return ret;
>>>>>>>> +        }
>>>>>>>> +        spin_lock(&glob->lru_lock);
>>>>>>>> +        /* previous busy resv lock is held by above, idle now,
>>>>>>>> +         * so let them evictable.
>>>>>>>> +         */
>>>>>>>> +        busy_ctx.interruptible = ctx->interruptible;
>>>>>>>> +        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
>>>>>>>> +        busy_ctx.resv           = first_bo->resv;
>>>>>>>> +        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
>>>>>>>> +
>>>>>>>> +        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx,
>>>>>>>> NULL,
>>>>>>>> +                          &locked);
>>>>>>>> +        if (bo && (bo->resv == first_bo->resv))
>>>>>>>> +            locked = true;
>>>>>>>> +        else if (bo)
>>>>>>>> + ww_mutex_unlock(&first_bo->resv->lock);
>>>>>>>> +        if (!bo) {
>>>>>>>> +            spin_unlock(&glob->lru_lock);
>>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>>> +            return -EBUSY;
>>>>>>>> +        }
>>>>>>>>          }
>>>>>>>>             kref_get(&bo->list_kref);
>>>>>>>> @@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct
>>>>>>>> ttm_bo_device *bdev,
>>>>>>>>              ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>>>>>>>>                            ctx->no_wait_gpu, locked);
>>>>>>>>              kref_put(&bo->list_kref, ttm_bo_release_list);
>>>>>>>> +        if (first_bo)
>>>>>>>> +            ttm_bo_put(first_bo);
>>>>>>>>              return ret;
>>>>>>>>          }
>>>>>>>>             ttm_bo_del_from_lru(bo);
>>>>>>>>          spin_unlock(&glob->lru_lock);
>>>>>>>> +    if (first_bo)
>>>>>>>> +        ttm_bo_put(first_bo);
>>>>>>>>             ret = ttm_bo_evict(bo, ctx);
>>>>>>>>          if (locked) {
>>>>>>>> @@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct
>>>>>>>> ttm_buffer_object *bo,
>>>>>>>>      {
>>>>>>>>          struct ttm_bo_device *bdev = bo->bdev;
>>>>>>>>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>>>>>> +    struct ttm_operation_ctx native_ctx = {
>>>>>>>> +        .interruptible = false,
>>>>>>>> +        .no_wait_gpu = false,
>>>>>>>> +        .resv = bo->resv,
>>>>>>>> +        .flags = 0
>>>>>>>> +    };
>>>>>>>> +    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
>>>>>>> I thought we made the ctx parameter mandatory, didn't we? Could be
>>>>>>> that
>>>>>>> I remember that incorrectly.
>>>>>> Prike said he see ctx->resv is null, in that case, code doesn't run
>>>>>> into busy path.
>>>>>> Oh, as you mentioned here, we need add .resv=bo->resv for every
>>>>>> ttm_operation_ctx. That's a huge change which will cross all vendor
>>>>>> drivers.
>>>>>>
>>>>>> Can we just force to evaluate evict_ctx->resv = bo->resv? That means
>>>>>> we just add one extra line: evict_ctx->resv = bo->resv. How about
>>>>>> that?
>>>>> Well only if ctx->resv is NULL, otherwise we would overwrite some
>>>>> reservation context given by the driver.
>>>>>
>>>>> Probably better to give the acquir_ctx as separate parameter to
>>>>> ttm_mem_evict_first().
>>>> still put acquire_ctx into ttm_operation_ctx? Then that's same
>>>> ctx->resv.
>>>> Current problem is we don't pass resv anywhere except ALLOW_EVICT case.
>>>> If you have concern for overwritten, we have to do ".resv = bo->resv"
>>>> in every ttm_operation_ctx definitions.
>>> No, what I mean is to add the acquire_ctx as separate parameter to
>>> ttm_mem_evict_first().
>>>
>>> E.g. we only need it in this function and it is actually not related
>>> to the ttm operation context filled in by the driver.
>>
>> FWIW, I think it would be nice at some point to have a reservation
>> context being part of the ttm operation context, so that validate and
>> evict could do sleeping reservations, and have bos remain on the lru
>> even when reserved...
> Yeah, well that's exactly what the ctx->resv parameter is good for :)

Hmm. I don't quite follow? It looks to me like ctx->resv is there to 
work around recursive reservations?

What I'm after is being able to do sleeping reservations within validate 
and evict and open up for returning -EDEADLK. One benefit would be to 
scan over the LRU lists, reserving exactly those bos we want to evict, 
and when all are reserved, we evict them. If we hit an -EDEADLK while 
evicting we need to restart. Then we need an acquire_ctx in the 
ttm_operation_ctx.

>
> And yes, we do keep the BOs on the LRU even when they are reserved.

static inline int ttm_bo_reserve(struct ttm_buffer_object *bo,
                  bool interruptible, bool no_wait,
                  struct ww_acquire_ctx *ticket)
{
     int ret;

     WARN_ON(!kref_read(&bo->kref));

     ret = __ttm_bo_reserve(bo, interruptible, no_wait, ticket);
     if (likely(ret == 0))
         ttm_bo_del_sub_from_lru(bo);

     return ret;
}

/Thomas


>
> Christian.
>
>> /Thomas
>>
>>
>>> Christian.
>>>
>>>> -David
>>>>> Christian.
>>>>>
>>>>>> -David
>>>>>>> Christian.
>>>>>>>
>>>>>>>>          int ret;
>>>>>>>>             do {
>>>>>>>> @@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct
>>>>>>>> ttm_buffer_object *bo,
>>>>>>>>                  return ret;
>>>>>>>>              if (mem->mm_node)
>>>>>>>>                  break;
>>>>>>>> -        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
>>>>>>>> +        ret = ttm_mem_evict_first(bdev, mem_type, place,
>>>>>>>> evict_ctx);
>>>>>>>>              if (unlikely(ret != 0))
>>>>>>>>                  return ret;
>>>>>>>>          } while (1);
>>>>>>>> @@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global
>>>>>>>> *glob,
>>>>>>>> struct ttm_operation_ctx *ctx)
>>>>>>>>          spin_lock(&glob->lru_lock);
>>>>>>>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>>>>>              list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>>>>>>> -            if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>>>>>>> &locked)) {
>>>>>>>> +            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
>>>>>>>> +                               NULL)) {
>>>>>>>>                      ret = 0;
>>>>>>>>                      break;
>>>>>>>>                  }
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6
  2019-05-08  8:34                       ` Thomas Hellstrom
@ 2019-05-08  9:03                         ` Koenig, Christian
  0 siblings, 0 replies; 14+ messages in thread
From: Koenig, Christian @ 2019-05-08  9:03 UTC (permalink / raw)
  To: Thomas Hellstrom, Zhou, David(ChunMing),
	Liang, Prike, dri-devel, amd-gfx

Am 08.05.19 um 10:34 schrieb Thomas Hellstrom:
> [SNIP]
>>>> No, what I mean is to add the acquire_ctx as separate parameter to
>>>> ttm_mem_evict_first().
>>>>
>>>> E.g. we only need it in this function and it is actually not related
>>>> to the ttm operation context filled in by the driver.
>>>
>>> FWIW, I think it would be nice at some point to have a reservation
>>> context being part of the ttm operation context, so that validate and
>>> evict could do sleeping reservations, and have bos remain on the lru
>>> even when reserved...
>> Yeah, well that's exactly what the ctx->resv parameter is good for :)
>
> Hmm. I don't quite follow? It looks to me like ctx->resv is there to
> work around recursive reservations?

Well yes and no, this is to allow eviction of BOs which share the same 
reservation object.

>
>
> What I'm after is being able to do sleeping reservations within validate
> and evict and open up for returning -EDEADLK. One benefit would be to
> scan over the LRU lists, reserving exactly those bos we want to evict,
> and when all are reserved, we evict them. If we hit an -EDEADLK while
> evicting we need to restart. Then we need an acquire_ctx in the
> ttm_operation_ctx.

The acquire_ctx is available from the BO you try to find space for.

But we already tried this approach and it doesn't work. We have a lot of 
BOs which now share the same reservation object and so would cause an 
-EDEADLK.

>> And yes, we do keep the BOs on the LRU even when they are reserved.
>
> static inline int ttm_bo_reserve(struct ttm_buffer_object *bo,
>                  bool interruptible, bool no_wait,
>                  struct ww_acquire_ctx *ticket)

ttm_bo_reserve() is not always used any more outside of TTM. The for 
DMA-buf as well as amdgpu VMs code the reservation object is locked 
without calling ttm_bo_reserve now.

Regards,
Christian.

>
> /Thomas

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve
  2019-05-07 11:45 ` [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve Chunming Zhou
@ 2019-05-13  2:45   ` Liang, Prike
  0 siblings, 0 replies; 14+ messages in thread
From: Liang, Prike @ 2019-05-13  2:45 UTC (permalink / raw)
  To: Zhou, David(ChunMing), Koenig, Christian, dri-devel

Acked-by: Prike Liang <Prike.Liang@amd.com>


-----Original Message-----
From: Chunming Zhou <david1.zhou@amd.com> 
Sent: Tuesday, May 07, 2019 7:46 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; Liang, Prike <Prike.Liang@amd.com>; dri-devel@lists.freedesktop.org
Cc: Zhou, David(ChunMing) <David1.Zhou@amd.com>
Subject: [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve

add ticket for display bo, so that it can preempt busy bo.

Change-Id: I9f031cdcc8267de00e819ae303baa0a52df8ebb9
Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index ac22f7351a42..3f36770946ab 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4176,6 +4176,9 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	struct amdgpu_device *adev;
 	struct amdgpu_bo *rbo;
 	struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
+	struct list_head list;
+	struct ttm_validate_buffer tv;
+	struct ww_acquire_ctx ticket;
 	uint64_t tiling_flags;
 	uint32_t domain;
 	int r;
@@ -4192,9 +4195,17 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	obj = new_state->fb->obj[0];
 	rbo = gem_to_amdgpu_bo(obj);
 	adev = amdgpu_ttm_adev(rbo->tbo.bdev);
-	r = amdgpu_bo_reserve(rbo, false);
-	if (unlikely(r != 0))
+	INIT_LIST_HEAD(&list);
+
+	tv.bo = &rbo->tbo;
+	tv.num_shared = 1;
+	list_add(&tv.head, &list);
+
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL);
+	if (r) {
+		dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
 		return r;
+	}
 
 	if (plane->type != DRM_PLANE_TYPE_CURSOR)
 		domain = amdgpu_display_supported_domains(adev);
@@ -4205,21 +4216,21 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		return r;
 	}
 
 	r = amdgpu_ttm_alloc_gart(&rbo->tbo);
 	if (unlikely(r != 0)) {
 		amdgpu_bo_unpin(rbo);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		DRM_ERROR("%p bind failed\n", rbo);
 		return r;
 	}
 
 	amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
 
-	amdgpu_bo_unreserve(rbo);
+	ttm_eu_backoff_reservation(&ticket, &list);
 
 	afb->address = amdgpu_bo_gpu_offset(rbo);
 
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve
  2019-05-07 11:45 [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Chunming Zhou
@ 2019-05-07 11:45 ` Chunming Zhou
  2019-05-13  2:45   ` Liang, Prike
  0 siblings, 1 reply; 14+ messages in thread
From: Chunming Zhou @ 2019-05-07 11:45 UTC (permalink / raw)
  To: Christian.Koenig, Prike.Liang, dri-devel

add ticket for display bo, so that it can preempt busy bo.

Change-Id: I9f031cdcc8267de00e819ae303baa0a52df8ebb9
Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index ac22f7351a42..3f36770946ab 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4176,6 +4176,9 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	struct amdgpu_device *adev;
 	struct amdgpu_bo *rbo;
 	struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
+	struct list_head list;
+	struct ttm_validate_buffer tv;
+	struct ww_acquire_ctx ticket;
 	uint64_t tiling_flags;
 	uint32_t domain;
 	int r;
@@ -4192,9 +4195,17 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	obj = new_state->fb->obj[0];
 	rbo = gem_to_amdgpu_bo(obj);
 	adev = amdgpu_ttm_adev(rbo->tbo.bdev);
-	r = amdgpu_bo_reserve(rbo, false);
-	if (unlikely(r != 0))
+	INIT_LIST_HEAD(&list);
+
+	tv.bo = &rbo->tbo;
+	tv.num_shared = 1;
+	list_add(&tv.head, &list);
+
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL);
+	if (r) {
+		dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
 		return r;
+	}
 
 	if (plane->type != DRM_PLANE_TYPE_CURSOR)
 		domain = amdgpu_display_supported_domains(adev);
@@ -4205,21 +4216,21 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		return r;
 	}
 
 	r = amdgpu_ttm_alloc_gart(&rbo->tbo);
 	if (unlikely(r != 0)) {
 		amdgpu_bo_unpin(rbo);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		DRM_ERROR("%p bind failed\n", rbo);
 		return r;
 	}
 
 	amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
 
-	amdgpu_bo_unreserve(rbo);
+	ttm_eu_backoff_reservation(&ticket, &list);
 
 	afb->address = amdgpu_bo_gpu_offset(rbo);
 
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2019-05-13  2:45 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-07  9:36 [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6 Chunming Zhou
     [not found] ` <20190507093642.7859-1-david1.zhou-5C7GfCeVMHo@public.gmane.org>
2019-05-07  9:36   ` [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve Chunming Zhou
     [not found]     ` <20190507093642.7859-2-david1.zhou-5C7GfCeVMHo@public.gmane.org>
2019-05-07 10:46       ` Koenig, Christian
2019-05-07 10:53 ` [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6 Koenig, Christian
     [not found]   ` <f4b1ddf2-b80b-260e-54c9-b0e62ecbe90b-5C7GfCeVMHo@public.gmane.org>
2019-05-07 11:08     ` zhoucm1
2019-05-07 11:13       ` Koenig, Christian
2019-05-07 11:22         ` zhoucm1
2019-05-07 11:24           ` Christian König
     [not found]             ` <968487eb-f78e-9922-a073-8ed08111e307-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-05-07 11:37               ` Thomas Hellstrom
     [not found]                 ` <93fbb994-d305-dfc4-f8e5-502647d7386f-4+hqylr40dJg9hUCZPvPmw@public.gmane.org>
2019-05-07 11:42                   ` Koenig, Christian
     [not found]                     ` <fe4a6a5e-b075-b1cc-a24c-af6c3126145b-5C7GfCeVMHo@public.gmane.org>
2019-05-08  8:34                       ` Thomas Hellstrom
2019-05-08  9:03                         ` Koenig, Christian
2019-05-07 11:45 [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Chunming Zhou
2019-05-07 11:45 ` [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve Chunming Zhou
2019-05-13  2:45   ` Liang, Prike

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.