All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
@ 2019-05-07 11:45 Chunming Zhou
  2019-05-07 11:45 ` [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve Chunming Zhou
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Chunming Zhou @ 2019-05-07 11:45 UTC (permalink / raw)
  To: Christian.Koenig, Prike.Liang, dri-devel

heavy gpu job could occupy memory long time, which lead other user fail to get memory.

basically pick up Christian idea:

1. Reserve the BO in DC using a ww_mutex ticket (trivial).
2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
6. If any of the "If's" above fail we just back off and return -EBUSY.

v2: fix some minor check
v3: address Christian v2 comments.
v4: fix some missing
v5: handle first_bo unlock and bo_get/put
v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
v7: pass request bo->resv to ttm_bo_evict_first

Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 111 +++++++++++++++++++++++++++++------
 1 file changed, 94 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 8502b3ed2d88..f5e6328e4a57 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
  * b. Otherwise, trylock it.
  */
 static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
-			struct ttm_operation_ctx *ctx, bool *locked)
+			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
 {
 	bool ret = false;
 
 	*locked = false;
+	if (busy)
+		*busy = false;
 	if (bo->resv == ctx->resv) {
 		reservation_object_assert_held(bo->resv);
 		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
@@ -779,35 +781,46 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
 	} else {
 		*locked = reservation_object_trylock(bo->resv);
 		ret = *locked;
+		if (!ret && busy)
+			*busy = true;
 	}
 
 	return ret;
 }
 
-static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
-			       uint32_t mem_type,
-			       const struct ttm_place *place,
-			       struct ttm_operation_ctx *ctx)
+static struct ttm_buffer_object*
+ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
+			 struct ttm_mem_type_manager *man,
+			 const struct ttm_place *place,
+			 struct ttm_operation_ctx *ctx,
+			 struct ttm_buffer_object **first_bo,
+			 bool *locked)
 {
-	struct ttm_bo_global *glob = bdev->glob;
-	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
 	struct ttm_buffer_object *bo = NULL;
-	bool locked = false;
-	unsigned i;
-	int ret;
+	int i;
 
-	spin_lock(&glob->lru_lock);
+	if (first_bo)
+		*first_bo = NULL;
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &man->lru[i], lru) {
-			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
+			bool busy = false;
+
+			if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
+							    &busy)) {
+				if (first_bo && !(*first_bo) && busy) {
+					ttm_bo_get(bo);
+					*first_bo = bo;
+				}
 				continue;
+			}
 
 			if (place && !bdev->driver->eviction_valuable(bo,
 								      place)) {
-				if (locked)
+				if (*locked)
 					reservation_object_unlock(bo->resv);
 				continue;
 			}
+
 			break;
 		}
 
@@ -818,9 +831,67 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 		bo = NULL;
 	}
 
+	return bo;
+}
+
+static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
+			       uint32_t mem_type,
+			       const struct ttm_place *place,
+			       struct ttm_operation_ctx *ctx,
+			       struct reservation_object *request_resv)
+{
+	struct ttm_bo_global *glob = bdev->glob;
+	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
+	struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
+	bool locked = false;
+	int ret;
+
+	spin_lock(&glob->lru_lock);
+	bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
+				      &locked);
 	if (!bo) {
+		struct ttm_operation_ctx busy_ctx;
+
 		spin_unlock(&glob->lru_lock);
-		return -EBUSY;
+		/* check if other user occupy memory too long time */
+		if (!first_bo || !request_resv || !request_resv->lock.ctx) {
+			if (first_bo)
+				ttm_bo_put(first_bo);
+			return -EBUSY;
+		}
+		if (first_bo->resv == request_resv) {
+			ttm_bo_put(first_bo);
+			return -EBUSY;
+		}
+		if (ctx->interruptible)
+			ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
+							  request_resv->lock.ctx);
+		else
+			ret = ww_mutex_lock(&first_bo->resv->lock, request_resv->lock.ctx);
+		if (ret) {
+			ttm_bo_put(first_bo);
+			return ret;
+		}
+		spin_lock(&glob->lru_lock);
+		/* previous busy resv lock is held by above, idle now,
+		 * so let them evictable.
+		 */
+		busy_ctx.interruptible = ctx->interruptible;
+		busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
+		busy_ctx.resv	       = first_bo->resv;
+		busy_ctx.flags	       = TTM_OPT_FLAG_ALLOW_RES_EVICT;
+
+		bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
+					      &locked);
+		if (bo && (bo->resv == first_bo->resv))
+			locked = true;
+		else if (bo)
+			ww_mutex_unlock(&first_bo->resv->lock);
+		if (!bo) {
+			spin_unlock(&glob->lru_lock);
+			ttm_bo_put(first_bo);
+			return -EBUSY;
+		}
 	}
 
 	kref_get(&bo->list_kref);
@@ -829,11 +900,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 		ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
 					  ctx->no_wait_gpu, locked);
 		kref_put(&bo->list_kref, ttm_bo_release_list);
+		if (first_bo)
+			ttm_bo_put(first_bo);
 		return ret;
 	}
 
 	ttm_bo_del_from_lru(bo);
 	spin_unlock(&glob->lru_lock);
+	if (first_bo)
+		ttm_bo_put(first_bo);
 
 	ret = ttm_bo_evict(bo, ctx);
 	if (locked) {
@@ -907,7 +982,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 			return ret;
 		if (mem->mm_node)
 			break;
-		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
+		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx, bo->resv);
 		if (unlikely(ret != 0))
 			return ret;
 	} while (1);
@@ -1413,7 +1488,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		while (!list_empty(&man->lru[i])) {
 			spin_unlock(&glob->lru_lock);
-			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
+			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
+						  NULL);
 			if (ret)
 				return ret;
 			spin_lock(&glob->lru_lock);
@@ -1784,7 +1860,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
-			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
+			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							   NULL)) {
 				ret = 0;
 				break;
 			}
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve
  2019-05-07 11:45 [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Chunming Zhou
@ 2019-05-07 11:45 ` Chunming Zhou
  2019-05-13  2:45   ` Liang, Prike
  2019-05-07 11:51 ` [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Christian König
  2019-05-09 14:28 ` Koenig, Christian
  2 siblings, 1 reply; 9+ messages in thread
From: Chunming Zhou @ 2019-05-07 11:45 UTC (permalink / raw)
  To: Christian.Koenig, Prike.Liang, dri-devel

add ticket for display bo, so that it can preempt busy bo.

Change-Id: I9f031cdcc8267de00e819ae303baa0a52df8ebb9
Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index ac22f7351a42..3f36770946ab 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4176,6 +4176,9 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	struct amdgpu_device *adev;
 	struct amdgpu_bo *rbo;
 	struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
+	struct list_head list;
+	struct ttm_validate_buffer tv;
+	struct ww_acquire_ctx ticket;
 	uint64_t tiling_flags;
 	uint32_t domain;
 	int r;
@@ -4192,9 +4195,17 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	obj = new_state->fb->obj[0];
 	rbo = gem_to_amdgpu_bo(obj);
 	adev = amdgpu_ttm_adev(rbo->tbo.bdev);
-	r = amdgpu_bo_reserve(rbo, false);
-	if (unlikely(r != 0))
+	INIT_LIST_HEAD(&list);
+
+	tv.bo = &rbo->tbo;
+	tv.num_shared = 1;
+	list_add(&tv.head, &list);
+
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL);
+	if (r) {
+		dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
 		return r;
+	}
 
 	if (plane->type != DRM_PLANE_TYPE_CURSOR)
 		domain = amdgpu_display_supported_domains(adev);
@@ -4205,21 +4216,21 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		return r;
 	}
 
 	r = amdgpu_ttm_alloc_gart(&rbo->tbo);
 	if (unlikely(r != 0)) {
 		amdgpu_bo_unpin(rbo);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		DRM_ERROR("%p bind failed\n", rbo);
 		return r;
 	}
 
 	amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
 
-	amdgpu_bo_unreserve(rbo);
+	ttm_eu_backoff_reservation(&ticket, &list);
 
 	afb->address = amdgpu_bo_gpu_offset(rbo);
 
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
  2019-05-07 11:45 [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Chunming Zhou
  2019-05-07 11:45 ` [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve Chunming Zhou
@ 2019-05-07 11:51 ` Christian König
  2019-05-09 14:28 ` Koenig, Christian
  2 siblings, 0 replies; 9+ messages in thread
From: Christian König @ 2019-05-07 11:51 UTC (permalink / raw)
  To: Chunming Zhou, Christian.Koenig, Prike.Liang, dri-devel

Am 07.05.19 um 13:45 schrieb Chunming Zhou:
> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>
> basically pick up Christian idea:
>
> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>
> v2: fix some minor check
> v3: address Christian v2 comments.
> v4: fix some missing
> v5: handle first_bo unlock and bo_get/put
> v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
> v7: pass request bo->resv to ttm_bo_evict_first
>
> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

Please leave me a note when this hits amd-staging-drm-next, cause I need 
to test why it didn't helped with Mareks problem.

Christian.

> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 111 +++++++++++++++++++++++++++++------
>   1 file changed, 94 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 8502b3ed2d88..f5e6328e4a57 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -			struct ttm_operation_ctx *ctx, bool *locked)
> +			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>   	bool ret = false;
>   
>   	*locked = false;
> +	if (busy)
> +		*busy = false;
>   	if (bo->resv == ctx->resv) {
>   		reservation_object_assert_held(bo->resv);
>   		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> @@ -779,35 +781,46 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>   	} else {
>   		*locked = reservation_object_trylock(bo->resv);
>   		ret = *locked;
> +		if (!ret && busy)
> +			*busy = true;
>   	}
>   
>   	return ret;
>   }
>   
> -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> -			       uint32_t mem_type,
> -			       const struct ttm_place *place,
> -			       struct ttm_operation_ctx *ctx)
> +static struct ttm_buffer_object*
> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
> +			 struct ttm_mem_type_manager *man,
> +			 const struct ttm_place *place,
> +			 struct ttm_operation_ctx *ctx,
> +			 struct ttm_buffer_object **first_bo,
> +			 bool *locked)
>   {
> -	struct ttm_bo_global *glob = bdev->glob;
> -	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>   	struct ttm_buffer_object *bo = NULL;
> -	bool locked = false;
> -	unsigned i;
> -	int ret;
> +	int i;
>   
> -	spin_lock(&glob->lru_lock);
> +	if (first_bo)
> +		*first_bo = NULL;
>   	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>   		list_for_each_entry(bo, &man->lru[i], lru) {
> -			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +			bool busy = false;
> +
> +			if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
> +							    &busy)) {
> +				if (first_bo && !(*first_bo) && busy) {
> +					ttm_bo_get(bo);
> +					*first_bo = bo;
> +				}
>   				continue;
> +			}
>   
>   			if (place && !bdev->driver->eviction_valuable(bo,
>   								      place)) {
> -				if (locked)
> +				if (*locked)
>   					reservation_object_unlock(bo->resv);
>   				continue;
>   			}
> +
>   			break;
>   		}
>   
> @@ -818,9 +831,67 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>   		bo = NULL;
>   	}
>   
> +	return bo;
> +}
> +
> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> +			       uint32_t mem_type,
> +			       const struct ttm_place *place,
> +			       struct ttm_operation_ctx *ctx,
> +			       struct reservation_object *request_resv)
> +{
> +	struct ttm_bo_global *glob = bdev->glob;
> +	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> +	struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
> +	bool locked = false;
> +	int ret;
> +
> +	spin_lock(&glob->lru_lock);
> +	bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
> +				      &locked);
>   	if (!bo) {
> +		struct ttm_operation_ctx busy_ctx;
> +
>   		spin_unlock(&glob->lru_lock);
> -		return -EBUSY;
> +		/* check if other user occupy memory too long time */
> +		if (!first_bo || !request_resv || !request_resv->lock.ctx) {
> +			if (first_bo)
> +				ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
> +		if (first_bo->resv == request_resv) {
> +			ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
> +		if (ctx->interruptible)
> +			ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
> +							  request_resv->lock.ctx);
> +		else
> +			ret = ww_mutex_lock(&first_bo->resv->lock, request_resv->lock.ctx);
> +		if (ret) {
> +			ttm_bo_put(first_bo);
> +			return ret;
> +		}
> +		spin_lock(&glob->lru_lock);
> +		/* previous busy resv lock is held by above, idle now,
> +		 * so let them evictable.
> +		 */
> +		busy_ctx.interruptible = ctx->interruptible;
> +		busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
> +		busy_ctx.resv	       = first_bo->resv;
> +		busy_ctx.flags	       = TTM_OPT_FLAG_ALLOW_RES_EVICT;
> +
> +		bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
> +					      &locked);
> +		if (bo && (bo->resv == first_bo->resv))
> +			locked = true;
> +		else if (bo)
> +			ww_mutex_unlock(&first_bo->resv->lock);
> +		if (!bo) {
> +			spin_unlock(&glob->lru_lock);
> +			ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
>   	}
>   
>   	kref_get(&bo->list_kref);
> @@ -829,11 +900,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>   		ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>   					  ctx->no_wait_gpu, locked);
>   		kref_put(&bo->list_kref, ttm_bo_release_list);
> +		if (first_bo)
> +			ttm_bo_put(first_bo);
>   		return ret;
>   	}
>   
>   	ttm_bo_del_from_lru(bo);
>   	spin_unlock(&glob->lru_lock);
> +	if (first_bo)
> +		ttm_bo_put(first_bo);
>   
>   	ret = ttm_bo_evict(bo, ctx);
>   	if (locked) {
> @@ -907,7 +982,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>   			return ret;
>   		if (mem->mm_node)
>   			break;
> -		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
> +		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx, bo->resv);
>   		if (unlikely(ret != 0))
>   			return ret;
>   	} while (1);
> @@ -1413,7 +1488,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
>   	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>   		while (!list_empty(&man->lru[i])) {
>   			spin_unlock(&glob->lru_lock);
> -			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> +			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> +						  NULL);
>   			if (ret)
>   				return ret;
>   			spin_lock(&glob->lru_lock);
> @@ -1784,7 +1860,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>   	spin_lock(&glob->lru_lock);
>   	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>   		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +							   NULL)) {
>   				ret = 0;
>   				break;
>   			}

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
  2019-05-07 11:45 [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Chunming Zhou
  2019-05-07 11:45 ` [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve Chunming Zhou
  2019-05-07 11:51 ` [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Christian König
@ 2019-05-09 14:28 ` Koenig, Christian
  2019-05-09 14:46   ` Zhou, David(ChunMing)
  2 siblings, 1 reply; 9+ messages in thread
From: Koenig, Christian @ 2019-05-09 14:28 UTC (permalink / raw)
  To: Zhou, David(ChunMing), Liang, Prike, dri-devel

I've foudn one more problem with this.

With lockdep enabled I get a warning because ttm_eu_reserve_buffers() 
has called ww_acquire_done() on the ticket (which essentially means we 
are done, no more locking with that ticket).

The simplest solution is probably to just remove the call to 
ww_acquire_done() from ttm_eu_reserve_buffers().

Christian.

Am 07.05.19 um 13:45 schrieb Chunming Zhou:
> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>
> basically pick up Christian idea:
>
> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>
> v2: fix some minor check
> v3: address Christian v2 comments.
> v4: fix some missing
> v5: handle first_bo unlock and bo_get/put
> v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
> v7: pass request bo->resv to ttm_bo_evict_first
>
> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 111 +++++++++++++++++++++++++++++------
>   1 file changed, 94 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 8502b3ed2d88..f5e6328e4a57 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -			struct ttm_operation_ctx *ctx, bool *locked)
> +			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>   	bool ret = false;
>   
>   	*locked = false;
> +	if (busy)
> +		*busy = false;
>   	if (bo->resv == ctx->resv) {
>   		reservation_object_assert_held(bo->resv);
>   		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> @@ -779,35 +781,46 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>   	} else {
>   		*locked = reservation_object_trylock(bo->resv);
>   		ret = *locked;
> +		if (!ret && busy)
> +			*busy = true;
>   	}
>   
>   	return ret;
>   }
>   
> -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> -			       uint32_t mem_type,
> -			       const struct ttm_place *place,
> -			       struct ttm_operation_ctx *ctx)
> +static struct ttm_buffer_object*
> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
> +			 struct ttm_mem_type_manager *man,
> +			 const struct ttm_place *place,
> +			 struct ttm_operation_ctx *ctx,
> +			 struct ttm_buffer_object **first_bo,
> +			 bool *locked)
>   {
> -	struct ttm_bo_global *glob = bdev->glob;
> -	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>   	struct ttm_buffer_object *bo = NULL;
> -	bool locked = false;
> -	unsigned i;
> -	int ret;
> +	int i;
>   
> -	spin_lock(&glob->lru_lock);
> +	if (first_bo)
> +		*first_bo = NULL;
>   	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>   		list_for_each_entry(bo, &man->lru[i], lru) {
> -			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +			bool busy = false;
> +
> +			if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
> +							    &busy)) {
> +				if (first_bo && !(*first_bo) && busy) {
> +					ttm_bo_get(bo);
> +					*first_bo = bo;
> +				}
>   				continue;
> +			}
>   
>   			if (place && !bdev->driver->eviction_valuable(bo,
>   								      place)) {
> -				if (locked)
> +				if (*locked)
>   					reservation_object_unlock(bo->resv);
>   				continue;
>   			}
> +
>   			break;
>   		}
>   
> @@ -818,9 +831,67 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>   		bo = NULL;
>   	}
>   
> +	return bo;
> +}
> +
> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> +			       uint32_t mem_type,
> +			       const struct ttm_place *place,
> +			       struct ttm_operation_ctx *ctx,
> +			       struct reservation_object *request_resv)
> +{
> +	struct ttm_bo_global *glob = bdev->glob;
> +	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> +	struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
> +	bool locked = false;
> +	int ret;
> +
> +	spin_lock(&glob->lru_lock);
> +	bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
> +				      &locked);
>   	if (!bo) {
> +		struct ttm_operation_ctx busy_ctx;
> +
>   		spin_unlock(&glob->lru_lock);
> -		return -EBUSY;
> +		/* check if other user occupy memory too long time */
> +		if (!first_bo || !request_resv || !request_resv->lock.ctx) {
> +			if (first_bo)
> +				ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
> +		if (first_bo->resv == request_resv) {
> +			ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
> +		if (ctx->interruptible)
> +			ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
> +							  request_resv->lock.ctx);
> +		else
> +			ret = ww_mutex_lock(&first_bo->resv->lock, request_resv->lock.ctx);
> +		if (ret) {
> +			ttm_bo_put(first_bo);
> +			return ret;
> +		}
> +		spin_lock(&glob->lru_lock);
> +		/* previous busy resv lock is held by above, idle now,
> +		 * so let them evictable.
> +		 */
> +		busy_ctx.interruptible = ctx->interruptible;
> +		busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
> +		busy_ctx.resv	       = first_bo->resv;
> +		busy_ctx.flags	       = TTM_OPT_FLAG_ALLOW_RES_EVICT;
> +
> +		bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
> +					      &locked);
> +		if (bo && (bo->resv == first_bo->resv))
> +			locked = true;
> +		else if (bo)
> +			ww_mutex_unlock(&first_bo->resv->lock);
> +		if (!bo) {
> +			spin_unlock(&glob->lru_lock);
> +			ttm_bo_put(first_bo);
> +			return -EBUSY;
> +		}
>   	}
>   
>   	kref_get(&bo->list_kref);
> @@ -829,11 +900,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>   		ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>   					  ctx->no_wait_gpu, locked);
>   		kref_put(&bo->list_kref, ttm_bo_release_list);
> +		if (first_bo)
> +			ttm_bo_put(first_bo);
>   		return ret;
>   	}
>   
>   	ttm_bo_del_from_lru(bo);
>   	spin_unlock(&glob->lru_lock);
> +	if (first_bo)
> +		ttm_bo_put(first_bo);
>   
>   	ret = ttm_bo_evict(bo, ctx);
>   	if (locked) {
> @@ -907,7 +982,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>   			return ret;
>   		if (mem->mm_node)
>   			break;
> -		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
> +		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx, bo->resv);
>   		if (unlikely(ret != 0))
>   			return ret;
>   	} while (1);
> @@ -1413,7 +1488,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
>   	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>   		while (!list_empty(&man->lru[i])) {
>   			spin_unlock(&glob->lru_lock);
> -			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> +			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> +						  NULL);
>   			if (ret)
>   				return ret;
>   			spin_lock(&glob->lru_lock);
> @@ -1784,7 +1860,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>   	spin_lock(&glob->lru_lock);
>   	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>   		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +							   NULL)) {
>   				ret = 0;
>   				break;
>   			}

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re:[PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
  2019-05-09 14:28 ` Koenig, Christian
@ 2019-05-09 14:46   ` Zhou, David(ChunMing)
  2019-05-09 14:59     ` [PATCH " Christian König
  0 siblings, 1 reply; 9+ messages in thread
From: Zhou, David(ChunMing) @ 2019-05-09 14:46 UTC (permalink / raw)
  To: Koenig, Christian, Zhou, David(ChunMing), Liang, Prike, dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 10860 bytes --]

I know that before, it will issue warning only when debug option is enabled. Removing that is ok to me.
I only help Prike draft your idea, and Prike is trying this patch on his side. The latest feedback he gave me is first_bo is always null, code doesn't run into busy path, which is very confusing me, and he said  he is debugging  that.

-David


-------- Original Message --------
Subject: Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
From: "Koenig, Christian"
To: "Zhou, David(ChunMing)" ,"Liang, Prike" ,dri-devel@lists.freedesktop.org
CC:

I've foudn one more problem with this.

With lockdep enabled I get a warning because ttm_eu_reserve_buffers()
has called ww_acquire_done() on the ticket (which essentially means we
are done, no more locking with that ticket).

The simplest solution is probably to just remove the call to
ww_acquire_done() from ttm_eu_reserve_buffers().

Christian.

Am 07.05.19 um 13:45 schrieb Chunming Zhou:
> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>
> basically pick up Christian idea:
>
> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>
> v2: fix some minor check
> v3: address Christian v2 comments.
> v4: fix some missing
> v5: handle first_bo unlock and bo_get/put
> v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
> v7: pass request bo->resv to ttm_bo_evict_first
>
> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
> Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 111 +++++++++++++++++++++++++++++------
>   1 file changed, 94 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 8502b3ed2d88..f5e6328e4a57 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -                     struct ttm_operation_ctx *ctx, bool *locked)
> +                     struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>        bool ret = false;
>
>        *locked = false;
> +     if (busy)
> +             *busy = false;
>        if (bo->resv == ctx->resv) {
>                reservation_object_assert_held(bo->resv);
>                if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> @@ -779,35 +781,46 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>        } else {
>                *locked = reservation_object_trylock(bo->resv);
>                ret = *locked;
> +             if (!ret && busy)
> +                     *busy = true;
>        }
>
>        return ret;
>   }
>
> -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> -                            uint32_t mem_type,
> -                            const struct ttm_place *place,
> -                            struct ttm_operation_ctx *ctx)
> +static struct ttm_buffer_object*
> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
> +                      struct ttm_mem_type_manager *man,
> +                      const struct ttm_place *place,
> +                      struct ttm_operation_ctx *ctx,
> +                      struct ttm_buffer_object **first_bo,
> +                      bool *locked)
>   {
> -     struct ttm_bo_global *glob = bdev->glob;
> -     struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>        struct ttm_buffer_object *bo = NULL;
> -     bool locked = false;
> -     unsigned i;
> -     int ret;
> +     int i;
>
> -     spin_lock(&glob->lru_lock);
> +     if (first_bo)
> +             *first_bo = NULL;
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                list_for_each_entry(bo, &man->lru[i], lru) {
> -                     if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +                     bool busy = false;
> +
> +                     if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
> +                                                         &busy)) {
> +                             if (first_bo && !(*first_bo) && busy) {
> +                                     ttm_bo_get(bo);
> +                                     *first_bo = bo;
> +                             }
>                                continue;
> +                     }
>
>                        if (place && !bdev->driver->eviction_valuable(bo,
>                                                                      place)) {
> -                             if (locked)
> +                             if (*locked)
>                                        reservation_object_unlock(bo->resv);
>                                continue;
>                        }
> +
>                        break;
>                }
>
> @@ -818,9 +831,67 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                bo = NULL;
>        }
>
> +     return bo;
> +}
> +
> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> +                            uint32_t mem_type,
> +                            const struct ttm_place *place,
> +                            struct ttm_operation_ctx *ctx,
> +                            struct reservation_object *request_resv)
> +{
> +     struct ttm_bo_global *glob = bdev->glob;
> +     struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> +     struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
> +     bool locked = false;
> +     int ret;
> +
> +     spin_lock(&glob->lru_lock);
> +     bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
> +                                   &locked);
>        if (!bo) {
> +             struct ttm_operation_ctx busy_ctx;
> +
>                spin_unlock(&glob->lru_lock);
> -             return -EBUSY;
> +             /* check if other user occupy memory too long time */
> +             if (!first_bo || !request_resv || !request_resv->lock.ctx) {
> +                     if (first_bo)
> +                             ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
> +             if (first_bo->resv == request_resv) {
> +                     ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
> +             if (ctx->interruptible)
> +                     ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
> +                                                       request_resv->lock.ctx);
> +             else
> +                     ret = ww_mutex_lock(&first_bo->resv->lock, request_resv->lock.ctx);
> +             if (ret) {
> +                     ttm_bo_put(first_bo);
> +                     return ret;
> +             }
> +             spin_lock(&glob->lru_lock);
> +             /* previous busy resv lock is held by above, idle now,
> +              * so let them evictable.
> +              */
> +             busy_ctx.interruptible = ctx->interruptible;
> +             busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
> +             busy_ctx.resv          = first_bo->resv;
> +             busy_ctx.flags         = TTM_OPT_FLAG_ALLOW_RES_EVICT;
> +
> +             bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
> +                                           &locked);
> +             if (bo && (bo->resv == first_bo->resv))
> +                     locked = true;
> +             else if (bo)
> +                     ww_mutex_unlock(&first_bo->resv->lock);
> +             if (!bo) {
> +                     spin_unlock(&glob->lru_lock);
> +                     ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
>        }
>
>        kref_get(&bo->list_kref);
> @@ -829,11 +900,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>                                          ctx->no_wait_gpu, locked);
>                kref_put(&bo->list_kref, ttm_bo_release_list);
> +             if (first_bo)
> +                     ttm_bo_put(first_bo);
>                return ret;
>        }
>
>        ttm_bo_del_from_lru(bo);
>        spin_unlock(&glob->lru_lock);
> +     if (first_bo)
> +             ttm_bo_put(first_bo);
>
>        ret = ttm_bo_evict(bo, ctx);
>        if (locked) {
> @@ -907,7 +982,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>                        return ret;
>                if (mem->mm_node)
>                        break;
> -             ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
> +             ret = ttm_mem_evict_first(bdev, mem_type, place, ctx, bo->resv);
>                if (unlikely(ret != 0))
>                        return ret;
>        } while (1);
> @@ -1413,7 +1488,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                while (!list_empty(&man->lru[i])) {
>                        spin_unlock(&glob->lru_lock);
> -                     ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> +                     ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> +                                               NULL);
>                        if (ret)
>                                return ret;
>                        spin_lock(&glob->lru_lock);
> @@ -1784,7 +1860,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>        spin_lock(&glob->lru_lock);
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -                     if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +                     if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                        NULL)) {
>                                ret = 0;
>                                break;
>                        }


[-- Attachment #1.2: Type: text/html, Size: 27372 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
  2019-05-09 14:46   ` Zhou, David(ChunMing)
@ 2019-05-09 14:59     ` Christian König
  2019-05-10  5:39       ` Liang, Prike
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2019-05-09 14:59 UTC (permalink / raw)
  To: Zhou, David(ChunMing), Koenig, Christian, Liang, Prike, dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 11244 bytes --]

Oh, I know where this is coming from.

The problem is that we remove the BOs from the LRU during CS and so we 
can't wait for the CS to finish up.

Already working on this problem for Marek's similar issue,
Christian.

Am 09.05.19 um 16:46 schrieb Zhou, David(ChunMing):
> I know that before, it will issue warning only when debug option is 
> enabled. Removing that is ok to me.
> I only help Prike draft your idea, and Prike is trying this patch on 
> his side. The latest feedback he gave me is first_bo is always null, 
> code doesn't run into busy path, which is very confusing me, and he 
> said  he is debugging  that.
>
> -David
>
>
> -------- Original Message --------
> Subject: Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
> From: "Koenig, Christian"
> To: "Zhou, David(ChunMing)" ,"Liang, Prike" 
> ,dri-devel@lists.freedesktop.org
> CC:
>
> I've foudn one more problem with this.
>
> With lockdep enabled I get a warning because ttm_eu_reserve_buffers()
> has called ww_acquire_done() on the ticket (which essentially means we
> are done, no more locking with that ticket).
>
> The simplest solution is probably to just remove the call to
> ww_acquire_done() from ttm_eu_reserve_buffers().
>
> Christian.
>
> Am 07.05.19 um 13:45 schrieb Chunming Zhou:
> > heavy gpu job could occupy memory long time, which lead other user 
> fail to get memory.
> >
> > basically pick up Christian idea:
> >
> > 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
> > 2. If we then run into this EBUSY condition in TTM check if the BO 
> we need memory for (or rather the ww_mutex of its reservation object) 
> has a ticket assigned.
> > 3. If we have a ticket we grab a reference to the first BO on the 
> LRU, drop the LRU lock and try to grab the reservation lock with the 
> ticket.
> > 4. If getting the reservation lock with the ticket succeeded we 
> check if the BO is still the first one on the LRU in question (the BO 
> could have moved).
> > 5. If the BO is still the first one on the LRU in question we try to 
> evict it as we would evict any other BO.
> > 6. If any of the "If's" above fail we just back off and return -EBUSY.
> >
> > v2: fix some minor check
> > v3: address Christian v2 comments.
> > v4: fix some missing
> > v5: handle first_bo unlock and bo_get/put
> > v6: abstract unified iterate function, and handle all possible 
> usecase not only pinned bo.
> > v7: pass request bo->resv to ttm_bo_evict_first
> >
> > Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
> > Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
> > ---
> >   drivers/gpu/drm/ttm/ttm_bo.c | 111 +++++++++++++++++++++++++++++------
> >   1 file changed, 94 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> > index 8502b3ed2d88..f5e6328e4a57 100644
> > --- a/drivers/gpu/drm/ttm/ttm_bo.c
> > +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> > @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
> >    * b. Otherwise, trylock it.
> >    */
> >   static bool ttm_bo_evict_swapout_allowable(struct 
> ttm_buffer_object *bo,
> > -                     struct ttm_operation_ctx *ctx, bool *locked)
> > +                     struct ttm_operation_ctx *ctx, bool *locked, 
> bool *busy)
> >   {
> >        bool ret = false;
> >
> >        *locked = false;
> > +     if (busy)
> > +             *busy = false;
> >        if (bo->resv == ctx->resv) {
> > reservation_object_assert_held(bo->resv);
> >                if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> > @@ -779,35 +781,46 @@ static bool 
> ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> >        } else {
> >                *locked = reservation_object_trylock(bo->resv);
> >                ret = *locked;
> > +             if (!ret && busy)
> > +                     *busy = true;
> >        }
> >
> >        return ret;
> >   }
> >
> > -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> > -                            uint32_t mem_type,
> > -                            const struct ttm_place *place,
> > -                            struct ttm_operation_ctx *ctx)
> > +static struct ttm_buffer_object*
> > +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
> > +                      struct ttm_mem_type_manager *man,
> > +                      const struct ttm_place *place,
> > +                      struct ttm_operation_ctx *ctx,
> > +                      struct ttm_buffer_object **first_bo,
> > +                      bool *locked)
> >   {
> > -     struct ttm_bo_global *glob = bdev->glob;
> > -     struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> >        struct ttm_buffer_object *bo = NULL;
> > -     bool locked = false;
> > -     unsigned i;
> > -     int ret;
> > +     int i;
> >
> > -     spin_lock(&glob->lru_lock);
> > +     if (first_bo)
> > +             *first_bo = NULL;
> >        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> >                list_for_each_entry(bo, &man->lru[i], lru) {
> > -                     if (!ttm_bo_evict_swapout_allowable(bo, ctx, 
> &locked))
> > +                     bool busy = false;
> > +
> > +                     if (!ttm_bo_evict_swapout_allowable(bo, ctx, 
> locked,
> > + &busy)) {
> > +                             if (first_bo && !(*first_bo) && busy) {
> > +                                     ttm_bo_get(bo);
> > +                                     *first_bo = bo;
> > +                             }
> >                                continue;
> > +                     }
> >
> >                        if (place && !bdev->driver->eviction_valuable(bo,
> >                                                                     place)) {
> > -                             if (locked)
> > +                             if (*locked)
> > reservation_object_unlock(bo->resv);
> >                                continue;
> >                        }
> > +
> >                        break;
> >                }
> >
> > @@ -818,9 +831,67 @@ static int ttm_mem_evict_first(struct 
> ttm_bo_device *bdev,
> >                bo = NULL;
> >        }
> >
> > +     return bo;
> > +}
> > +
> > +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> > +                            uint32_t mem_type,
> > +                            const struct ttm_place *place,
> > +                            struct ttm_operation_ctx *ctx,
> > +                            struct reservation_object *request_resv)
> > +{
> > +     struct ttm_bo_global *glob = bdev->glob;
> > +     struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> > +     struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
> > +     bool locked = false;
> > +     int ret;
> > +
> > +     spin_lock(&glob->lru_lock);
> > +     bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
> > +                                   &locked);
> >        if (!bo) {
> > +             struct ttm_operation_ctx busy_ctx;
> > +
> >                spin_unlock(&glob->lru_lock);
> > -             return -EBUSY;
> > +             /* check if other user occupy memory too long time */
> > +             if (!first_bo || !request_resv || 
> !request_resv->lock.ctx) {
> > +                     if (first_bo)
> > +                             ttm_bo_put(first_bo);
> > +                     return -EBUSY;
> > +             }
> > +             if (first_bo->resv == request_resv) {
> > +                     ttm_bo_put(first_bo);
> > +                     return -EBUSY;
> > +             }
> > +             if (ctx->interruptible)
> > +                     ret = 
> ww_mutex_lock_interruptible(&first_bo->resv->lock,
> > + request_resv->lock.ctx);
> > +             else
> > +                     ret = ww_mutex_lock(&first_bo->resv->lock, 
> request_resv->lock.ctx);
> > +             if (ret) {
> > +                     ttm_bo_put(first_bo);
> > +                     return ret;
> > +             }
> > +             spin_lock(&glob->lru_lock);
> > +             /* previous busy resv lock is held by above, idle now,
> > +              * so let them evictable.
> > +              */
> > +             busy_ctx.interruptible = ctx->interruptible;
> > +             busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
> > +             busy_ctx.resv          = first_bo->resv;
> > +             busy_ctx.flags         = TTM_OPT_FLAG_ALLOW_RES_EVICT;
> > +
> > +             bo = ttm_mem_find_evitable_bo(bdev, man, place, 
> &busy_ctx, NULL,
> > + &locked);
> > +             if (bo && (bo->resv == first_bo->resv))
> > +                     locked = true;
> > +             else if (bo)
> > + ww_mutex_unlock(&first_bo->resv->lock);
> > +             if (!bo) {
> > + spin_unlock(&glob->lru_lock);
> > +                     ttm_bo_put(first_bo);
> > +                     return -EBUSY;
> > +             }
> >        }
> >
> >        kref_get(&bo->list_kref);
> > @@ -829,11 +900,15 @@ static int ttm_mem_evict_first(struct 
> ttm_bo_device *bdev,
> >                ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
> > ctx->no_wait_gpu, locked);
> >                kref_put(&bo->list_kref, ttm_bo_release_list);
> > +             if (first_bo)
> > +                     ttm_bo_put(first_bo);
> >                return ret;
> >        }
> >
> >        ttm_bo_del_from_lru(bo);
> >        spin_unlock(&glob->lru_lock);
> > +     if (first_bo)
> > +             ttm_bo_put(first_bo);
> >
> >        ret = ttm_bo_evict(bo, ctx);
> >        if (locked) {
> > @@ -907,7 +982,7 @@ static int ttm_bo_mem_force_space(struct 
> ttm_buffer_object *bo,
> >                        return ret;
> >                if (mem->mm_node)
> >                        break;
> > -             ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
> > +             ret = ttm_mem_evict_first(bdev, mem_type, place, ctx, 
> bo->resv);
> >                if (unlikely(ret != 0))
> >                        return ret;
> >        } while (1);
> > @@ -1413,7 +1488,8 @@ static int ttm_bo_force_list_clean(struct 
> ttm_bo_device *bdev,
> >        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> >                while (!list_empty(&man->lru[i])) {
> > spin_unlock(&glob->lru_lock);
> > -                     ret = ttm_mem_evict_first(bdev, mem_type, 
> NULL, &ctx);
> > +                     ret = ttm_mem_evict_first(bdev, mem_type, 
> NULL, &ctx,
> > +                                               NULL);
> >                        if (ret)
> >                                return ret;
> > spin_lock(&glob->lru_lock);
> > @@ -1784,7 +1860,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, 
> struct ttm_operation_ctx *ctx)
> >        spin_lock(&glob->lru_lock);
> >        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> >                list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> > -                     if (ttm_bo_evict_swapout_allowable(bo, ctx, 
> &locked)) {
> > +                     if (ttm_bo_evict_swapout_allowable(bo, ctx, 
> &locked,
> > + NULL)) {
> >                                ret = 0;
> >                                break;
> >                        }
>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel


[-- Attachment #1.2: Type: text/html, Size: 18334 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
  2019-05-09 14:59     ` [PATCH " Christian König
@ 2019-05-10  5:39       ` Liang, Prike
  2019-05-13  3:00         ` Liang, Prike
  0 siblings, 1 reply; 9+ messages in thread
From: Liang, Prike @ 2019-05-10  5:39 UTC (permalink / raw)
  To: Koenig, Christian, Zhou, David(ChunMing), dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 14081 bytes --]

Thanks Christian proposal and David draft the solution implement .

The pinned Bos failed not observed from prepare_fb ,but Abaqus job can't  finished through the whole night .
Regards the NULL fist BO EBUSY error case , which  comes from amdgpu_cs_bo_validate perform period as the below call stack show . Now the NULL first BO debug error message popup out endlessly during Abaqus running ,that's seems the function @amdgpu_cs_validate run into invoked amdgpu_cs_bo_validate dead loop.

lxj ttm_mem_evict_first first_bo=          (null),request_resv=ffff929d47b33218,request_resv->lock.ctx=ffff929b8d6bfbd8
[ 2703.091731] CPU: 3 PID: 10739 Comm: standard Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.el7.x86_64 #1
[ 2703.103046] Hardware name: MSI MS-7984/Z170 KRAIT GAMING (MS-7984), BIOS B.80 05/11/2016
[ 2703.111181] Call Trace:
[ 2703.113745]  [<ffffffff81961dc1>] dump_stack+0x19/0x1b
[ 2703.118979]  [<ffffffffc055cd19>] ttm_mem_evict_first+0x3a9/0x400 [amdttm]
[ 2703.125974]  [<ffffffffc055d05b>] amdttm_bo_mem_space+0x2eb/0x4a0 [amdttm]
[ 2703.132967]  [<ffffffffc055d6e4>] amdttm_bo_validate+0xc4/0x140 [amdttm]
[ 2703.139827]  [<ffffffffc059fed5>] amdgpu_cs_bo_validate+0xa5/0x220 [amdgpu]
[ 2703.146879]  [<ffffffffc05a0097>] amdgpu_cs_validate+0x47/0x2e0 [amdgpu]
[ 2703.153776]  [<ffffffffc05b41a2>] ? amdgpu_vm_del_from_lru_notify+0x12/0x80 [amdgpu]
[ 2703.161707]  [<ffffffffc05a0050>] ? amdgpu_cs_bo_validate+0x220/0x220 [amdgpu]
[ 2703.169018]  [<ffffffffc05b4452>] amdgpu_vm_validate_pt_bos+0x92/0x140 [amdgpu]
[ 2703.176512]  [<ffffffffc05a23e5>] amdgpu_cs_ioctl+0x18a5/0x1d40 [amdgpu]
[ 2703.183372]  [<ffffffffc05a0b40>] ? amdgpu_cs_find_mapping+0x120/0x120 [amdgpu]
[ 2703.190815]  [<ffffffffc042df2c>] drm_ioctl_kernel+0x6c/0xb0 [drm]
[ 2703.197109]  [<ffffffffc042e647>] drm_ioctl+0x1e7/0x420 [drm]
[ 2703.202995]  [<ffffffffc05a0b40>] ? amdgpu_cs_find_mapping+0x120/0x120 [amdgpu]
[ 2703.210471]  [<ffffffffc058004b>] amdgpu_drm_ioctl+0x4b/0x80 [amdgpu]
[ 2703.217019]  [<ffffffff81456210>] do_vfs_ioctl+0x3a0/0x5a0
[ 2703.222596]  [<ffffffff8196744a>] ? __schedule+0x13a/0x890
[ 2703.228172]  [<ffffffff814564b1>] SyS_ioctl+0xa1/0xc0
[ 2703.233308]  [<ffffffff81974ddb>] system_call_fastpath+0x22/0x27

Thanks,
Prike
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Sent: Thursday, May 09, 2019 10:59 PM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Liang, Prike <Prike.Liang@amd.com>; dri-devel@lists.freedesktop.org
Subject: Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7

[CAUTION: External Email]
Oh, I know where this is coming from.

The problem is that we remove the BOs from the LRU during CS and so we can't wait for the CS to finish up.

Already working on this problem for Marek's similar issue,
Christian.

Am 09.05.19 um 16:46 schrieb Zhou, David(ChunMing):
I know that before, it will issue warning only when debug option is enabled. Removing that is ok to me.
I only help Prike draft your idea, and Prike is trying this patch on his side. The latest feedback he gave me is first_bo is always null, code doesn't run into busy path, which is very confusing me, and he said  he is debugging  that.

-David


-------- Original Message --------
Subject: Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
From: "Koenig, Christian"
To: "Zhou, David(ChunMing)" ,"Liang, Prike" ,dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>
CC:
I've foudn one more problem with this.

With lockdep enabled I get a warning because ttm_eu_reserve_buffers()
has called ww_acquire_done() on the ticket (which essentially means we
are done, no more locking with that ticket).

The simplest solution is probably to just remove the call to
ww_acquire_done() from ttm_eu_reserve_buffers().

Christian.

Am 07.05.19 um 13:45 schrieb Chunming Zhou:
> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>
> basically pick up Christian idea:
>
> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>
> v2: fix some minor check
> v3: address Christian v2 comments.
> v4: fix some missing
> v5: handle first_bo unlock and bo_get/put
> v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
> v7: pass request bo->resv to ttm_bo_evict_first
>
> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
> Signed-off-by: Chunming Zhou <david1.zhou@amd.com><mailto:david1.zhou@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 111 +++++++++++++++++++++++++++++------
>   1 file changed, 94 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 8502b3ed2d88..f5e6328e4a57 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -                     struct ttm_operation_ctx *ctx, bool *locked)
> +                     struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>        bool ret = false;
>
>        *locked = false;
> +     if (busy)
> +             *busy = false;
>        if (bo->resv == ctx->resv) {
>                reservation_object_assert_held(bo->resv);
>                if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> @@ -779,35 +781,46 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>        } else {
>                *locked = reservation_object_trylock(bo->resv);
>                ret = *locked;
> +             if (!ret && busy)
> +                     *busy = true;
>        }
>
>        return ret;
>   }
>
> -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> -                            uint32_t mem_type,
> -                            const struct ttm_place *place,
> -                            struct ttm_operation_ctx *ctx)
> +static struct ttm_buffer_object*
> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
> +                      struct ttm_mem_type_manager *man,
> +                      const struct ttm_place *place,
> +                      struct ttm_operation_ctx *ctx,
> +                      struct ttm_buffer_object **first_bo,
> +                      bool *locked)
>   {
> -     struct ttm_bo_global *glob = bdev->glob;
> -     struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>        struct ttm_buffer_object *bo = NULL;
> -     bool locked = false;
> -     unsigned i;
> -     int ret;
> +     int i;
>
> -     spin_lock(&glob->lru_lock);
> +     if (first_bo)
> +             *first_bo = NULL;
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                list_for_each_entry(bo, &man->lru[i], lru) {
> -                     if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +                     bool busy = false;
> +
> +                     if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
> +                                                         &busy)) {
> +                             if (first_bo && !(*first_bo) && busy) {
> +                                     ttm_bo_get(bo);
> +                                     *first_bo = bo;
> +                             }
>                                continue;
> +                     }
>
>                        if (place && !bdev->driver->eviction_valuable(bo,
>                                                                      place)) {
> -                             if (locked)
> +                             if (*locked)
>                                        reservation_object_unlock(bo->resv);
>                                continue;
>                        }
> +
>                        break;
>                }
>
> @@ -818,9 +831,67 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                bo = NULL;
>        }
>
> +     return bo;
> +}
> +
> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> +                            uint32_t mem_type,
> +                            const struct ttm_place *place,
> +                            struct ttm_operation_ctx *ctx,
> +                            struct reservation_object *request_resv)
> +{
> +     struct ttm_bo_global *glob = bdev->glob;
> +     struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> +     struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
> +     bool locked = false;
> +     int ret;
> +
> +     spin_lock(&glob->lru_lock);
> +     bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
> +                                   &locked);
>        if (!bo) {
> +             struct ttm_operation_ctx busy_ctx;
> +
>                spin_unlock(&glob->lru_lock);
> -             return -EBUSY;
> +             /* check if other user occupy memory too long time */
> +             if (!first_bo || !request_resv || !request_resv->lock.ctx) {
> +                     if (first_bo)
> +                             ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
> +             if (first_bo->resv == request_resv) {
> +                     ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
> +             if (ctx->interruptible)
> +                     ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
> +                                                       request_resv->lock.ctx);
> +             else
> +                     ret = ww_mutex_lock(&first_bo->resv->lock, request_resv->lock.ctx);
> +             if (ret) {
> +                     ttm_bo_put(first_bo);
> +                     return ret;
> +             }
> +             spin_lock(&glob->lru_lock);
> +             /* previous busy resv lock is held by above, idle now,
> +              * so let them evictable.
> +              */
> +             busy_ctx.interruptible = ctx->interruptible;
> +             busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
> +             busy_ctx.resv          = first_bo->resv;
> +             busy_ctx.flags         = TTM_OPT_FLAG_ALLOW_RES_EVICT;
> +
> +             bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
> +                                           &locked);
> +             if (bo && (bo->resv == first_bo->resv))
> +                     locked = true;
> +             else if (bo)
> +                     ww_mutex_unlock(&first_bo->resv->lock);
> +             if (!bo) {
> +                     spin_unlock(&glob->lru_lock);
> +                     ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
>        }
>
>        kref_get(&bo->list_kref);
> @@ -829,11 +900,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>                                          ctx->no_wait_gpu, locked);
>                kref_put(&bo->list_kref, ttm_bo_release_list);
> +             if (first_bo)
> +                     ttm_bo_put(first_bo);
>                return ret;
>        }
>
>        ttm_bo_del_from_lru(bo);
>        spin_unlock(&glob->lru_lock);
> +     if (first_bo)
> +             ttm_bo_put(first_bo);
>
>        ret = ttm_bo_evict(bo, ctx);
>        if (locked) {
> @@ -907,7 +982,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>                        return ret;
>                if (mem->mm_node)
>                        break;
> -             ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
> +             ret = ttm_mem_evict_first(bdev, mem_type, place, ctx, bo->resv);
>                if (unlikely(ret != 0))
>                        return ret;
>        } while (1);
> @@ -1413,7 +1488,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                while (!list_empty(&man->lru[i])) {
>                        spin_unlock(&glob->lru_lock);
> -                     ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> +                     ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> +                                               NULL);
>                        if (ret)
>                                return ret;
>                        spin_lock(&glob->lru_lock);
> @@ -1784,7 +1860,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>        spin_lock(&glob->lru_lock);
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -                     if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +                     if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                        NULL)) {
>                                ret = 0;
>                                break;
>                        }



_______________________________________________

dri-devel mailing list

dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/dri-devel


[-- Attachment #1.2: Type: text/html, Size: 36557 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve
  2019-05-07 11:45 ` [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve Chunming Zhou
@ 2019-05-13  2:45   ` Liang, Prike
  0 siblings, 0 replies; 9+ messages in thread
From: Liang, Prike @ 2019-05-13  2:45 UTC (permalink / raw)
  To: Zhou, David(ChunMing), Koenig, Christian, dri-devel

Acked-by: Prike Liang <Prike.Liang@amd.com>


-----Original Message-----
From: Chunming Zhou <david1.zhou@amd.com> 
Sent: Tuesday, May 07, 2019 7:46 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; Liang, Prike <Prike.Liang@amd.com>; dri-devel@lists.freedesktop.org
Cc: Zhou, David(ChunMing) <David1.Zhou@amd.com>
Subject: [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve

add ticket for display bo, so that it can preempt busy bo.

Change-Id: I9f031cdcc8267de00e819ae303baa0a52df8ebb9
Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index ac22f7351a42..3f36770946ab 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4176,6 +4176,9 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	struct amdgpu_device *adev;
 	struct amdgpu_bo *rbo;
 	struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
+	struct list_head list;
+	struct ttm_validate_buffer tv;
+	struct ww_acquire_ctx ticket;
 	uint64_t tiling_flags;
 	uint32_t domain;
 	int r;
@@ -4192,9 +4195,17 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	obj = new_state->fb->obj[0];
 	rbo = gem_to_amdgpu_bo(obj);
 	adev = amdgpu_ttm_adev(rbo->tbo.bdev);
-	r = amdgpu_bo_reserve(rbo, false);
-	if (unlikely(r != 0))
+	INIT_LIST_HEAD(&list);
+
+	tv.bo = &rbo->tbo;
+	tv.num_shared = 1;
+	list_add(&tv.head, &list);
+
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL);
+	if (r) {
+		dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
 		return r;
+	}
 
 	if (plane->type != DRM_PLANE_TYPE_CURSOR)
 		domain = amdgpu_display_supported_domains(adev);
@@ -4205,21 +4216,21 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		return r;
 	}
 
 	r = amdgpu_ttm_alloc_gart(&rbo->tbo);
 	if (unlikely(r != 0)) {
 		amdgpu_bo_unpin(rbo);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		DRM_ERROR("%p bind failed\n", rbo);
 		return r;
 	}
 
 	amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
 
-	amdgpu_bo_unreserve(rbo);
+	ttm_eu_backoff_reservation(&ticket, &list);
 
 	afb->address = amdgpu_bo_gpu_offset(rbo);
 
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
  2019-05-10  5:39       ` Liang, Prike
@ 2019-05-13  3:00         ` Liang, Prike
  0 siblings, 0 replies; 9+ messages in thread
From: Liang, Prike @ 2019-05-13  3:00 UTC (permalink / raw)
  To: Koenig, Christian, Zhou, David(ChunMing), dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 14988 bytes --]

I have verified this solution again and the Abaqus case finished after about 27 hours running .
But not sure whether retrieve the first busy BO and then retry evict the LRU BOs result in
Abaqus poor performance .

Anyway this can fix the age-old issue .Does any other concern before push the following patch to the drm-next branch .

drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve
drm/ttm: fix busy memory to fail other user v7

Thanks,
Prike
From: Liang, Prike
Sent: Friday, May 10, 2019 1:40 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; dri-devel@lists.freedesktop.org
Subject: RE: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7

Thanks Christian proposal and David draft the solution implement .

The pinned Bos failed not observed from prepare_fb ,but Abaqus job can't  finished through the whole night .
Regards the NULL fist BO EBUSY error case , which  comes from amdgpu_cs_bo_validate perform period as the below call stack show . Now the NULL first BO debug error message popup out endlessly during Abaqus running ,that's seems the function @amdgpu_cs_validate run into invoked amdgpu_cs_bo_validate dead loop.

lxj ttm_mem_evict_first first_bo=          (null),request_resv=ffff929d47b33218,request_resv->lock.ctx=ffff929b8d6bfbd8
[ 2703.091731] CPU: 3 PID: 10739 Comm: standard Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.el7.x86_64 #1
[ 2703.103046] Hardware name: MSI MS-7984/Z170 KRAIT GAMING (MS-7984), BIOS B.80 05/11/2016
[ 2703.111181] Call Trace:
[ 2703.113745]  [<ffffffff81961dc1>] dump_stack+0x19/0x1b
[ 2703.118979]  [<ffffffffc055cd19>] ttm_mem_evict_first+0x3a9/0x400 [amdttm]
[ 2703.125974]  [<ffffffffc055d05b>] amdttm_bo_mem_space+0x2eb/0x4a0 [amdttm]
[ 2703.132967]  [<ffffffffc055d6e4>] amdttm_bo_validate+0xc4/0x140 [amdttm]
[ 2703.139827]  [<ffffffffc059fed5>] amdgpu_cs_bo_validate+0xa5/0x220 [amdgpu]
[ 2703.146879]  [<ffffffffc05a0097>] amdgpu_cs_validate+0x47/0x2e0 [amdgpu]
[ 2703.153776]  [<ffffffffc05b41a2>] ? amdgpu_vm_del_from_lru_notify+0x12/0x80 [amdgpu]
[ 2703.161707]  [<ffffffffc05a0050>] ? amdgpu_cs_bo_validate+0x220/0x220 [amdgpu]
[ 2703.169018]  [<ffffffffc05b4452>] amdgpu_vm_validate_pt_bos+0x92/0x140 [amdgpu]
[ 2703.176512]  [<ffffffffc05a23e5>] amdgpu_cs_ioctl+0x18a5/0x1d40 [amdgpu]
[ 2703.183372]  [<ffffffffc05a0b40>] ? amdgpu_cs_find_mapping+0x120/0x120 [amdgpu]
[ 2703.190815]  [<ffffffffc042df2c>] drm_ioctl_kernel+0x6c/0xb0 [drm]
[ 2703.197109]  [<ffffffffc042e647>] drm_ioctl+0x1e7/0x420 [drm]
[ 2703.202995]  [<ffffffffc05a0b40>] ? amdgpu_cs_find_mapping+0x120/0x120 [amdgpu]
[ 2703.210471]  [<ffffffffc058004b>] amdgpu_drm_ioctl+0x4b/0x80 [amdgpu]
[ 2703.217019]  [<ffffffff81456210>] do_vfs_ioctl+0x3a0/0x5a0
[ 2703.222596]  [<ffffffff8196744a>] ? __schedule+0x13a/0x890
[ 2703.228172]  [<ffffffff814564b1>] SyS_ioctl+0xa1/0xc0
[ 2703.233308]  [<ffffffff81974ddb>] system_call_fastpath+0x22/0x27

Thanks,
Prike
From: Christian König <ckoenig.leichtzumerken@gmail.com<mailto:ckoenig.leichtzumerken@gmail.com>>
Sent: Thursday, May 09, 2019 10:59 PM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com<mailto:David1.Zhou@amd.com>>; Koenig, Christian <Christian.Koenig@amd.com<mailto:Christian.Koenig@amd.com>>; Liang, Prike <Prike.Liang@amd.com<mailto:Prike.Liang@amd.com>>; dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>
Subject: Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7

[CAUTION: External Email]
Oh, I know where this is coming from.

The problem is that we remove the BOs from the LRU during CS and so we can't wait for the CS to finish up.

Already working on this problem for Marek's similar issue,
Christian.

Am 09.05.19 um 16:46 schrieb Zhou, David(ChunMing):
I know that before, it will issue warning only when debug option is enabled. Removing that is ok to me.
I only help Prike draft your idea, and Prike is trying this patch on his side. The latest feedback he gave me is first_bo is always null, code doesn't run into busy path, which is very confusing me, and he said  he is debugging  that.

-David


-------- Original Message --------
Subject: Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7
From: "Koenig, Christian"
To: "Zhou, David(ChunMing)" ,"Liang, Prike" ,dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>
CC:
I've foudn one more problem with this.

With lockdep enabled I get a warning because ttm_eu_reserve_buffers()
has called ww_acquire_done() on the ticket (which essentially means we
are done, no more locking with that ticket).

The simplest solution is probably to just remove the call to
ww_acquire_done() from ttm_eu_reserve_buffers().

Christian.

Am 07.05.19 um 13:45 schrieb Chunming Zhou:
> heavy gpu job could occupy memory long time, which lead other user fail to get memory.
>
> basically pick up Christian idea:
>
> 1. Reserve the BO in DC using a ww_mutex ticket (trivial).
> 2. If we then run into this EBUSY condition in TTM check if the BO we need memory for (or rather the ww_mutex of its reservation object) has a ticket assigned.
> 3. If we have a ticket we grab a reference to the first BO on the LRU, drop the LRU lock and try to grab the reservation lock with the ticket.
> 4. If getting the reservation lock with the ticket succeeded we check if the BO is still the first one on the LRU in question (the BO could have moved).
> 5. If the BO is still the first one on the LRU in question we try to evict it as we would evict any other BO.
> 6. If any of the "If's" above fail we just back off and return -EBUSY.
>
> v2: fix some minor check
> v3: address Christian v2 comments.
> v4: fix some missing
> v5: handle first_bo unlock and bo_get/put
> v6: abstract unified iterate function, and handle all possible usecase not only pinned bo.
> v7: pass request bo->resv to ttm_bo_evict_first
>
> Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
> Signed-off-by: Chunming Zhou <david1.zhou@amd.com><mailto:david1.zhou@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 111 +++++++++++++++++++++++++++++------
>   1 file changed, 94 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 8502b3ed2d88..f5e6328e4a57 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -                     struct ttm_operation_ctx *ctx, bool *locked)
> +                     struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>        bool ret = false;
>
>        *locked = false;
> +     if (busy)
> +             *busy = false;
>        if (bo->resv == ctx->resv) {
>                reservation_object_assert_held(bo->resv);
>                if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
> @@ -779,35 +781,46 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
>        } else {
>                *locked = reservation_object_trylock(bo->resv);
>                ret = *locked;
> +             if (!ret && busy)
> +                     *busy = true;
>        }
>
>        return ret;
>   }
>
> -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> -                            uint32_t mem_type,
> -                            const struct ttm_place *place,
> -                            struct ttm_operation_ctx *ctx)
> +static struct ttm_buffer_object*
> +ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
> +                      struct ttm_mem_type_manager *man,
> +                      const struct ttm_place *place,
> +                      struct ttm_operation_ctx *ctx,
> +                      struct ttm_buffer_object **first_bo,
> +                      bool *locked)
>   {
> -     struct ttm_bo_global *glob = bdev->glob;
> -     struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>        struct ttm_buffer_object *bo = NULL;
> -     bool locked = false;
> -     unsigned i;
> -     int ret;
> +     int i;
>
> -     spin_lock(&glob->lru_lock);
> +     if (first_bo)
> +             *first_bo = NULL;
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                list_for_each_entry(bo, &man->lru[i], lru) {
> -                     if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +                     bool busy = false;
> +
> +                     if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
> +                                                         &busy)) {
> +                             if (first_bo && !(*first_bo) && busy) {
> +                                     ttm_bo_get(bo);
> +                                     *first_bo = bo;
> +                             }
>                                continue;
> +                     }
>
>                        if (place && !bdev->driver->eviction_valuable(bo,
>                                                                      place)) {
> -                             if (locked)
> +                             if (*locked)
>                                        reservation_object_unlock(bo->resv);
>                                continue;
>                        }
> +
>                        break;
>                }
>
> @@ -818,9 +831,67 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                bo = NULL;
>        }
>
> +     return bo;
> +}
> +
> +static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
> +                            uint32_t mem_type,
> +                            const struct ttm_place *place,
> +                            struct ttm_operation_ctx *ctx,
> +                            struct reservation_object *request_resv)
> +{
> +     struct ttm_bo_global *glob = bdev->glob;
> +     struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> +     struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
> +     bool locked = false;
> +     int ret;
> +
> +     spin_lock(&glob->lru_lock);
> +     bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx, &first_bo,
> +                                   &locked);
>        if (!bo) {
> +             struct ttm_operation_ctx busy_ctx;
> +
>                spin_unlock(&glob->lru_lock);
> -             return -EBUSY;
> +             /* check if other user occupy memory too long time */
> +             if (!first_bo || !request_resv || !request_resv->lock.ctx) {
> +                     if (first_bo)
> +                             ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
> +             if (first_bo->resv == request_resv) {
> +                     ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
> +             if (ctx->interruptible)
> +                     ret = ww_mutex_lock_interruptible(&first_bo->resv->lock,
> +                                                       request_resv->lock.ctx);
> +             else
> +                     ret = ww_mutex_lock(&first_bo->resv->lock, request_resv->lock.ctx);
> +             if (ret) {
> +                     ttm_bo_put(first_bo);
> +                     return ret;
> +             }
> +             spin_lock(&glob->lru_lock);
> +             /* previous busy resv lock is held by above, idle now,
> +              * so let them evictable.
> +              */
> +             busy_ctx.interruptible = ctx->interruptible;
> +             busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
> +             busy_ctx.resv          = first_bo->resv;
> +             busy_ctx.flags         = TTM_OPT_FLAG_ALLOW_RES_EVICT;
> +
> +             bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx, NULL,
> +                                           &locked);
> +             if (bo && (bo->resv == first_bo->resv))
> +                     locked = true;
> +             else if (bo)
> +                     ww_mutex_unlock(&first_bo->resv->lock);
> +             if (!bo) {
> +                     spin_unlock(&glob->lru_lock);
> +                     ttm_bo_put(first_bo);
> +                     return -EBUSY;
> +             }
>        }
>
>        kref_get(&bo->list_kref);
> @@ -829,11 +900,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
>                                          ctx->no_wait_gpu, locked);
>                kref_put(&bo->list_kref, ttm_bo_release_list);
> +             if (first_bo)
> +                     ttm_bo_put(first_bo);
>                return ret;
>        }
>
>        ttm_bo_del_from_lru(bo);
>        spin_unlock(&glob->lru_lock);
> +     if (first_bo)
> +             ttm_bo_put(first_bo);
>
>        ret = ttm_bo_evict(bo, ctx);
>        if (locked) {
> @@ -907,7 +982,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>                        return ret;
>                if (mem->mm_node)
>                        break;
> -             ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
> +             ret = ttm_mem_evict_first(bdev, mem_type, place, ctx, bo->resv);
>                if (unlikely(ret != 0))
>                        return ret;
>        } while (1);
> @@ -1413,7 +1488,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                while (!list_empty(&man->lru[i])) {
>                        spin_unlock(&glob->lru_lock);
> -                     ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> +                     ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> +                                               NULL);
>                        if (ret)
>                                return ret;
>                        spin_lock(&glob->lru_lock);
> @@ -1784,7 +1860,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>        spin_lock(&glob->lru_lock);
>        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -                     if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +                     if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                        NULL)) {
>                                ret = 0;
>                                break;
>                        }


_______________________________________________

dri-devel mailing list

dri-devel@lists.freedesktop.org<mailto:dri-devel@lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/dri-devel


[-- Attachment #1.2: Type: text/html, Size: 38780 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-05-13  3:00 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-07 11:45 [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Chunming Zhou
2019-05-07 11:45 ` [PATCH 2/2] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve Chunming Zhou
2019-05-13  2:45   ` Liang, Prike
2019-05-07 11:51 ` [PATCH 1/2] drm/ttm: fix busy memory to fail other user v7 Christian König
2019-05-09 14:28 ` Koenig, Christian
2019-05-09 14:46   ` Zhou, David(ChunMing)
2019-05-09 14:59     ` [PATCH " Christian König
2019-05-10  5:39       ` Liang, Prike
2019-05-13  3:00         ` Liang, Prike

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.