All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/10] drm/ttm: Make LRU removal optional.
@ 2019-05-22 12:59 Christian König
  2019-05-22 12:59 ` [PATCH 02/10] drm/ttm: return immediately in case of a signal Christian König
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak-5C7GfCeVMHo, David1.Zhou-5C7GfCeVMHo,
	Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

We are already doing this for DMA-buf imports and also for
amdgpu VM BOs for quite a while now.

If this doesn't run into any problems we are probably going
to stop removing BOs from the LRU altogether.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 +++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c       |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  4 ++--
 drivers/gpu/drm/qxl/qxl_release.c             |  2 +-
 drivers/gpu/drm/radeon/radeon_gem.c           |  2 +-
 drivers/gpu/drm/radeon/radeon_object.c        |  2 +-
 drivers/gpu/drm/ttm/ttm_execbuf_util.c        | 20 +++++++++++--------
 drivers/gpu/drm/virtio/virtgpu_ioctl.c        |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c      |  3 ++-
 drivers/gpu/drm/vmwgfx/vmwgfx_validation.h    |  2 +-
 include/drm/ttm/ttm_bo_driver.h               |  5 ++++-
 include/drm/ttm/ttm_execbuf_util.h            |  3 ++-
 13 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index e1cae4a37113..647e18f9e136 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -574,7 +574,7 @@ static int reserve_bo_and_vm(struct kgd_mem *mem,
 	amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]);
 
 	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
-				     false, &ctx->duplicates);
+				     false, &ctx->duplicates, true);
 	if (!ret)
 		ctx->reserved = true;
 	else {
@@ -647,7 +647,7 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
 	}
 
 	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
-				     false, &ctx->duplicates);
+				     false, &ctx->duplicates, true);
 	if (!ret)
 		ctx->reserved = true;
 	else
@@ -1800,7 +1800,8 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
 	}
 
 	/* Reserve all BOs and page tables for validation */
-	ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates);
+	ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates,
+				     true);
 	WARN(!list_empty(&duplicates), "Duplicates should be empty");
 	if (ret)
 		goto out_free;
@@ -2006,7 +2007,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
 	}
 
 	ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list,
-				     false, &duplicate_save);
+				     false, &duplicate_save, true);
 	if (ret) {
 		pr_debug("Memory eviction: TTM Reserve Failed. Try again\n");
 		goto ttm_reserve_fail;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d72cc583ebd1..fff558cf385b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	}
 
 	r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
-				   &duplicates);
+				   &duplicates, true);
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index 54dd02a898b9..06f83cac0d3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	list_add(&csa_tv.head, &list);
 	amdgpu_vm_get_pd_bo(vm, &list, &pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
 	if (r) {
 		DRM_ERROR("failed to reserve CSA,PD BOs: err=%d\n", r);
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 7b840367004c..d513a5ad03dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
 
 	amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, true);
 	if (r) {
 		dev_err(adev->dev, "leaking bo va because "
 			"we fail to reserve bo (%d)\n", r);
@@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 
 	amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, true);
 	if (r)
 		goto error_unref;
 
diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
index 30f85f0130cb..49f9a9385393 100644
--- a/drivers/gpu/drm/qxl/qxl_release.c
+++ b/drivers/gpu/drm/qxl/qxl_release.c
@@ -256,7 +256,7 @@ int qxl_release_reserve_list(struct qxl_release *release, bool no_intr)
 		return 0;
 
 	ret = ttm_eu_reserve_buffers(&release->ticket, &release->bos,
-				     !no_intr, NULL);
+				     !no_intr, NULL, true);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 44617dec8183..7411e69e2712 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -559,7 +559,7 @@ static void radeon_gem_va_update_vm(struct radeon_device *rdev,
 	if (!vm_bos)
 		return;
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
 	if (r)
 		goto error_free;
 
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index 833e909706a9..36683de0300b 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -539,7 +539,7 @@ int radeon_bo_list_validate(struct radeon_device *rdev,
 	u64 bytes_moved_threshold = radeon_bo_get_threshold_for_moves(rdev);
 
 	INIT_LIST_HEAD(&duplicates);
-	r = ttm_eu_reserve_buffers(ticket, head, true, &duplicates);
+	r = ttm_eu_reserve_buffers(ticket, head, true, &duplicates, true);
 	if (unlikely(r != 0)) {
 		return r;
 	}
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 0075eb9a0b52..957ec375a4ba 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -69,7 +69,8 @@ void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
 	list_for_each_entry(entry, list, head) {
 		struct ttm_buffer_object *bo = entry->bo;
 
-		ttm_bo_add_to_lru(bo);
+		if (list_empty(&bo->lru))
+			ttm_bo_add_to_lru(bo);
 		reservation_object_unlock(bo->resv);
 	}
 	spin_unlock(&glob->lru_lock);
@@ -93,7 +94,7 @@ EXPORT_SYMBOL(ttm_eu_backoff_reservation);
 
 int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
 			   struct list_head *list, bool intr,
-			   struct list_head *dups)
+			   struct list_head *dups, bool del_lru)
 {
 	struct ttm_bo_global *glob;
 	struct ttm_validate_buffer *entry;
@@ -172,11 +173,11 @@ int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
 		list_add(&entry->head, list);
 	}
 
-	if (ticket)
-		ww_acquire_done(ticket);
-	spin_lock(&glob->lru_lock);
-	ttm_eu_del_from_lru_locked(list);
-	spin_unlock(&glob->lru_lock);
+	if (del_lru) {
+		spin_lock(&glob->lru_lock);
+		ttm_eu_del_from_lru_locked(list);
+		spin_unlock(&glob->lru_lock);
+	}
 	return 0;
 }
 EXPORT_SYMBOL(ttm_eu_reserve_buffers);
@@ -203,7 +204,10 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket,
 			reservation_object_add_shared_fence(bo->resv, fence);
 		else
 			reservation_object_add_excl_fence(bo->resv, fence);
-		ttm_bo_add_to_lru(bo);
+		if (list_empty(&bo->lru))
+			ttm_bo_add_to_lru(bo);
+		else
+			ttm_bo_move_to_lru_tail(bo, NULL);
 		reservation_object_unlock(bo->resv);
 	}
 	spin_unlock(&glob->lru_lock);
diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
index 161b80fee492..5cffaa24259f 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
@@ -63,7 +63,7 @@ static int virtio_gpu_object_list_validate(struct ww_acquire_ctx *ticket,
 	struct virtio_gpu_object *qobj;
 	int ret;
 
-	ret = ttm_eu_reserve_buffers(ticket, head, true, NULL);
+	ret = ttm_eu_reserve_buffers(ticket, head, true, NULL, true);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index a7c30e567f09..d28cbedba0b5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -465,7 +465,8 @@ vmw_resource_check_buffer(struct ww_acquire_ctx *ticket,
 	val_buf->bo = &res->backup->base;
 	val_buf->num_shared = 0;
 	list_add_tail(&val_buf->head, &val_list);
-	ret = ttm_eu_reserve_buffers(ticket, &val_list, interruptible, NULL);
+	ret = ttm_eu_reserve_buffers(ticket, &val_list, interruptible, NULL,
+				     true);
 	if (unlikely(ret != 0))
 		goto out_no_reserve;
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
index 3b396fea40d7..ac435b51f4eb 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
@@ -165,7 +165,7 @@ vmw_validation_bo_reserve(struct vmw_validation_context *ctx,
 			  bool intr)
 {
 	return ttm_eu_reserve_buffers(&ctx->ticket, &ctx->bo_list, intr,
-				      NULL);
+				      NULL, true);
 }
 
 /**
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index c008346c2401..fc0d995ac90d 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -769,7 +769,10 @@ static inline void ttm_bo_unreserve(struct ttm_buffer_object *bo)
 {
 	if (!(bo->mem.placement & TTM_PL_FLAG_NO_EVICT)) {
 		spin_lock(&bo->bdev->glob->lru_lock);
-		ttm_bo_add_to_lru(bo);
+		if (list_empty(&bo->lru))
+			ttm_bo_add_to_lru(bo);
+		else
+			ttm_bo_move_to_lru_tail(bo, NULL);
 		spin_unlock(&bo->bdev->glob->lru_lock);
 	}
 	reservation_object_unlock(bo->resv);
diff --git a/include/drm/ttm/ttm_execbuf_util.h b/include/drm/ttm/ttm_execbuf_util.h
index 621615fa7728..7e46cc678e7e 100644
--- a/include/drm/ttm/ttm_execbuf_util.h
+++ b/include/drm/ttm/ttm_execbuf_util.h
@@ -70,6 +70,7 @@ extern void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
  * @list:    thread private list of ttm_validate_buffer structs.
  * @intr:    should the wait be interruptible
  * @dups:    [out] optional list of duplicates.
+ * @del_lru: true if BOs should be removed from the LRU.
  *
  * Tries to reserve bos pointed to by the list entries for validation.
  * If the function returns 0, all buffers are marked as "unfenced",
@@ -98,7 +99,7 @@ extern void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
 
 extern int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
 				  struct list_head *list, bool intr,
-				  struct list_head *dups);
+				  struct list_head *dups, bool del_lru);
 
 /**
  * function ttm_eu_fence_buffer_objects.
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 02/10] drm/ttm: return immediately in case of a signal
  2019-05-22 12:59 [PATCH 01/10] drm/ttm: Make LRU removal optional Christian König
@ 2019-05-22 12:59 ` Christian König
  2019-05-22 12:59 ` [PATCH 03/10] drm/ttm: remove manual placement preference Christian König
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak, David1.Zhou, Prike.Liang, dri-devel, amd-gfx

When a signal arrives we should return immediately for
handling it and not try other placements first.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 2845fceb2fbd..4336893cb35e 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -978,7 +978,6 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 	uint32_t cur_flags = 0;
 	bool type_found = false;
 	bool type_ok = false;
-	bool has_erestartsys = false;
 	int i, ret;
 
 	ret = reservation_object_reserve_shared(bo->resv, 1);
@@ -1069,8 +1068,8 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 			mem->placement = cur_flags;
 			return 0;
 		}
-		if (ret == -ERESTARTSYS)
-			has_erestartsys = true;
+		if (ret && ret != -EBUSY)
+			return ret;
 	}
 
 	if (!type_found) {
@@ -1078,7 +1077,7 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 		return -EINVAL;
 	}
 
-	return (has_erestartsys) ? -ERESTARTSYS : -ENOMEM;
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(ttm_bo_mem_space);
 
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 03/10] drm/ttm: remove manual placement preference
  2019-05-22 12:59 [PATCH 01/10] drm/ttm: Make LRU removal optional Christian König
  2019-05-22 12:59 ` [PATCH 02/10] drm/ttm: return immediately in case of a signal Christian König
@ 2019-05-22 12:59 ` Christian König
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2019-05-22 12:59 ` [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3 Christian König
  3 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak, David1.Zhou, Prike.Liang, dri-devel, amd-gfx

If drivers don't prefer a system memory placement
they should not but it into the placement list first.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 4336893cb35e..b29799aedb71 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1011,8 +1011,12 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 		ttm_flag_masked(&cur_flags, place->flags,
 				~TTM_PL_MASK_MEMTYPE);
 
-		if (mem_type == TTM_PL_SYSTEM)
-			break;
+		if (mem_type == TTM_PL_SYSTEM) {
+			mem->mem_type = mem_type;
+			mem->placement = cur_flags;
+			mem->mm_node = NULL;
+			return 0;
+		}
 
 		ret = (*man->func->get_node)(man, bo, place, mem);
 		if (unlikely(ret))
@@ -1024,16 +1028,12 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 				(*man->func->put_node)(man, mem);
 				return ret;
 			}
-			break;
+			mem->mem_type = mem_type;
+			mem->placement = cur_flags;
+			return 0;
 		}
 	}
 
-	if ((type_ok && (mem_type == TTM_PL_SYSTEM)) || mem->mm_node) {
-		mem->mem_type = mem_type;
-		mem->placement = cur_flags;
-		return 0;
-	}
-
 	for (i = 0; i < placement->num_busy_placement; ++i) {
 		const struct ttm_place *place = &placement->busy_placement[i];
 
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 04/10] drm/ttm: cleanup ttm_bo_mem_space
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-22 12:59   ` Christian König
  2019-05-22 12:59   ` [PATCH 05/10] drm/ttm: immediately move BOs to the new LRU v2 Christian König
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak-5C7GfCeVMHo, David1.Zhou-5C7GfCeVMHo,
	Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

We tried this once before, but that turned out to be more
complicated than thought. With all the right prerequisites
it looks like we can do this now.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 127 ++++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index b29799aedb71..fd6dbebea430 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -892,13 +892,12 @@ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
  * space, or we've evicted everything and there isn't enough space.
  */
 static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
-					uint32_t mem_type,
-					const struct ttm_place *place,
-					struct ttm_mem_reg *mem,
-					struct ttm_operation_ctx *ctx)
+				  const struct ttm_place *place,
+				  struct ttm_mem_reg *mem,
+				  struct ttm_operation_ctx *ctx)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
-	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
+	struct ttm_mem_type_manager *man = &bdev->man[mem->mem_type];
 	int ret;
 
 	do {
@@ -907,11 +906,11 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 			return ret;
 		if (mem->mm_node)
 			break;
-		ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
+		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
 		if (unlikely(ret != 0))
 			return ret;
 	} while (1);
-	mem->mem_type = mem_type;
+
 	return ttm_bo_add_move_fence(bo, man, mem);
 }
 
@@ -959,6 +958,51 @@ static bool ttm_bo_mt_compatible(struct ttm_mem_type_manager *man,
 	return true;
 }
 
+/**
+ * ttm_bo_mem_placement - check if placement is compatible
+ * @bo: BO to find memory for
+ * @place: where to search
+ * @mem: the memory object to fill in
+ * @ctx: operation context
+ *
+ * Check if placement is compatible and fill in mem structure.
+ * Returns -EBUSY if placement won't work or negative error code.
+ * 0 when placement can be used.
+ */
+static int ttm_bo_mem_placement(struct ttm_buffer_object *bo,
+				const struct ttm_place *place,
+				struct ttm_mem_reg *mem,
+				struct ttm_operation_ctx *ctx)
+{
+	struct ttm_bo_device *bdev = bo->bdev;
+	uint32_t mem_type = TTM_PL_SYSTEM;
+	struct ttm_mem_type_manager *man;
+	uint32_t cur_flags = 0;
+	int ret;
+
+	ret = ttm_mem_type_from_place(place, &mem_type);
+	if (ret)
+		return ret;
+
+	man = &bdev->man[mem_type];
+	if (!man->has_type || !man->use_type)
+		return -EBUSY;
+
+	if (!ttm_bo_mt_compatible(man, mem_type, place, &cur_flags))
+		return -EBUSY;
+
+	cur_flags = ttm_bo_select_caching(man, bo->mem.placement, cur_flags);
+	/*
+	 * Use the access and other non-mapping-related flag bits from
+	 * the memory placement flags to the current flags
+	 */
+	ttm_flag_masked(&cur_flags, place->flags, ~TTM_PL_MASK_MEMTYPE);
+
+	mem->mem_type = mem_type;
+	mem->placement = cur_flags;
+	return 0;
+}
+
 /**
  * Creates space for memory region @mem according to its type.
  *
@@ -973,11 +1017,7 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 			struct ttm_operation_ctx *ctx)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
-	struct ttm_mem_type_manager *man;
-	uint32_t mem_type = TTM_PL_SYSTEM;
-	uint32_t cur_flags = 0;
 	bool type_found = false;
-	bool type_ok = false;
 	int i, ret;
 
 	ret = reservation_object_reserve_shared(bo->resv, 1);
@@ -987,37 +1027,20 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 	mem->mm_node = NULL;
 	for (i = 0; i < placement->num_placement; ++i) {
 		const struct ttm_place *place = &placement->placement[i];
+		struct ttm_mem_type_manager *man;
 
-		ret = ttm_mem_type_from_place(place, &mem_type);
+		ret = ttm_bo_mem_placement(bo, place, mem, ctx);
+		if (ret == -EBUSY)
+			continue;
 		if (ret)
 			return ret;
-		man = &bdev->man[mem_type];
-		if (!man->has_type || !man->use_type)
-			continue;
-
-		type_ok = ttm_bo_mt_compatible(man, mem_type, place,
-						&cur_flags);
-
-		if (!type_ok)
-			continue;
 
 		type_found = true;
-		cur_flags = ttm_bo_select_caching(man, bo->mem.placement,
-						  cur_flags);
-		/*
-		 * Use the access and other non-mapping-related flag bits from
-		 * the memory placement flags to the current flags
-		 */
-		ttm_flag_masked(&cur_flags, place->flags,
-				~TTM_PL_MASK_MEMTYPE);
-
-		if (mem_type == TTM_PL_SYSTEM) {
-			mem->mem_type = mem_type;
-			mem->placement = cur_flags;
-			mem->mm_node = NULL;
+		mem->mm_node = NULL;
+		if (mem->mem_type == TTM_PL_SYSTEM)
 			return 0;
-		}
 
+		man = &bdev->man[mem->mem_type];
 		ret = (*man->func->get_node)(man, bo, place, mem);
 		if (unlikely(ret))
 			return ret;
@@ -1028,8 +1051,6 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 				(*man->func->put_node)(man, mem);
 				return ret;
 			}
-			mem->mem_type = mem_type;
-			mem->placement = cur_flags;
 			return 0;
 		}
 	}
@@ -1037,37 +1058,21 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 	for (i = 0; i < placement->num_busy_placement; ++i) {
 		const struct ttm_place *place = &placement->busy_placement[i];
 
-		ret = ttm_mem_type_from_place(place, &mem_type);
+		ret = ttm_bo_mem_placement(bo, place, mem, ctx);
+		if (ret == -EBUSY)
+			continue;
 		if (ret)
 			return ret;
-		man = &bdev->man[mem_type];
-		if (!man->has_type || !man->use_type)
-			continue;
-		if (!ttm_bo_mt_compatible(man, mem_type, place, &cur_flags))
-			continue;
 
 		type_found = true;
-		cur_flags = ttm_bo_select_caching(man, bo->mem.placement,
-						  cur_flags);
-		/*
-		 * Use the access and other non-mapping-related flag bits from
-		 * the memory placement flags to the current flags
-		 */
-		ttm_flag_masked(&cur_flags, place->flags,
-				~TTM_PL_MASK_MEMTYPE);
-
-		if (mem_type == TTM_PL_SYSTEM) {
-			mem->mem_type = mem_type;
-			mem->placement = cur_flags;
-			mem->mm_node = NULL;
+		mem->mm_node = NULL;
+		if (mem->mem_type == TTM_PL_SYSTEM)
 			return 0;
-		}
 
-		ret = ttm_bo_mem_force_space(bo, mem_type, place, mem, ctx);
-		if (ret == 0 && mem->mm_node) {
-			mem->placement = cur_flags;
+		ret = ttm_bo_mem_force_space(bo, place, mem, ctx);
+		if (ret == 0 && mem->mm_node)
 			return 0;
-		}
+
 		if (ret && ret != -EBUSY)
 			return ret;
 	}
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 05/10] drm/ttm: immediately move BOs to the new LRU v2
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2019-05-22 12:59   ` [PATCH 04/10] drm/ttm: cleanup ttm_bo_mem_space Christian König
@ 2019-05-22 12:59   ` Christian König
  2019-05-22 12:59   ` [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10 Christian König
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak-5C7GfCeVMHo, David1.Zhou-5C7GfCeVMHo,
	Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Move BOs which are currently in a lower domain to the new
LRU before allocating backing space while validating.

This makes sure that we always have enough entries on the
LRU to allow for other processes to wait for an operation
to complete.

v2: generalize the test

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 45 ++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index fd6dbebea430..4c6389d849ed 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -166,17 +166,17 @@ static void ttm_bo_release_list(struct kref *list_kref)
 	ttm_mem_global_free(bdev->glob->mem_glob, acc_size);
 }
 
-void ttm_bo_add_to_lru(struct ttm_buffer_object *bo)
+static void ttm_bo_add_mem_to_lru(struct ttm_buffer_object *bo,
+				  struct ttm_mem_reg *mem)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_type_manager *man;
 
 	reservation_object_assert_held(bo->resv);
+	BUG_ON(!list_empty(&bo->lru));
 
-	if (!(bo->mem.placement & TTM_PL_FLAG_NO_EVICT)) {
-		BUG_ON(!list_empty(&bo->lru));
-
-		man = &bdev->man[bo->mem.mem_type];
+	if (!(mem->placement & TTM_PL_FLAG_NO_EVICT)) {
+		man = &bdev->man[mem->mem_type];
 		list_add_tail(&bo->lru, &man->lru[bo->priority]);
 		kref_get(&bo->list_kref);
 
@@ -188,6 +188,11 @@ void ttm_bo_add_to_lru(struct ttm_buffer_object *bo)
 		}
 	}
 }
+
+void ttm_bo_add_to_lru(struct ttm_buffer_object *bo)
+{
+	ttm_bo_add_mem_to_lru(bo, &bo->mem);
+}
 EXPORT_SYMBOL(ttm_bo_add_to_lru);
 
 static void ttm_bo_ref_bug(struct kref *list_kref)
@@ -1000,6 +1005,14 @@ static int ttm_bo_mem_placement(struct ttm_buffer_object *bo,
 
 	mem->mem_type = mem_type;
 	mem->placement = cur_flags;
+
+	if (bo->mem.mem_type < mem_type && !list_empty(&bo->lru)) {
+		spin_lock(&bo->bdev->glob->lru_lock);
+		ttm_bo_del_from_lru(bo);
+		ttm_bo_add_mem_to_lru(bo, mem);
+		spin_unlock(&bo->bdev->glob->lru_lock);
+	}
+
 	return 0;
 }
 
@@ -1033,7 +1046,7 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 		if (ret == -EBUSY)
 			continue;
 		if (ret)
-			return ret;
+			goto error;
 
 		type_found = true;
 		mem->mm_node = NULL;
@@ -1043,13 +1056,13 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 		man = &bdev->man[mem->mem_type];
 		ret = (*man->func->get_node)(man, bo, place, mem);
 		if (unlikely(ret))
-			return ret;
+			goto error;
 
 		if (mem->mm_node) {
 			ret = ttm_bo_add_move_fence(bo, man, mem);
 			if (unlikely(ret)) {
 				(*man->func->put_node)(man, mem);
-				return ret;
+				goto error;
 			}
 			return 0;
 		}
@@ -1062,7 +1075,7 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 		if (ret == -EBUSY)
 			continue;
 		if (ret)
-			return ret;
+			goto error;
 
 		type_found = true;
 		mem->mm_node = NULL;
@@ -1074,15 +1087,23 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 			return 0;
 
 		if (ret && ret != -EBUSY)
-			return ret;
+			goto error;
 	}
 
+	ret = -ENOMEM;
 	if (!type_found) {
 		pr_err(TTM_PFX "No compatible memory type found\n");
-		return -EINVAL;
+		ret = -EINVAL;
 	}
 
-	return -ENOMEM;
+error:
+	if (bo->mem.mem_type == TTM_PL_SYSTEM && !list_empty(&bo->lru)) {
+		spin_lock(&bo->bdev->glob->lru_lock);
+		ttm_bo_move_to_lru_tail(bo, NULL);
+		spin_unlock(&bo->bdev->glob->lru_lock);
+	}
+
+	return ret;
 }
 EXPORT_SYMBOL(ttm_bo_mem_space);
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2019-05-22 12:59   ` [PATCH 04/10] drm/ttm: cleanup ttm_bo_mem_space Christian König
  2019-05-22 12:59   ` [PATCH 05/10] drm/ttm: immediately move BOs to the new LRU v2 Christian König
@ 2019-05-22 12:59   ` Christian König
  2019-05-23 10:24     ` zhoucm1
       [not found]     ` <20190522125947.4592-6-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2019-05-22 12:59   ` [PATCH 07/10] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve v2 Christian König
                     ` (3 subsequent siblings)
  6 siblings, 2 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak-5C7GfCeVMHo, David1.Zhou-5C7GfCeVMHo,
	Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

BOs on the LRU might be blocked during command submission
and cause OOM situations.

Avoid this by blocking for the first busy BO not locked by
the same ticket as the BO we are searching space for.

v10: completely start over with the patch since we didn't
     handled a whole bunch of corner cases.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 4c6389d849ed..861facac33d4 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
  * b. Otherwise, trylock it.
  */
 static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
-			struct ttm_operation_ctx *ctx, bool *locked)
+			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
 {
 	bool ret = false;
 
-	*locked = false;
 	if (bo->resv == ctx->resv) {
 		reservation_object_assert_held(bo->resv);
 		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
 		    || !list_empty(&bo->ddestroy))
 			ret = true;
+		*locked = false;
+		if (busy)
+			*busy = false;
 	} else {
-		*locked = reservation_object_trylock(bo->resv);
-		ret = *locked;
+		ret = reservation_object_trylock(bo->resv);
+		*locked = ret;
+		if (busy)
+			*busy = !ret;
 	}
 
 	return ret;
 }
 
+/**
+ * ttm_mem_evict_wait_busy - wait for a busy BO to become available
+ *
+ * @busy_bo: BO which couldn't be locked with trylock
+ * @ctx: operation context
+ * @ticket: acquire ticket
+ *
+ * Try to lock a busy buffer object to avoid failing eviction.
+ */
+static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
+				   struct ttm_operation_ctx *ctx,
+				   struct ww_acquire_ctx *ticket)
+{
+	int r;
+
+	if (!busy_bo || !ticket)
+		return -EBUSY;
+
+	if (ctx->interruptible)
+		r = reservation_object_lock_interruptible(busy_bo->resv,
+							  ticket);
+	else
+		r = reservation_object_lock(busy_bo->resv, ticket);
+
+	/*
+	 * TODO: It would be better to keep the BO locked until allocation is at
+	 * least tried one more time, but that would mean a much larger rework
+	 * of TTM.
+	 */
+	if (!r)
+		reservation_object_unlock(busy_bo->resv);
+
+	return r == -EDEADLK ? -EAGAIN : r;
+}
+
 static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 			       uint32_t mem_type,
 			       const struct ttm_place *place,
-			       struct ttm_operation_ctx *ctx)
+			       struct ttm_operation_ctx *ctx,
+			       struct ww_acquire_ctx *ticket)
 {
+	struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
-	struct ttm_buffer_object *bo = NULL;
 	bool locked = false;
 	unsigned i;
 	int ret;
@@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &man->lru[i], lru) {
-			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
+			bool busy;
+
+			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							    &busy)) {
+				if (busy && !busy_bo &&
+				    bo->resv->lock.ctx != ticket)
+					busy_bo = bo;
 				continue;
+			}
 
 			if (place && !bdev->driver->eviction_valuable(bo,
 								      place)) {
@@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	}
 
 	if (!bo) {
+		if (busy_bo)
+			ttm_bo_get(busy_bo);
 		spin_unlock(&glob->lru_lock);
-		return -EBUSY;
+		ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
+		if (busy_bo)
+			ttm_bo_put(busy_bo);
+		return ret;
 	}
 
 	kref_get(&bo->list_kref);
@@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 			return ret;
 		if (mem->mm_node)
 			break;
-		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
+		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx,
+					  bo->resv->lock.ctx);
 		if (unlikely(ret != 0))
 			return ret;
 	} while (1);
@@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		while (!list_empty(&man->lru[i])) {
 			spin_unlock(&glob->lru_lock);
-			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
+			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
+						  NULL);
 			if (ret)
 				return ret;
 			spin_lock(&glob->lru_lock);
@@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
-			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
+			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							   NULL)) {
 				ret = 0;
 				break;
 			}
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 07/10] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve v2
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2019-05-22 12:59   ` [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10 Christian König
@ 2019-05-22 12:59   ` Christian König
  2019-05-22 12:59   ` [PATCH 08/10] drm/amdgpu: drop some validation failure messages Christian König
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak-5C7GfCeVMHo, David1.Zhou-5C7GfCeVMHo,
	Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

From: Chunming Zhou <david1.zhou@amd.com>

add ticket for display bo, so that it can preempt busy bo.

v2: fix stupid rebase error

Change-Id: I9f031cdcc8267de00e819ae303baa0a52df8ebb9
Signed-off-by: Chunming Zhou <david1.zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 4a1755bce96c..56f320f3fd72 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4182,6 +4182,9 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	struct amdgpu_device *adev;
 	struct amdgpu_bo *rbo;
 	struct dm_plane_state *dm_plane_state_new, *dm_plane_state_old;
+	struct list_head list;
+	struct ttm_validate_buffer tv;
+	struct ww_acquire_ctx ticket;
 	uint64_t tiling_flags;
 	uint32_t domain;
 	int r;
@@ -4198,9 +4201,17 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	obj = new_state->fb->obj[0];
 	rbo = gem_to_amdgpu_bo(obj);
 	adev = amdgpu_ttm_adev(rbo->tbo.bdev);
-	r = amdgpu_bo_reserve(rbo, false);
-	if (unlikely(r != 0))
+	INIT_LIST_HEAD(&list);
+
+	tv.bo = &rbo->tbo;
+	tv.num_shared = 1;
+	list_add(&tv.head, &list);
+
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL, true);
+	if (r) {
+		dev_err(adev->dev, "fail to reserve bo (%d)\n", r);
 		return r;
+	}
 
 	if (plane->type != DRM_PLANE_TYPE_CURSOR)
 		domain = amdgpu_display_supported_domains(adev);
@@ -4211,21 +4222,21 @@ static int dm_plane_helper_prepare_fb(struct drm_plane *plane,
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("Failed to pin framebuffer with error %d\n", r);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		return r;
 	}
 
 	r = amdgpu_ttm_alloc_gart(&rbo->tbo);
 	if (unlikely(r != 0)) {
 		amdgpu_bo_unpin(rbo);
-		amdgpu_bo_unreserve(rbo);
+		ttm_eu_backoff_reservation(&ticket, &list);
 		DRM_ERROR("%p bind failed\n", rbo);
 		return r;
 	}
 
 	amdgpu_bo_get_tiling_flags(rbo, &tiling_flags);
 
-	amdgpu_bo_unreserve(rbo);
+	ttm_eu_backoff_reservation(&ticket, &list);
 
 	afb->address = amdgpu_bo_gpu_offset(rbo);
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 08/10] drm/amdgpu: drop some validation failure messages
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2019-05-22 12:59   ` [PATCH 07/10] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve v2 Christian König
@ 2019-05-22 12:59   ` Christian König
  2019-05-22 12:59   ` [PATCH 09/10] drm/amdgpu: create GDS, GWS and OA in system domain Christian König
  2019-05-23  9:15   ` [PATCH 01/10] drm/ttm: Make LRU removal optional zhoucm1
  6 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak-5C7GfCeVMHo, David1.Zhou-5C7GfCeVMHo,
	Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

The messages about amdgpu_cs_list_validate are duplicated because the
caller will complain into the logs as well and we can also get
interrupted by a signal here.

Also fix the the caller to not report -EAGAIN from validation.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index fff558cf385b..20f2955d2a55 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -671,16 +671,12 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	}
 
 	r = amdgpu_cs_list_validate(p, &duplicates);
-	if (r) {
-		DRM_ERROR("amdgpu_cs_list_validate(duplicates) failed.\n");
+	if (r)
 		goto error_validate;
-	}
 
 	r = amdgpu_cs_list_validate(p, &p->validated);
-	if (r) {
-		DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n");
+	if (r)
 		goto error_validate;
-	}
 
 	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
 				     p->bytes_moved_vis);
@@ -1383,7 +1379,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 	if (r) {
 		if (r == -ENOMEM)
 			DRM_ERROR("Not enough memory for command submission!\n");
-		else if (r != -ERESTARTSYS)
+		else if (r != -ERESTARTSYS && r != -EAGAIN)
 			DRM_ERROR("Failed to process the buffer list %d!\n", r);
 		goto out;
 	}
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 09/10] drm/amdgpu: create GDS, GWS and OA in system domain
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (4 preceding siblings ...)
  2019-05-22 12:59   ` [PATCH 08/10] drm/amdgpu: drop some validation failure messages Christian König
@ 2019-05-22 12:59   ` Christian König
  2019-05-23  9:15   ` [PATCH 01/10] drm/ttm: Make LRU removal optional zhoucm1
  6 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak-5C7GfCeVMHo, David1.Zhou-5C7GfCeVMHo,
	Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

And only move them in on validation. This allows for better control
when multiple processes are fighting over those resources.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 93b2c5a48a71..30493429851e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -495,7 +495,11 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 #endif
 
 	bo->tbo.bdev = &adev->mman.bdev;
-	amdgpu_bo_placement_from_domain(bo, bp->domain);
+	if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
+			  AMDGPU_GEM_DOMAIN_GDS))
+		amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
+	else
+		amdgpu_bo_placement_from_domain(bo, bp->domain);
 	if (bp->type == ttm_bo_type_kernel)
 		bo->tbo.priority = 1;
 
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3
  2019-05-22 12:59 [PATCH 01/10] drm/ttm: Make LRU removal optional Christian König
                   ` (2 preceding siblings ...)
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-22 12:59 ` Christian König
       [not found]   ` <20190522125947.4592-10-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  3 siblings, 1 reply; 26+ messages in thread
From: Christian König @ 2019-05-22 12:59 UTC (permalink / raw)
  To: Marek.Olsak, David1.Zhou, Prike.Liang, dri-devel, amd-gfx

This avoids OOM situations when we have lots of threads
submitting at the same time.

v3: apply this to the whole driver, not just CS

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c    | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 20f2955d2a55..3e2da24cd17a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	}
 
 	r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
-				   &duplicates, true);
+				   &duplicates, false);
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index 06f83cac0d3a..f660628e6af9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	list_add(&csa_tv.head, &list);
 	amdgpu_vm_get_pd_bo(vm, &list, &pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, false);
 	if (r) {
 		DRM_ERROR("failed to reserve CSA,PD BOs: err=%d\n", r);
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index d513a5ad03dd..ed25a4e14404 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
 
 	amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, true);
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, false);
 	if (r) {
 		dev_err(adev->dev, "leaking bo va because "
 			"we fail to reserve bo (%d)\n", r);
@@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 
 	amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, true);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, false);
 	if (r)
 		goto error_unref;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index c430e8259038..d60593cc436e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -155,7 +155,7 @@ static inline int amdgpu_bo_reserve(struct amdgpu_bo *bo, bool no_intr)
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 	int r;
 
-	r = ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
+	r = __ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			dev_err(adev->dev, "%p reserve failed\n", bo);
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3
       [not found]   ` <20190522125947.4592-10-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-22 19:43     ` Kuehling, Felix
       [not found]       ` <48ac98a8-de22-3549-5d63-078a0effab72-5C7GfCeVMHo@public.gmane.org>
  2019-05-23  8:27     ` Liang, Prike
  1 sibling, 1 reply; 26+ messages in thread
From: Kuehling, Felix @ 2019-05-22 19:43 UTC (permalink / raw)
  To: Christian König, Olsak, Marek, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Can you explain how this avoids OOM situations? When is it safe to leave 
a reserved BO on the LRU list? Could we do the same thing in 
amdgpu_amdkfd_gpuvm.c? And if we did, what would be the expected side 
effects or consequences?

Thanks,
   Felix

On 2019-05-22 8:59 a.m., Christian König wrote:
> [CAUTION: External Email]
>
> This avoids OOM situations when we have lots of threads
> submitting at the same time.
>
> v3: apply this to the whole driver, not just CS
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c    | 2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 +-
>   4 files changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 20f2955d2a55..3e2da24cd17a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>          }
>
>          r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
> -                                  &duplicates, true);
> +                                  &duplicates, false);
>          if (unlikely(r != 0)) {
>                  if (r != -ERESTARTSYS)
>                          DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> index 06f83cac0d3a..f660628e6af9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> @@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>          list_add(&csa_tv.head, &list);
>          amdgpu_vm_get_pd_bo(vm, &list, &pd);
>
> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, false);
>          if (r) {
>                  DRM_ERROR("failed to reserve CSA,PD BOs: err=%d\n", r);
>                  return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index d513a5ad03dd..ed25a4e14404 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
>
>          amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
>
> -       r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, true);
> +       r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, false);
>          if (r) {
>                  dev_err(adev->dev, "leaking bo va because "
>                          "we fail to reserve bo (%d)\n", r);
> @@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
>
>          amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
>
> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, true);
> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, false);
>          if (r)
>                  goto error_unref;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index c430e8259038..d60593cc436e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -155,7 +155,7 @@ static inline int amdgpu_bo_reserve(struct amdgpu_bo *bo, bool no_intr)
>          struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>          int r;
>
> -       r = ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
> +       r = __ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
>          if (unlikely(r != 0)) {
>                  if (r != -ERESTARTSYS)
>                          dev_err(adev->dev, "%p reserve failed\n", bo);
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3
       [not found]   ` <20190522125947.4592-10-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  2019-05-22 19:43     ` Kuehling, Felix
@ 2019-05-23  8:27     ` Liang, Prike
  1 sibling, 0 replies; 26+ messages in thread
From: Liang, Prike @ 2019-05-23  8:27 UTC (permalink / raw)
  To: Christian König, Olsak, Marek, Zhou, David(ChunMing),
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi, Christian 

Thanks for you patch .Those patches can fix amdgpu bo pinned failed issue during perform dm_plane_helper_prepare_fb 
and Abaqus performance seems improved.

But there some error message can be observer. Do we need drop  amdgpu_vm_validate_pt_bos() error message 
and other warning debug info .

[ 1910.674541] Call Trace:
[ 1910.676944]  [<ffffffff8b361dc1>] dump_stack+0x19/0x1b
[ 1910.682236]  [<ffffffff8ac97648>] __warn+0xd8/0x100
[ 1910.687195]  [<ffffffff8ac9778d>] warn_slowpath_null+0x1d/0x20
[ 1910.693167]  [<ffffffffc0603619>] amdgpu_bo_move+0x169/0x1c0 [amdgpu]
[ 1910.699719]  [<ffffffffc05c82bb>] ttm_bo_handle_move_mem+0x26b/0x5d0 [amdttm]
[ 1910.706976]  [<ffffffffc05c8767>] ttm_bo_evict+0x147/0x3b0 [amdttm]
[ 1910.713358]  [<ffffffffc04e88d9>] ? drm_mm_insert_node_in_range+0x299/0x4d0 [drm]
[ 1910.720881]  [<ffffffffc057652e>] ? _kcl_reservation_object_reserve_shared+0xfe/0x1a0 [amdkcl]
[ 1910.729710]  [<ffffffffc05c8c6e>] ttm_mem_evict_first+0x29e/0x3a0 [amdttm]
[ 1910.736705]  [<ffffffffc05c8f1e>] amdttm_bo_mem_space+0x1ae/0x300 [amdttm]
[ 1910.743696]  [<ffffffffc05c9544>] amdttm_bo_validate+0xc4/0x140 [amdttm]
[ 1910.750529]  [<ffffffffc060c035>] amdgpu_cs_bo_validate+0xa5/0x220 [amdgpu]
[ 1910.757625]  [<ffffffffc060c1f7>] amdgpu_cs_validate+0x47/0x2e0 [amdgpu]
[ 1910.764463]  [<ffffffffc060c1b0>] ? amdgpu_cs_bo_validate+0x220/0x220 [amdgpu]
[ 1910.771736]  [<ffffffffc0620652>] amdgpu_vm_validate_pt_bos+0x92/0x140 [amdgpu]
[ 1910.779248]  [<ffffffffc060e547>] amdgpu_cs_ioctl+0x18a7/0x1d50 [amdgpu]
[ 1910.785992]  [<ffffffffc060cca0>] ? amdgpu_cs_find_mapping+0x120/0x120 [amdgpu]
[ 1910.793486]  [<ffffffffc04e3f2c>] drm_ioctl_kernel+0x6c/0xb0 [drm]
[ 1910.799777]  [<ffffffffc04e4647>] drm_ioctl+0x1e7/0x420 [drm]
[ 1910.805643]  [<ffffffffc060cca0>] ? amdgpu_cs_find_mapping+0x120/0x120 [amdgpu]
[ 1910.813090]  [<ffffffffc05ec04b>] amdgpu_drm_ioctl+0x4b/0x80 [amdgpu]
[ 1910.819639]  [<ffffffff8ae56210>] do_vfs_ioctl+0x3a0/0x5a0
[ 1910.825217]  [<ffffffff8b36744a>] ? __schedule+0x13a/0x890
[ 1910.830795]  [<ffffffff8ae564b1>] SyS_ioctl+0xa1/0xc0
[ 1910.835943]  [<ffffffff8b374ddb>] system_call_fastpath+0x22/0x27
[ 1910.842048] ---[ end trace a5c00b151c061d53 ]---
[ 1910.846814] [TTM] Buffer eviction failed
[ 1910.850838] [drm:amdgpu_cs_ioctl [amdgpu]] *ERROR* amdgpu_vm_validate_pt_bos() failed.
[ 1910.858905] [drm:amdgpu_cs_ioctl [amdgpu]] *ERROR* Failed to process the buffer list -22!
.......

Thanks,
Prike
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, May 22, 2019 9:00 PM
To: Olsak, Marek <Marek.Olsak@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; Liang, Prike <Prike.Liang@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Subject: [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3

[CAUTION: External Email]

This avoids OOM situations when we have lots of threads submitting at the same time.

v3: apply this to the whole driver, not just CS

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c    | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 20f2955d2a55..3e2da24cd17a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
        }

        r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
-                                  &duplicates, true);
+                                  &duplicates, false);
        if (unlikely(r != 0)) {
                if (r != -ERESTARTSYS)
                        DRM_ERROR("ttm_eu_reserve_buffers failed.\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index 06f83cac0d3a..f660628e6af9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
        list_add(&csa_tv.head, &list);
        amdgpu_vm_get_pd_bo(vm, &list, &pd);

-       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
+       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, false);
        if (r) {
                DRM_ERROR("failed to reserve CSA,PD BOs: err=%d\n", r);
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index d513a5ad03dd..ed25a4e14404 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,

        amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);

-       r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, true);
+       r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, 
+ false);
        if (r) {
                dev_err(adev->dev, "leaking bo va because "
                        "we fail to reserve bo (%d)\n", r); @@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,

        amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);

-       r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, true);
+       r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, 
+ false);
        if (r)
                goto error_unref;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index c430e8259038..d60593cc436e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -155,7 +155,7 @@ static inline int amdgpu_bo_reserve(struct amdgpu_bo *bo, bool no_intr)
        struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
        int r;

-       r = ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
+       r = __ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
        if (unlikely(r != 0)) {
                if (r != -ERESTARTSYS)
                        dev_err(adev->dev, "%p reserve failed\n", bo);
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3
       [not found]       ` <48ac98a8-de22-3549-5d63-078a0effab72-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-23  9:06         ` Christian König
       [not found]           ` <eea6245e-616d-eb16-8521-2f21ce5d6d25-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Christian König @ 2019-05-23  9:06 UTC (permalink / raw)
  To: Kuehling, Felix, Olsak, Marek, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Leaving BOs on the LRU is harmless. We always did this for VM page table 
and per VM BOs.

The key point is that BOs which couldn't be reserved can't be evicted. 
So what happened is that an application used basically all of VRAM 
during CS and because of this X server couldn't pin a BO for scanout.

Now we keep the BOs on the LRU and modify TTM to block for the CS to 
complete, which in turn allows the X server to pin its BO for scanout.

Christian.

Am 22.05.19 um 21:43 schrieb Kuehling, Felix:
> Can you explain how this avoids OOM situations? When is it safe to leave
> a reserved BO on the LRU list? Could we do the same thing in
> amdgpu_amdkfd_gpuvm.c? And if we did, what would be the expected side
> effects or consequences?
>
> Thanks,
>     Felix
>
> On 2019-05-22 8:59 a.m., Christian König wrote:
>> [CAUTION: External Email]
>>
>> This avoids OOM situations when we have lots of threads
>> submitting at the same time.
>>
>> v3: apply this to the whole driver, not just CS
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c    | 2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 4 ++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 +-
>>    4 files changed, 5 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 20f2955d2a55..3e2da24cd17a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>>           }
>>
>>           r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
>> -                                  &duplicates, true);
>> +                                  &duplicates, false);
>>           if (unlikely(r != 0)) {
>>                   if (r != -ERESTARTSYS)
>>                           DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>> index 06f83cac0d3a..f660628e6af9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>> @@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>           list_add(&csa_tv.head, &list);
>>           amdgpu_vm_get_pd_bo(vm, &list, &pd);
>>
>> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
>> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, false);
>>           if (r) {
>>                   DRM_ERROR("failed to reserve CSA,PD BOs: err=%d\n", r);
>>                   return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index d513a5ad03dd..ed25a4e14404 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
>>
>>           amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
>>
>> -       r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, true);
>> +       r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, false);
>>           if (r) {
>>                   dev_err(adev->dev, "leaking bo va because "
>>                           "we fail to reserve bo (%d)\n", r);
>> @@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
>>
>>           amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
>>
>> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, true);
>> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, false);
>>           if (r)
>>                   goto error_unref;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index c430e8259038..d60593cc436e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -155,7 +155,7 @@ static inline int amdgpu_bo_reserve(struct amdgpu_bo *bo, bool no_intr)
>>           struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>           int r;
>>
>> -       r = ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
>> +       r = __ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
>>           if (unlikely(r != 0)) {
>>                   if (r != -ERESTARTSYS)
>>                           dev_err(adev->dev, "%p reserve failed\n", bo);
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 01/10] drm/ttm: Make LRU removal optional.
       [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
                     ` (5 preceding siblings ...)
  2019-05-22 12:59   ` [PATCH 09/10] drm/amdgpu: create GDS, GWS and OA in system domain Christian König
@ 2019-05-23  9:15   ` zhoucm1
       [not found]     ` <fbb023f9-28e7-2ac8-994f-e262da597098-5C7GfCeVMHo@public.gmane.org>
  6 siblings, 1 reply; 26+ messages in thread
From: zhoucm1 @ 2019-05-23  9:15 UTC (permalink / raw)
  To: Christian König, Marek.Olsak-5C7GfCeVMHo,
	David1.Zhou-5C7GfCeVMHo, Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2019年05月22日 20:59, Christian König wrote:
> [CAUTION: External Email]
>
> We are already doing this for DMA-buf imports and also for
> amdgpu VM BOs for quite a while now.
>
> If this doesn't run into any problems we are probably going
> to stop removing BOs from the LRU altogether.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 +++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c       |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  4 ++--
>   drivers/gpu/drm/qxl/qxl_release.c             |  2 +-
>   drivers/gpu/drm/radeon/radeon_gem.c           |  2 +-
>   drivers/gpu/drm/radeon/radeon_object.c        |  2 +-
>   drivers/gpu/drm/ttm/ttm_execbuf_util.c        | 20 +++++++++++--------
>   drivers/gpu/drm/virtio/virtgpu_ioctl.c        |  2 +-
>   drivers/gpu/drm/vmwgfx/vmwgfx_resource.c      |  3 ++-
>   drivers/gpu/drm/vmwgfx/vmwgfx_validation.h    |  2 +-
>   include/drm/ttm/ttm_bo_driver.h               |  5 ++++-
>   include/drm/ttm/ttm_execbuf_util.h            |  3 ++-
>   13 files changed, 34 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index e1cae4a37113..647e18f9e136 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -574,7 +574,7 @@ static int reserve_bo_and_vm(struct kgd_mem *mem,
>          amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]);
>
>          ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
> -                                    false, &ctx->duplicates);
> +                                    false, &ctx->duplicates, true);
>          if (!ret)
>                  ctx->reserved = true;
>          else {
> @@ -647,7 +647,7 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
>          }
>
>          ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
> -                                    false, &ctx->duplicates);
> +                                    false, &ctx->duplicates, true);
>          if (!ret)
>                  ctx->reserved = true;
>          else
> @@ -1800,7 +1800,8 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
>          }
>
>          /* Reserve all BOs and page tables for validation */
> -       ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates);
> +       ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates,
> +                                    true);
>          WARN(!list_empty(&duplicates), "Duplicates should be empty");
>          if (ret)
>                  goto out_free;
> @@ -2006,7 +2007,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
>          }
>
>          ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list,
> -                                    false, &duplicate_save);
> +                                    false, &duplicate_save, true);
>          if (ret) {
>                  pr_debug("Memory eviction: TTM Reserve Failed. Try again\n");
>                  goto ttm_reserve_fail;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index d72cc583ebd1..fff558cf385b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>          }
>
>          r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
> -                                  &duplicates);
> +                                  &duplicates, true);
>          if (unlikely(r != 0)) {
>                  if (r != -ERESTARTSYS)
>                          DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> index 54dd02a898b9..06f83cac0d3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> @@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>          list_add(&csa_tv.head, &list);
>          amdgpu_vm_get_pd_bo(vm, &list, &pd);
>
> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL);
> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
>          if (r) {
>                  DRM_ERROR("failed to reserve CSA,PD BOs: err=%d\n", r);
>                  return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 7b840367004c..d513a5ad03dd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
>
>          amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
>
> -       r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
> +       r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates, true);
>          if (r) {
>                  dev_err(adev->dev, "leaking bo va because "
>                          "we fail to reserve bo (%d)\n", r);
> @@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
>
>          amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
>
> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates);
> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates, true);
>          if (r)
>                  goto error_unref;
>
> diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
> index 30f85f0130cb..49f9a9385393 100644
> --- a/drivers/gpu/drm/qxl/qxl_release.c
> +++ b/drivers/gpu/drm/qxl/qxl_release.c
> @@ -256,7 +256,7 @@ int qxl_release_reserve_list(struct qxl_release *release, bool no_intr)
>                  return 0;
>
>          ret = ttm_eu_reserve_buffers(&release->ticket, &release->bos,
> -                                    !no_intr, NULL);
> +                                    !no_intr, NULL, true);
>          if (ret)
>                  return ret;
>
> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
> index 44617dec8183..7411e69e2712 100644
> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> @@ -559,7 +559,7 @@ static void radeon_gem_va_update_vm(struct radeon_device *rdev,
>          if (!vm_bos)
>                  return;
>
> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL);
> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
>          if (r)
>                  goto error_free;
>
> diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
> index 833e909706a9..36683de0300b 100644
> --- a/drivers/gpu/drm/radeon/radeon_object.c
> +++ b/drivers/gpu/drm/radeon/radeon_object.c
> @@ -539,7 +539,7 @@ int radeon_bo_list_validate(struct radeon_device *rdev,
>          u64 bytes_moved_threshold = radeon_bo_get_threshold_for_moves(rdev);
>
>          INIT_LIST_HEAD(&duplicates);
> -       r = ttm_eu_reserve_buffers(ticket, head, true, &duplicates);
> +       r = ttm_eu_reserve_buffers(ticket, head, true, &duplicates, true);
>          if (unlikely(r != 0)) {
>                  return r;
>          }
> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> index 0075eb9a0b52..957ec375a4ba 100644
> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> @@ -69,7 +69,8 @@ void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
>          list_for_each_entry(entry, list, head) {
>                  struct ttm_buffer_object *bo = entry->bo;
>
> -               ttm_bo_add_to_lru(bo);
> +               if (list_empty(&bo->lru))
> +                       ttm_bo_add_to_lru(bo);
>                  reservation_object_unlock(bo->resv);
>          }
>          spin_unlock(&glob->lru_lock);
> @@ -93,7 +94,7 @@ EXPORT_SYMBOL(ttm_eu_backoff_reservation);
>
>   int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
>                             struct list_head *list, bool intr,
> -                          struct list_head *dups)
> +                          struct list_head *dups, bool del_lru)
>   {
>          struct ttm_bo_global *glob;
>          struct ttm_validate_buffer *entry;
> @@ -172,11 +173,11 @@ int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
>                  list_add(&entry->head, list);
>          }
>
> -       if (ticket)
> -               ww_acquire_done(ticket);
> -       spin_lock(&glob->lru_lock);
> -       ttm_eu_del_from_lru_locked(list);
> -       spin_unlock(&glob->lru_lock);
> +       if (del_lru) {
> +               spin_lock(&glob->lru_lock);
> +               ttm_eu_del_from_lru_locked(list);
> +               spin_unlock(&glob->lru_lock);
> +       }
>          return 0;
>   }
>   EXPORT_SYMBOL(ttm_eu_reserve_buffers);
> @@ -203,7 +204,10 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket,
>                          reservation_object_add_shared_fence(bo->resv, fence);
>                  else
>                          reservation_object_add_excl_fence(bo->resv, fence);
> -               ttm_bo_add_to_lru(bo);
> +               if (list_empty(&bo->lru))
> +                       ttm_bo_add_to_lru(bo);
> +               else
> +                       ttm_bo_move_to_lru_tail(bo, NULL);
can ttm_bo_move_to_lru_tail be moved to ttm_eu_reserve_buffers when 
del_lru is false?

-David

>                  reservation_object_unlock(bo->resv);
>          }
>          spin_unlock(&glob->lru_lock);
> diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
> index 161b80fee492..5cffaa24259f 100644
> --- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c
> +++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c
> @@ -63,7 +63,7 @@ static int virtio_gpu_object_list_validate(struct ww_acquire_ctx *ticket,
>          struct virtio_gpu_object *qobj;
>          int ret;
>
> -       ret = ttm_eu_reserve_buffers(ticket, head, true, NULL);
> +       ret = ttm_eu_reserve_buffers(ticket, head, true, NULL, true);
>          if (ret != 0)
>                  return ret;
>
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> index a7c30e567f09..d28cbedba0b5 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> @@ -465,7 +465,8 @@ vmw_resource_check_buffer(struct ww_acquire_ctx *ticket,
>          val_buf->bo = &res->backup->base;
>          val_buf->num_shared = 0;
>          list_add_tail(&val_buf->head, &val_list);
> -       ret = ttm_eu_reserve_buffers(ticket, &val_list, interruptible, NULL);
> +       ret = ttm_eu_reserve_buffers(ticket, &val_list, interruptible, NULL,
> +                                    true);
>          if (unlikely(ret != 0))
>                  goto out_no_reserve;
>
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
> index 3b396fea40d7..ac435b51f4eb 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
> @@ -165,7 +165,7 @@ vmw_validation_bo_reserve(struct vmw_validation_context *ctx,
>                            bool intr)
>   {
>          return ttm_eu_reserve_buffers(&ctx->ticket, &ctx->bo_list, intr,
> -                                     NULL);
> +                                     NULL, true);
>   }
>
>   /**
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> index c008346c2401..fc0d995ac90d 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -769,7 +769,10 @@ static inline void ttm_bo_unreserve(struct ttm_buffer_object *bo)
>   {
>          if (!(bo->mem.placement & TTM_PL_FLAG_NO_EVICT)) {
>                  spin_lock(&bo->bdev->glob->lru_lock);
> -               ttm_bo_add_to_lru(bo);
> +               if (list_empty(&bo->lru))
> +                       ttm_bo_add_to_lru(bo);
> +               else
> +                       ttm_bo_move_to_lru_tail(bo, NULL);
>                  spin_unlock(&bo->bdev->glob->lru_lock);
>          }
>          reservation_object_unlock(bo->resv);
> diff --git a/include/drm/ttm/ttm_execbuf_util.h b/include/drm/ttm/ttm_execbuf_util.h
> index 621615fa7728..7e46cc678e7e 100644
> --- a/include/drm/ttm/ttm_execbuf_util.h
> +++ b/include/drm/ttm/ttm_execbuf_util.h
> @@ -70,6 +70,7 @@ extern void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
>    * @list:    thread private list of ttm_validate_buffer structs.
>    * @intr:    should the wait be interruptible
>    * @dups:    [out] optional list of duplicates.
> + * @del_lru: true if BOs should be removed from the LRU.
>    *
>    * Tries to reserve bos pointed to by the list entries for validation.
>    * If the function returns 0, all buffers are marked as "unfenced",
> @@ -98,7 +99,7 @@ extern void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
>
>   extern int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
>                                    struct list_head *list, bool intr,
> -                                 struct list_head *dups);
> +                                 struct list_head *dups, bool del_lru);
>
>   /**
>    * function ttm_eu_fence_buffer_objects.
> --
> 2.17.1
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 01/10] drm/ttm: Make LRU removal optional.
       [not found]     ` <fbb023f9-28e7-2ac8-994f-e262da597098-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-23  9:39       ` Christian König
  0 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-23  9:39 UTC (permalink / raw)
  To: zhoucm1, Marek.Olsak-5C7GfCeVMHo, David1.Zhou-5C7GfCeVMHo,
	Prike.Liang-5C7GfCeVMHo,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 23.05.19 um 11:15 schrieb zhoucm1:
> On 2019年05月22日 20:59, Christian König wrote:
>> [SNIP]
>> @@ -203,7 +204,10 @@ void ttm_eu_fence_buffer_objects(struct 
>> ww_acquire_ctx *ticket,
>> reservation_object_add_shared_fence(bo->resv, fence);
>>                  else
>> reservation_object_add_excl_fence(bo->resv, fence);
>> -               ttm_bo_add_to_lru(bo);
>> +               if (list_empty(&bo->lru))
>> +                       ttm_bo_add_to_lru(bo);
>> +               else
>> +                       ttm_bo_move_to_lru_tail(bo, NULL);
> can ttm_bo_move_to_lru_tail be moved to ttm_eu_reserve_buffers when 
> del_lru is false?

No, that won't work.

The BO might have moved to another domain and when we have the 
ttm_bo_move_to_lru_tail() in the reservation we won't be handling this 
correctly.

Christian.

>
> -David

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
  2019-05-22 12:59   ` [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10 Christian König
@ 2019-05-23 10:24     ` zhoucm1
  2019-05-23 11:03       ` Christian König
       [not found]     ` <20190522125947.4592-6-christian.koenig-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: zhoucm1 @ 2019-05-23 10:24 UTC (permalink / raw)
  To: Christian König, Marek.Olsak, David1.Zhou, Prike.Liang,
	dri-devel, amd-gfx



On 2019年05月22日 20:59, Christian König wrote:
> [CAUTION: External Email]
>
> BOs on the LRU might be blocked during command submission
> and cause OOM situations.
>
> Avoid this by blocking for the first busy BO not locked by
> the same ticket as the BO we are searching space for.
>
> v10: completely start over with the patch since we didn't
>       handled a whole bunch of corner cases.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>   1 file changed, 66 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 4c6389d849ed..861facac33d4 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -                       struct ttm_operation_ctx *ctx, bool *locked)
> +                       struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>          bool ret = false;
>
> -       *locked = false;
>          if (bo->resv == ctx->resv) {
>                  reservation_object_assert_held(bo->resv);
>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>                      || !list_empty(&bo->ddestroy))
>                          ret = true;
> +               *locked = false;
> +               if (busy)
> +                       *busy = false;
>          } else {
> -               *locked = reservation_object_trylock(bo->resv);
> -               ret = *locked;
> +               ret = reservation_object_trylock(bo->resv);
> +               *locked = ret;
> +               if (busy)
> +                       *busy = !ret;
>          }
>
>          return ret;
>   }
>
> +/**
> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
> + *
> + * @busy_bo: BO which couldn't be locked with trylock
> + * @ctx: operation context
> + * @ticket: acquire ticket
> + *
> + * Try to lock a busy buffer object to avoid failing eviction.
> + */
> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
> +                                  struct ttm_operation_ctx *ctx,
> +                                  struct ww_acquire_ctx *ticket)
> +{
> +       int r;
> +
> +       if (!busy_bo || !ticket)
> +               return -EBUSY;
> +
> +       if (ctx->interruptible)
> +               r = reservation_object_lock_interruptible(busy_bo->resv,
> +                                                         ticket);
> +       else
> +               r = reservation_object_lock(busy_bo->resv, ticket);
> +
> +       /*
> +        * TODO: It would be better to keep the BO locked until allocation is at
> +        * least tried one more time, but that would mean a much larger rework
> +        * of TTM.
> +        */
> +       if (!r)
> +               reservation_object_unlock(busy_bo->resv);
> +
> +       return r == -EDEADLK ? -EAGAIN : r;
> +}
> +
>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                                 uint32_t mem_type,
>                                 const struct ttm_place *place,
> -                              struct ttm_operation_ctx *ctx)
> +                              struct ttm_operation_ctx *ctx,
> +                              struct ww_acquire_ctx *ticket)
>   {
> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>          struct ttm_bo_global *glob = bdev->glob;
>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> -       struct ttm_buffer_object *bo = NULL;
>          bool locked = false;
>          unsigned i;
>          int ret;
> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>          spin_lock(&glob->lru_lock);
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  list_for_each_entry(bo, &man->lru[i], lru) {
> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +                       bool busy;
> +
> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                           &busy)) {
> +                               if (busy && !busy_bo &&
> +                                   bo->resv->lock.ctx != ticket)
> +                                       busy_bo = bo;
>                                  continue;
> +                       }
>
>                          if (place && !bdev->driver->eviction_valuable(bo,
>                                                                        place)) {
> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>          }
>
>          if (!bo) {
> +               if (busy_bo)
> +                       ttm_bo_get(busy_bo);
>                  spin_unlock(&glob->lru_lock);
> -               return -EBUSY;
> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
If you rely on EAGAIN, why do you still try to lock busy_bo? any 
negative effect if directly return EAGAIN without tring lock?

-David
> +               if (busy_bo)
> +                       ttm_bo_put(busy_bo);
> +               return ret;
>          }
>
>          kref_get(&bo->list_kref);
> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>                          return ret;
>                  if (mem->mm_node)
>                          break;
> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx,
> +                                         bo->resv->lock.ctx);
>                  if (unlikely(ret != 0))
>                          return ret;
>          } while (1);
> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  while (!list_empty(&man->lru[i])) {
>                          spin_unlock(&glob->lru_lock);
> -                       ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> +                       ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> +                                                 NULL);
>                          if (ret)
>                                  return ret;
>                          spin_lock(&glob->lru_lock);
> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>          spin_lock(&glob->lru_lock);
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                          NULL)) {
>                                  ret = 0;
>                                  break;
>                          }
> --
> 2.17.1
>

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
  2019-05-23 10:24     ` zhoucm1
@ 2019-05-23 11:03       ` Christian König
       [not found]         ` <16918096-1430-d581-7284-a987aacb89da-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2019-05-24  5:35         ` Liang, Prike
  0 siblings, 2 replies; 26+ messages in thread
From: Christian König @ 2019-05-23 11:03 UTC (permalink / raw)
  To: zhoucm1, Marek.Olsak, David1.Zhou, Prike.Liang, dri-devel, amd-gfx

Am 23.05.19 um 12:24 schrieb zhoucm1:
>
>
> On 2019年05月22日 20:59, Christian König wrote:
>> [CAUTION: External Email]
>>
>> BOs on the LRU might be blocked during command submission
>> and cause OOM situations.
>>
>> Avoid this by blocking for the first busy BO not locked by
>> the same ticket as the BO we are searching space for.
>>
>> v10: completely start over with the patch since we didn't
>>       handled a whole bunch of corner cases.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>>   1 file changed, 66 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index 4c6389d849ed..861facac33d4 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>    * b. Otherwise, trylock it.
>>    */
>>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object 
>> *bo,
>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>> +                       struct ttm_operation_ctx *ctx, bool *locked, 
>> bool *busy)
>>   {
>>          bool ret = false;
>>
>> -       *locked = false;
>>          if (bo->resv == ctx->resv) {
>>                  reservation_object_assert_held(bo->resv);
>>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>                      || !list_empty(&bo->ddestroy))
>>                          ret = true;
>> +               *locked = false;
>> +               if (busy)
>> +                       *busy = false;
>>          } else {
>> -               *locked = reservation_object_trylock(bo->resv);
>> -               ret = *locked;
>> +               ret = reservation_object_trylock(bo->resv);
>> +               *locked = ret;
>> +               if (busy)
>> +                       *busy = !ret;
>>          }
>>
>>          return ret;
>>   }
>>
>> +/**
>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>> + *
>> + * @busy_bo: BO which couldn't be locked with trylock
>> + * @ctx: operation context
>> + * @ticket: acquire ticket
>> + *
>> + * Try to lock a busy buffer object to avoid failing eviction.
>> + */
>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>> +                                  struct ttm_operation_ctx *ctx,
>> +                                  struct ww_acquire_ctx *ticket)
>> +{
>> +       int r;
>> +
>> +       if (!busy_bo || !ticket)
>> +               return -EBUSY;
>> +
>> +       if (ctx->interruptible)
>> +               r = reservation_object_lock_interruptible(busy_bo->resv,
>> + ticket);
>> +       else
>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>> +
>> +       /*
>> +        * TODO: It would be better to keep the BO locked until 
>> allocation is at
>> +        * least tried one more time, but that would mean a much 
>> larger rework
>> +        * of TTM.
>> +        */
>> +       if (!r)
>> +               reservation_object_unlock(busy_bo->resv);
>> +
>> +       return r == -EDEADLK ? -EAGAIN : r;
>> +}
>> +
>>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>                                 uint32_t mem_type,
>>                                 const struct ttm_place *place,
>> -                              struct ttm_operation_ctx *ctx)
>> +                              struct ttm_operation_ctx *ctx,
>> +                              struct ww_acquire_ctx *ticket)
>>   {
>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>          struct ttm_bo_global *glob = bdev->glob;
>>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> -       struct ttm_buffer_object *bo = NULL;
>>          bool locked = false;
>>          unsigned i;
>>          int ret;
>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct 
>> ttm_bo_device *bdev,
>>          spin_lock(&glob->lru_lock);
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  list_for_each_entry(bo, &man->lru[i], lru) {
>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, 
>> &locked))
>> +                       bool busy;
>> +
>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, 
>> &locked,
>> + &busy)) {
>> +                               if (busy && !busy_bo &&
>> +                                   bo->resv->lock.ctx != ticket)
>> +                                       busy_bo = bo;
>>                                  continue;
>> +                       }
>>
>>                          if (place && 
>> !bdev->driver->eviction_valuable(bo,
>> place)) {
>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct 
>> ttm_bo_device *bdev,
>>          }
>>
>>          if (!bo) {
>> +               if (busy_bo)
>> +                       ttm_bo_get(busy_bo);
>>                  spin_unlock(&glob->lru_lock);
>> -               return -EBUSY;
>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
> If you rely on EAGAIN, why do you still try to lock busy_bo? any 
> negative effect if directly return EAGAIN without tring lock?

Yeah, that would burn a lot of CPU cycles because we would essentially 
busy wait for the BO to become unlocked.

When we only return in case of a deadlock the other thread can continue 
with its eviction while we reacquire all looks during EAGAIN handling.

Even directly unlocking the BO as I do here is a bit questionable. But I 
couldn't get the original logic with finding a new BO to evict to work 
correctly, that's why I have the TODO comment in the function itself as 
well.

Christian.

>
> -David
>> +               if (busy_bo)
>> +                       ttm_bo_put(busy_bo);
>> +               return ret;
>>          }
>>
>>          kref_get(&bo->list_kref);
>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct 
>> ttm_buffer_object *bo,
>>                          return ret;
>>                  if (mem->mm_node)
>>                          break;
>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, 
>> ctx);
>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, 
>> ctx,
>> + bo->resv->lock.ctx);
>>                  if (unlikely(ret != 0))
>>                          return ret;
>>          } while (1);
>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct 
>> ttm_bo_device *bdev,
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  while (!list_empty(&man->lru[i])) {
>>                          spin_unlock(&glob->lru_lock);
>> -                       ret = ttm_mem_evict_first(bdev, mem_type, 
>> NULL, &ctx);
>> +                       ret = ttm_mem_evict_first(bdev, mem_type, 
>> NULL, &ctx,
>> +                                                 NULL);
>>                          if (ret)
>>                                  return ret;
>>                          spin_lock(&glob->lru_lock);
>> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, 
>> struct ttm_operation_ctx *ctx)
>>          spin_lock(&glob->lru_lock);
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx, 
>> &locked)) {
>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx, 
>> &locked,
>> + NULL)) {
>>                                  ret = 0;
>>                                  break;
>>                          }
>> -- 
>> 2.17.1
>>
>

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
       [not found]         ` <16918096-1430-d581-7284-a987aacb89da-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-05-23 11:50           ` Chunming Zhou
       [not found]             ` <5d68ba04-250d-918e-3633-ec45e5b18904-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Chunming Zhou @ 2019-05-23 11:50 UTC (permalink / raw)
  To: Koenig, Christian, Olsak, Marek, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


在 2019/5/23 19:03, Christian König 写道:
> [CAUTION: External Email]
>
> Am 23.05.19 um 12:24 schrieb zhoucm1:
>>
>>
>> On 2019年05月22日 20:59, Christian König wrote:
>>> [CAUTION: External Email]
>>>
>>> BOs on the LRU might be blocked during command submission
>>> and cause OOM situations.
>>>
>>> Avoid this by blocking for the first busy BO not locked by
>>> the same ticket as the BO we are searching space for.
>>>
>>> v10: completely start over with the patch since we didn't
>>>       handled a whole bunch of corner cases.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>>   drivers/gpu/drm/ttm/ttm_bo.c | 77 
>>> ++++++++++++++++++++++++++++++------
>>>   1 file changed, 66 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>> index 4c6389d849ed..861facac33d4 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>    * b. Otherwise, trylock it.
>>>    */
>>>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object
>>> *bo,
>>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>>> +                       struct ttm_operation_ctx *ctx, bool *locked,
>>> bool *busy)
>>>   {
>>>          bool ret = false;
>>>
>>> -       *locked = false;
>>>          if (bo->resv == ctx->resv) {
>>>                  reservation_object_assert_held(bo->resv);
>>>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>                      || !list_empty(&bo->ddestroy))
>>>                          ret = true;
>>> +               *locked = false;
>>> +               if (busy)
>>> +                       *busy = false;
>>>          } else {
>>> -               *locked = reservation_object_trylock(bo->resv);
>>> -               ret = *locked;
>>> +               ret = reservation_object_trylock(bo->resv);
>>> +               *locked = ret;
>>> +               if (busy)
>>> +                       *busy = !ret;
>>>          }
>>>
>>>          return ret;
>>>   }
>>>
>>> +/**
>>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>>> + *
>>> + * @busy_bo: BO which couldn't be locked with trylock
>>> + * @ctx: operation context
>>> + * @ticket: acquire ticket
>>> + *
>>> + * Try to lock a busy buffer object to avoid failing eviction.
>>> + */
>>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>>> +                                  struct ttm_operation_ctx *ctx,
>>> +                                  struct ww_acquire_ctx *ticket)
>>> +{
>>> +       int r;
>>> +
>>> +       if (!busy_bo || !ticket)
>>> +               return -EBUSY;
>>> +
>>> +       if (ctx->interruptible)
>>> +               r = 
>>> reservation_object_lock_interruptible(busy_bo->resv,
>>> + ticket);
>>> +       else
>>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>>> +
>>> +       /*
>>> +        * TODO: It would be better to keep the BO locked until
>>> allocation is at
>>> +        * least tried one more time, but that would mean a much
>>> larger rework
>>> +        * of TTM.
>>> +        */
>>> +       if (!r)
>>> +               reservation_object_unlock(busy_bo->resv);
>>> +
>>> +       return r == -EDEADLK ? -EAGAIN : r;
>>> +}
>>> +
>>>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>                                 uint32_t mem_type,
>>>                                 const struct ttm_place *place,
>>> -                              struct ttm_operation_ctx *ctx)
>>> +                              struct ttm_operation_ctx *ctx,
>>> +                              struct ww_acquire_ctx *ticket)
>>>   {
>>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>>          struct ttm_bo_global *glob = bdev->glob;
>>>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>> -       struct ttm_buffer_object *bo = NULL;
>>>          bool locked = false;
>>>          unsigned i;
>>>          int ret;
>>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct
>>> ttm_bo_device *bdev,
>>>          spin_lock(&glob->lru_lock);
>>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                  list_for_each_entry(bo, &man->lru[i], lru) {
>>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked))
>>> +                       bool busy;
>>> +
>>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked,
>>> + &busy)) {
>>> +                               if (busy && !busy_bo &&
>>> +                                   bo->resv->lock.ctx != ticket)
>>> +                                       busy_bo = bo;
>>>                                  continue;
>>> +                       }
>>>
>>>                          if (place &&
>>> !bdev->driver->eviction_valuable(bo,
>>> place)) {
>>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct
>>> ttm_bo_device *bdev,
>>>          }
>>>
>>>          if (!bo) {
>>> +               if (busy_bo)
>>> +                       ttm_bo_get(busy_bo);
>>>                  spin_unlock(&glob->lru_lock);
>>> -               return -EBUSY;
>>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
>> If you rely on EAGAIN, why do you still try to lock busy_bo? any
>> negative effect if directly return EAGAIN without tring lock?
>
> Yeah, that would burn a lot of CPU cycles because we would essentially
> busy wait for the BO to become unlocked.
>
> When we only return in case of a deadlock the other thread can continue
> with its eviction while we reacquire all looks during EAGAIN handling.
>
> Even directly unlocking the BO as I do here is a bit questionable. But I
> couldn't get the original logic with finding a new BO to evict to work
> correctly, that's why I have the TODO comment in the function itself as
> well.

Yes, it looks very wired.

original logic should already work verified by Prike. Friendly, you  
need go through lookup lru loop again, judge allowable, and whether 
busy_bo->resv and requried_bo->resv are same, and make evict_allowable 
in ctx for same lock of busy_bo before loop again.

Or at least, if busy_bo is evict allowable, you  can direclty evict 
busy_bo after locked successfully.

-David

>
> Christian.
>
>>
>> -David
>>> +               if (busy_bo)
>>> +                       ttm_bo_put(busy_bo);
>>> +               return ret;
>>>          }
>>>
>>>          kref_get(&bo->list_kref);
>>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct
>>> ttm_buffer_object *bo,
>>>                          return ret;
>>>                  if (mem->mm_node)
>>>                          break;
>>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>> ctx);
>>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>> ctx,
>>> + bo->resv->lock.ctx);
>>>                  if (unlikely(ret != 0))
>>>                          return ret;
>>>          } while (1);
>>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct
>>> ttm_bo_device *bdev,
>>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                  while (!list_empty(&man->lru[i])) {
>>>                          spin_unlock(&glob->lru_lock);
>>> -                       ret = ttm_mem_evict_first(bdev, mem_type,
>>> NULL, &ctx);
>>> +                       ret = ttm_mem_evict_first(bdev, mem_type,
>>> NULL, &ctx,
>>> +                                                 NULL);
>>>                          if (ret)
>>>                                  return ret;
>>>                          spin_lock(&glob->lru_lock);
>>> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
>>> struct ttm_operation_ctx *ctx)
>>>          spin_lock(&glob->lru_lock);
>>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked)) {
>>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked,
>>> + NULL)) {
>>>                                  ret = 0;
>>>                                  break;
>>>                          }
>>> -- 
>>> 2.17.1
>>>
>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
       [not found]             ` <5d68ba04-250d-918e-3633-ec45e5b18904-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-23 14:15               ` Koenig, Christian
  0 siblings, 0 replies; 26+ messages in thread
From: Koenig, Christian @ 2019-05-23 14:15 UTC (permalink / raw)
  To: Zhou, David(ChunMing),
	Olsak, Marek, Liang, Prike,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 23.05.19 um 13:50 schrieb Zhou, David(ChunMing):
> 在 2019/5/23 19:03, Christian König 写道:
>> [CAUTION: External Email]
>>
>> Am 23.05.19 um 12:24 schrieb zhoucm1:
>>>
>>> On 2019年05月22日 20:59, Christian König wrote:
>>>> [CAUTION: External Email]
>>>>
>>>> BOs on the LRU might be blocked during command submission
>>>> and cause OOM situations.
>>>>
>>>> Avoid this by blocking for the first busy BO not locked by
>>>> the same ticket as the BO we are searching space for.
>>>>
>>>> v10: completely start over with the patch since we didn't
>>>>        handled a whole bunch of corner cases.
>>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/ttm/ttm_bo.c | 77
>>>> ++++++++++++++++++++++++++++++------
>>>>    1 file changed, 66 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>> index 4c6389d849ed..861facac33d4 100644
>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>>     * b. Otherwise, trylock it.
>>>>     */
>>>>    static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object
>>>> *bo,
>>>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>>>> +                       struct ttm_operation_ctx *ctx, bool *locked,
>>>> bool *busy)
>>>>    {
>>>>           bool ret = false;
>>>>
>>>> -       *locked = false;
>>>>           if (bo->resv == ctx->resv) {
>>>>                   reservation_object_assert_held(bo->resv);
>>>>                   if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>>                       || !list_empty(&bo->ddestroy))
>>>>                           ret = true;
>>>> +               *locked = false;
>>>> +               if (busy)
>>>> +                       *busy = false;
>>>>           } else {
>>>> -               *locked = reservation_object_trylock(bo->resv);
>>>> -               ret = *locked;
>>>> +               ret = reservation_object_trylock(bo->resv);
>>>> +               *locked = ret;
>>>> +               if (busy)
>>>> +                       *busy = !ret;
>>>>           }
>>>>
>>>>           return ret;
>>>>    }
>>>>
>>>> +/**
>>>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>>>> + *
>>>> + * @busy_bo: BO which couldn't be locked with trylock
>>>> + * @ctx: operation context
>>>> + * @ticket: acquire ticket
>>>> + *
>>>> + * Try to lock a busy buffer object to avoid failing eviction.
>>>> + */
>>>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>>>> +                                  struct ttm_operation_ctx *ctx,
>>>> +                                  struct ww_acquire_ctx *ticket)
>>>> +{
>>>> +       int r;
>>>> +
>>>> +       if (!busy_bo || !ticket)
>>>> +               return -EBUSY;
>>>> +
>>>> +       if (ctx->interruptible)
>>>> +               r =
>>>> reservation_object_lock_interruptible(busy_bo->resv,
>>>> + ticket);
>>>> +       else
>>>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>>>> +
>>>> +       /*
>>>> +        * TODO: It would be better to keep the BO locked until
>>>> allocation is at
>>>> +        * least tried one more time, but that would mean a much
>>>> larger rework
>>>> +        * of TTM.
>>>> +        */
>>>> +       if (!r)
>>>> +               reservation_object_unlock(busy_bo->resv);
>>>> +
>>>> +       return r == -EDEADLK ? -EAGAIN : r;
>>>> +}
>>>> +
>>>>    static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>>                                  uint32_t mem_type,
>>>>                                  const struct ttm_place *place,
>>>> -                              struct ttm_operation_ctx *ctx)
>>>> +                              struct ttm_operation_ctx *ctx,
>>>> +                              struct ww_acquire_ctx *ticket)
>>>>    {
>>>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>>>           struct ttm_bo_global *glob = bdev->glob;
>>>>           struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>>> -       struct ttm_buffer_object *bo = NULL;
>>>>           bool locked = false;
>>>>           unsigned i;
>>>>           int ret;
>>>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct
>>>> ttm_bo_device *bdev,
>>>>           spin_lock(&glob->lru_lock);
>>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>                   list_for_each_entry(bo, &man->lru[i], lru) {
>>>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>>> &locked))
>>>> +                       bool busy;
>>>> +
>>>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>>> &locked,
>>>> + &busy)) {
>>>> +                               if (busy && !busy_bo &&
>>>> +                                   bo->resv->lock.ctx != ticket)
>>>> +                                       busy_bo = bo;
>>>>                                   continue;
>>>> +                       }
>>>>
>>>>                           if (place &&
>>>> !bdev->driver->eviction_valuable(bo,
>>>> place)) {
>>>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct
>>>> ttm_bo_device *bdev,
>>>>           }
>>>>
>>>>           if (!bo) {
>>>> +               if (busy_bo)
>>>> +                       ttm_bo_get(busy_bo);
>>>>                   spin_unlock(&glob->lru_lock);
>>>> -               return -EBUSY;
>>>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
>>> If you rely on EAGAIN, why do you still try to lock busy_bo? any
>>> negative effect if directly return EAGAIN without tring lock?
>> Yeah, that would burn a lot of CPU cycles because we would essentially
>> busy wait for the BO to become unlocked.
>>
>> When we only return in case of a deadlock the other thread can continue
>> with its eviction while we reacquire all looks during EAGAIN handling.
>>
>> Even directly unlocking the BO as I do here is a bit questionable. But I
>> couldn't get the original logic with finding a new BO to evict to work
>> correctly, that's why I have the TODO comment in the function itself as
>> well.
> Yes, it looks very wired.
>
> original logic should already work verified by Prike.

Unfortunately that didn't worked out correctly. Marek came up with a GDS 
related test case which showed that we corrupted the locking object somehow.

> Friendly, you
> need go through lookup lru loop again, judge allowable, and whether
> busy_bo->resv and requried_bo->resv are same, and make evict_allowable
> in ctx for same lock of busy_bo before loop again.

Actually we don't necessarily need to do this. See Mareks test case was 
that you not only have two tasks fighting for resources, but three.

So what happens is that task A is doing a command submission swapping 
things in and task B and C are waiting. When A unlocks  B got the lock 
and swapped things out again. When B then unlocked C got the lock and 
also tried to evict something and failed miserable.

In this failing path of C we somehow overwrote the memory of the lock 
(some pointer must have gone banana).

Anyway the solution we have here now seems to work for both Prikes and 
Mareks test case, but is definitely not ideal. E.g. we rely on that we 
either be able to evict something sooner or later or run out of BOs to 
wait to become idle.

Apart from that Prike now reported an OOM in the VM code, but that is 
completely unrelated to this problem.

Regards,
Christian.

> Or at least, if busy_bo is evict allowable, you  can direclty evict
> busy_bo after locked successfully.
>
> -David
>
>> Christian.
>>
>>> -David
>>>> +               if (busy_bo)
>>>> +                       ttm_bo_put(busy_bo);
>>>> +               return ret;
>>>>           }
>>>>
>>>>           kref_get(&bo->list_kref);
>>>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct
>>>> ttm_buffer_object *bo,
>>>>                           return ret;
>>>>                   if (mem->mm_node)
>>>>                           break;
>>>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>>> ctx);
>>>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>>> ctx,
>>>> + bo->resv->lock.ctx);
>>>>                   if (unlikely(ret != 0))
>>>>                           return ret;
>>>>           } while (1);
>>>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct
>>>> ttm_bo_device *bdev,
>>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>                   while (!list_empty(&man->lru[i])) {
>>>>                           spin_unlock(&glob->lru_lock);
>>>> -                       ret = ttm_mem_evict_first(bdev, mem_type,
>>>> NULL, &ctx);
>>>> +                       ret = ttm_mem_evict_first(bdev, mem_type,
>>>> NULL, &ctx,
>>>> +                                                 NULL);
>>>>                           if (ret)
>>>>                                   return ret;
>>>>                           spin_lock(&glob->lru_lock);
>>>> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
>>>> struct ttm_operation_ctx *ctx)
>>>>           spin_lock(&glob->lru_lock);
>>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>>                   list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>>> &locked)) {
>>>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>>> &locked,
>>>> + NULL)) {
>>>>                                   ret = 0;
>>>>                                   break;
>>>>                           }
>>>> -- 
>>>> 2.17.1
>>>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
  2019-05-23 11:03       ` Christian König
       [not found]         ` <16918096-1430-d581-7284-a987aacb89da-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-05-24  5:35         ` Liang, Prike
       [not found]           ` <MN2PR12MB35364235378F29899838CD80FB020-rweVpJHSKTovpq7YPKzLfQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: Liang, Prike @ 2019-05-24  5:35 UTC (permalink / raw)
  To: Koenig, Christian, Zhou, David(ChunMing),
	Olsak, Marek, dri-devel, amd-gfx

Use Abaqus torturing the amdgpu driver more times will running into locking first busy BO deadlock .Then the caller will 
return EAGAIN and eventually dm_plane_helper_prepare_fb popups out pinned failed message .For this case, the patch#7
 can we add EAGAIN as ERESTARTSYS which filter out the annoying error message .

Thanks,
Prike
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Thursday, May 23, 2019 7:04 PM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; Olsak, Marek <Marek.Olsak@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; Liang, Prike <Prike.Liang@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10

[CAUTION: External Email]

Am 23.05.19 um 12:24 schrieb zhoucm1:
>
>
> On 2019年05月22日 20:59, Christian König wrote:
>> [CAUTION: External Email]
>>
>> BOs on the LRU might be blocked during command submission and cause 
>> OOM situations.
>>
>> Avoid this by blocking for the first busy BO not locked by the same 
>> ticket as the BO we are searching space for.
>>
>> v10: completely start over with the patch since we didn't
>>       handled a whole bunch of corner cases.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>>   1 file changed, 66 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>> b/drivers/gpu/drm/ttm/ttm_bo.c index 4c6389d849ed..861facac33d4 
>> 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>    * b. Otherwise, trylock it.
>>    */
>>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object 
>> *bo,
>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>> +                       struct ttm_operation_ctx *ctx, bool *locked,
>> bool *busy)
>>   {
>>          bool ret = false;
>>
>> -       *locked = false;
>>          if (bo->resv == ctx->resv) {
>>                  reservation_object_assert_held(bo->resv);
>>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>                      || !list_empty(&bo->ddestroy))
>>                          ret = true;
>> +               *locked = false;
>> +               if (busy)
>> +                       *busy = false;
>>          } else {
>> -               *locked = reservation_object_trylock(bo->resv);
>> -               ret = *locked;
>> +               ret = reservation_object_trylock(bo->resv);
>> +               *locked = ret;
>> +               if (busy)
>> +                       *busy = !ret;
>>          }
>>
>>          return ret;
>>   }
>>
>> +/**
>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>> + *
>> + * @busy_bo: BO which couldn't be locked with trylock
>> + * @ctx: operation context
>> + * @ticket: acquire ticket
>> + *
>> + * Try to lock a busy buffer object to avoid failing eviction.
>> + */
>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>> +                                  struct ttm_operation_ctx *ctx,
>> +                                  struct ww_acquire_ctx *ticket) {
>> +       int r;
>> +
>> +       if (!busy_bo || !ticket)
>> +               return -EBUSY;
>> +
>> +       if (ctx->interruptible)
>> +               r = 
>> + reservation_object_lock_interruptible(busy_bo->resv,
>> + ticket);
>> +       else
>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>> +
>> +       /*
>> +        * TODO: It would be better to keep the BO locked until
>> allocation is at
>> +        * least tried one more time, but that would mean a much
>> larger rework
>> +        * of TTM.
>> +        */
>> +       if (!r)
>> +               reservation_object_unlock(busy_bo->resv);
>> +
>> +       return r == -EDEADLK ? -EAGAIN : r; }
>> +
>>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>                                 uint32_t mem_type,
>>                                 const struct ttm_place *place,
>> -                              struct ttm_operation_ctx *ctx)
>> +                              struct ttm_operation_ctx *ctx,
>> +                              struct ww_acquire_ctx *ticket)
>>   {
>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>          struct ttm_bo_global *glob = bdev->glob;
>>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> -       struct ttm_buffer_object *bo = NULL;
>>          bool locked = false;
>>          unsigned i;
>>          int ret;
>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct 
>> ttm_bo_device *bdev,
>>          spin_lock(&glob->lru_lock);
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  list_for_each_entry(bo, &man->lru[i], lru) {
>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>> &locked))
>> +                       bool busy;
>> +
>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>> &locked,
>> + &busy)) {
>> +                               if (busy && !busy_bo &&
>> +                                   bo->resv->lock.ctx != ticket)
>> +                                       busy_bo = bo;
>>                                  continue;
>> +                       }
>>
>>                          if (place && 
>> !bdev->driver->eviction_valuable(bo,
>> place)) {
>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct 
>> ttm_bo_device *bdev,
>>          }
>>
>>          if (!bo) {
>> +               if (busy_bo)
>> +                       ttm_bo_get(busy_bo);
>>                  spin_unlock(&glob->lru_lock);
>> -               return -EBUSY;
>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
> If you rely on EAGAIN, why do you still try to lock busy_bo? any 
> negative effect if directly return EAGAIN without tring lock?

Yeah, that would burn a lot of CPU cycles because we would essentially busy wait for the BO to become unlocked.

When we only return in case of a deadlock the other thread can continue with its eviction while we reacquire all looks during EAGAIN handling.

Even directly unlocking the BO as I do here is a bit questionable. But I couldn't get the original logic with finding a new BO to evict to work correctly, that's why I have the TODO comment in the function itself as well.

Christian.

>
> -David
>> +               if (busy_bo)
>> +                       ttm_bo_put(busy_bo);
>> +               return ret;
>>          }
>>
>>          kref_get(&bo->list_kref);
>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct 
>> ttm_buffer_object *bo,
>>                          return ret;
>>                  if (mem->mm_node)
>>                          break;
>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>> ctx);
>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>> ctx,
>> + bo->resv->lock.ctx);
>>                  if (unlikely(ret != 0))
>>                          return ret;
>>          } while (1);
>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct 
>> ttm_bo_device *bdev,
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  while (!list_empty(&man->lru[i])) {
>>                          spin_unlock(&glob->lru_lock);
>> -                       ret = ttm_mem_evict_first(bdev, mem_type,
>> NULL, &ctx);
>> +                       ret = ttm_mem_evict_first(bdev, mem_type,
>> NULL, &ctx,
>> +                                                 NULL);
>>                          if (ret)
>>                                  return ret;
>>                          spin_lock(&glob->lru_lock); @@ -1797,7 
>> +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct 
>> ttm_operation_ctx *ctx)
>>          spin_lock(&glob->lru_lock);
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>> &locked)) {
>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>> &locked,
>> + NULL)) {
>>                                  ret = 0;
>>                                  break;
>>                          }
>> --
>> 2.17.1
>>
>

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
       [not found]           ` <MN2PR12MB35364235378F29899838CD80FB020-rweVpJHSKTovpq7YPKzLfQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-05-24  8:49             ` Christian König
  0 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-24  8:49 UTC (permalink / raw)
  To: Liang, Prike, Koenig, Christian, Zhou, David(ChunMing),
	Olsak, Marek, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Yeah, that shouldn't be a problem. We just need to make sure we don't 
busy wait for the BOs to become available.

Christian.

Am 24.05.19 um 07:35 schrieb Liang, Prike:
> Use Abaqus torturing the amdgpu driver more times will running into locking first busy BO deadlock .Then the caller will
> return EAGAIN and eventually dm_plane_helper_prepare_fb popups out pinned failed message .For this case, the patch#7
>   can we add EAGAIN as ERESTARTSYS which filter out the annoying error message .
>
> Thanks,
> Prike
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Thursday, May 23, 2019 7:04 PM
> To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; Olsak, Marek <Marek.Olsak@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; Liang, Prike <Prike.Liang@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
>
> [CAUTION: External Email]
>
> Am 23.05.19 um 12:24 schrieb zhoucm1:
>>
>> On 2019年05月22日 20:59, Christian König wrote:
>>> [CAUTION: External Email]
>>>
>>> BOs on the LRU might be blocked during command submission and cause
>>> OOM situations.
>>>
>>> Avoid this by blocking for the first busy BO not locked by the same
>>> ticket as the BO we are searching space for.
>>>
>>> v10: completely start over with the patch since we didn't
>>>        handled a whole bunch of corner cases.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>>    drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>>>    1 file changed, 66 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>> b/drivers/gpu/drm/ttm/ttm_bo.c index 4c6389d849ed..861facac33d4
>>> 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>     * b. Otherwise, trylock it.
>>>     */
>>>    static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object
>>> *bo,
>>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>>> +                       struct ttm_operation_ctx *ctx, bool *locked,
>>> bool *busy)
>>>    {
>>>           bool ret = false;
>>>
>>> -       *locked = false;
>>>           if (bo->resv == ctx->resv) {
>>>                   reservation_object_assert_held(bo->resv);
>>>                   if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>                       || !list_empty(&bo->ddestroy))
>>>                           ret = true;
>>> +               *locked = false;
>>> +               if (busy)
>>> +                       *busy = false;
>>>           } else {
>>> -               *locked = reservation_object_trylock(bo->resv);
>>> -               ret = *locked;
>>> +               ret = reservation_object_trylock(bo->resv);
>>> +               *locked = ret;
>>> +               if (busy)
>>> +                       *busy = !ret;
>>>           }
>>>
>>>           return ret;
>>>    }
>>>
>>> +/**
>>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>>> + *
>>> + * @busy_bo: BO which couldn't be locked with trylock
>>> + * @ctx: operation context
>>> + * @ticket: acquire ticket
>>> + *
>>> + * Try to lock a busy buffer object to avoid failing eviction.
>>> + */
>>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>>> +                                  struct ttm_operation_ctx *ctx,
>>> +                                  struct ww_acquire_ctx *ticket) {
>>> +       int r;
>>> +
>>> +       if (!busy_bo || !ticket)
>>> +               return -EBUSY;
>>> +
>>> +       if (ctx->interruptible)
>>> +               r =
>>> + reservation_object_lock_interruptible(busy_bo->resv,
>>> + ticket);
>>> +       else
>>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>>> +
>>> +       /*
>>> +        * TODO: It would be better to keep the BO locked until
>>> allocation is at
>>> +        * least tried one more time, but that would mean a much
>>> larger rework
>>> +        * of TTM.
>>> +        */
>>> +       if (!r)
>>> +               reservation_object_unlock(busy_bo->resv);
>>> +
>>> +       return r == -EDEADLK ? -EAGAIN : r; }
>>> +
>>>    static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>                                  uint32_t mem_type,
>>>                                  const struct ttm_place *place,
>>> -                              struct ttm_operation_ctx *ctx)
>>> +                              struct ttm_operation_ctx *ctx,
>>> +                              struct ww_acquire_ctx *ticket)
>>>    {
>>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>>           struct ttm_bo_global *glob = bdev->glob;
>>>           struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>> -       struct ttm_buffer_object *bo = NULL;
>>>           bool locked = false;
>>>           unsigned i;
>>>           int ret;
>>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct
>>> ttm_bo_device *bdev,
>>>           spin_lock(&glob->lru_lock);
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   list_for_each_entry(bo, &man->lru[i], lru) {
>>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked))
>>> +                       bool busy;
>>> +
>>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked,
>>> + &busy)) {
>>> +                               if (busy && !busy_bo &&
>>> +                                   bo->resv->lock.ctx != ticket)
>>> +                                       busy_bo = bo;
>>>                                   continue;
>>> +                       }
>>>
>>>                           if (place &&
>>> !bdev->driver->eviction_valuable(bo,
>>> place)) {
>>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct
>>> ttm_bo_device *bdev,
>>>           }
>>>
>>>           if (!bo) {
>>> +               if (busy_bo)
>>> +                       ttm_bo_get(busy_bo);
>>>                   spin_unlock(&glob->lru_lock);
>>> -               return -EBUSY;
>>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
>> If you rely on EAGAIN, why do you still try to lock busy_bo? any
>> negative effect if directly return EAGAIN without tring lock?
> Yeah, that would burn a lot of CPU cycles because we would essentially busy wait for the BO to become unlocked.
>
> When we only return in case of a deadlock the other thread can continue with its eviction while we reacquire all looks during EAGAIN handling.
>
> Even directly unlocking the BO as I do here is a bit questionable. But I couldn't get the original logic with finding a new BO to evict to work correctly, that's why I have the TODO comment in the function itself as well.
>
> Christian.
>
>> -David
>>> +               if (busy_bo)
>>> +                       ttm_bo_put(busy_bo);
>>> +               return ret;
>>>           }
>>>
>>>           kref_get(&bo->list_kref);
>>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct
>>> ttm_buffer_object *bo,
>>>                           return ret;
>>>                   if (mem->mm_node)
>>>                           break;
>>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>> ctx);
>>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>> ctx,
>>> + bo->resv->lock.ctx);
>>>                   if (unlikely(ret != 0))
>>>                           return ret;
>>>           } while (1);
>>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct
>>> ttm_bo_device *bdev,
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   while (!list_empty(&man->lru[i])) {
>>>                           spin_unlock(&glob->lru_lock);
>>> -                       ret = ttm_mem_evict_first(bdev, mem_type,
>>> NULL, &ctx);
>>> +                       ret = ttm_mem_evict_first(bdev, mem_type,
>>> NULL, &ctx,
>>> +                                                 NULL);
>>>                           if (ret)
>>>                                   return ret;
>>>                           spin_lock(&glob->lru_lock); @@ -1797,7
>>> +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct
>>> ttm_operation_ctx *ctx)
>>>           spin_lock(&glob->lru_lock);
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked)) {
>>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked,
>>> + NULL)) {
>>>                                   ret = 0;
>>>                                   break;
>>>                           }
>>> --
>>> 2.17.1
>>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3
       [not found]           ` <eea6245e-616d-eb16-8521-2f21ce5d6d25-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2019-05-24 21:34             ` Kuehling, Felix
       [not found]               ` <776d29df-428f-ad98-8e38-4b191b602abb-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Kuehling, Felix @ 2019-05-24 21:34 UTC (permalink / raw)
  To: Koenig, Christian, Olsak, Marek, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-05-23 5:06 a.m., Christian König wrote:
> [CAUTION: External Email]
>
> Leaving BOs on the LRU is harmless. We always did this for VM page table
> and per VM BOs.
>
> The key point is that BOs which couldn't be reserved can't be evicted.
> So what happened is that an application used basically all of VRAM
> during CS and because of this X server couldn't pin a BO for scanout.
>
> Now we keep the BOs on the LRU and modify TTM to block for the CS to
> complete, which in turn allows the X server to pin its BO for scanout.


OK, let me rephrase that to make sure I understand it correctly. I think 
the point is that eviction candidates come from an LRU list, so leaving 
things on the LRU makes more BOs available for eviction and avoids OOM 
situations. To take advantage of that, patch 6 adds the ability to wait 
for reserved BOs when there is nothing easier to evict.

ROCm applications like to use lots of memory. So it probably makes sense 
for us to stop removing our BOs from the LRU as well while we 
mass-validate our BOs in amdgpu_amdkfd_gpuvm_restore_process_bos.

Regards,
   Felix


>
> Christian.
>
> Am 22.05.19 um 21:43 schrieb Kuehling, Felix:
>> Can you explain how this avoids OOM situations? When is it safe to leave
>> a reserved BO on the LRU list? Could we do the same thing in
>> amdgpu_amdkfd_gpuvm.c? And if we did, what would be the expected side
>> effects or consequences?
>>
>> Thanks,
>>     Felix
>>
>> On 2019-05-22 8:59 a.m., Christian König wrote:
>>> [CAUTION: External Email]
>>>
>>> This avoids OOM situations when we have lots of threads
>>> submitting at the same time.
>>>
>>> v3: apply this to the whole driver, not just CS
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 2 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c    | 2 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 +-
>>>    4 files changed, 5 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 20f2955d2a55..3e2da24cd17a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct 
>>> amdgpu_cs_parser *p,
>>>           }
>>>
>>>           r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
>>> -                                  &duplicates, true);
>>> +                                  &duplicates, false);
>>>           if (unlikely(r != 0)) {
>>>                   if (r != -ERESTARTSYS)
>>>                           DRM_ERROR("ttm_eu_reserve_buffers 
>>> failed.\n");
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>> index 06f83cac0d3a..f660628e6af9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>> @@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device 
>>> *adev, struct amdgpu_vm *vm,
>>>           list_add(&csa_tv.head, &list);
>>>           amdgpu_vm_get_pd_bo(vm, &list, &pd);
>>>
>>> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
>>> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, false);
>>>           if (r) {
>>>                   DRM_ERROR("failed to reserve CSA,PD BOs: 
>>> err=%d\n", r);
>>>                   return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index d513a5ad03dd..ed25a4e14404 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct 
>>> drm_gem_object *obj,
>>>
>>>           amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
>>>
>>> -       r = ttm_eu_reserve_buffers(&ticket, &list, false, 
>>> &duplicates, true);
>>> +       r = ttm_eu_reserve_buffers(&ticket, &list, false, 
>>> &duplicates, false);
>>>           if (r) {
>>>                   dev_err(adev->dev, "leaking bo va because "
>>>                           "we fail to reserve bo (%d)\n", r);
>>> @@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, 
>>> void *data,
>>>
>>>           amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
>>>
>>> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, 
>>> &duplicates, true);
>>> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, 
>>> &duplicates, false);
>>>           if (r)
>>>                   goto error_unref;
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> index c430e8259038..d60593cc436e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> @@ -155,7 +155,7 @@ static inline int amdgpu_bo_reserve(struct 
>>> amdgpu_bo *bo, bool no_intr)
>>>           struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>>           int r;
>>>
>>> -       r = ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
>>> +       r = __ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
>>>           if (unlikely(r != 0)) {
>>>                   if (r != -ERESTARTSYS)
>>>                           dev_err(adev->dev, "%p reserve failed\n", 
>>> bo);
>>> -- 
>>> 2.17.1
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3
       [not found]               ` <776d29df-428f-ad98-8e38-4b191b602abb-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-27 10:51                 ` Koenig, Christian
  0 siblings, 0 replies; 26+ messages in thread
From: Koenig, Christian @ 2019-05-27 10:51 UTC (permalink / raw)
  To: Kuehling, Felix, Olsak, Marek, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 24.05.19 um 23:34 schrieb Kuehling, Felix:
> On 2019-05-23 5:06 a.m., Christian König wrote:
>> [CAUTION: External Email]
>>
>> Leaving BOs on the LRU is harmless. We always did this for VM page table
>> and per VM BOs.
>>
>> The key point is that BOs which couldn't be reserved can't be evicted.
>> So what happened is that an application used basically all of VRAM
>> during CS and because of this X server couldn't pin a BO for scanout.
>>
>> Now we keep the BOs on the LRU and modify TTM to block for the CS to
>> complete, which in turn allows the X server to pin its BO for scanout.
>
> OK, let me rephrase that to make sure I understand it correctly. I think
> the point is that eviction candidates come from an LRU list, so leaving
> things on the LRU makes more BOs available for eviction and avoids OOM
> situations. To take advantage of that, patch 6 adds the ability to wait
> for reserved BOs when there is nothing easier to evict.
>
> ROCm applications like to use lots of memory. So it probably makes sense
> for us to stop removing our BOs from the LRU as well while we
> mass-validate our BOs in amdgpu_amdkfd_gpuvm_restore_process_bos.

Well that would allow concurrent calls of 
amdgpu_amdkfd_gpuvm_restore_process_bos() to wait for each other.

If that's what you want then yeah that certainly makes sense.

Regards,
Christian.

>
> Regards,
>     Felix
>
>
>> Christian.
>>
>> Am 22.05.19 um 21:43 schrieb Kuehling, Felix:
>>> Can you explain how this avoids OOM situations? When is it safe to leave
>>> a reserved BO on the LRU list? Could we do the same thing in
>>> amdgpu_amdkfd_gpuvm.c? And if we did, what would be the expected side
>>> effects or consequences?
>>>
>>> Thanks,
>>>      Felix
>>>
>>> On 2019-05-22 8:59 a.m., Christian König wrote:
>>>> [CAUTION: External Email]
>>>>
>>>> This avoids OOM situations when we have lots of threads
>>>> submitting at the same time.
>>>>
>>>> v3: apply this to the whole driver, not just CS
>>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> ---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 2 +-
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c    | 2 +-
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 4 ++--
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 +-
>>>>     4 files changed, 5 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index 20f2955d2a55..3e2da24cd17a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct
>>>> amdgpu_cs_parser *p,
>>>>            }
>>>>
>>>>            r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
>>>> -                                  &duplicates, true);
>>>> +                                  &duplicates, false);
>>>>            if (unlikely(r != 0)) {
>>>>                    if (r != -ERESTARTSYS)
>>>>                            DRM_ERROR("ttm_eu_reserve_buffers
>>>> failed.\n");
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>>> index 06f83cac0d3a..f660628e6af9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>>> @@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device
>>>> *adev, struct amdgpu_vm *vm,
>>>>            list_add(&csa_tv.head, &list);
>>>>            amdgpu_vm_get_pd_bo(vm, &list, &pd);
>>>>
>>>> -       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, true);
>>>> +       r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL, false);
>>>>            if (r) {
>>>>                    DRM_ERROR("failed to reserve CSA,PD BOs:
>>>> err=%d\n", r);
>>>>                    return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>> index d513a5ad03dd..ed25a4e14404 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>> @@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct
>>>> drm_gem_object *obj,
>>>>
>>>>            amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
>>>>
>>>> -       r = ttm_eu_reserve_buffers(&ticket, &list, false,
>>>> &duplicates, true);
>>>> +       r = ttm_eu_reserve_buffers(&ticket, &list, false,
>>>> &duplicates, false);
>>>>            if (r) {
>>>>                    dev_err(adev->dev, "leaking bo va because "
>>>>                            "we fail to reserve bo (%d)\n", r);
>>>> @@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev,
>>>> void *data,
>>>>
>>>>            amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
>>>>
>>>> -       r = ttm_eu_reserve_buffers(&ticket, &list, true,
>>>> &duplicates, true);
>>>> +       r = ttm_eu_reserve_buffers(&ticket, &list, true,
>>>> &duplicates, false);
>>>>            if (r)
>>>>                    goto error_unref;
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> index c430e8259038..d60593cc436e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> @@ -155,7 +155,7 @@ static inline int amdgpu_bo_reserve(struct
>>>> amdgpu_bo *bo, bool no_intr)
>>>>            struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>>>            int r;
>>>>
>>>> -       r = ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
>>>> +       r = __ttm_bo_reserve(&bo->tbo, !no_intr, false, NULL);
>>>>            if (unlikely(r != 0)) {
>>>>                    if (r != -ERESTARTSYS)
>>>>                            dev_err(adev->dev, "%p reserve failed\n",
>>>> bo);
>>>> -- 
>>>> 2.17.1
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
       [not found]     ` <20190522125947.4592-6-christian.koenig-5C7GfCeVMHo@public.gmane.org>
@ 2019-06-26  6:36       ` Kuehling, Felix
  0 siblings, 0 replies; 26+ messages in thread
From: Kuehling, Felix @ 2019-06-26  6:36 UTC (permalink / raw)
  To: Christian König, Olsak, Marek, Zhou, David(ChunMing),
	Liang, Prike, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

I believe I found a live-lock due to this patch when running our KFD 
eviction test in a loop. I pretty reliably hangs on the second loop 
iteration. If I revert this patch, the problem disappears.

With some added instrumentation, I see that amdgpu_cs_list_validate in 
amdgpu_cs_parser_bos returns -EAGAIN repeatedly. User mode dutifully 
retries the command submission, but it never succeeds.

I think the problem is the fact that ttm_mem_evict_wait_busy returns 
-EAGAIN when it fails to lock the busy BO. That will cause 
ttm_bo_mem_space to give up instead of trying an alternate placement. A 
straight-forward fix is to modify ttm_mem_evict_wait_busy to return 
-EBUSY, which will maintain the old behaviour of trying alternate 
placements when eviction is not possible. I'll send out a patch for that.

See also inline.

On 2019-05-22 8:59 a.m., Christian König wrote:
> [CAUTION: External Email]
>
> BOs on the LRU might be blocked during command submission
> and cause OOM situations.
>
> Avoid this by blocking for the first busy BO not locked by
> the same ticket as the BO we are searching space for.
>
> v10: completely start over with the patch since we didn't
>       handled a whole bunch of corner cases.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>   1 file changed, 66 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 4c6389d849ed..861facac33d4 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -                       struct ttm_operation_ctx *ctx, bool *locked)
> +                       struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>          bool ret = false;
>
> -       *locked = false;
>          if (bo->resv == ctx->resv) {
>                  reservation_object_assert_held(bo->resv);
>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>                      || !list_empty(&bo->ddestroy))
>                          ret = true;
> +               *locked = false;
> +               if (busy)
> +                       *busy = false;
>          } else {
> -               *locked = reservation_object_trylock(bo->resv);
> -               ret = *locked;
> +               ret = reservation_object_trylock(bo->resv);
> +               *locked = ret;
> +               if (busy)
> +                       *busy = !ret;
>          }
>
>          return ret;
>   }
>
> +/**
> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
> + *
> + * @busy_bo: BO which couldn't be locked with trylock
> + * @ctx: operation context
> + * @ticket: acquire ticket
> + *
> + * Try to lock a busy buffer object to avoid failing eviction.
> + */
> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
> +                                  struct ttm_operation_ctx *ctx,
> +                                  struct ww_acquire_ctx *ticket)
> +{
> +       int r;
> +
> +       if (!busy_bo || !ticket)
> +               return -EBUSY;
> +
> +       if (ctx->interruptible)
> +               r = reservation_object_lock_interruptible(busy_bo->resv,
> +                                                         ticket);
> +       else
> +               r = reservation_object_lock(busy_bo->resv, ticket);
> +
> +       /*
> +        * TODO: It would be better to keep the BO locked until allocation is at
> +        * least tried one more time, but that would mean a much larger rework
> +        * of TTM.
> +        */
> +       if (!r)
> +               reservation_object_unlock(busy_bo->resv);
> +
> +       return r == -EDEADLK ? -EAGAIN : r;

If locking fails, this returns -EAGAIN.


> +}
> +
>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                                 uint32_t mem_type,
>                                 const struct ttm_place *place,
> -                              struct ttm_operation_ctx *ctx)
> +                              struct ttm_operation_ctx *ctx,
> +                              struct ww_acquire_ctx *ticket)
>   {
> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>          struct ttm_bo_global *glob = bdev->glob;
>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> -       struct ttm_buffer_object *bo = NULL;
>          bool locked = false;
>          unsigned i;
>          int ret;
> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>          spin_lock(&glob->lru_lock);
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  list_for_each_entry(bo, &man->lru[i], lru) {
> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +                       bool busy;
> +
> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                           &busy)) {
> +                               if (busy && !busy_bo &&
> +                                   bo->resv->lock.ctx != ticket)
> +                                       busy_bo = bo;
>                                  continue;
> +                       }
>
>                          if (place && !bdev->driver->eviction_valuable(bo,
>                                                                        place)) {
> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>          }
>
>          if (!bo) {
> +               if (busy_bo)
> +                       ttm_bo_get(busy_bo);
>                  spin_unlock(&glob->lru_lock);
> -               return -EBUSY;

The old behaviour was to always return -EBUSY if eviction was not possible.


> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
> +               if (busy_bo)
> +                       ttm_bo_put(busy_bo);
> +               return ret;

Now this can return -EAGAIN. This prevents ttm_bo_mem_space from trying 
alternate placements.

Regards,
   Felix


>          }
>
>          kref_get(&bo->list_kref);
> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>                          return ret;
>                  if (mem->mm_node)
>                          break;
> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx,
> +                                         bo->resv->lock.ctx);
>                  if (unlikely(ret != 0))
>                          return ret;
>          } while (1);
> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  while (!list_empty(&man->lru[i])) {
>                          spin_unlock(&glob->lru_lock);
> -                       ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> +                       ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> +                                                 NULL);
>                          if (ret)
>                                  return ret;
>                          spin_lock(&glob->lru_lock);
> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>          spin_lock(&glob->lru_lock);
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                          NULL)) {
>                                  ret = 0;
>                                  break;
>                          }
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
  2019-05-29 12:26 [PATCH 01/10] drm/ttm: Make LRU removal optional v2 Christian König
@ 2019-05-29 12:26 ` Christian König
  0 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-29 12:26 UTC (permalink / raw)
  To: dri-devel, amd-gfx

BOs on the LRU might be blocked during command submission
and cause OOM situations.

Avoid this by blocking for the first busy BO not locked by
the same ticket as the BO we are searching space for.

v10: completely start over with the patch since we didn't
     handled a whole bunch of corner cases.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 8a8859cf60e8..c7de667d482a 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -773,32 +773,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
  * b. Otherwise, trylock it.
  */
 static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
-			struct ttm_operation_ctx *ctx, bool *locked)
+			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
 {
 	bool ret = false;
 
-	*locked = false;
 	if (bo->resv == ctx->resv) {
 		reservation_object_assert_held(bo->resv);
 		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
 		    || !list_empty(&bo->ddestroy))
 			ret = true;
+		*locked = false;
+		if (busy)
+			*busy = false;
 	} else {
-		*locked = reservation_object_trylock(bo->resv);
-		ret = *locked;
+		ret = reservation_object_trylock(bo->resv);
+		*locked = ret;
+		if (busy)
+			*busy = !ret;
 	}
 
 	return ret;
 }
 
+/**
+ * ttm_mem_evict_wait_busy - wait for a busy BO to become available
+ *
+ * @busy_bo: BO which couldn't be locked with trylock
+ * @ctx: operation context
+ * @ticket: acquire ticket
+ *
+ * Try to lock a busy buffer object to avoid failing eviction.
+ */
+static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
+				   struct ttm_operation_ctx *ctx,
+				   struct ww_acquire_ctx *ticket)
+{
+	int r;
+
+	if (!busy_bo || !ticket)
+		return -EBUSY;
+
+	if (ctx->interruptible)
+		r = reservation_object_lock_interruptible(busy_bo->resv,
+							  ticket);
+	else
+		r = reservation_object_lock(busy_bo->resv, ticket);
+
+	/*
+	 * TODO: It would be better to keep the BO locked until allocation is at
+	 * least tried one more time, but that would mean a much larger rework
+	 * of TTM.
+	 */
+	if (!r)
+		reservation_object_unlock(busy_bo->resv);
+
+	return r == -EDEADLK ? -EAGAIN : r;
+}
+
 static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 			       uint32_t mem_type,
 			       const struct ttm_place *place,
-			       struct ttm_operation_ctx *ctx)
+			       struct ttm_operation_ctx *ctx,
+			       struct ww_acquire_ctx *ticket)
 {
+	struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
-	struct ttm_buffer_object *bo = NULL;
 	bool locked = false;
 	unsigned i;
 	int ret;
@@ -806,8 +846,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &man->lru[i], lru) {
-			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
+			bool busy;
+
+			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							    &busy)) {
+				if (busy && !busy_bo &&
+				    bo->resv->lock.ctx != ticket)
+					busy_bo = bo;
 				continue;
+			}
 
 			if (place && !bdev->driver->eviction_valuable(bo,
 								      place)) {
@@ -826,8 +873,13 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	}
 
 	if (!bo) {
+		if (busy_bo)
+			ttm_bo_get(busy_bo);
 		spin_unlock(&glob->lru_lock);
-		return -EBUSY;
+		ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
+		if (busy_bo)
+			ttm_bo_put(busy_bo);
+		return ret;
 	}
 
 	kref_get(&bo->list_kref);
@@ -913,7 +965,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 			return ret;
 		if (mem->mm_node)
 			break;
-		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
+		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx,
+					  bo->resv->lock.ctx);
 		if (unlikely(ret != 0))
 			return ret;
 	} while (1);
@@ -1428,7 +1481,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		while (!list_empty(&man->lru[i])) {
 			spin_unlock(&glob->lru_lock);
-			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
+			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
+						  NULL);
 			if (ret)
 				return ret;
 			spin_lock(&glob->lru_lock);
@@ -1799,7 +1853,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
-			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
+			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							   NULL)) {
 				ret = 0;
 				break;
 			}
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
  2019-05-28 16:25 [PATCH 01/10] drm/ttm: Make LRU removal optional v2 Christian König
@ 2019-05-28 16:25 ` Christian König
  0 siblings, 0 replies; 26+ messages in thread
From: Christian König @ 2019-05-28 16:25 UTC (permalink / raw)
  To: David1.Zhou, Marek.Olsak, Prike.Liang, dri-devel, amd-gfx

BOs on the LRU might be blocked during command submission
and cause OOM situations.

Avoid this by blocking for the first busy BO not locked by
the same ticket as the BO we are searching space for.

v10: completely start over with the patch since we didn't
     handled a whole bunch of corner cases.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 3bc31da1df67..98aa90056807 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -774,32 +774,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
  * b. Otherwise, trylock it.
  */
 static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
-			struct ttm_operation_ctx *ctx, bool *locked)
+			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
 {
 	bool ret = false;
 
-	*locked = false;
 	if (bo->resv == ctx->resv) {
 		reservation_object_assert_held(bo->resv);
 		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
 		    || !list_empty(&bo->ddestroy))
 			ret = true;
+		*locked = false;
+		if (busy)
+			*busy = false;
 	} else {
-		*locked = reservation_object_trylock(bo->resv);
-		ret = *locked;
+		ret = reservation_object_trylock(bo->resv);
+		*locked = ret;
+		if (busy)
+			*busy = !ret;
 	}
 
 	return ret;
 }
 
+/**
+ * ttm_mem_evict_wait_busy - wait for a busy BO to become available
+ *
+ * @busy_bo: BO which couldn't be locked with trylock
+ * @ctx: operation context
+ * @ticket: acquire ticket
+ *
+ * Try to lock a busy buffer object to avoid failing eviction.
+ */
+static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
+				   struct ttm_operation_ctx *ctx,
+				   struct ww_acquire_ctx *ticket)
+{
+	int r;
+
+	if (!busy_bo || !ticket)
+		return -EBUSY;
+
+	if (ctx->interruptible)
+		r = reservation_object_lock_interruptible(busy_bo->resv,
+							  ticket);
+	else
+		r = reservation_object_lock(busy_bo->resv, ticket);
+
+	/*
+	 * TODO: It would be better to keep the BO locked until allocation is at
+	 * least tried one more time, but that would mean a much larger rework
+	 * of TTM.
+	 */
+	if (!r)
+		reservation_object_unlock(busy_bo->resv);
+
+	return r == -EDEADLK ? -EAGAIN : r;
+}
+
 static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 			       uint32_t mem_type,
 			       const struct ttm_place *place,
-			       struct ttm_operation_ctx *ctx)
+			       struct ttm_operation_ctx *ctx,
+			       struct ww_acquire_ctx *ticket)
 {
+	struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
-	struct ttm_buffer_object *bo = NULL;
 	bool locked = false;
 	unsigned i;
 	int ret;
@@ -807,8 +847,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &man->lru[i], lru) {
-			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
+			bool busy;
+
+			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							    &busy)) {
+				if (busy && !busy_bo &&
+				    bo->resv->lock.ctx != ticket)
+					busy_bo = bo;
 				continue;
+			}
 
 			if (place && !bdev->driver->eviction_valuable(bo,
 								      place)) {
@@ -827,8 +874,13 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	}
 
 	if (!bo) {
+		if (busy_bo)
+			ttm_bo_get(busy_bo);
 		spin_unlock(&glob->lru_lock);
-		return -EBUSY;
+		ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
+		if (busy_bo)
+			ttm_bo_put(busy_bo);
+		return ret;
 	}
 
 	kref_get(&bo->list_kref);
@@ -914,7 +966,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 			return ret;
 		if (mem->mm_node)
 			break;
-		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
+		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx,
+					  bo->resv->lock.ctx);
 		if (unlikely(ret != 0))
 			return ret;
 	} while (1);
@@ -1429,7 +1482,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		while (!list_empty(&man->lru[i])) {
 			spin_unlock(&glob->lru_lock);
-			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
+			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
+						  NULL);
 			if (ret)
 				return ret;
 			spin_lock(&glob->lru_lock);
@@ -1800,7 +1854,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
-			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
+			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							   NULL)) {
 				ret = 0;
 				break;
 			}
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2019-06-26  6:36 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-22 12:59 [PATCH 01/10] drm/ttm: Make LRU removal optional Christian König
2019-05-22 12:59 ` [PATCH 02/10] drm/ttm: return immediately in case of a signal Christian König
2019-05-22 12:59 ` [PATCH 03/10] drm/ttm: remove manual placement preference Christian König
     [not found] ` <20190522125947.4592-1-christian.koenig-5C7GfCeVMHo@public.gmane.org>
2019-05-22 12:59   ` [PATCH 04/10] drm/ttm: cleanup ttm_bo_mem_space Christian König
2019-05-22 12:59   ` [PATCH 05/10] drm/ttm: immediately move BOs to the new LRU v2 Christian König
2019-05-22 12:59   ` [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10 Christian König
2019-05-23 10:24     ` zhoucm1
2019-05-23 11:03       ` Christian König
     [not found]         ` <16918096-1430-d581-7284-a987aacb89da-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-05-23 11:50           ` Chunming Zhou
     [not found]             ` <5d68ba04-250d-918e-3633-ec45e5b18904-5C7GfCeVMHo@public.gmane.org>
2019-05-23 14:15               ` Koenig, Christian
2019-05-24  5:35         ` Liang, Prike
     [not found]           ` <MN2PR12MB35364235378F29899838CD80FB020-rweVpJHSKTovpq7YPKzLfQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-05-24  8:49             ` Christian König
     [not found]     ` <20190522125947.4592-6-christian.koenig-5C7GfCeVMHo@public.gmane.org>
2019-06-26  6:36       ` Kuehling, Felix
2019-05-22 12:59   ` [PATCH 07/10] drm/amd/display: use ttm_eu_reserve_buffers instead of amdgpu_bo_reserve v2 Christian König
2019-05-22 12:59   ` [PATCH 08/10] drm/amdgpu: drop some validation failure messages Christian König
2019-05-22 12:59   ` [PATCH 09/10] drm/amdgpu: create GDS, GWS and OA in system domain Christian König
2019-05-23  9:15   ` [PATCH 01/10] drm/ttm: Make LRU removal optional zhoucm1
     [not found]     ` <fbb023f9-28e7-2ac8-994f-e262da597098-5C7GfCeVMHo@public.gmane.org>
2019-05-23  9:39       ` Christian König
2019-05-22 12:59 ` [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3 Christian König
     [not found]   ` <20190522125947.4592-10-christian.koenig-5C7GfCeVMHo@public.gmane.org>
2019-05-22 19:43     ` Kuehling, Felix
     [not found]       ` <48ac98a8-de22-3549-5d63-078a0effab72-5C7GfCeVMHo@public.gmane.org>
2019-05-23  9:06         ` Christian König
     [not found]           ` <eea6245e-616d-eb16-8521-2f21ce5d6d25-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2019-05-24 21:34             ` Kuehling, Felix
     [not found]               ` <776d29df-428f-ad98-8e38-4b191b602abb-5C7GfCeVMHo@public.gmane.org>
2019-05-27 10:51                 ` Koenig, Christian
2019-05-23  8:27     ` Liang, Prike
2019-05-28 16:25 [PATCH 01/10] drm/ttm: Make LRU removal optional v2 Christian König
2019-05-28 16:25 ` [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10 Christian König
2019-05-29 12:26 [PATCH 01/10] drm/ttm: Make LRU removal optional v2 Christian König
2019-05-29 12:26 ` [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10 Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.