All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
@ 2020-12-16 14:04 Christian König
  2020-12-16 14:04 ` [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx Christian König
  2020-12-16 15:09 ` [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2 Daniel Vetter
  0 siblings, 2 replies; 20+ messages in thread
From: Christian König @ 2020-12-16 14:04 UTC (permalink / raw)
  To: dri-devel, linux-graphics-maintainer; +Cc: airlied, ray.huang, sroland

TTM implements a rather extensive accounting of allocated memory.

There are two reasons for this:
1. It tries to block userspace allocating a huge number of very small
   BOs without accounting for the kmalloced memory.

2. Make sure we don't over allocate and run into an OOM situation
   during swapout while trying to handle the memory shortage.

This is only partially a good idea. First of all it is perfectly
valid for an application to use all of system memory, limiting it to
50% is not really acceptable.

What we need to take care of is that the application is held
accountable for the memory it allocated. This is what control
mechanisms like memcg and the normal Linux page accounting already do.

Making sure that we don't run into an OOM situation while trying to
cope with a memory shortage is still a good idea, but this is also
not very well implemented since it means another opportunity of
recursion from the driver back into TTM.

So start to rework all of this by implementing a shrinker callback which
allows for TT object to be swapped out if necessary.

v2: Switch from limit to shrinker callback.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c        |  4 +-
 drivers/gpu/drm/ttm/ttm_memory.c    |  7 ++-
 drivers/gpu/drm/ttm/ttm_tt.c        | 82 +++++++++++++++++++++++++++--
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c |  2 +-
 include/drm/ttm/ttm_bo_api.h        |  2 +-
 include/drm/ttm/ttm_tt.h            |  6 ++-
 6 files changed, 91 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 31e8b3da5563..f1f3fd085465 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1412,7 +1412,7 @@ EXPORT_SYMBOL(ttm_bo_wait);
  * A buffer object shrink method that tries to swap out the first
  * buffer object on the bo_global::swap_lru list.
  */
-int ttm_bo_swapout(struct ttm_operation_ctx *ctx)
+int ttm_bo_swapout(struct ttm_operation_ctx *ctx, gfp_t gfp_flags)
 {
 	struct ttm_bo_global *glob = &ttm_bo_glob;
 	struct ttm_buffer_object *bo;
@@ -1495,7 +1495,7 @@ int ttm_bo_swapout(struct ttm_operation_ctx *ctx)
 	if (bo->bdev->driver->swap_notify)
 		bo->bdev->driver->swap_notify(bo);
 
-	ret = ttm_tt_swapout(bo->bdev, bo->ttm);
+	ret = ttm_tt_swapout(bo->bdev, bo->ttm, gfp_flags);
 out:
 
 	/**
diff --git a/drivers/gpu/drm/ttm/ttm_memory.c b/drivers/gpu/drm/ttm/ttm_memory.c
index a3bfbd9cea68..3d2479d0ce2c 100644
--- a/drivers/gpu/drm/ttm/ttm_memory.c
+++ b/drivers/gpu/drm/ttm/ttm_memory.c
@@ -37,6 +37,7 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <drm/ttm/ttm_pool.h>
+#include <drm/ttm/ttm_tt.h>
 
 #include "ttm_module.h"
 
@@ -276,9 +277,9 @@ static void ttm_shrink(struct ttm_mem_global *glob, bool from_wq,
 
 	while (ttm_zones_above_swap_target(glob, from_wq, extra)) {
 		spin_unlock(&glob->lock);
-		ret = ttm_bo_swapout(ctx);
+		ret = ttm_bo_swapout(ctx, 0);
 		spin_lock(&glob->lock);
-		if (unlikely(ret != 0))
+		if (unlikely(ret < 0))
 			break;
 	}
 
@@ -453,6 +454,7 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
 			zone->name, (unsigned long long)zone->max_mem >> 10);
 	}
 	ttm_pool_mgr_init(glob->zone_kernel->max_mem/(2*PAGE_SIZE));
+	ttm_tt_mgr_init();
 	return 0;
 out_no_zone:
 	ttm_mem_global_release(glob);
@@ -466,6 +468,7 @@ void ttm_mem_global_release(struct ttm_mem_global *glob)
 
 	/* let the page allocator first stop the shrink work. */
 	ttm_pool_mgr_fini();
+	ttm_tt_mgr_fini();
 
 	flush_workqueue(glob->swap_queue);
 	destroy_workqueue(glob->swap_queue);
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 7f75a13163f0..d454c428c56a 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -38,6 +38,8 @@
 #include <drm/drm_cache.h>
 #include <drm/ttm/ttm_bo_driver.h>
 
+static struct shrinker mm_shrinker;
+
 /*
  * Allocates a ttm structure for the given BO.
  */
@@ -223,13 +225,23 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
 	return ret;
 }
 
-int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
+/**
+ * ttm_tt_swapout - swap out tt object
+ *
+ * @bdev: TTM device structure.
+ * @ttm: The struct ttm_tt.
+ * @gfp_flags: Flags to use for memory allocation.
+ *
+ * Swapout a TT object to a shmem_file, return number of pages swapped out or
+ * negative error code.
+ */
+int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm,
+		   gfp_t gfp_flags)
 {
 	struct address_space *swap_space;
 	struct file *swap_storage;
 	struct page *from_page;
 	struct page *to_page;
-	gfp_t gfp_mask;
 	int i, ret;
 
 	swap_storage = shmem_file_setup("ttm swap",
@@ -241,14 +253,14 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
 	}
 
 	swap_space = swap_storage->f_mapping;
-	gfp_mask = mapping_gfp_mask(swap_space);
+	gfp_flags |= mapping_gfp_mask(swap_space);
 
 	for (i = 0; i < ttm->num_pages; ++i) {
 		from_page = ttm->pages[i];
 		if (unlikely(from_page == NULL))
 			continue;
 
-		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
+		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_flags);
 		if (IS_ERR(to_page)) {
 			ret = PTR_ERR(to_page);
 			goto out_err;
@@ -263,7 +275,7 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
 	ttm->swap_storage = swap_storage;
 	ttm->page_flags |= TTM_PAGE_FLAG_SWAPPED;
 
-	return 0;
+	return ttm->num_pages;
 
 out_err:
 	fput(swap_storage);
@@ -341,3 +353,63 @@ void ttm_tt_unpopulate(struct ttm_bo_device *bdev,
 		ttm_pool_free(&bdev->pool, ttm);
 	ttm->page_flags &= ~TTM_PAGE_FLAG_PRIV_POPULATED;
 }
+
+/* As long as pages are available make sure to release at least one */
+static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
+					  struct shrink_control *sc)
+{
+	struct ttm_operation_ctx ctx = {
+		.no_wait_gpu = true
+	};
+	int ret;
+
+	if (sc->gfp_mask & GFP_NOFS)
+		return 0;
+
+	ret = ttm_bo_swapout(&ctx, GFP_NOFS);
+	return ret < 0 ? SHRINK_EMPTY : ret;
+}
+
+/* Return the number of pages available or SHRINK_EMPTY if we have none */
+static unsigned long ttm_tt_shrinker_count(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct ttm_buffer_object *bo;
+	unsigned long num_pages = 0;
+	unsigned int i;
+
+	if (sc->gfp_mask & GFP_NOFS)
+		return 0;
+
+	/* TODO: Is that to much overhead? */
+	spin_lock(&ttm_bo_glob.lru_lock);
+	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i)
+		list_for_each_entry(bo, &ttm_bo_glob.swap_lru[i], swap)
+			num_pages += bo->ttm->num_pages;
+	spin_unlock(&ttm_bo_glob.lru_lock);
+
+	return num_pages ? num_pages : SHRINK_EMPTY;
+}
+
+/**
+ * ttm_tt_mgr_init - register with the MM shrinker
+ *
+ * Register with the MM shrinker for swapping out BOs.
+ */
+int ttm_tt_mgr_init(void)
+{
+	mm_shrinker.count_objects = ttm_tt_shrinker_count;
+	mm_shrinker.scan_objects = ttm_tt_shrinker_scan;
+	mm_shrinker.seeks = 1;
+	return register_shrinker(&mm_shrinker);
+}
+
+/**
+ * ttm_tt_mgr_fini - unregister our MM shrinker
+ *
+ * Unregisters the MM shrinker.
+ */
+void ttm_tt_mgr_fini(void)
+{
+	unregister_shrinker(&mm_shrinker);
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 0008be02d31c..e141fa9f616d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1389,7 +1389,7 @@ static int vmw_pm_freeze(struct device *kdev)
 	vmw_execbuf_release_pinned_bo(dev_priv);
 	vmw_resource_evict_all(dev_priv);
 	vmw_release_device_early(dev_priv);
-	while (ttm_bo_swapout(&ctx) == 0);
+	while (ttm_bo_swapout(&ctx, 0) > 0);
 	if (dev_priv->enable_fb)
 		vmw_fifo_resource_dec(dev_priv);
 	if (atomic_read(&dev_priv->num_fifo_resources) != 0) {
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index e17be324d95f..7c7539d76e1d 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -569,7 +569,7 @@ ssize_t ttm_bo_io(struct ttm_bo_device *bdev, struct file *filp,
 		  const char __user *wbuf, char __user *rbuf,
 		  size_t count, loff_t *f_pos, bool write);
 
-int ttm_bo_swapout(struct ttm_operation_ctx *ctx);
+int ttm_bo_swapout(struct ttm_operation_ctx *ctx, gfp_t gfp_flags);
 
 /**
  * ttm_bo_uses_embedded_gem_object - check if the given bo uses the
diff --git a/include/drm/ttm/ttm_tt.h b/include/drm/ttm/ttm_tt.h
index 6c8eb9a4de81..4bc3a08644fc 100644
--- a/include/drm/ttm/ttm_tt.h
+++ b/include/drm/ttm/ttm_tt.h
@@ -135,7 +135,8 @@ void ttm_tt_destroy_common(struct ttm_bo_device *bdev, struct ttm_tt *ttm);
  * Swap in a previously swap out ttm_tt.
  */
 int ttm_tt_swapin(struct ttm_tt *ttm);
-int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm);
+int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm,
+		   gfp_t gfp_flags);
 
 /**
  * ttm_tt_populate - allocate pages for a ttm
@@ -155,6 +156,9 @@ int ttm_tt_populate(struct ttm_bo_device *bdev, struct ttm_tt *ttm, struct ttm_o
  */
 void ttm_tt_unpopulate(struct ttm_bo_device *bdev, struct ttm_tt *ttm);
 
+int ttm_tt_mgr_init(void);
+void ttm_tt_mgr_fini(void);
+
 #if IS_ENABLED(CONFIG_AGP)
 #include <linux/agp_backend.h>
 
-- 
2.25.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx
  2020-12-16 14:04 [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2 Christian König
@ 2020-12-16 14:04 ` Christian König
  2020-12-16 15:26   ` Daniel Vetter
                     ` (2 more replies)
  2020-12-16 15:09 ` [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2 Daniel Vetter
  1 sibling, 3 replies; 20+ messages in thread
From: Christian König @ 2020-12-16 14:04 UTC (permalink / raw)
  To: dri-devel, linux-graphics-maintainer; +Cc: airlied, ray.huang, sroland

This is just another feature which is only used by VMWGFX, so move
it into the driver instead.

I've tried to add the accounting sysfs file to the kobject of the drm
minor, but I'm not 100% sure if this works as expected.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 16 ++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    |  8 +--
 drivers/gpu/drm/drm_gem_vram_helper.c         |  6 +--
 drivers/gpu/drm/nouveau/nouveau_bo.c          |  7 +--
 drivers/gpu/drm/nouveau/nouveau_drv.h         |  1 -
 drivers/gpu/drm/qxl/qxl_object.c              |  4 +-
 drivers/gpu/drm/radeon/radeon_object.c        |  8 +--
 drivers/gpu/drm/ttm/Makefile                  |  6 +--
 drivers/gpu/drm/ttm/ttm_bo.c                  | 52 ++++++-------------
 drivers/gpu/drm/ttm/ttm_bo_util.c             |  1 -
 drivers/gpu/drm/ttm/ttm_pool.c                | 13 +----
 drivers/gpu/drm/vmwgfx/Makefile               |  2 +-
 drivers/gpu/drm/{ttm => vmwgfx}/ttm_memory.c  | 19 +++----
 .../gpu/drm/vmwgfx}/ttm_memory.h              |  5 +-
 drivers/gpu/drm/vmwgfx/ttm_object.h           |  3 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_bo.c            | 22 ++++++--
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c           |  5 ++
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c    | 28 +++++++++-
 include/drm/ttm/ttm_bo_api.h                  | 13 +----
 include/drm/ttm/ttm_bo_driver.h               |  1 -
 include/drm/ttm/ttm_tt.h                      |  1 +
 21 files changed, 107 insertions(+), 114 deletions(-)
 rename drivers/gpu/drm/{ttm => vmwgfx}/ttm_memory.c (97%)
 rename {include/drm/ttm => drivers/gpu/drm/vmwgfx}/ttm_memory.h (97%)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index a9647e7f98a8..5ed1c88b8748 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -118,6 +118,16 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
  */
 #define ESTIMATE_PT_SIZE(mem_size) ((mem_size) >> 14)
 
+static size_t amdgpu_amdkfd_acc_size(uint64_t size)
+{
+	size >>= PAGE_SHIFT;
+	size *= sizeof(dma_addr_t) + sizeof(void *);
+
+	return __roundup_pow_of_two(sizeof(struct amdgpu_bo)) +
+		__rountup_pow_of_two(sizeof(struct ttm_tt)) +
+		PAGE_ALIGN(size);
+}
+
 static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
 		uint64_t size, u32 domain, bool sg)
 {
@@ -126,8 +136,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
 	size_t acc_size, system_mem_needed, ttm_mem_needed, vram_needed;
 	int ret = 0;
 
-	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
-				       sizeof(struct amdgpu_bo));
+	acc_size = amdgpu_amdkfd_acc_size(size);
 
 	vram_needed = 0;
 	if (domain == AMDGPU_GEM_DOMAIN_GTT) {
@@ -174,8 +183,7 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
 {
 	size_t acc_size;
 
-	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
-				       sizeof(struct amdgpu_bo));
+	acc_size = amdgpu_amdkfd_acc_size(size);
 
 	spin_lock(&kfd_mem_limit.mem_limit_lock);
 	if (domain == AMDGPU_GEM_DOMAIN_GTT) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 6cc9919b12cc..599c9a132eb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -523,7 +523,6 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 	};
 	struct amdgpu_bo *bo;
 	unsigned long page_align, size = bp->size;
-	size_t acc_size;
 	int r;
 
 	/* Note that GDS/GWS/OA allocates 1 page per byte/resource. */
@@ -546,9 +545,6 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 
 	*bo_ptr = NULL;
 
-	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
-				       sizeof(struct amdgpu_bo));
-
 	bo = kzalloc(sizeof(struct amdgpu_bo), GFP_KERNEL);
 	if (bo == NULL)
 		return -ENOMEM;
@@ -577,8 +573,8 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 		bo->tbo.priority = 1;
 
 	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, bp->type,
-				 &bo->placement, page_align, &ctx, acc_size,
-				 NULL, bp->resv, &amdgpu_bo_destroy);
+				 &bo->placement, page_align, &ctx,  NULL,
+				 bp->resv, &amdgpu_bo_destroy);
 	if (unlikely(r != 0))
 		return r;
 
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index 02ca22e90290..5cf7797048e1 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -189,7 +189,6 @@ struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
 	struct drm_vram_mm *vmm = dev->vram_mm;
 	struct ttm_bo_device *bdev;
 	int ret;
-	size_t acc_size;
 
 	if (WARN_ONCE(!vmm, "VRAM MM not initialized"))
 		return ERR_PTR(-EINVAL);
@@ -216,7 +215,6 @@ struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
 	}
 
 	bdev = &vmm->bdev;
-	acc_size = ttm_bo_dma_acc_size(bdev, size, sizeof(*gbo));
 
 	gbo->bo.bdev = bdev;
 	drm_gem_vram_placement(gbo, DRM_GEM_VRAM_PL_FLAG_SYSTEM);
@@ -226,8 +224,8 @@ struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
 	 * to release gbo->bo.base and kfree gbo.
 	 */
 	ret = ttm_bo_init(bdev, &gbo->bo, size, ttm_bo_type_device,
-			  &gbo->placement, pg_align, false, acc_size,
-			  NULL, NULL, ttm_buffer_object_destroy);
+			  &gbo->placement, pg_align, false, NULL, NULL,
+			  ttm_buffer_object_destroy);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 02d86f8311a9..64cb1ed0f6ea 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -300,18 +300,15 @@ nouveau_bo_init(struct nouveau_bo *nvbo, u64 size, int align, u32 domain,
 		struct sg_table *sg, struct dma_resv *robj)
 {
 	int type = sg ? ttm_bo_type_sg : ttm_bo_type_device;
-	size_t acc_size;
 	int ret;
 
-	acc_size = ttm_bo_dma_acc_size(nvbo->bo.bdev, size, sizeof(*nvbo));
-
 	nvbo->bo.mem.num_pages = size >> PAGE_SHIFT;
 	nouveau_bo_placement_set(nvbo, domain, 0);
 	INIT_LIST_HEAD(&nvbo->io_reserve_lru);
 
 	ret = ttm_bo_init(nvbo->bo.bdev, &nvbo->bo, size, type,
-			  &nvbo->placement, align >> PAGE_SHIFT, false,
-			  acc_size, sg, robj, nouveau_bo_del_ttm);
+			  &nvbo->placement, align >> PAGE_SHIFT, false, sg,
+			  robj, nouveau_bo_del_ttm);
 	if (ret) {
 		/* ttm will call nouveau_bo_del_ttm if it fails.. */
 		return ret;
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index c802d3d1ba39..700fa0979d14 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -54,7 +54,6 @@
 #include <drm/ttm/ttm_bo_api.h>
 #include <drm/ttm/ttm_bo_driver.h>
 #include <drm/ttm/ttm_placement.h>
-#include <drm/ttm/ttm_memory.h>
 
 #include <drm/drm_audio_component.h>
 
diff --git a/drivers/gpu/drm/qxl/qxl_object.c b/drivers/gpu/drm/qxl/qxl_object.c
index ceebc5881f68..705b51535492 100644
--- a/drivers/gpu/drm/qxl/qxl_object.c
+++ b/drivers/gpu/drm/qxl/qxl_object.c
@@ -138,8 +138,8 @@ int qxl_bo_create(struct qxl_device *qdev,
 	qxl_ttm_placement_from_domain(bo, domain);
 
 	r = ttm_bo_init_reserved(&qdev->mman.bdev, &bo->tbo, size, type,
-				 &bo->placement, 0, &ctx, size,
-				 NULL, NULL, &qxl_ttm_bo_destroy);
+				 &bo->placement, 0, &ctx, NULL, NULL,
+				 &qxl_ttm_bo_destroy);
 	if (unlikely(r != 0)) {
 		if (r != -ERESTARTSYS)
 			dev_err(qdev->ddev.dev,
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index 9b81786782de..5cd44f2a77cd 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -159,7 +159,6 @@ int radeon_bo_create(struct radeon_device *rdev,
 	struct radeon_bo *bo;
 	enum ttm_bo_type type;
 	unsigned long page_align = roundup(byte_align, PAGE_SIZE) >> PAGE_SHIFT;
-	size_t acc_size;
 	int r;
 
 	size = ALIGN(size, PAGE_SIZE);
@@ -173,9 +172,6 @@ int radeon_bo_create(struct radeon_device *rdev,
 	}
 	*bo_ptr = NULL;
 
-	acc_size = ttm_bo_dma_acc_size(&rdev->mman.bdev, size,
-				       sizeof(struct radeon_bo));
-
 	bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL);
 	if (bo == NULL)
 		return -ENOMEM;
@@ -230,8 +226,8 @@ int radeon_bo_create(struct radeon_device *rdev,
 	/* Kernel allocation are uninterruptible */
 	down_read(&rdev->pm.mclk_lock);
 	r = ttm_bo_init(&rdev->mman.bdev, &bo->tbo, size, type,
-			&bo->placement, page_align, !kernel, acc_size,
-			sg, resv, &radeon_ttm_bo_destroy);
+			&bo->placement, page_align, !kernel, sg, resv,
+			&radeon_ttm_bo_destroy);
 	up_read(&rdev->pm.mclk_lock);
 	if (unlikely(r != 0)) {
 		return r;
diff --git a/drivers/gpu/drm/ttm/Makefile b/drivers/gpu/drm/ttm/Makefile
index b6f5f87b270f..7e8dc85caac3 100644
--- a/drivers/gpu/drm/ttm/Makefile
+++ b/drivers/gpu/drm/ttm/Makefile
@@ -2,10 +2,8 @@
 #
 # Makefile for the drm device driver.  This driver provides support for the
 
-ttm-y := ttm_memory.o ttm_tt.o ttm_bo.o \
-	ttm_bo_util.o ttm_bo_vm.o ttm_module.o \
-	ttm_execbuf_util.o ttm_range_manager.o \
-	ttm_resource.o ttm_pool.o
+ttm-y := ttm_tt.o ttm_bo.o ttm_bo_util.o ttm_bo_vm.o ttm_module.o \
+	ttm_execbuf_util.o ttm_range_manager.o ttm_resource.o ttm_pool.o
 ttm-$(CONFIG_AGP) += ttm_agp_backend.o
 
 obj-$(CONFIG_DRM_TTM) += ttm.o
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index f1f3fd085465..cbb4d67c9288 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -474,7 +474,6 @@ static void ttm_bo_release(struct kref *kref)
 	struct ttm_buffer_object *bo =
 	    container_of(kref, struct ttm_buffer_object, kref);
 	struct ttm_bo_device *bdev = bo->bdev;
-	size_t acc_size = bo->acc_size;
 	int ret;
 
 	if (!bo->deleted) {
@@ -534,7 +533,6 @@ static void ttm_bo_release(struct kref *kref)
 	if (!ttm_bo_uses_embedded_gem_object(bo))
 		dma_resv_fini(&bo->base._resv);
 	bo->destroy(bo);
-	ttm_mem_global_free(&ttm_mem_glob, acc_size);
 }
 
 void ttm_bo_put(struct ttm_buffer_object *bo)
@@ -1095,25 +1093,13 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
 			 struct ttm_placement *placement,
 			 uint32_t page_alignment,
 			 struct ttm_operation_ctx *ctx,
-			 size_t acc_size,
 			 struct sg_table *sg,
 			 struct dma_resv *resv,
 			 void (*destroy) (struct ttm_buffer_object *))
 {
-	struct ttm_mem_global *mem_glob = &ttm_mem_glob;
 	bool locked;
 	int ret = 0;
 
-	ret = ttm_mem_global_alloc(mem_glob, acc_size, ctx);
-	if (ret) {
-		pr_err("Out of kernel memory\n");
-		if (destroy)
-			(*destroy)(bo);
-		else
-			kfree(bo);
-		return -ENOMEM;
-	}
-
 	bo->destroy = destroy ? destroy : ttm_bo_default_destroy;
 
 	kref_init(&bo->kref);
@@ -1130,7 +1116,6 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
 	bo->mem.bus.addr = NULL;
 	bo->moving = NULL;
 	bo->mem.placement = 0;
-	bo->acc_size = acc_size;
 	bo->pin_count = 0;
 	bo->sg = sg;
 	if (resv) {
@@ -1191,7 +1176,6 @@ int ttm_bo_init(struct ttm_bo_device *bdev,
 		struct ttm_placement *placement,
 		uint32_t page_alignment,
 		bool interruptible,
-		size_t acc_size,
 		struct sg_table *sg,
 		struct dma_resv *resv,
 		void (*destroy) (struct ttm_buffer_object *))
@@ -1200,8 +1184,7 @@ int ttm_bo_init(struct ttm_bo_device *bdev,
 	int ret;
 
 	ret = ttm_bo_init_reserved(bdev, bo, size, type, placement,
-				   page_alignment, &ctx, acc_size,
-				   sg, resv, destroy);
+				   page_alignment, &ctx, sg, resv, destroy);
 	if (ret)
 		return ret;
 
@@ -1212,20 +1195,6 @@ int ttm_bo_init(struct ttm_bo_device *bdev,
 }
 EXPORT_SYMBOL(ttm_bo_init);
 
-size_t ttm_bo_dma_acc_size(struct ttm_bo_device *bdev,
-			   unsigned long bo_size,
-			   unsigned struct_size)
-{
-	unsigned npages = (PAGE_ALIGN(bo_size)) >> PAGE_SHIFT;
-	size_t size = 0;
-
-	size += ttm_round_pot(struct_size);
-	size += ttm_round_pot(npages * (2*sizeof(void *) + sizeof(dma_addr_t)));
-	size += ttm_round_pot(sizeof(struct ttm_tt));
-	return size;
-}
-EXPORT_SYMBOL(ttm_bo_dma_acc_size);
-
 static void ttm_bo_global_kobj_release(struct kobject *kobj)
 {
 	struct ttm_bo_global *glob =
@@ -1242,9 +1211,11 @@ static void ttm_bo_global_release(void)
 	if (--ttm_bo_glob_use_count > 0)
 		goto out;
 
+	ttm_tt_mgr_fini();
+	ttm_pool_mgr_fini();
+
 	kobject_del(&glob->kobj);
 	kobject_put(&glob->kobj);
-	ttm_mem_global_release(&ttm_mem_glob);
 	memset(glob, 0, sizeof(*glob));
 out:
 	mutex_unlock(&ttm_global_mutex);
@@ -1253,6 +1224,8 @@ static void ttm_bo_global_release(void)
 static int ttm_bo_global_init(void)
 {
 	struct ttm_bo_global *glob = &ttm_bo_glob;
+	uint64_t num_pages;
+	struct sysinfo si;
 	int ret = 0;
 	unsigned i;
 
@@ -1260,9 +1233,16 @@ static int ttm_bo_global_init(void)
 	if (++ttm_bo_glob_use_count > 1)
 		goto out;
 
-	ret = ttm_mem_global_init(&ttm_mem_glob);
-	if (ret)
-		goto out;
+	si_meminfo(&si);
+
+	/* Limit the number of pages in the pool to about 50% of the total
+	 * system memory.
+	 */
+	num_pages = (u64)si.totalram * si.mem_unit;
+	num_pages = (num_pages * 50 / 100) >> PAGE_SHIFT;
+
+	ttm_pool_mgr_init(num_pages);
+	ttm_tt_mgr_init();
 
 	spin_lock_init(&glob->lru_lock);
 	glob->dummy_read_page = alloc_page(__GFP_ZERO | GFP_DMA32);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 398d5013fc39..03e9f6f6a611 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -309,7 +309,6 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo,
 
 	kref_init(&fbo->base.kref);
 	fbo->base.destroy = &ttm_transfered_destroy;
-	fbo->base.acc_size = 0;
 	fbo->base.pin_count = 0;
 	if (bo->type != ttm_bo_type_sg)
 		fbo->base.base.resv = &fbo->base.base._resv;
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 5455b2044759..24ae7c23cfd0 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -416,16 +416,10 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
 			caching = pages + (1 << order);
 		}
 
-		r = ttm_mem_global_alloc_page(&ttm_mem_glob, p,
-					      (1 << order) * PAGE_SIZE,
-					      ctx);
-		if (r)
-			goto error_free_page;
-
 		if (dma_addr) {
 			r = ttm_pool_map(pool, order, p, &dma_addr);
 			if (r)
-				goto error_global_free;
+				goto error_free_page;
 		}
 
 		num_pages -= 1 << order;
@@ -439,9 +433,6 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
 
 	return 0;
 
-error_global_free:
-	ttm_mem_global_free_page(&ttm_mem_glob, p, (1 << order) * PAGE_SIZE);
-
 error_free_page:
 	ttm_pool_free_page(pool, tt->caching, order, p);
 
@@ -476,8 +467,6 @@ void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt *tt)
 
 		order = ttm_pool_page_order(pool, p);
 		num_pages = 1ULL << order;
-		ttm_mem_global_free_page(&ttm_mem_glob, p,
-					 num_pages * PAGE_SIZE);
 		if (tt->dma_address)
 			ttm_pool_unmap(pool, tt->dma_address[i], num_pages);
 
diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
index 31f85f09f1fc..86af5d7b0242 100644
--- a/drivers/gpu/drm/vmwgfx/Makefile
+++ b/drivers/gpu/drm/vmwgfx/Makefile
@@ -9,7 +9,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
 	    vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o \
 	    vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
 	    vmwgfx_validation.o vmwgfx_page_dirty.o vmwgfx_streamoutput.o \
-	    ttm_object.o ttm_lock.o
+	    ttm_object.o ttm_lock.o ttm_memory.o
 
 vmwgfx-$(CONFIG_TRANSPARENT_HUGEPAGE) += vmwgfx_thp.o
 obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
diff --git a/drivers/gpu/drm/ttm/ttm_memory.c b/drivers/gpu/drm/vmwgfx/ttm_memory.c
similarity index 97%
rename from drivers/gpu/drm/ttm/ttm_memory.c
rename to drivers/gpu/drm/vmwgfx/ttm_memory.c
index 3d2479d0ce2c..bfec1e845345 100644
--- a/drivers/gpu/drm/ttm/ttm_memory.c
+++ b/drivers/gpu/drm/vmwgfx/ttm_memory.c
@@ -28,7 +28,6 @@
 
 #define pr_fmt(fmt) "[TTM] " fmt
 
-#include <drm/ttm/ttm_memory.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
@@ -36,10 +35,11 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
-#include <drm/ttm/ttm_pool.h>
-#include <drm/ttm/ttm_tt.h>
 
-#include "ttm_module.h"
+#include <drm/drm_device.h>
+#include <drm/drm_file.h>
+
+#include "ttm_memory.h"
 
 #define TTM_MEMORY_ALLOC_RETRIES 4
 
@@ -414,7 +414,7 @@ static int ttm_mem_init_dma32_zone(struct ttm_mem_global *glob,
 }
 #endif
 
-int ttm_mem_global_init(struct ttm_mem_global *glob)
+int ttm_mem_global_init(struct ttm_mem_global *glob, struct drm_device *dev)
 {
 	struct sysinfo si;
 	int ret;
@@ -425,7 +425,8 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
 	glob->swap_queue = create_singlethread_workqueue("ttm_swap");
 	INIT_WORK(&glob->work, ttm_shrink_work);
 	ret = kobject_init_and_add(
-		&glob->kobj, &ttm_mem_glob_kobj_type, ttm_get_kobj(), "memory_accounting");
+		&glob->kobj, &ttm_mem_glob_kobj_type, &dev->primary->kdev->kobj,
+		"memory_accounting");
 	if (unlikely(ret != 0)) {
 		kobject_put(&glob->kobj);
 		return ret;
@@ -453,8 +454,6 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
 		pr_info("Zone %7s: Available graphics memory: %llu KiB\n",
 			zone->name, (unsigned long long)zone->max_mem >> 10);
 	}
-	ttm_pool_mgr_init(glob->zone_kernel->max_mem/(2*PAGE_SIZE));
-	ttm_tt_mgr_init();
 	return 0;
 out_no_zone:
 	ttm_mem_global_release(glob);
@@ -466,10 +465,6 @@ void ttm_mem_global_release(struct ttm_mem_global *glob)
 	struct ttm_mem_zone *zone;
 	unsigned int i;
 
-	/* let the page allocator first stop the shrink work. */
-	ttm_pool_mgr_fini();
-	ttm_tt_mgr_fini();
-
 	flush_workqueue(glob->swap_queue);
 	destroy_workqueue(glob->swap_queue);
 	glob->swap_queue = NULL;
diff --git a/include/drm/ttm/ttm_memory.h b/drivers/gpu/drm/vmwgfx/ttm_memory.h
similarity index 97%
rename from include/drm/ttm/ttm_memory.h
rename to drivers/gpu/drm/vmwgfx/ttm_memory.h
index c1f167881e33..850ee6c867da 100644
--- a/include/drm/ttm/ttm_memory.h
+++ b/drivers/gpu/drm/vmwgfx/ttm_memory.h
@@ -35,7 +35,8 @@
 #include <linux/errno.h>
 #include <linux/kobject.h>
 #include <linux/mm.h>
-#include "ttm_bo_api.h"
+
+#include <drm/ttm/ttm_bo_api.h>
 
 /**
  * struct ttm_mem_global - Global memory accounting structure.
@@ -79,7 +80,7 @@ extern struct ttm_mem_global {
 #endif
 } ttm_mem_glob;
 
-int ttm_mem_global_init(struct ttm_mem_global *glob);
+int ttm_mem_global_init(struct ttm_mem_global *glob, struct drm_device *dev);
 void ttm_mem_global_release(struct ttm_mem_global *glob);
 int ttm_mem_global_alloc(struct ttm_mem_global *glob, uint64_t memory,
 			 struct ttm_operation_ctx *ctx);
diff --git a/drivers/gpu/drm/vmwgfx/ttm_object.h b/drivers/gpu/drm/vmwgfx/ttm_object.h
index ede26df87c93..49b064f0cb19 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_object.h
+++ b/drivers/gpu/drm/vmwgfx/ttm_object.h
@@ -43,7 +43,8 @@
 #include <linux/rcupdate.h>
 
 #include <drm/drm_hashtab.h>
-#include <drm/ttm/ttm_memory.h>
+
+#include "ttm_memory.h"
 
 /**
  * enum ttm_ref_type
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
index 63dbc44eebe0..f7abe2992178 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
@@ -508,11 +508,16 @@ int vmw_bo_create_kernel(struct vmw_private *dev_priv, unsigned long size,
 	acc_size = ttm_round_pot(sizeof(*bo));
 	acc_size += ttm_round_pot(npages * sizeof(void *));
 	acc_size += ttm_round_pot(sizeof(struct ttm_tt));
+
+	ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
+	if (unlikely(ret))
+		goto error_free;
+
 	ret = ttm_bo_init_reserved(&dev_priv->bdev, bo, size,
 				   ttm_bo_type_device, placement, 0,
-				   &ctx, acc_size, NULL, NULL, NULL);
+				   &ctx, NULL, NULL, NULL);
 	if (unlikely(ret))
-		goto error_free;
+		goto error_account;
 
 	ttm_bo_pin(bo);
 	ttm_bo_unreserve(bo);
@@ -520,6 +525,9 @@ int vmw_bo_create_kernel(struct vmw_private *dev_priv, unsigned long size,
 
 	return 0;
 
+error_account:
+	ttm_mem_global_free(&ttm_mem_glob, acc_size);
+
 error_free:
 	kfree(bo);
 	return ret;
@@ -559,11 +567,17 @@ int vmw_bo_init(struct vmw_private *dev_priv,
 	vmw_bo->base.priority = 3;
 	vmw_bo->res_tree = RB_ROOT;
 
+	ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
+	if (unlikely(ret))
+		return ret;
+
 	ret = ttm_bo_init_reserved(bdev, &vmw_bo->base, size,
 				   ttm_bo_type_device, placement,
-				   0, &ctx, acc_size, NULL, NULL, bo_free);
-	if (unlikely(ret))
+				   0, &ctx, NULL, NULL, bo_free);
+	if (unlikely(ret)) {
+		ttm_mem_global_free(&ttm_mem_glob, acc_size);
 		return ret;
+	}
 
 	if (pin)
 		ttm_bo_pin(&vmw_bo->base);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index e141fa9f616d..ca66eaa2e376 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1272,6 +1272,7 @@ static void vmw_remove(struct pci_dev *pdev)
 {
 	struct drm_device *dev = pci_get_drvdata(pdev);
 
+	ttm_mem_global_release(&ttm_mem_glob);
 	drm_dev_unregister(dev);
 	vmw_driver_unload(dev);
 	drm_dev_put(dev);
@@ -1519,6 +1520,10 @@ static int vmw_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_pci_disable_device;
 	}
 
+	ret = ttm_mem_global_init(&ttm_mem_glob, dev);
+	if (ret)
+		return ret;
+
 	dev->pdev = pdev;
 	pci_set_drvdata(pdev, dev);
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index 1c75f73538c0..efb567da749d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
@@ -575,11 +575,31 @@ static void vmw_ttm_destroy(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
 static int vmw_ttm_populate(struct ttm_bo_device *bdev,
 			    struct ttm_tt *ttm, struct ttm_operation_ctx *ctx)
 {
+	unsigned int i;
+	int ret;
+
 	/* TODO: maybe completely drop this ? */
 	if (ttm_tt_is_populated(ttm))
 		return 0;
 
-	return ttm_pool_alloc(&bdev->pool, ttm, ctx);
+	ret = ttm_pool_alloc(&bdev->pool, ttm, ctx);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ttm->num_pages; ++i) {
+		ret = ttm_mem_global_alloc_page(&ttm_mem_glob, ttm->pages[i],
+						PAGE_SIZE, ctx);
+		if (ret)
+			goto error;
+	}
+	return 0;
+
+error:
+	while (i--)
+		ttm_mem_global_free_page(&ttm_mem_glob, ttm->pages[i],
+					 PAGE_SIZE);
+	ttm_pool_free(&bdev->pool, ttm);
+	return ret;
 }
 
 static void vmw_ttm_unpopulate(struct ttm_bo_device *bdev,
@@ -587,6 +607,7 @@ static void vmw_ttm_unpopulate(struct ttm_bo_device *bdev,
 {
 	struct vmw_ttm_tt *vmw_tt = container_of(ttm, struct vmw_ttm_tt,
 						 dma_ttm);
+	unsigned int i;
 
 	if (vmw_tt->mob) {
 		vmw_mob_destroy(vmw_tt->mob);
@@ -594,6 +615,11 @@ static void vmw_ttm_unpopulate(struct ttm_bo_device *bdev,
 	}
 
 	vmw_ttm_unmap_dma(vmw_tt);
+
+	for (i = 0; i < ttm->num_pages; ++i)
+		ttm_mem_global_free_page(&ttm_mem_glob, ttm->pages[i],
+					 PAGE_SIZE);
+
 	ttm_pool_free(&bdev->pool, ttm);
 }
 
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 7c7539d76e1d..30254d869db6 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -88,7 +88,6 @@ struct ttm_tt;
  * @type: The bo type.
  * @destroy: Destruction function. If NULL, kfree is used.
  * @num_pages: Actual number of pages.
- * @acc_size: Accounted size for this object.
  * @kref: Reference count of this buffer object. When this refcount reaches
  * zero, the object is destroyed or put on the delayed delete list.
  * @mem: structure describing current placement.
@@ -125,7 +124,6 @@ struct ttm_buffer_object {
 	struct ttm_bo_device *bdev;
 	enum ttm_bo_type type;
 	void (*destroy) (struct ttm_buffer_object *);
-	size_t acc_size;
 
 	/**
 	* Members not needing protection.
@@ -357,10 +355,6 @@ void ttm_bo_unlock_delayed_workqueue(struct ttm_bo_device *bdev, int resched);
 bool ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 			      const struct ttm_place *place);
 
-size_t ttm_bo_dma_acc_size(struct ttm_bo_device *bdev,
-			   unsigned long bo_size,
-			   unsigned struct_size);
-
 /**
  * ttm_bo_init_reserved
  *
@@ -371,7 +365,6 @@ size_t ttm_bo_dma_acc_size(struct ttm_bo_device *bdev,
  * @flags: Initial placement flags.
  * @page_alignment: Data alignment in pages.
  * @ctx: TTM operation context for memory allocation.
- * @acc_size: Accounted size for this object.
  * @resv: Pointer to a dma_resv, or NULL to let ttm allocate one.
  * @destroy: Destroy function. Use NULL for kfree().
  *
@@ -402,8 +395,7 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
 			 struct ttm_placement *placement,
 			 uint32_t page_alignment,
 			 struct ttm_operation_ctx *ctx,
-			 size_t acc_size, struct sg_table *sg,
-			 struct dma_resv *resv,
+			 struct sg_table *sg, struct dma_resv *resv,
 			 void (*destroy) (struct ttm_buffer_object *));
 
 /**
@@ -421,7 +413,6 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
  * holds a pointer to a persistent shmem object. Typically, this would
  * point to the shmem object backing a GEM object if TTM is used to back a
  * GEM user interface.
- * @acc_size: Accounted size for this object.
  * @resv: Pointer to a dma_resv, or NULL to let ttm allocate one.
  * @destroy: Destroy function. Use NULL for kfree().
  *
@@ -446,7 +437,7 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
 int ttm_bo_init(struct ttm_bo_device *bdev, struct ttm_buffer_object *bo,
 		size_t size, enum ttm_bo_type type,
 		struct ttm_placement *placement,
-		uint32_t page_alignment, bool interrubtible, size_t acc_size,
+		uint32_t page_alignment, bool interrubtible,
 		struct sg_table *sg, struct dma_resv *resv,
 		void (*destroy) (struct ttm_buffer_object *));
 
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 423348414c59..547857697776 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -38,7 +38,6 @@
 #include <linux/dma-resv.h>
 
 #include "ttm_bo_api.h"
-#include "ttm_memory.h"
 #include "ttm_placement.h"
 #include "ttm_tt.h"
 #include "ttm_pool.h"
diff --git a/include/drm/ttm/ttm_tt.h b/include/drm/ttm/ttm_tt.h
index 4bc3a08644fc..499338680660 100644
--- a/include/drm/ttm/ttm_tt.h
+++ b/include/drm/ttm/ttm_tt.h
@@ -30,6 +30,7 @@
 #include <linux/types.h>
 #include <drm/ttm/ttm_caching.h>
 
+struct ttm_bo_device;
 struct ttm_tt;
 struct ttm_resource;
 struct ttm_buffer_object;
-- 
2.25.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-16 14:04 [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2 Christian König
  2020-12-16 14:04 ` [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx Christian König
@ 2020-12-16 15:09 ` Daniel Vetter
  2020-12-17 13:46   ` Christian König
  1 sibling, 1 reply; 20+ messages in thread
From: Daniel Vetter @ 2020-12-16 15:09 UTC (permalink / raw)
  To: Christian König
  Cc: airlied, sroland, dri-devel, ray.huang, linux-graphics-maintainer

On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> TTM implements a rather extensive accounting of allocated memory.
> 
> There are two reasons for this:
> 1. It tries to block userspace allocating a huge number of very small
>    BOs without accounting for the kmalloced memory.
> 
> 2. Make sure we don't over allocate and run into an OOM situation
>    during swapout while trying to handle the memory shortage.
> 
> This is only partially a good idea. First of all it is perfectly
> valid for an application to use all of system memory, limiting it to
> 50% is not really acceptable.
> 
> What we need to take care of is that the application is held
> accountable for the memory it allocated. This is what control
> mechanisms like memcg and the normal Linux page accounting already do.
> 
> Making sure that we don't run into an OOM situation while trying to
> cope with a memory shortage is still a good idea, but this is also
> not very well implemented since it means another opportunity of
> recursion from the driver back into TTM.
> 
> So start to rework all of this by implementing a shrinker callback which
> allows for TT object to be swapped out if necessary.
> 
> v2: Switch from limit to shrinker callback.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/ttm/ttm_bo.c        |  4 +-
>  drivers/gpu/drm/ttm/ttm_memory.c    |  7 ++-
>  drivers/gpu/drm/ttm/ttm_tt.c        | 82 +++++++++++++++++++++++++++--
>  drivers/gpu/drm/vmwgfx/vmwgfx_drv.c |  2 +-
>  include/drm/ttm/ttm_bo_api.h        |  2 +-
>  include/drm/ttm/ttm_tt.h            |  6 ++-
>  6 files changed, 91 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 31e8b3da5563..f1f3fd085465 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -1412,7 +1412,7 @@ EXPORT_SYMBOL(ttm_bo_wait);
>   * A buffer object shrink method that tries to swap out the first
>   * buffer object on the bo_global::swap_lru list.
>   */
> -int ttm_bo_swapout(struct ttm_operation_ctx *ctx)
> +int ttm_bo_swapout(struct ttm_operation_ctx *ctx, gfp_t gfp_flags)
>  {
>  	struct ttm_bo_global *glob = &ttm_bo_glob;
>  	struct ttm_buffer_object *bo;
> @@ -1495,7 +1495,7 @@ int ttm_bo_swapout(struct ttm_operation_ctx *ctx)
>  	if (bo->bdev->driver->swap_notify)
>  		bo->bdev->driver->swap_notify(bo);
>  
> -	ret = ttm_tt_swapout(bo->bdev, bo->ttm);
> +	ret = ttm_tt_swapout(bo->bdev, bo->ttm, gfp_flags);
>  out:
>  
>  	/**
> diff --git a/drivers/gpu/drm/ttm/ttm_memory.c b/drivers/gpu/drm/ttm/ttm_memory.c
> index a3bfbd9cea68..3d2479d0ce2c 100644
> --- a/drivers/gpu/drm/ttm/ttm_memory.c
> +++ b/drivers/gpu/drm/ttm/ttm_memory.c
> @@ -37,6 +37,7 @@
>  #include <linux/slab.h>
>  #include <linux/swap.h>
>  #include <drm/ttm/ttm_pool.h>
> +#include <drm/ttm/ttm_tt.h>
>  
>  #include "ttm_module.h"
>  
> @@ -276,9 +277,9 @@ static void ttm_shrink(struct ttm_mem_global *glob, bool from_wq,
>  
>  	while (ttm_zones_above_swap_target(glob, from_wq, extra)) {
>  		spin_unlock(&glob->lock);
> -		ret = ttm_bo_swapout(ctx);
> +		ret = ttm_bo_swapout(ctx, 0);

General we don't treat gfp_mask as a set of additional flags, but the full
thing. So here we should have GFP_KERNEL.

Also having both the shrinker and the ttm_shrink_work is a bit much, the
shrink work should get deleted completely I think.

>  		spin_lock(&glob->lock);
> -		if (unlikely(ret != 0))
> +		if (unlikely(ret < 0))
>  			break;
>  	}
>  
> @@ -453,6 +454,7 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
>  			zone->name, (unsigned long long)zone->max_mem >> 10);
>  	}
>  	ttm_pool_mgr_init(glob->zone_kernel->max_mem/(2*PAGE_SIZE));
> +	ttm_tt_mgr_init();
>  	return 0;
>  out_no_zone:
>  	ttm_mem_global_release(glob);
> @@ -466,6 +468,7 @@ void ttm_mem_global_release(struct ttm_mem_global *glob)
>  
>  	/* let the page allocator first stop the shrink work. */
>  	ttm_pool_mgr_fini();
> +	ttm_tt_mgr_fini();
>  
>  	flush_workqueue(glob->swap_queue);
>  	destroy_workqueue(glob->swap_queue);
> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> index 7f75a13163f0..d454c428c56a 100644
> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> @@ -38,6 +38,8 @@
>  #include <drm/drm_cache.h>
>  #include <drm/ttm/ttm_bo_driver.h>
>  
> +static struct shrinker mm_shrinker;
> +
>  /*
>   * Allocates a ttm structure for the given BO.
>   */
> @@ -223,13 +225,23 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
>  	return ret;
>  }
>  
> -int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
> +/**
> + * ttm_tt_swapout - swap out tt object
> + *
> + * @bdev: TTM device structure.
> + * @ttm: The struct ttm_tt.
> + * @gfp_flags: Flags to use for memory allocation.
> + *
> + * Swapout a TT object to a shmem_file, return number of pages swapped out or
> + * negative error code.
> + */
> +int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm,
> +		   gfp_t gfp_flags)
>  {
>  	struct address_space *swap_space;
>  	struct file *swap_storage;
>  	struct page *from_page;
>  	struct page *to_page;
> -	gfp_t gfp_mask;
>  	int i, ret;
>  
>  	swap_storage = shmem_file_setup("ttm swap",
> @@ -241,14 +253,14 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
>  	}
>  
>  	swap_space = swap_storage->f_mapping;
> -	gfp_mask = mapping_gfp_mask(swap_space);
> +	gfp_flags |= mapping_gfp_mask(swap_space);

I don't think this combines flags correctly. mapping_gfp_mask is most
likely GFP_KERNEL or something like that, so __GFP_FS (the thing you want
to check for to avoid recursion) is always set.

I think we need an & here, and maybe screaming if the gfp flags for the
swapout space are funny.

>  
>  	for (i = 0; i < ttm->num_pages; ++i) {
>  		from_page = ttm->pages[i];
>  		if (unlikely(from_page == NULL))
>  			continue;
>  
> -		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
> +		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_flags);
>  		if (IS_ERR(to_page)) {
>  			ret = PTR_ERR(to_page);
>  			goto out_err;
> @@ -263,7 +275,7 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
>  	ttm->swap_storage = swap_storage;
>  	ttm->page_flags |= TTM_PAGE_FLAG_SWAPPED;
>  
> -	return 0;
> +	return ttm->num_pages;
>  
>  out_err:
>  	fput(swap_storage);
> @@ -341,3 +353,63 @@ void ttm_tt_unpopulate(struct ttm_bo_device *bdev,
>  		ttm_pool_free(&bdev->pool, ttm);
>  	ttm->page_flags &= ~TTM_PAGE_FLAG_PRIV_POPULATED;
>  }
> +
> +/* As long as pages are available make sure to release at least one */
> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> +					  struct shrink_control *sc)
> +{
> +	struct ttm_operation_ctx ctx = {
> +		.no_wait_gpu = true

Iirc there's an eventual shrinker limit where it gets desperate. I think
once we hit that, we should allow gpu waits. But it's not passed to
shrinkers for reasons, so maybe we should have a second round that tries
to more actively shrink objects if we fell substantially short of what
reclaim expected us to do?

Also don't we have a trylock_only flag here to make sure drivers don't do
something stupid?

> +	};
> +	int ret;
> +
> +	if (sc->gfp_mask & GFP_NOFS)
> +		return 0;
> +
> +	ret = ttm_bo_swapout(&ctx, GFP_NOFS);
> +	return ret < 0 ? SHRINK_EMPTY : ret;
> +}
> +
> +/* Return the number of pages available or SHRINK_EMPTY if we have none */
> +static unsigned long ttm_tt_shrinker_count(struct shrinker *shrink,
> +					   struct shrink_control *sc)
> +{
> +	struct ttm_buffer_object *bo;
> +	unsigned long num_pages = 0;
> +	unsigned int i;
> +
> +	if (sc->gfp_mask & GFP_NOFS)
> +		return 0;

The count function should always count, and I'm not seeing a reason why
you couldn't do that here ... Also my understanding is that GFP_NOFS never
goes into shrinkers (the NOFS comes from shrinkers originally only being
used for filesystem objects), so this is double redundant.

My understanding is that gfp_mask is just to convey the right zones and
stuff, so that your shrinker can try to shrink objects in the right zones.
Hence I think the check in the _scan() function should also be removed.

Also the non __ prefixed flags are the combinations callers are supposed
to look at. Memory reclaim code needs to look at the __GFP flags, see e.g. 
gfpflags_allow_blocking() or fs_reclaim_acquire().

> +
> +	/* TODO: Is that to much overhead? */
> +	spin_lock(&ttm_bo_glob.lru_lock);
> +	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i)
> +		list_for_each_entry(bo, &ttm_bo_glob.swap_lru[i], swap)
> +			num_pages += bo->ttm->num_pages;
> +	spin_unlock(&ttm_bo_glob.lru_lock);
> +
> +	return num_pages ? num_pages : SHRINK_EMPTY;
> +}
> +
> +/**
> + * ttm_tt_mgr_init - register with the MM shrinker
> + *
> + * Register with the MM shrinker for swapping out BOs.
> + */
> +int ttm_tt_mgr_init(void)
> +{
> +	mm_shrinker.count_objects = ttm_tt_shrinker_count;
> +	mm_shrinker.scan_objects = ttm_tt_shrinker_scan;
> +	mm_shrinker.seeks = 1;
> +	return register_shrinker(&mm_shrinker);
> +}
> +
> +/**
> + * ttm_tt_mgr_fini - unregister our MM shrinker
> + *
> + * Unregisters the MM shrinker.
> + */
> +void ttm_tt_mgr_fini(void)
> +{
> +	unregister_shrinker(&mm_shrinker);
> +}
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> index 0008be02d31c..e141fa9f616d 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> @@ -1389,7 +1389,7 @@ static int vmw_pm_freeze(struct device *kdev)
>  	vmw_execbuf_release_pinned_bo(dev_priv);
>  	vmw_resource_evict_all(dev_priv);
>  	vmw_release_device_early(dev_priv);
> -	while (ttm_bo_swapout(&ctx) == 0);
> +	while (ttm_bo_swapout(&ctx, 0) > 0);

I think again GFP_KERNEL.

>  	if (dev_priv->enable_fb)
>  		vmw_fifo_resource_dec(dev_priv);
>  	if (atomic_read(&dev_priv->num_fifo_resources) != 0) {
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index e17be324d95f..7c7539d76e1d 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -569,7 +569,7 @@ ssize_t ttm_bo_io(struct ttm_bo_device *bdev, struct file *filp,
>  		  const char __user *wbuf, char __user *rbuf,
>  		  size_t count, loff_t *f_pos, bool write);
>  
> -int ttm_bo_swapout(struct ttm_operation_ctx *ctx);
> +int ttm_bo_swapout(struct ttm_operation_ctx *ctx, gfp_t gfp_flags);
>  
>  /**
>   * ttm_bo_uses_embedded_gem_object - check if the given bo uses the
> diff --git a/include/drm/ttm/ttm_tt.h b/include/drm/ttm/ttm_tt.h
> index 6c8eb9a4de81..4bc3a08644fc 100644
> --- a/include/drm/ttm/ttm_tt.h
> +++ b/include/drm/ttm/ttm_tt.h
> @@ -135,7 +135,8 @@ void ttm_tt_destroy_common(struct ttm_bo_device *bdev, struct ttm_tt *ttm);
>   * Swap in a previously swap out ttm_tt.
>   */
>  int ttm_tt_swapin(struct ttm_tt *ttm);
> -int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm);
> +int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm,
> +		   gfp_t gfp_flags);
>  
>  /**
>   * ttm_tt_populate - allocate pages for a ttm
> @@ -155,6 +156,9 @@ int ttm_tt_populate(struct ttm_bo_device *bdev, struct ttm_tt *ttm, struct ttm_o
>   */
>  void ttm_tt_unpopulate(struct ttm_bo_device *bdev, struct ttm_tt *ttm);
>  
> +int ttm_tt_mgr_init(void);
> +void ttm_tt_mgr_fini(void);
> +
>  #if IS_ENABLED(CONFIG_AGP)
>  #include <linux/agp_backend.h>

For testing I strongly recommend a debugfs file to trigger this shrinker
completely, from the right lockdep context (i.e. using
fs_reclaim_acquire/release()). Much easier to test that way. See
i915_drop_caches_set() in i915_debugfs.c.

That way you can fully test it all without hitting anything remotely
resembling actual OOM, which tends to kill all kinds of things.

Aside from the detail work I think this is going in the right direction.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx
  2020-12-16 14:04 ` [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx Christian König
@ 2020-12-16 15:26   ` Daniel Vetter
  2020-12-16 16:55     ` kernel test robot
  2020-12-16 18:19     ` kernel test robot
  2 siblings, 0 replies; 20+ messages in thread
From: Daniel Vetter @ 2020-12-16 15:26 UTC (permalink / raw)
  To: Christian König
  Cc: airlied, sroland, dri-devel, ray.huang, linux-graphics-maintainer

On Wed, Dec 16, 2020 at 03:04:27PM +0100, Christian König wrote:
> This is just another feature which is only used by VMWGFX, so move
> it into the driver instead.
> 
> I've tried to add the accounting sysfs file to the kobject of the drm
> minor, but I'm not 100% sure if this works as expected.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>

tbh, delete it all?

From looking at callers in ttm, all this does is explicitly call your own
shrinker instead of through kmalloc. If we need this (and i915 history
somewhat suggests it might be needed, but that was with the global
dev->struct_mutex lock) then we should have this in the main ttm
allocation paths. Or more properly, in the core mm paths.

But reinventing half of the shrinker infra in a subsystem, much less in a
single driver doesn't sound like a good idea.

That means the sysfs interface goes belly up for everyone, but the real
solution for gpu memory limiting is cgroups, not something in drm hidden
away. Since you're deleting that from all other drivers I think we're
already working under the assumption that no one is using that little bit
of uapi anyway. If not, then this won't work anyway.
-Daniel

> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 16 ++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    |  8 +--
>  drivers/gpu/drm/drm_gem_vram_helper.c         |  6 +--
>  drivers/gpu/drm/nouveau/nouveau_bo.c          |  7 +--
>  drivers/gpu/drm/nouveau/nouveau_drv.h         |  1 -
>  drivers/gpu/drm/qxl/qxl_object.c              |  4 +-
>  drivers/gpu/drm/radeon/radeon_object.c        |  8 +--
>  drivers/gpu/drm/ttm/Makefile                  |  6 +--
>  drivers/gpu/drm/ttm/ttm_bo.c                  | 52 ++++++-------------
>  drivers/gpu/drm/ttm/ttm_bo_util.c             |  1 -
>  drivers/gpu/drm/ttm/ttm_pool.c                | 13 +----
>  drivers/gpu/drm/vmwgfx/Makefile               |  2 +-
>  drivers/gpu/drm/{ttm => vmwgfx}/ttm_memory.c  | 19 +++----
>  .../gpu/drm/vmwgfx}/ttm_memory.h              |  5 +-
>  drivers/gpu/drm/vmwgfx/ttm_object.h           |  3 +-
>  drivers/gpu/drm/vmwgfx/vmwgfx_bo.c            | 22 ++++++--
>  drivers/gpu/drm/vmwgfx/vmwgfx_drv.c           |  5 ++
>  drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c    | 28 +++++++++-
>  include/drm/ttm/ttm_bo_api.h                  | 13 +----
>  include/drm/ttm/ttm_bo_driver.h               |  1 -
>  include/drm/ttm/ttm_tt.h                      |  1 +
>  21 files changed, 107 insertions(+), 114 deletions(-)
>  rename drivers/gpu/drm/{ttm => vmwgfx}/ttm_memory.c (97%)
>  rename {include/drm/ttm => drivers/gpu/drm/vmwgfx}/ttm_memory.h (97%)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index a9647e7f98a8..5ed1c88b8748 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -118,6 +118,16 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>   */
>  #define ESTIMATE_PT_SIZE(mem_size) ((mem_size) >> 14)
>  
> +static size_t amdgpu_amdkfd_acc_size(uint64_t size)
> +{
> +	size >>= PAGE_SHIFT;
> +	size *= sizeof(dma_addr_t) + sizeof(void *);
> +
> +	return __roundup_pow_of_two(sizeof(struct amdgpu_bo)) +
> +		__rountup_pow_of_two(sizeof(struct ttm_tt)) +
> +		PAGE_ALIGN(size);
> +}
> +
>  static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
>  		uint64_t size, u32 domain, bool sg)
>  {
> @@ -126,8 +136,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
>  	size_t acc_size, system_mem_needed, ttm_mem_needed, vram_needed;
>  	int ret = 0;
>  
> -	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
> -				       sizeof(struct amdgpu_bo));
> +	acc_size = amdgpu_amdkfd_acc_size(size);
>  
>  	vram_needed = 0;
>  	if (domain == AMDGPU_GEM_DOMAIN_GTT) {
> @@ -174,8 +183,7 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
>  {
>  	size_t acc_size;
>  
> -	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
> -				       sizeof(struct amdgpu_bo));
> +	acc_size = amdgpu_amdkfd_acc_size(size);
>  
>  	spin_lock(&kfd_mem_limit.mem_limit_lock);
>  	if (domain == AMDGPU_GEM_DOMAIN_GTT) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 6cc9919b12cc..599c9a132eb6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -523,7 +523,6 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>  	};
>  	struct amdgpu_bo *bo;
>  	unsigned long page_align, size = bp->size;
> -	size_t acc_size;
>  	int r;
>  
>  	/* Note that GDS/GWS/OA allocates 1 page per byte/resource. */
> @@ -546,9 +545,6 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>  
>  	*bo_ptr = NULL;
>  
> -	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
> -				       sizeof(struct amdgpu_bo));
> -
>  	bo = kzalloc(sizeof(struct amdgpu_bo), GFP_KERNEL);
>  	if (bo == NULL)
>  		return -ENOMEM;
> @@ -577,8 +573,8 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>  		bo->tbo.priority = 1;
>  
>  	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, bp->type,
> -				 &bo->placement, page_align, &ctx, acc_size,
> -				 NULL, bp->resv, &amdgpu_bo_destroy);
> +				 &bo->placement, page_align, &ctx,  NULL,
> +				 bp->resv, &amdgpu_bo_destroy);
>  	if (unlikely(r != 0))
>  		return r;
>  
> diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
> index 02ca22e90290..5cf7797048e1 100644
> --- a/drivers/gpu/drm/drm_gem_vram_helper.c
> +++ b/drivers/gpu/drm/drm_gem_vram_helper.c
> @@ -189,7 +189,6 @@ struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
>  	struct drm_vram_mm *vmm = dev->vram_mm;
>  	struct ttm_bo_device *bdev;
>  	int ret;
> -	size_t acc_size;
>  
>  	if (WARN_ONCE(!vmm, "VRAM MM not initialized"))
>  		return ERR_PTR(-EINVAL);
> @@ -216,7 +215,6 @@ struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
>  	}
>  
>  	bdev = &vmm->bdev;
> -	acc_size = ttm_bo_dma_acc_size(bdev, size, sizeof(*gbo));
>  
>  	gbo->bo.bdev = bdev;
>  	drm_gem_vram_placement(gbo, DRM_GEM_VRAM_PL_FLAG_SYSTEM);
> @@ -226,8 +224,8 @@ struct drm_gem_vram_object *drm_gem_vram_create(struct drm_device *dev,
>  	 * to release gbo->bo.base and kfree gbo.
>  	 */
>  	ret = ttm_bo_init(bdev, &gbo->bo, size, ttm_bo_type_device,
> -			  &gbo->placement, pg_align, false, acc_size,
> -			  NULL, NULL, ttm_buffer_object_destroy);
> +			  &gbo->placement, pg_align, false, NULL, NULL,
> +			  ttm_buffer_object_destroy);
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index 02d86f8311a9..64cb1ed0f6ea 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -300,18 +300,15 @@ nouveau_bo_init(struct nouveau_bo *nvbo, u64 size, int align, u32 domain,
>  		struct sg_table *sg, struct dma_resv *robj)
>  {
>  	int type = sg ? ttm_bo_type_sg : ttm_bo_type_device;
> -	size_t acc_size;
>  	int ret;
>  
> -	acc_size = ttm_bo_dma_acc_size(nvbo->bo.bdev, size, sizeof(*nvbo));
> -
>  	nvbo->bo.mem.num_pages = size >> PAGE_SHIFT;
>  	nouveau_bo_placement_set(nvbo, domain, 0);
>  	INIT_LIST_HEAD(&nvbo->io_reserve_lru);
>  
>  	ret = ttm_bo_init(nvbo->bo.bdev, &nvbo->bo, size, type,
> -			  &nvbo->placement, align >> PAGE_SHIFT, false,
> -			  acc_size, sg, robj, nouveau_bo_del_ttm);
> +			  &nvbo->placement, align >> PAGE_SHIFT, false, sg,
> +			  robj, nouveau_bo_del_ttm);
>  	if (ret) {
>  		/* ttm will call nouveau_bo_del_ttm if it fails.. */
>  		return ret;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
> index c802d3d1ba39..700fa0979d14 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
> @@ -54,7 +54,6 @@
>  #include <drm/ttm/ttm_bo_api.h>
>  #include <drm/ttm/ttm_bo_driver.h>
>  #include <drm/ttm/ttm_placement.h>
> -#include <drm/ttm/ttm_memory.h>
>  
>  #include <drm/drm_audio_component.h>
>  
> diff --git a/drivers/gpu/drm/qxl/qxl_object.c b/drivers/gpu/drm/qxl/qxl_object.c
> index ceebc5881f68..705b51535492 100644
> --- a/drivers/gpu/drm/qxl/qxl_object.c
> +++ b/drivers/gpu/drm/qxl/qxl_object.c
> @@ -138,8 +138,8 @@ int qxl_bo_create(struct qxl_device *qdev,
>  	qxl_ttm_placement_from_domain(bo, domain);
>  
>  	r = ttm_bo_init_reserved(&qdev->mman.bdev, &bo->tbo, size, type,
> -				 &bo->placement, 0, &ctx, size,
> -				 NULL, NULL, &qxl_ttm_bo_destroy);
> +				 &bo->placement, 0, &ctx, NULL, NULL,
> +				 &qxl_ttm_bo_destroy);
>  	if (unlikely(r != 0)) {
>  		if (r != -ERESTARTSYS)
>  			dev_err(qdev->ddev.dev,
> diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
> index 9b81786782de..5cd44f2a77cd 100644
> --- a/drivers/gpu/drm/radeon/radeon_object.c
> +++ b/drivers/gpu/drm/radeon/radeon_object.c
> @@ -159,7 +159,6 @@ int radeon_bo_create(struct radeon_device *rdev,
>  	struct radeon_bo *bo;
>  	enum ttm_bo_type type;
>  	unsigned long page_align = roundup(byte_align, PAGE_SIZE) >> PAGE_SHIFT;
> -	size_t acc_size;
>  	int r;
>  
>  	size = ALIGN(size, PAGE_SIZE);
> @@ -173,9 +172,6 @@ int radeon_bo_create(struct radeon_device *rdev,
>  	}
>  	*bo_ptr = NULL;
>  
> -	acc_size = ttm_bo_dma_acc_size(&rdev->mman.bdev, size,
> -				       sizeof(struct radeon_bo));
> -
>  	bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL);
>  	if (bo == NULL)
>  		return -ENOMEM;
> @@ -230,8 +226,8 @@ int radeon_bo_create(struct radeon_device *rdev,
>  	/* Kernel allocation are uninterruptible */
>  	down_read(&rdev->pm.mclk_lock);
>  	r = ttm_bo_init(&rdev->mman.bdev, &bo->tbo, size, type,
> -			&bo->placement, page_align, !kernel, acc_size,
> -			sg, resv, &radeon_ttm_bo_destroy);
> +			&bo->placement, page_align, !kernel, sg, resv,
> +			&radeon_ttm_bo_destroy);
>  	up_read(&rdev->pm.mclk_lock);
>  	if (unlikely(r != 0)) {
>  		return r;
> diff --git a/drivers/gpu/drm/ttm/Makefile b/drivers/gpu/drm/ttm/Makefile
> index b6f5f87b270f..7e8dc85caac3 100644
> --- a/drivers/gpu/drm/ttm/Makefile
> +++ b/drivers/gpu/drm/ttm/Makefile
> @@ -2,10 +2,8 @@
>  #
>  # Makefile for the drm device driver.  This driver provides support for the
>  
> -ttm-y := ttm_memory.o ttm_tt.o ttm_bo.o \
> -	ttm_bo_util.o ttm_bo_vm.o ttm_module.o \
> -	ttm_execbuf_util.o ttm_range_manager.o \
> -	ttm_resource.o ttm_pool.o
> +ttm-y := ttm_tt.o ttm_bo.o ttm_bo_util.o ttm_bo_vm.o ttm_module.o \
> +	ttm_execbuf_util.o ttm_range_manager.o ttm_resource.o ttm_pool.o
>  ttm-$(CONFIG_AGP) += ttm_agp_backend.o
>  
>  obj-$(CONFIG_DRM_TTM) += ttm.o
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index f1f3fd085465..cbb4d67c9288 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -474,7 +474,6 @@ static void ttm_bo_release(struct kref *kref)
>  	struct ttm_buffer_object *bo =
>  	    container_of(kref, struct ttm_buffer_object, kref);
>  	struct ttm_bo_device *bdev = bo->bdev;
> -	size_t acc_size = bo->acc_size;
>  	int ret;
>  
>  	if (!bo->deleted) {
> @@ -534,7 +533,6 @@ static void ttm_bo_release(struct kref *kref)
>  	if (!ttm_bo_uses_embedded_gem_object(bo))
>  		dma_resv_fini(&bo->base._resv);
>  	bo->destroy(bo);
> -	ttm_mem_global_free(&ttm_mem_glob, acc_size);
>  }
>  
>  void ttm_bo_put(struct ttm_buffer_object *bo)
> @@ -1095,25 +1093,13 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
>  			 struct ttm_placement *placement,
>  			 uint32_t page_alignment,
>  			 struct ttm_operation_ctx *ctx,
> -			 size_t acc_size,
>  			 struct sg_table *sg,
>  			 struct dma_resv *resv,
>  			 void (*destroy) (struct ttm_buffer_object *))
>  {
> -	struct ttm_mem_global *mem_glob = &ttm_mem_glob;
>  	bool locked;
>  	int ret = 0;
>  
> -	ret = ttm_mem_global_alloc(mem_glob, acc_size, ctx);
> -	if (ret) {
> -		pr_err("Out of kernel memory\n");
> -		if (destroy)
> -			(*destroy)(bo);
> -		else
> -			kfree(bo);
> -		return -ENOMEM;
> -	}
> -
>  	bo->destroy = destroy ? destroy : ttm_bo_default_destroy;
>  
>  	kref_init(&bo->kref);
> @@ -1130,7 +1116,6 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
>  	bo->mem.bus.addr = NULL;
>  	bo->moving = NULL;
>  	bo->mem.placement = 0;
> -	bo->acc_size = acc_size;
>  	bo->pin_count = 0;
>  	bo->sg = sg;
>  	if (resv) {
> @@ -1191,7 +1176,6 @@ int ttm_bo_init(struct ttm_bo_device *bdev,
>  		struct ttm_placement *placement,
>  		uint32_t page_alignment,
>  		bool interruptible,
> -		size_t acc_size,
>  		struct sg_table *sg,
>  		struct dma_resv *resv,
>  		void (*destroy) (struct ttm_buffer_object *))
> @@ -1200,8 +1184,7 @@ int ttm_bo_init(struct ttm_bo_device *bdev,
>  	int ret;
>  
>  	ret = ttm_bo_init_reserved(bdev, bo, size, type, placement,
> -				   page_alignment, &ctx, acc_size,
> -				   sg, resv, destroy);
> +				   page_alignment, &ctx, sg, resv, destroy);
>  	if (ret)
>  		return ret;
>  
> @@ -1212,20 +1195,6 @@ int ttm_bo_init(struct ttm_bo_device *bdev,
>  }
>  EXPORT_SYMBOL(ttm_bo_init);
>  
> -size_t ttm_bo_dma_acc_size(struct ttm_bo_device *bdev,
> -			   unsigned long bo_size,
> -			   unsigned struct_size)
> -{
> -	unsigned npages = (PAGE_ALIGN(bo_size)) >> PAGE_SHIFT;
> -	size_t size = 0;
> -
> -	size += ttm_round_pot(struct_size);
> -	size += ttm_round_pot(npages * (2*sizeof(void *) + sizeof(dma_addr_t)));
> -	size += ttm_round_pot(sizeof(struct ttm_tt));
> -	return size;
> -}
> -EXPORT_SYMBOL(ttm_bo_dma_acc_size);
> -
>  static void ttm_bo_global_kobj_release(struct kobject *kobj)
>  {
>  	struct ttm_bo_global *glob =
> @@ -1242,9 +1211,11 @@ static void ttm_bo_global_release(void)
>  	if (--ttm_bo_glob_use_count > 0)
>  		goto out;
>  
> +	ttm_tt_mgr_fini();
> +	ttm_pool_mgr_fini();
> +
>  	kobject_del(&glob->kobj);
>  	kobject_put(&glob->kobj);
> -	ttm_mem_global_release(&ttm_mem_glob);
>  	memset(glob, 0, sizeof(*glob));
>  out:
>  	mutex_unlock(&ttm_global_mutex);
> @@ -1253,6 +1224,8 @@ static void ttm_bo_global_release(void)
>  static int ttm_bo_global_init(void)
>  {
>  	struct ttm_bo_global *glob = &ttm_bo_glob;
> +	uint64_t num_pages;
> +	struct sysinfo si;
>  	int ret = 0;
>  	unsigned i;
>  
> @@ -1260,9 +1233,16 @@ static int ttm_bo_global_init(void)
>  	if (++ttm_bo_glob_use_count > 1)
>  		goto out;
>  
> -	ret = ttm_mem_global_init(&ttm_mem_glob);
> -	if (ret)
> -		goto out;
> +	si_meminfo(&si);
> +
> +	/* Limit the number of pages in the pool to about 50% of the total
> +	 * system memory.
> +	 */
> +	num_pages = (u64)si.totalram * si.mem_unit;
> +	num_pages = (num_pages * 50 / 100) >> PAGE_SHIFT;
> +
> +	ttm_pool_mgr_init(num_pages);
> +	ttm_tt_mgr_init();
>  
>  	spin_lock_init(&glob->lru_lock);
>  	glob->dummy_read_page = alloc_page(__GFP_ZERO | GFP_DMA32);
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
> index 398d5013fc39..03e9f6f6a611 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> @@ -309,7 +309,6 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo,
>  
>  	kref_init(&fbo->base.kref);
>  	fbo->base.destroy = &ttm_transfered_destroy;
> -	fbo->base.acc_size = 0;
>  	fbo->base.pin_count = 0;
>  	if (bo->type != ttm_bo_type_sg)
>  		fbo->base.base.resv = &fbo->base.base._resv;
> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
> index 5455b2044759..24ae7c23cfd0 100644
> --- a/drivers/gpu/drm/ttm/ttm_pool.c
> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
> @@ -416,16 +416,10 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
>  			caching = pages + (1 << order);
>  		}
>  
> -		r = ttm_mem_global_alloc_page(&ttm_mem_glob, p,
> -					      (1 << order) * PAGE_SIZE,
> -					      ctx);
> -		if (r)
> -			goto error_free_page;
> -
>  		if (dma_addr) {
>  			r = ttm_pool_map(pool, order, p, &dma_addr);
>  			if (r)
> -				goto error_global_free;
> +				goto error_free_page;
>  		}
>  
>  		num_pages -= 1 << order;
> @@ -439,9 +433,6 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
>  
>  	return 0;
>  
> -error_global_free:
> -	ttm_mem_global_free_page(&ttm_mem_glob, p, (1 << order) * PAGE_SIZE);
> -
>  error_free_page:
>  	ttm_pool_free_page(pool, tt->caching, order, p);
>  
> @@ -476,8 +467,6 @@ void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt *tt)
>  
>  		order = ttm_pool_page_order(pool, p);
>  		num_pages = 1ULL << order;
> -		ttm_mem_global_free_page(&ttm_mem_glob, p,
> -					 num_pages * PAGE_SIZE);
>  		if (tt->dma_address)
>  			ttm_pool_unmap(pool, tt->dma_address[i], num_pages);
>  
> diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
> index 31f85f09f1fc..86af5d7b0242 100644
> --- a/drivers/gpu/drm/vmwgfx/Makefile
> +++ b/drivers/gpu/drm/vmwgfx/Makefile
> @@ -9,7 +9,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
>  	    vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o \
>  	    vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
>  	    vmwgfx_validation.o vmwgfx_page_dirty.o vmwgfx_streamoutput.o \
> -	    ttm_object.o ttm_lock.o
> +	    ttm_object.o ttm_lock.o ttm_memory.o
>  
>  vmwgfx-$(CONFIG_TRANSPARENT_HUGEPAGE) += vmwgfx_thp.o
>  obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
> diff --git a/drivers/gpu/drm/ttm/ttm_memory.c b/drivers/gpu/drm/vmwgfx/ttm_memory.c
> similarity index 97%
> rename from drivers/gpu/drm/ttm/ttm_memory.c
> rename to drivers/gpu/drm/vmwgfx/ttm_memory.c
> index 3d2479d0ce2c..bfec1e845345 100644
> --- a/drivers/gpu/drm/ttm/ttm_memory.c
> +++ b/drivers/gpu/drm/vmwgfx/ttm_memory.c
> @@ -28,7 +28,6 @@
>  
>  #define pr_fmt(fmt) "[TTM] " fmt
>  
> -#include <drm/ttm/ttm_memory.h>
>  #include <linux/spinlock.h>
>  #include <linux/sched.h>
>  #include <linux/wait.h>
> @@ -36,10 +35,11 @@
>  #include <linux/module.h>
>  #include <linux/slab.h>
>  #include <linux/swap.h>
> -#include <drm/ttm/ttm_pool.h>
> -#include <drm/ttm/ttm_tt.h>
>  
> -#include "ttm_module.h"
> +#include <drm/drm_device.h>
> +#include <drm/drm_file.h>
> +
> +#include "ttm_memory.h"
>  
>  #define TTM_MEMORY_ALLOC_RETRIES 4
>  
> @@ -414,7 +414,7 @@ static int ttm_mem_init_dma32_zone(struct ttm_mem_global *glob,
>  }
>  #endif
>  
> -int ttm_mem_global_init(struct ttm_mem_global *glob)
> +int ttm_mem_global_init(struct ttm_mem_global *glob, struct drm_device *dev)
>  {
>  	struct sysinfo si;
>  	int ret;
> @@ -425,7 +425,8 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
>  	glob->swap_queue = create_singlethread_workqueue("ttm_swap");
>  	INIT_WORK(&glob->work, ttm_shrink_work);
>  	ret = kobject_init_and_add(
> -		&glob->kobj, &ttm_mem_glob_kobj_type, ttm_get_kobj(), "memory_accounting");
> +		&glob->kobj, &ttm_mem_glob_kobj_type, &dev->primary->kdev->kobj,
> +		"memory_accounting");
>  	if (unlikely(ret != 0)) {
>  		kobject_put(&glob->kobj);
>  		return ret;
> @@ -453,8 +454,6 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
>  		pr_info("Zone %7s: Available graphics memory: %llu KiB\n",
>  			zone->name, (unsigned long long)zone->max_mem >> 10);
>  	}
> -	ttm_pool_mgr_init(glob->zone_kernel->max_mem/(2*PAGE_SIZE));
> -	ttm_tt_mgr_init();
>  	return 0;
>  out_no_zone:
>  	ttm_mem_global_release(glob);
> @@ -466,10 +465,6 @@ void ttm_mem_global_release(struct ttm_mem_global *glob)
>  	struct ttm_mem_zone *zone;
>  	unsigned int i;
>  
> -	/* let the page allocator first stop the shrink work. */
> -	ttm_pool_mgr_fini();
> -	ttm_tt_mgr_fini();
> -
>  	flush_workqueue(glob->swap_queue);
>  	destroy_workqueue(glob->swap_queue);
>  	glob->swap_queue = NULL;
> diff --git a/include/drm/ttm/ttm_memory.h b/drivers/gpu/drm/vmwgfx/ttm_memory.h
> similarity index 97%
> rename from include/drm/ttm/ttm_memory.h
> rename to drivers/gpu/drm/vmwgfx/ttm_memory.h
> index c1f167881e33..850ee6c867da 100644
> --- a/include/drm/ttm/ttm_memory.h
> +++ b/drivers/gpu/drm/vmwgfx/ttm_memory.h
> @@ -35,7 +35,8 @@
>  #include <linux/errno.h>
>  #include <linux/kobject.h>
>  #include <linux/mm.h>
> -#include "ttm_bo_api.h"
> +
> +#include <drm/ttm/ttm_bo_api.h>
>  
>  /**
>   * struct ttm_mem_global - Global memory accounting structure.
> @@ -79,7 +80,7 @@ extern struct ttm_mem_global {
>  #endif
>  } ttm_mem_glob;
>  
> -int ttm_mem_global_init(struct ttm_mem_global *glob);
> +int ttm_mem_global_init(struct ttm_mem_global *glob, struct drm_device *dev);
>  void ttm_mem_global_release(struct ttm_mem_global *glob);
>  int ttm_mem_global_alloc(struct ttm_mem_global *glob, uint64_t memory,
>  			 struct ttm_operation_ctx *ctx);
> diff --git a/drivers/gpu/drm/vmwgfx/ttm_object.h b/drivers/gpu/drm/vmwgfx/ttm_object.h
> index ede26df87c93..49b064f0cb19 100644
> --- a/drivers/gpu/drm/vmwgfx/ttm_object.h
> +++ b/drivers/gpu/drm/vmwgfx/ttm_object.h
> @@ -43,7 +43,8 @@
>  #include <linux/rcupdate.h>
>  
>  #include <drm/drm_hashtab.h>
> -#include <drm/ttm/ttm_memory.h>
> +
> +#include "ttm_memory.h"
>  
>  /**
>   * enum ttm_ref_type
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> index 63dbc44eebe0..f7abe2992178 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> @@ -508,11 +508,16 @@ int vmw_bo_create_kernel(struct vmw_private *dev_priv, unsigned long size,
>  	acc_size = ttm_round_pot(sizeof(*bo));
>  	acc_size += ttm_round_pot(npages * sizeof(void *));
>  	acc_size += ttm_round_pot(sizeof(struct ttm_tt));
> +
> +	ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
> +	if (unlikely(ret))
> +		goto error_free;
> +
>  	ret = ttm_bo_init_reserved(&dev_priv->bdev, bo, size,
>  				   ttm_bo_type_device, placement, 0,
> -				   &ctx, acc_size, NULL, NULL, NULL);
> +				   &ctx, NULL, NULL, NULL);
>  	if (unlikely(ret))
> -		goto error_free;
> +		goto error_account;
>  
>  	ttm_bo_pin(bo);
>  	ttm_bo_unreserve(bo);
> @@ -520,6 +525,9 @@ int vmw_bo_create_kernel(struct vmw_private *dev_priv, unsigned long size,
>  
>  	return 0;
>  
> +error_account:
> +	ttm_mem_global_free(&ttm_mem_glob, acc_size);
> +
>  error_free:
>  	kfree(bo);
>  	return ret;
> @@ -559,11 +567,17 @@ int vmw_bo_init(struct vmw_private *dev_priv,
>  	vmw_bo->base.priority = 3;
>  	vmw_bo->res_tree = RB_ROOT;
>  
> +	ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
> +	if (unlikely(ret))
> +		return ret;
> +
>  	ret = ttm_bo_init_reserved(bdev, &vmw_bo->base, size,
>  				   ttm_bo_type_device, placement,
> -				   0, &ctx, acc_size, NULL, NULL, bo_free);
> -	if (unlikely(ret))
> +				   0, &ctx, NULL, NULL, bo_free);
> +	if (unlikely(ret)) {
> +		ttm_mem_global_free(&ttm_mem_glob, acc_size);
>  		return ret;
> +	}
>  
>  	if (pin)
>  		ttm_bo_pin(&vmw_bo->base);
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> index e141fa9f616d..ca66eaa2e376 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> @@ -1272,6 +1272,7 @@ static void vmw_remove(struct pci_dev *pdev)
>  {
>  	struct drm_device *dev = pci_get_drvdata(pdev);
>  
> +	ttm_mem_global_release(&ttm_mem_glob);
>  	drm_dev_unregister(dev);
>  	vmw_driver_unload(dev);
>  	drm_dev_put(dev);
> @@ -1519,6 +1520,10 @@ static int vmw_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>  		goto err_pci_disable_device;
>  	}
>  
> +	ret = ttm_mem_global_init(&ttm_mem_glob, dev);
> +	if (ret)
> +		return ret;
> +
>  	dev->pdev = pdev;
>  	pci_set_drvdata(pdev, dev);
>  
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> index 1c75f73538c0..efb567da749d 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> @@ -575,11 +575,31 @@ static void vmw_ttm_destroy(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
>  static int vmw_ttm_populate(struct ttm_bo_device *bdev,
>  			    struct ttm_tt *ttm, struct ttm_operation_ctx *ctx)
>  {
> +	unsigned int i;
> +	int ret;
> +
>  	/* TODO: maybe completely drop this ? */
>  	if (ttm_tt_is_populated(ttm))
>  		return 0;
>  
> -	return ttm_pool_alloc(&bdev->pool, ttm, ctx);
> +	ret = ttm_pool_alloc(&bdev->pool, ttm, ctx);
> +	if (ret)
> +		return ret;
> +
> +	for (i = 0; i < ttm->num_pages; ++i) {
> +		ret = ttm_mem_global_alloc_page(&ttm_mem_glob, ttm->pages[i],
> +						PAGE_SIZE, ctx);
> +		if (ret)
> +			goto error;
> +	}
> +	return 0;
> +
> +error:
> +	while (i--)
> +		ttm_mem_global_free_page(&ttm_mem_glob, ttm->pages[i],
> +					 PAGE_SIZE);
> +	ttm_pool_free(&bdev->pool, ttm);
> +	return ret;
>  }
>  
>  static void vmw_ttm_unpopulate(struct ttm_bo_device *bdev,
> @@ -587,6 +607,7 @@ static void vmw_ttm_unpopulate(struct ttm_bo_device *bdev,
>  {
>  	struct vmw_ttm_tt *vmw_tt = container_of(ttm, struct vmw_ttm_tt,
>  						 dma_ttm);
> +	unsigned int i;
>  
>  	if (vmw_tt->mob) {
>  		vmw_mob_destroy(vmw_tt->mob);
> @@ -594,6 +615,11 @@ static void vmw_ttm_unpopulate(struct ttm_bo_device *bdev,
>  	}
>  
>  	vmw_ttm_unmap_dma(vmw_tt);
> +
> +	for (i = 0; i < ttm->num_pages; ++i)
> +		ttm_mem_global_free_page(&ttm_mem_glob, ttm->pages[i],
> +					 PAGE_SIZE);
> +
>  	ttm_pool_free(&bdev->pool, ttm);
>  }
>  
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index 7c7539d76e1d..30254d869db6 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -88,7 +88,6 @@ struct ttm_tt;
>   * @type: The bo type.
>   * @destroy: Destruction function. If NULL, kfree is used.
>   * @num_pages: Actual number of pages.
> - * @acc_size: Accounted size for this object.
>   * @kref: Reference count of this buffer object. When this refcount reaches
>   * zero, the object is destroyed or put on the delayed delete list.
>   * @mem: structure describing current placement.
> @@ -125,7 +124,6 @@ struct ttm_buffer_object {
>  	struct ttm_bo_device *bdev;
>  	enum ttm_bo_type type;
>  	void (*destroy) (struct ttm_buffer_object *);
> -	size_t acc_size;
>  
>  	/**
>  	* Members not needing protection.
> @@ -357,10 +355,6 @@ void ttm_bo_unlock_delayed_workqueue(struct ttm_bo_device *bdev, int resched);
>  bool ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
>  			      const struct ttm_place *place);
>  
> -size_t ttm_bo_dma_acc_size(struct ttm_bo_device *bdev,
> -			   unsigned long bo_size,
> -			   unsigned struct_size);
> -
>  /**
>   * ttm_bo_init_reserved
>   *
> @@ -371,7 +365,6 @@ size_t ttm_bo_dma_acc_size(struct ttm_bo_device *bdev,
>   * @flags: Initial placement flags.
>   * @page_alignment: Data alignment in pages.
>   * @ctx: TTM operation context for memory allocation.
> - * @acc_size: Accounted size for this object.
>   * @resv: Pointer to a dma_resv, or NULL to let ttm allocate one.
>   * @destroy: Destroy function. Use NULL for kfree().
>   *
> @@ -402,8 +395,7 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
>  			 struct ttm_placement *placement,
>  			 uint32_t page_alignment,
>  			 struct ttm_operation_ctx *ctx,
> -			 size_t acc_size, struct sg_table *sg,
> -			 struct dma_resv *resv,
> +			 struct sg_table *sg, struct dma_resv *resv,
>  			 void (*destroy) (struct ttm_buffer_object *));
>  
>  /**
> @@ -421,7 +413,6 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
>   * holds a pointer to a persistent shmem object. Typically, this would
>   * point to the shmem object backing a GEM object if TTM is used to back a
>   * GEM user interface.
> - * @acc_size: Accounted size for this object.
>   * @resv: Pointer to a dma_resv, or NULL to let ttm allocate one.
>   * @destroy: Destroy function. Use NULL for kfree().
>   *
> @@ -446,7 +437,7 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
>  int ttm_bo_init(struct ttm_bo_device *bdev, struct ttm_buffer_object *bo,
>  		size_t size, enum ttm_bo_type type,
>  		struct ttm_placement *placement,
> -		uint32_t page_alignment, bool interrubtible, size_t acc_size,
> +		uint32_t page_alignment, bool interrubtible,
>  		struct sg_table *sg, struct dma_resv *resv,
>  		void (*destroy) (struct ttm_buffer_object *));
>  
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> index 423348414c59..547857697776 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -38,7 +38,6 @@
>  #include <linux/dma-resv.h>
>  
>  #include "ttm_bo_api.h"
> -#include "ttm_memory.h"
>  #include "ttm_placement.h"
>  #include "ttm_tt.h"
>  #include "ttm_pool.h"
> diff --git a/include/drm/ttm/ttm_tt.h b/include/drm/ttm/ttm_tt.h
> index 4bc3a08644fc..499338680660 100644
> --- a/include/drm/ttm/ttm_tt.h
> +++ b/include/drm/ttm/ttm_tt.h
> @@ -30,6 +30,7 @@
>  #include <linux/types.h>
>  #include <drm/ttm/ttm_caching.h>
>  
> +struct ttm_bo_device;
>  struct ttm_tt;
>  struct ttm_resource;
>  struct ttm_buffer_object;
> -- 
> 2.25.1
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx
  2020-12-16 14:04 ` [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx Christian König
@ 2020-12-16 16:55     ` kernel test robot
  2020-12-16 16:55     ` kernel test robot
  2020-12-16 18:19     ` kernel test robot
  2 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2020-12-16 16:55 UTC (permalink / raw)
  To: Christian König, dri-devel, linux-graphics-maintainer
  Cc: airlied, sroland, ray.huang, kbuild-all

[-- Attachment #1: Type: text/plain, Size: 1744 bytes --]

Hi "Christian,

I love your patch! Yet something to improve:

[auto build test ERROR on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.10 next-20201215]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Christian-K-nig/drm-ttm-rework-ttm_tt-page-limit-v2/20201216-221614
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: riscv-rv32_defconfig (attached as .config)
compiler: riscv32-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/b613e371433208f88816be875b9d46b6d24cf830
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Christian-K-nig/drm-ttm-rework-ttm_tt-page-limit-v2/20201216-221614
        git checkout b613e371433208f88816be875b9d46b6d24cf830
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   riscv32-linux-ld: drivers/gpu/drm/ttm/ttm_bo.o: in function `.L114':
>> ttm_bo.c:(.text+0x62c): undefined reference to `__udivdi3'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 20335 bytes --]

[-- Attachment #3: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx
@ 2020-12-16 16:55     ` kernel test robot
  0 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2020-12-16 16:55 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 1781 bytes --]

Hi "Christian,

I love your patch! Yet something to improve:

[auto build test ERROR on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.10 next-20201215]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Christian-K-nig/drm-ttm-rework-ttm_tt-page-limit-v2/20201216-221614
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: riscv-rv32_defconfig (attached as .config)
compiler: riscv32-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/b613e371433208f88816be875b9d46b6d24cf830
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Christian-K-nig/drm-ttm-rework-ttm_tt-page-limit-v2/20201216-221614
        git checkout b613e371433208f88816be875b9d46b6d24cf830
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   riscv32-linux-ld: drivers/gpu/drm/ttm/ttm_bo.o: in function `.L114':
>> ttm_bo.c:(.text+0x62c): undefined reference to `__udivdi3'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 20335 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx
  2020-12-16 14:04 ` [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx Christian König
@ 2020-12-16 18:19     ` kernel test robot
  2020-12-16 16:55     ` kernel test robot
  2020-12-16 18:19     ` kernel test robot
  2 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2020-12-16 18:19 UTC (permalink / raw)
  To: Christian König, dri-devel, linux-graphics-maintainer
  Cc: airlied, sroland, ray.huang, kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2963 bytes --]

Hi "Christian,

I love your patch! Yet something to improve:

[auto build test ERROR on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.10 next-20201215]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Christian-K-nig/drm-ttm-rework-ttm_tt-page-limit-v2/20201216-221614
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: i386-randconfig-c001-20201216 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/b613e371433208f88816be875b9d46b6d24cf830
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Christian-K-nig/drm-ttm-rework-ttm_tt-page-limit-v2/20201216-221614
        git checkout b613e371433208f88816be875b9d46b6d24cf830
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: drivers/gpu/drm/ttm/ttm_bo.o: in function `ttm_bo_global_init':
>> drivers/gpu/drm/ttm/ttm_bo.c:1242: undefined reference to `__udivdi3'


vim +1242 drivers/gpu/drm/ttm/ttm_bo.c

  1223	
  1224	static int ttm_bo_global_init(void)
  1225	{
  1226		struct ttm_bo_global *glob = &ttm_bo_glob;
  1227		uint64_t num_pages;
  1228		struct sysinfo si;
  1229		int ret = 0;
  1230		unsigned i;
  1231	
  1232		mutex_lock(&ttm_global_mutex);
  1233		if (++ttm_bo_glob_use_count > 1)
  1234			goto out;
  1235	
  1236		si_meminfo(&si);
  1237	
  1238		/* Limit the number of pages in the pool to about 50% of the total
  1239		 * system memory.
  1240		 */
  1241		num_pages = (u64)si.totalram * si.mem_unit;
> 1242		num_pages = (num_pages * 50 / 100) >> PAGE_SHIFT;
  1243	
  1244		ttm_pool_mgr_init(num_pages);
  1245		ttm_tt_mgr_init();
  1246	
  1247		spin_lock_init(&glob->lru_lock);
  1248		glob->dummy_read_page = alloc_page(__GFP_ZERO | GFP_DMA32);
  1249	
  1250		if (unlikely(glob->dummy_read_page == NULL)) {
  1251			ret = -ENOMEM;
  1252			goto out;
  1253		}
  1254	
  1255		for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i)
  1256			INIT_LIST_HEAD(&glob->swap_lru[i]);
  1257		INIT_LIST_HEAD(&glob->device_list);
  1258		atomic_set(&glob->bo_count, 0);
  1259	
  1260		ret = kobject_init_and_add(
  1261			&glob->kobj, &ttm_bo_glob_kobj_type, ttm_get_kobj(), "buffer_objects");
  1262		if (unlikely(ret != 0))
  1263			kobject_put(&glob->kobj);
  1264	out:
  1265		mutex_unlock(&ttm_global_mutex);
  1266		return ret;
  1267	}
  1268	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 35139 bytes --]

[-- Attachment #3: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx
@ 2020-12-16 18:19     ` kernel test robot
  0 siblings, 0 replies; 20+ messages in thread
From: kernel test robot @ 2020-12-16 18:19 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 3048 bytes --]

Hi "Christian,

I love your patch! Yet something to improve:

[auto build test ERROR on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next linus/master drm/drm-next v5.10 next-20201215]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Christian-K-nig/drm-ttm-rework-ttm_tt-page-limit-v2/20201216-221614
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: i386-randconfig-c001-20201216 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/b613e371433208f88816be875b9d46b6d24cf830
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Christian-K-nig/drm-ttm-rework-ttm_tt-page-limit-v2/20201216-221614
        git checkout b613e371433208f88816be875b9d46b6d24cf830
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: drivers/gpu/drm/ttm/ttm_bo.o: in function `ttm_bo_global_init':
>> drivers/gpu/drm/ttm/ttm_bo.c:1242: undefined reference to `__udivdi3'


vim +1242 drivers/gpu/drm/ttm/ttm_bo.c

  1223	
  1224	static int ttm_bo_global_init(void)
  1225	{
  1226		struct ttm_bo_global *glob = &ttm_bo_glob;
  1227		uint64_t num_pages;
  1228		struct sysinfo si;
  1229		int ret = 0;
  1230		unsigned i;
  1231	
  1232		mutex_lock(&ttm_global_mutex);
  1233		if (++ttm_bo_glob_use_count > 1)
  1234			goto out;
  1235	
  1236		si_meminfo(&si);
  1237	
  1238		/* Limit the number of pages in the pool to about 50% of the total
  1239		 * system memory.
  1240		 */
  1241		num_pages = (u64)si.totalram * si.mem_unit;
> 1242		num_pages = (num_pages * 50 / 100) >> PAGE_SHIFT;
  1243	
  1244		ttm_pool_mgr_init(num_pages);
  1245		ttm_tt_mgr_init();
  1246	
  1247		spin_lock_init(&glob->lru_lock);
  1248		glob->dummy_read_page = alloc_page(__GFP_ZERO | GFP_DMA32);
  1249	
  1250		if (unlikely(glob->dummy_read_page == NULL)) {
  1251			ret = -ENOMEM;
  1252			goto out;
  1253		}
  1254	
  1255		for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i)
  1256			INIT_LIST_HEAD(&glob->swap_lru[i]);
  1257		INIT_LIST_HEAD(&glob->device_list);
  1258		atomic_set(&glob->bo_count, 0);
  1259	
  1260		ret = kobject_init_and_add(
  1261			&glob->kobj, &ttm_bo_glob_kobj_type, ttm_get_kobj(), "buffer_objects");
  1262		if (unlikely(ret != 0))
  1263			kobject_put(&glob->kobj);
  1264	out:
  1265		mutex_unlock(&ttm_global_mutex);
  1266		return ret;
  1267	}
  1268	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 35139 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-16 15:09 ` [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2 Daniel Vetter
@ 2020-12-17 13:46   ` Christian König
  2020-12-17 14:36     ` Daniel Vetter
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2020-12-17 13:46 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: airlied, sroland, dri-devel, ray.huang, linux-graphics-maintainer

Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
>> [SNIP]
>> @@ -276,9 +277,9 @@ static void ttm_shrink(struct ttm_mem_global *glob, bool from_wq,
>>   
>>   	while (ttm_zones_above_swap_target(glob, from_wq, extra)) {
>>   		spin_unlock(&glob->lock);
>> -		ret = ttm_bo_swapout(ctx);
>> +		ret = ttm_bo_swapout(ctx, 0);
> General we don't treat gfp_mask as a set of additional flags, but the full
> thing. So here we should have GFP_KERNEL.
>
> Also having both the shrinker and the ttm_shrink_work is a bit much, the
> shrink work should get deleted completely I think.

That's why I'm moving the shrinker work into VMWGFX with patch #2 :)

>>   		spin_lock(&glob->lock);
>> -		if (unlikely(ret != 0))
>> +		if (unlikely(ret < 0))
>>   			break;
>>   	}
>>   
>> @@ -453,6 +454,7 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
>>   			zone->name, (unsigned long long)zone->max_mem >> 10);
>>   	}
>>   	ttm_pool_mgr_init(glob->zone_kernel->max_mem/(2*PAGE_SIZE));
>> +	ttm_tt_mgr_init();
>>   	return 0;
>>   out_no_zone:
>>   	ttm_mem_global_release(glob);
>> @@ -466,6 +468,7 @@ void ttm_mem_global_release(struct ttm_mem_global *glob)
>>   
>>   	/* let the page allocator first stop the shrink work. */
>>   	ttm_pool_mgr_fini();
>> +	ttm_tt_mgr_fini();
>>   
>>   	flush_workqueue(glob->swap_queue);
>>   	destroy_workqueue(glob->swap_queue);
>> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
>> index 7f75a13163f0..d454c428c56a 100644
>> --- a/drivers/gpu/drm/ttm/ttm_tt.c
>> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
>> @@ -38,6 +38,8 @@
>>   #include <drm/drm_cache.h>
>>   #include <drm/ttm/ttm_bo_driver.h>
>>   
>> +static struct shrinker mm_shrinker;
>> +
>>   /*
>>    * Allocates a ttm structure for the given BO.
>>    */
>> @@ -223,13 +225,23 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
>>   	return ret;
>>   }
>>   
>> -int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
>> +/**
>> + * ttm_tt_swapout - swap out tt object
>> + *
>> + * @bdev: TTM device structure.
>> + * @ttm: The struct ttm_tt.
>> + * @gfp_flags: Flags to use for memory allocation.
>> + *
>> + * Swapout a TT object to a shmem_file, return number of pages swapped out or
>> + * negative error code.
>> + */
>> +int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm,
>> +		   gfp_t gfp_flags)
>>   {
>>   	struct address_space *swap_space;
>>   	struct file *swap_storage;
>>   	struct page *from_page;
>>   	struct page *to_page;
>> -	gfp_t gfp_mask;
>>   	int i, ret;
>>   
>>   	swap_storage = shmem_file_setup("ttm swap",
>> @@ -241,14 +253,14 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
>>   	}
>>   
>>   	swap_space = swap_storage->f_mapping;
>> -	gfp_mask = mapping_gfp_mask(swap_space);
>> +	gfp_flags |= mapping_gfp_mask(swap_space);
> I don't think this combines flags correctly. mapping_gfp_mask is most
> likely GFP_KERNEL or something like that, so __GFP_FS (the thing you want
> to check for to avoid recursion) is always set.

Ah, ok. Thanks for the information, because of GFP_NOFS I was assuming 
that the mask works just the other way around.

> I think we need an & here, and maybe screaming if the gfp flags for the
> swapout space are funny.
>
>>   
>>   	for (i = 0; i < ttm->num_pages; ++i) {
>>   		from_page = ttm->pages[i];
>>   		if (unlikely(from_page == NULL))
>>   			continue;
>>   
>> -		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
>> +		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_flags);
>>   		if (IS_ERR(to_page)) {
>>   			ret = PTR_ERR(to_page);
>>   			goto out_err;
>> @@ -263,7 +275,7 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
>>   	ttm->swap_storage = swap_storage;
>>   	ttm->page_flags |= TTM_PAGE_FLAG_SWAPPED;
>>   
>> -	return 0;
>> +	return ttm->num_pages;
>>   
>>   out_err:
>>   	fput(swap_storage);
>> @@ -341,3 +353,63 @@ void ttm_tt_unpopulate(struct ttm_bo_device *bdev,
>>   		ttm_pool_free(&bdev->pool, ttm);
>>   	ttm->page_flags &= ~TTM_PAGE_FLAG_PRIV_POPULATED;
>>   }
>> +
>> +/* As long as pages are available make sure to release at least one */
>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
>> +					  struct shrink_control *sc)
>> +{
>> +	struct ttm_operation_ctx ctx = {
>> +		.no_wait_gpu = true
> Iirc there's an eventual shrinker limit where it gets desperate. I think
> once we hit that, we should allow gpu waits. But it's not passed to
> shrinkers for reasons, so maybe we should have a second round that tries
> to more actively shrink objects if we fell substantially short of what
> reclaim expected us to do?

I think we should try to avoid waiting for the GPU in the shrinker callback.

When we get HMM we will have cases where the shrinker is called from 
there and we can't wait for the GPU then without causing deadlocks.


> Also don't we have a trylock_only flag here to make sure drivers don't do
> something stupid?

Mhm, I'm pretty sure drivers should only be minimal involved.

>> +	};
>> +	int ret;
>> +
>> +	if (sc->gfp_mask & GFP_NOFS)
>> +		return 0;
>> +
>> +	ret = ttm_bo_swapout(&ctx, GFP_NOFS);
>> +	return ret < 0 ? SHRINK_EMPTY : ret;
>> +}
>> +
>> +/* Return the number of pages available or SHRINK_EMPTY if we have none */
>> +static unsigned long ttm_tt_shrinker_count(struct shrinker *shrink,
>> +					   struct shrink_control *sc)
>> +{
>> +	struct ttm_buffer_object *bo;
>> +	unsigned long num_pages = 0;
>> +	unsigned int i;
>> +
>> +	if (sc->gfp_mask & GFP_NOFS)
>> +		return 0;
> The count function should always count, and I'm not seeing a reason why
> you couldn't do that here ... Also my understanding is that GFP_NOFS never
> goes into shrinkers (the NOFS comes from shrinkers originally only being
> used for filesystem objects), so this is double redundant.
>
> My understanding is that gfp_mask is just to convey the right zones and
> stuff, so that your shrinker can try to shrink objects in the right zones.
> Hence I think the check in the _scan() function should also be removed.
>
> Also the non __ prefixed flags are the combinations callers are supposed
> to look at. Memory reclaim code needs to look at the __GFP flags, see e.g.
> gfpflags_allow_blocking() or fs_reclaim_acquire().

Ok got it. But don't we need to somehow avoid recursion here?

> [SNIP]
>> +int ttm_tt_mgr_init(void);
>> +void ttm_tt_mgr_fini(void);
>> +
>>   #if IS_ENABLED(CONFIG_AGP)
>>   #include <linux/agp_backend.h>
> For testing I strongly recommend a debugfs file to trigger this shrinker
> completely, from the right lockdep context (i.e. using
> fs_reclaim_acquire/release()). Much easier to test that way. See
> i915_drop_caches_set() in i915_debugfs.c.
>
> That way you can fully test it all without hitting anything remotely
> resembling actual OOM, which tends to kill all kinds of things.

That's exactly the reason I was switching from sysfs to debugfs in the 
other patch set.

Ok, in this case I'm going to reorder all that stuff and send out the 
debugfs patches first.

> Aside from the detail work I think this is going in the right direction.
> -Daniel

Thanks,
Christian.
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 13:46   ` Christian König
@ 2020-12-17 14:36     ` Daniel Vetter
  2020-12-17 15:10       ` Christian König
  0 siblings, 1 reply; 20+ messages in thread
From: Daniel Vetter @ 2020-12-17 14:36 UTC (permalink / raw)
  To: Christian König
  Cc: Dave Airlie, Roland Scheidegger, dri-devel, Huang Rui, VMware Graphics

On Thu, Dec 17, 2020 at 2:46 PM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> > On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> >> [SNIP]
> >> @@ -276,9 +277,9 @@ static void ttm_shrink(struct ttm_mem_global *glob, bool from_wq,
> >>
> >>      while (ttm_zones_above_swap_target(glob, from_wq, extra)) {
> >>              spin_unlock(&glob->lock);
> >> -            ret = ttm_bo_swapout(ctx);
> >> +            ret = ttm_bo_swapout(ctx, 0);
> > General we don't treat gfp_mask as a set of additional flags, but the full
> > thing. So here we should have GFP_KERNEL.
> >
> > Also having both the shrinker and the ttm_shrink_work is a bit much, the
> > shrink work should get deleted completely I think.
>
> That's why I'm moving the shrinker work into VMWGFX with patch #2 :)
>
> >>              spin_lock(&glob->lock);
> >> -            if (unlikely(ret != 0))
> >> +            if (unlikely(ret < 0))
> >>                      break;
> >>      }
> >>
> >> @@ -453,6 +454,7 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
> >>                      zone->name, (unsigned long long)zone->max_mem >> 10);
> >>      }
> >>      ttm_pool_mgr_init(glob->zone_kernel->max_mem/(2*PAGE_SIZE));
> >> +    ttm_tt_mgr_init();
> >>      return 0;
> >>   out_no_zone:
> >>      ttm_mem_global_release(glob);
> >> @@ -466,6 +468,7 @@ void ttm_mem_global_release(struct ttm_mem_global *glob)
> >>
> >>      /* let the page allocator first stop the shrink work. */
> >>      ttm_pool_mgr_fini();
> >> +    ttm_tt_mgr_fini();
> >>
> >>      flush_workqueue(glob->swap_queue);
> >>      destroy_workqueue(glob->swap_queue);
> >> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> >> index 7f75a13163f0..d454c428c56a 100644
> >> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> >> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> >> @@ -38,6 +38,8 @@
> >>   #include <drm/drm_cache.h>
> >>   #include <drm/ttm/ttm_bo_driver.h>
> >>
> >> +static struct shrinker mm_shrinker;
> >> +
> >>   /*
> >>    * Allocates a ttm structure for the given BO.
> >>    */
> >> @@ -223,13 +225,23 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
> >>      return ret;
> >>   }
> >>
> >> -int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
> >> +/**
> >> + * ttm_tt_swapout - swap out tt object
> >> + *
> >> + * @bdev: TTM device structure.
> >> + * @ttm: The struct ttm_tt.
> >> + * @gfp_flags: Flags to use for memory allocation.
> >> + *
> >> + * Swapout a TT object to a shmem_file, return number of pages swapped out or
> >> + * negative error code.
> >> + */
> >> +int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm,
> >> +               gfp_t gfp_flags)
> >>   {
> >>      struct address_space *swap_space;
> >>      struct file *swap_storage;
> >>      struct page *from_page;
> >>      struct page *to_page;
> >> -    gfp_t gfp_mask;
> >>      int i, ret;
> >>
> >>      swap_storage = shmem_file_setup("ttm swap",
> >> @@ -241,14 +253,14 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
> >>      }
> >>
> >>      swap_space = swap_storage->f_mapping;
> >> -    gfp_mask = mapping_gfp_mask(swap_space);
> >> +    gfp_flags |= mapping_gfp_mask(swap_space);
> > I don't think this combines flags correctly. mapping_gfp_mask is most
> > likely GFP_KERNEL or something like that, so __GFP_FS (the thing you want
> > to check for to avoid recursion) is always set.
>
> Ah, ok. Thanks for the information, because of GFP_NOFS I was assuming
> that the mask works just the other way around.
>
> > I think we need an & here, and maybe screaming if the gfp flags for the
> > swapout space are funny.
> >
> >>
> >>      for (i = 0; i < ttm->num_pages; ++i) {
> >>              from_page = ttm->pages[i];
> >>              if (unlikely(from_page == NULL))
> >>                      continue;
> >>
> >> -            to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
> >> +            to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_flags);
> >>              if (IS_ERR(to_page)) {
> >>                      ret = PTR_ERR(to_page);
> >>                      goto out_err;
> >> @@ -263,7 +275,7 @@ int ttm_tt_swapout(struct ttm_bo_device *bdev, struct ttm_tt *ttm)
> >>      ttm->swap_storage = swap_storage;
> >>      ttm->page_flags |= TTM_PAGE_FLAG_SWAPPED;
> >>
> >> -    return 0;
> >> +    return ttm->num_pages;
> >>
> >>   out_err:
> >>      fput(swap_storage);
> >> @@ -341,3 +353,63 @@ void ttm_tt_unpopulate(struct ttm_bo_device *bdev,
> >>              ttm_pool_free(&bdev->pool, ttm);
> >>      ttm->page_flags &= ~TTM_PAGE_FLAG_PRIV_POPULATED;
> >>   }
> >> +
> >> +/* As long as pages are available make sure to release at least one */
> >> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> >> +                                      struct shrink_control *sc)
> >> +{
> >> +    struct ttm_operation_ctx ctx = {
> >> +            .no_wait_gpu = true
> > Iirc there's an eventual shrinker limit where it gets desperate. I think
> > once we hit that, we should allow gpu waits. But it's not passed to
> > shrinkers for reasons, so maybe we should have a second round that tries
> > to more actively shrink objects if we fell substantially short of what
> > reclaim expected us to do?
>
> I think we should try to avoid waiting for the GPU in the shrinker callback.
>
> When we get HMM we will have cases where the shrinker is called from
> there and we can't wait for the GPU then without causing deadlocks.

Uh that doesn't work. Also, the current rules are that you are allowed
to call dma_fence_wait from shrinker callbacks, so that shipped sailed
already. This is because shrinkers are a less restrictive context than
mmu notifier invalidation, and we wait in there too.

So if you can't wait in shrinkers, you also can't wait in mmu
notifiers (and also not in HMM, wĥich is the same thing). Why do you
need this?

> > Also don't we have a trylock_only flag here to make sure drivers don't do
> > something stupid?
>
> Mhm, I'm pretty sure drivers should only be minimal involved.

In the move callback they might try to acquire other locks. Or is that
a driver bug?

Just kinda feels wrong if we have this and don't set it.

> >> +    };
> >> +    int ret;
> >> +
> >> +    if (sc->gfp_mask & GFP_NOFS)
> >> +            return 0;
> >> +
> >> +    ret = ttm_bo_swapout(&ctx, GFP_NOFS);
> >> +    return ret < 0 ? SHRINK_EMPTY : ret;
> >> +}
> >> +
> >> +/* Return the number of pages available or SHRINK_EMPTY if we have none */
> >> +static unsigned long ttm_tt_shrinker_count(struct shrinker *shrink,
> >> +                                       struct shrink_control *sc)
> >> +{
> >> +    struct ttm_buffer_object *bo;
> >> +    unsigned long num_pages = 0;
> >> +    unsigned int i;
> >> +
> >> +    if (sc->gfp_mask & GFP_NOFS)
> >> +            return 0;
> > The count function should always count, and I'm not seeing a reason why
> > you couldn't do that here ... Also my understanding is that GFP_NOFS never
> > goes into shrinkers (the NOFS comes from shrinkers originally only being
> > used for filesystem objects), so this is double redundant.
> >
> > My understanding is that gfp_mask is just to convey the right zones and
> > stuff, so that your shrinker can try to shrink objects in the right zones.
> > Hence I think the check in the _scan() function should also be removed.
> >
> > Also the non __ prefixed flags are the combinations callers are supposed
> > to look at. Memory reclaim code needs to look at the __GFP flags, see e.g.
> > gfpflags_allow_blocking() or fs_reclaim_acquire().
>
> Ok got it. But don't we need to somehow avoid recursion here?

How can you recurse?

GFP_KERNEL includes the __GFP_FS flag, which allows calling into
shrinkers. GFP_NOFS (which we're using when we're in the shrinker)
does not have the __GFP_FS flag, and so calling into shrinkers isn't
allowed. It's still allowed to clean out page cache and do io in
general (which is the __GFP_IO flag), and it's also allowed to do
memory reclaim of userspace ptes, which can involve calling into mmu
notifiers (the __GFP_RECLAIM flag is for that). So rules are:
- GFP_KERNEL for everyone who's not special
- GFP_NOFS from shrinkers.
- GFP_NOIO from block io handlers (not relevant for gpus)
- GFP_ATOMIC only from mmu notifier callbacks. That this is  the
reason why nothing in dma-fence signalling critical path is allowed to
allocate memory with anything else than GFP_ATOMIC. I guess I need to
resurrect my patches, but I thought when we discussed this it's clear
that at least in theory all these allocations in scheduler code are
bugs and need to be replaced by mempool allocations.

So where do you want to recurse here?

Cheers, Daniel

> > [SNIP]
> >> +int ttm_tt_mgr_init(void);
> >> +void ttm_tt_mgr_fini(void);
> >> +
> >>   #if IS_ENABLED(CONFIG_AGP)
> >>   #include <linux/agp_backend.h>
> > For testing I strongly recommend a debugfs file to trigger this shrinker
> > completely, from the right lockdep context (i.e. using
> > fs_reclaim_acquire/release()). Much easier to test that way. See
> > i915_drop_caches_set() in i915_debugfs.c.
> >
> > That way you can fully test it all without hitting anything remotely
> > resembling actual OOM, which tends to kill all kinds of things.
>
> That's exactly the reason I was switching from sysfs to debugfs in the
> other patch set.
>
> Ok, in this case I'm going to reorder all that stuff and send out the
> debugfs patches first.
>
> > Aside from the detail work I think this is going in the right direction.
> > -Daniel
>
> Thanks,
> Christian.



-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 14:36     ` Daniel Vetter
@ 2020-12-17 15:10       ` Christian König
  2020-12-17 15:26         ` Daniel Vetter
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2020-12-17 15:10 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Dave Airlie, Roland Scheidegger, dri-devel, Huang Rui, VMware Graphics

Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> On Thu, Dec 17, 2020 at 2:46 PM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
>>> [SNIP]
>>>> +
>>>> +/* As long as pages are available make sure to release at least one */
>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
>>>> +                                      struct shrink_control *sc)
>>>> +{
>>>> +    struct ttm_operation_ctx ctx = {
>>>> +            .no_wait_gpu = true
>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
>>> once we hit that, we should allow gpu waits. But it's not passed to
>>> shrinkers for reasons, so maybe we should have a second round that tries
>>> to more actively shrink objects if we fell substantially short of what
>>> reclaim expected us to do?
>> I think we should try to avoid waiting for the GPU in the shrinker callback.
>>
>> When we get HMM we will have cases where the shrinker is called from
>> there and we can't wait for the GPU then without causing deadlocks.
> Uh that doesn't work. Also, the current rules are that you are allowed
> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> already. This is because shrinkers are a less restrictive context than
> mmu notifier invalidation, and we wait in there too.
>
> So if you can't wait in shrinkers, you also can't wait in mmu
> notifiers (and also not in HMM, wĥich is the same thing). Why do you
> need this?

The core concept of HMM is that pages are faulted in on demand and it is 
perfectly valid for one of those pages to be on disk.

So when a page fault happens we might need to be able to allocate memory 
and fetch something from disk to handle that.

When this memory allocation then in turn waits for the GPU which is 
running the HMM process we are pretty much busted.

>>> Also don't we have a trylock_only flag here to make sure drivers don't do
>>> something stupid?
>> Mhm, I'm pretty sure drivers should only be minimal involved.
> In the move callback they might try to acquire other locks. Or is that
> a driver bug?

That would be a rather serious driver bug at the moment.

> Just kinda feels wrong if we have this and don't set it.
>
>>>> +    };
>>>> +    int ret;
>>>> +
>>>> +    if (sc->gfp_mask & GFP_NOFS)
>>>> +            return 0;
>>>> +
>>>> +    ret = ttm_bo_swapout(&ctx, GFP_NOFS);
>>>> +    return ret < 0 ? SHRINK_EMPTY : ret;
>>>> +}
>>>> +
>>>> +/* Return the number of pages available or SHRINK_EMPTY if we have none */
>>>> +static unsigned long ttm_tt_shrinker_count(struct shrinker *shrink,
>>>> +                                       struct shrink_control *sc)
>>>> +{
>>>> +    struct ttm_buffer_object *bo;
>>>> +    unsigned long num_pages = 0;
>>>> +    unsigned int i;
>>>> +
>>>> +    if (sc->gfp_mask & GFP_NOFS)
>>>> +            return 0;
>>> The count function should always count, and I'm not seeing a reason why
>>> you couldn't do that here ... Also my understanding is that GFP_NOFS never
>>> goes into shrinkers (the NOFS comes from shrinkers originally only being
>>> used for filesystem objects), so this is double redundant.
>>>
>>> My understanding is that gfp_mask is just to convey the right zones and
>>> stuff, so that your shrinker can try to shrink objects in the right zones.
>>> Hence I think the check in the _scan() function should also be removed.
>>>
>>> Also the non __ prefixed flags are the combinations callers are supposed
>>> to look at. Memory reclaim code needs to look at the __GFP flags, see e.g.
>>> gfpflags_allow_blocking() or fs_reclaim_acquire().
>> Ok got it. But don't we need to somehow avoid recursion here?
> How can you recurse?
>
> GFP_KERNEL includes the __GFP_FS flag, which allows calling into
> shrinkers. GFP_NOFS (which we're using when we're in the shrinker)
> does not have the __GFP_FS flag, and so calling into shrinkers isn't
> allowed. It's still allowed to clean out page cache and do io in
> general (which is the __GFP_IO flag), and it's also allowed to do
> memory reclaim of userspace ptes, which can involve calling into mmu
> notifiers (the __GFP_RECLAIM flag is for that). So rules are:
> - GFP_KERNEL for everyone who's not special
> - GFP_NOFS from shrinkers.
> - GFP_NOIO from block io handlers (not relevant for gpus)
> - GFP_ATOMIC only from mmu notifier callbacks. That this is  the
> reason why nothing in dma-fence signalling critical path is allowed to
> allocate memory with anything else than GFP_ATOMIC. I guess I need to
> resurrect my patches, but I thought when we discussed this it's clear
> that at least in theory all these allocations in scheduler code are
> bugs and need to be replaced by mempool allocations.
>
> So where do you want to recurse here?

I wasn't aware that without __GFP_FS shrinkers are not called.

Thanks for clearing that up,
Christian.

>
> Cheers, Daniel
>
>>> [SNIP]
>>>> +int ttm_tt_mgr_init(void);
>>>> +void ttm_tt_mgr_fini(void);
>>>> +
>>>>    #if IS_ENABLED(CONFIG_AGP)
>>>>    #include <linux/agp_backend.h>
>>> For testing I strongly recommend a debugfs file to trigger this shrinker
>>> completely, from the right lockdep context (i.e. using
>>> fs_reclaim_acquire/release()). Much easier to test that way. See
>>> i915_drop_caches_set() in i915_debugfs.c.
>>>
>>> That way you can fully test it all without hitting anything remotely
>>> resembling actual OOM, which tends to kill all kinds of things.
>> That's exactly the reason I was switching from sysfs to debugfs in the
>> other patch set.
>>
>> Ok, in this case I'm going to reorder all that stuff and send out the
>> debugfs patches first.
>>
>>> Aside from the detail work I think this is going in the right direction.
>>> -Daniel
>> Thanks,
>> Christian.
>
>

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 15:10       ` Christian König
@ 2020-12-17 15:26         ` Daniel Vetter
  2020-12-17 15:35           ` Christian König
  0 siblings, 1 reply; 20+ messages in thread
From: Daniel Vetter @ 2020-12-17 15:26 UTC (permalink / raw)
  To: Christian König
  Cc: Dave Airlie, Roland Scheidegger, dri-devel, Huang Rui, VMware Graphics

On Thu, Dec 17, 2020 at 4:10 PM Christian König
<christian.koenig@amd.com> wrote:
>
> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> > On Thu, Dec 17, 2020 at 2:46 PM Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> >> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> >>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> >>> [SNIP]
> >>>> +
> >>>> +/* As long as pages are available make sure to release at least one */
> >>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> >>>> +                                      struct shrink_control *sc)
> >>>> +{
> >>>> +    struct ttm_operation_ctx ctx = {
> >>>> +            .no_wait_gpu = true
> >>> Iirc there's an eventual shrinker limit where it gets desperate. I think
> >>> once we hit that, we should allow gpu waits. But it's not passed to
> >>> shrinkers for reasons, so maybe we should have a second round that tries
> >>> to more actively shrink objects if we fell substantially short of what
> >>> reclaim expected us to do?
> >> I think we should try to avoid waiting for the GPU in the shrinker callback.
> >>
> >> When we get HMM we will have cases where the shrinker is called from
> >> there and we can't wait for the GPU then without causing deadlocks.
> > Uh that doesn't work. Also, the current rules are that you are allowed
> > to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> > already. This is because shrinkers are a less restrictive context than
> > mmu notifier invalidation, and we wait in there too.
> >
> > So if you can't wait in shrinkers, you also can't wait in mmu
> > notifiers (and also not in HMM, wĥich is the same thing). Why do you
> > need this?
>
> The core concept of HMM is that pages are faulted in on demand and it is
> perfectly valid for one of those pages to be on disk.
>
> So when a page fault happens we might need to be able to allocate memory
> and fetch something from disk to handle that.
>
> When this memory allocation then in turn waits for the GPU which is
> running the HMM process we are pretty much busted.

Yeah you can't do that. That's the entire infinite fences discussions.
For HMM to work, we need to stop using dma_fence for userspace sync,
and you can only use the amdkfd style preempt fences. And preempting
while the pagefault is pending is I thought something we require.

Iow, the HMM page fault handler must not be a dma-fence critical
section, i.e. it's not allowed to hold up any dma_fence, ever.

One consequence of this is that you can use HMM for compute, but until
we've revamped all the linux winsys layers, not for gl/vk. Or at least
I'm not seeing how.

Also like I said, dma_fence_wait is already allowed in mmu notifiers,
so we've already locked down these semantics even more. Due to the
nesting of gfp allocation contexts allowing dma_fence_wait in mmu
notifiers (i.e. __GFP_ALLOW_RECLAIM or whatever the flag is exactly)
implies it's allowed in shrinkers. And only if you forbid it from from
all allocations contexts (which makes all buffer object managed gpu
memory essentially pinned, exactly what you're trying to lift here) do
you get what you want.

The other option is to make HMM and dma-buf completely disjoint worlds
with no overlap, and gang scheduling on the gpu (to guarantee that
there's never any dma_fence in pending state while an HMM task might
cause a fault).

> >>> Also don't we have a trylock_only flag here to make sure drivers don't do
> >>> something stupid?
> >> Mhm, I'm pretty sure drivers should only be minimal involved.
> > In the move callback they might try to acquire other locks. Or is that
> > a driver bug?
>
> That would be a rather serious driver bug at the moment.
>
> > Just kinda feels wrong if we have this and don't set it.
> >
> >>>> +    };
> >>>> +    int ret;
> >>>> +
> >>>> +    if (sc->gfp_mask & GFP_NOFS)
> >>>> +            return 0;
> >>>> +
> >>>> +    ret = ttm_bo_swapout(&ctx, GFP_NOFS);
> >>>> +    return ret < 0 ? SHRINK_EMPTY : ret;
> >>>> +}
> >>>> +
> >>>> +/* Return the number of pages available or SHRINK_EMPTY if we have none */
> >>>> +static unsigned long ttm_tt_shrinker_count(struct shrinker *shrink,
> >>>> +                                       struct shrink_control *sc)
> >>>> +{
> >>>> +    struct ttm_buffer_object *bo;
> >>>> +    unsigned long num_pages = 0;
> >>>> +    unsigned int i;
> >>>> +
> >>>> +    if (sc->gfp_mask & GFP_NOFS)
> >>>> +            return 0;
> >>> The count function should always count, and I'm not seeing a reason why
> >>> you couldn't do that here ... Also my understanding is that GFP_NOFS never
> >>> goes into shrinkers (the NOFS comes from shrinkers originally only being
> >>> used for filesystem objects), so this is double redundant.
> >>>
> >>> My understanding is that gfp_mask is just to convey the right zones and
> >>> stuff, so that your shrinker can try to shrink objects in the right zones.
> >>> Hence I think the check in the _scan() function should also be removed.
> >>>
> >>> Also the non __ prefixed flags are the combinations callers are supposed
> >>> to look at. Memory reclaim code needs to look at the __GFP flags, see e.g.
> >>> gfpflags_allow_blocking() or fs_reclaim_acquire().
> >> Ok got it. But don't we need to somehow avoid recursion here?
> > How can you recurse?
> >
> > GFP_KERNEL includes the __GFP_FS flag, which allows calling into
> > shrinkers. GFP_NOFS (which we're using when we're in the shrinker)
> > does not have the __GFP_FS flag, and so calling into shrinkers isn't
> > allowed. It's still allowed to clean out page cache and do io in
> > general (which is the __GFP_IO flag), and it's also allowed to do
> > memory reclaim of userspace ptes, which can involve calling into mmu
> > notifiers (the __GFP_RECLAIM flag is for that). So rules are:
> > - GFP_KERNEL for everyone who's not special
> > - GFP_NOFS from shrinkers.
> > - GFP_NOIO from block io handlers (not relevant for gpus)
> > - GFP_ATOMIC only from mmu notifier callbacks. That this is  the
> > reason why nothing in dma-fence signalling critical path is allowed to
> > allocate memory with anything else than GFP_ATOMIC. I guess I need to
> > resurrect my patches, but I thought when we discussed this it's clear
> > that at least in theory all these allocations in scheduler code are
> > bugs and need to be replaced by mempool allocations.
> >
> > So where do you want to recurse here?
>
> I wasn't aware that without __GFP_FS shrinkers are not called.

Maybe double check, but that's at least my understanding. GFP flags
are flags, but in reality it's a strictly nesting hierarchy:
GFP_KERNEL > GFP_NOFS > GFP_NOIO > GFP_RELCAIM > GFP_ATOMIC (ok atomic
is special, since it's allowed to dip into emergency reserve).
-Daniel

> Thanks for clearing that up,
> Christian.
>
> >
> > Cheers, Daniel
> >
> >>> [SNIP]
> >>>> +int ttm_tt_mgr_init(void);
> >>>> +void ttm_tt_mgr_fini(void);
> >>>> +
> >>>>    #if IS_ENABLED(CONFIG_AGP)
> >>>>    #include <linux/agp_backend.h>
> >>> For testing I strongly recommend a debugfs file to trigger this shrinker
> >>> completely, from the right lockdep context (i.e. using
> >>> fs_reclaim_acquire/release()). Much easier to test that way. See
> >>> i915_drop_caches_set() in i915_debugfs.c.
> >>>
> >>> That way you can fully test it all without hitting anything remotely
> >>> resembling actual OOM, which tends to kill all kinds of things.
> >> That's exactly the reason I was switching from sysfs to debugfs in the
> >> other patch set.
> >>
> >> Ok, in this case I'm going to reorder all that stuff and send out the
> >> debugfs patches first.
> >>
> >>> Aside from the detail work I think this is going in the right direction.
> >>> -Daniel
> >> Thanks,
> >> Christian.
> >
> >
>


-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 15:26         ` Daniel Vetter
@ 2020-12-17 15:35           ` Christian König
  2020-12-17 15:45             ` Daniel Vetter
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2020-12-17 15:35 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Dave Airlie, Roland Scheidegger, dri-devel, Huang Rui, VMware Graphics

Am 17.12.20 um 16:26 schrieb Daniel Vetter:
> On Thu, Dec 17, 2020 at 4:10 PM Christian König
> <christian.koenig@amd.com> wrote:
>> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
>>> On Thu, Dec 17, 2020 at 2:46 PM Christian König
>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
>>>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
>>>>> [SNIP]
>>>>>> +
>>>>>> +/* As long as pages are available make sure to release at least one */
>>>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
>>>>>> +                                      struct shrink_control *sc)
>>>>>> +{
>>>>>> +    struct ttm_operation_ctx ctx = {
>>>>>> +            .no_wait_gpu = true
>>>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
>>>>> once we hit that, we should allow gpu waits. But it's not passed to
>>>>> shrinkers for reasons, so maybe we should have a second round that tries
>>>>> to more actively shrink objects if we fell substantially short of what
>>>>> reclaim expected us to do?
>>>> I think we should try to avoid waiting for the GPU in the shrinker callback.
>>>>
>>>> When we get HMM we will have cases where the shrinker is called from
>>>> there and we can't wait for the GPU then without causing deadlocks.
>>> Uh that doesn't work. Also, the current rules are that you are allowed
>>> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
>>> already. This is because shrinkers are a less restrictive context than
>>> mmu notifier invalidation, and we wait in there too.
>>>
>>> So if you can't wait in shrinkers, you also can't wait in mmu
>>> notifiers (and also not in HMM, wĥich is the same thing). Why do you
>>> need this?
>> The core concept of HMM is that pages are faulted in on demand and it is
>> perfectly valid for one of those pages to be on disk.
>>
>> So when a page fault happens we might need to be able to allocate memory
>> and fetch something from disk to handle that.
>>
>> When this memory allocation then in turn waits for the GPU which is
>> running the HMM process we are pretty much busted.
> Yeah you can't do that. That's the entire infinite fences discussions.

Yes, exactly.

> For HMM to work, we need to stop using dma_fence for userspace sync,

I was considering of separating that into a dma_fence and a hmm_fence. 
Or something like this.

> and you can only use the amdkfd style preempt fences. And preempting
> while the pagefault is pending is I thought something we require.

Yeah, problem is that most hardware can't do that :)

Getting page faults to work is hard enough, preempting while waiting for 
a fault to return is not something which was anticipated :)

> Iow, the HMM page fault handler must not be a dma-fence critical
> section, i.e. it's not allowed to hold up any dma_fence, ever.

What do you mean with that?

> One consequence of this is that you can use HMM for compute, but until
> we've revamped all the linux winsys layers, not for gl/vk. Or at least
> I'm not seeing how.
>
> Also like I said, dma_fence_wait is already allowed in mmu notifiers,
> so we've already locked down these semantics even more. Due to the
> nesting of gfp allocation contexts allowing dma_fence_wait in mmu
> notifiers (i.e. __GFP_ALLOW_RECLAIM or whatever the flag is exactly)
> implies it's allowed in shrinkers. And only if you forbid it from from
> all allocations contexts (which makes all buffer object managed gpu
> memory essentially pinned, exactly what you're trying to lift here) do
> you get what you want.
>
> The other option is to make HMM and dma-buf completely disjoint worlds
> with no overlap, and gang scheduling on the gpu (to guarantee that
> there's never any dma_fence in pending state while an HMM task might
> cause a fault).
>
>> [SNIP]
>>> So where do you want to recurse here?
>> I wasn't aware that without __GFP_FS shrinkers are not called.
> Maybe double check, but that's at least my understanding. GFP flags
> are flags, but in reality it's a strictly nesting hierarchy:
> GFP_KERNEL > GFP_NOFS > GFP_NOIO > GFP_RELCAIM > GFP_ATOMIC (ok atomic
> is special, since it's allowed to dip into emergency reserve).

Going to read myself into that over the holidays.

Christian.
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 15:35           ` Christian König
@ 2020-12-17 15:45             ` Daniel Vetter
  2020-12-17 18:09               ` Jerome Glisse
  0 siblings, 1 reply; 20+ messages in thread
From: Daniel Vetter @ 2020-12-17 15:45 UTC (permalink / raw)
  To: Christian König
  Cc: Dave Airlie, Roland Scheidegger, dri-devel, Huang Rui, VMware Graphics

On Thu, Dec 17, 2020 at 4:36 PM Christian König
<christian.koenig@amd.com> wrote:
> Am 17.12.20 um 16:26 schrieb Daniel Vetter:
> > On Thu, Dec 17, 2020 at 4:10 PM Christian König
> > <christian.koenig@amd.com> wrote:
> >> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> >>> On Thu, Dec 17, 2020 at 2:46 PM Christian König
> >>> <ckoenig.leichtzumerken@gmail.com> wrote:
> >>>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> >>>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> >>>>> [SNIP]
> >>>>>> +
> >>>>>> +/* As long as pages are available make sure to release at least one */
> >>>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> >>>>>> +                                      struct shrink_control *sc)
> >>>>>> +{
> >>>>>> +    struct ttm_operation_ctx ctx = {
> >>>>>> +            .no_wait_gpu = true
> >>>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
> >>>>> once we hit that, we should allow gpu waits. But it's not passed to
> >>>>> shrinkers for reasons, so maybe we should have a second round that tries
> >>>>> to more actively shrink objects if we fell substantially short of what
> >>>>> reclaim expected us to do?
> >>>> I think we should try to avoid waiting for the GPU in the shrinker callback.
> >>>>
> >>>> When we get HMM we will have cases where the shrinker is called from
> >>>> there and we can't wait for the GPU then without causing deadlocks.
> >>> Uh that doesn't work. Also, the current rules are that you are allowed
> >>> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> >>> already. This is because shrinkers are a less restrictive context than
> >>> mmu notifier invalidation, and we wait in there too.
> >>>
> >>> So if you can't wait in shrinkers, you also can't wait in mmu
> >>> notifiers (and also not in HMM, wĥich is the same thing). Why do you
> >>> need this?
> >> The core concept of HMM is that pages are faulted in on demand and it is
> >> perfectly valid for one of those pages to be on disk.
> >>
> >> So when a page fault happens we might need to be able to allocate memory
> >> and fetch something from disk to handle that.
> >>
> >> When this memory allocation then in turn waits for the GPU which is
> >> running the HMM process we are pretty much busted.
> > Yeah you can't do that. That's the entire infinite fences discussions.
>
> Yes, exactly.
>
> > For HMM to work, we need to stop using dma_fence for userspace sync,
>
> I was considering of separating that into a dma_fence and a hmm_fence.
> Or something like this.

The trouble is that dma_fence it all its forms is uapi. And on gpus
without page fault support dma_fence_wait is still required in
allocation contexts. So creating a new kernel structure doesn't really
solve anything I think, it needs entire new uapi completely decoupled
from memory management. Last time we've done new uapi was probably
modifiers, and that's still not rolled out years later.

> > and you can only use the amdkfd style preempt fences. And preempting
> > while the pagefault is pending is I thought something we require.
>
> Yeah, problem is that most hardware can't do that :)
>
> Getting page faults to work is hard enough, preempting while waiting for
> a fault to return is not something which was anticipated :)

Hm last summer in a thread you said you've blocked that because it
doesn't work. I agreed, page fault without preempt is rather tough to
make work.

> > Iow, the HMM page fault handler must not be a dma-fence critical
> > section, i.e. it's not allowed to hold up any dma_fence, ever.
>
> What do you mean with that?

dma_fence_signalling_begin/end() annotations essentially, i.e.
cross-release dependencies. Or the other way round, if you want to be
able to allocate memory you have to guarantee that you're never
holding up a dma_fence.
-Daniel

> > One consequence of this is that you can use HMM for compute, but until
> > we've revamped all the linux winsys layers, not for gl/vk. Or at least
> > I'm not seeing how.
> >
> > Also like I said, dma_fence_wait is already allowed in mmu notifiers,
> > so we've already locked down these semantics even more. Due to the
> > nesting of gfp allocation contexts allowing dma_fence_wait in mmu
> > notifiers (i.e. __GFP_ALLOW_RECLAIM or whatever the flag is exactly)
> > implies it's allowed in shrinkers. And only if you forbid it from from
> > all allocations contexts (which makes all buffer object managed gpu
> > memory essentially pinned, exactly what you're trying to lift here) do
> > you get what you want.
> >
> > The other option is to make HMM and dma-buf completely disjoint worlds
> > with no overlap, and gang scheduling on the gpu (to guarantee that
> > there's never any dma_fence in pending state while an HMM task might
> > cause a fault).
> >
> >> [SNIP]
> >>> So where do you want to recurse here?
> >> I wasn't aware that without __GFP_FS shrinkers are not called.
> > Maybe double check, but that's at least my understanding. GFP flags
> > are flags, but in reality it's a strictly nesting hierarchy:
> > GFP_KERNEL > GFP_NOFS > GFP_NOIO > GFP_RELCAIM > GFP_ATOMIC (ok atomic
> > is special, since it's allowed to dip into emergency reserve).
>
> Going to read myself into that over the holidays.
>
> Christian.



-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 15:45             ` Daniel Vetter
@ 2020-12-17 18:09               ` Jerome Glisse
  2020-12-17 18:19                 ` Daniel Vetter
  0 siblings, 1 reply; 20+ messages in thread
From: Jerome Glisse @ 2020-12-17 18:09 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Ralph Campbell, Dave Airlie, John Hubbard, Roland Scheidegger,
	dri-devel, Huang Rui, VMware Graphics, Ben Skeggs,
	Christian König

Adding few folks on cc just to raise awareness and so that i
could get corrected if i said anything wrong.

On Thu, Dec 17, 2020 at 04:45:55PM +0100, Daniel Vetter wrote:
> On Thu, Dec 17, 2020 at 4:36 PM Christian König
> <christian.koenig@amd.com> wrote:
> > Am 17.12.20 um 16:26 schrieb Daniel Vetter:
> > > On Thu, Dec 17, 2020 at 4:10 PM Christian König
> > > <christian.koenig@amd.com> wrote:
> > >> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> > >>> On Thu, Dec 17, 2020 at 2:46 PM Christian König
> > >>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > >>>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> > >>>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> > >>>>> [SNIP]
> > >>>>>> +
> > >>>>>> +/* As long as pages are available make sure to release at least one */
> > >>>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> > >>>>>> +                                      struct shrink_control *sc)
> > >>>>>> +{
> > >>>>>> +    struct ttm_operation_ctx ctx = {
> > >>>>>> +            .no_wait_gpu = true
> > >>>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
> > >>>>> once we hit that, we should allow gpu waits. But it's not passed to
> > >>>>> shrinkers for reasons, so maybe we should have a second round that tries
> > >>>>> to more actively shrink objects if we fell substantially short of what
> > >>>>> reclaim expected us to do?
> > >>>> I think we should try to avoid waiting for the GPU in the shrinker callback.
> > >>>>
> > >>>> When we get HMM we will have cases where the shrinker is called from
> > >>>> there and we can't wait for the GPU then without causing deadlocks.
> > >>> Uh that doesn't work. Also, the current rules are that you are allowed
> > >>> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> > >>> already. This is because shrinkers are a less restrictive context than
> > >>> mmu notifier invalidation, and we wait in there too.
> > >>>
> > >>> So if you can't wait in shrinkers, you also can't wait in mmu
> > >>> notifiers (and also not in HMM, wĥich is the same thing). Why do you
> > >>> need this?
> > >> The core concept of HMM is that pages are faulted in on demand and it is
> > >> perfectly valid for one of those pages to be on disk.
> > >>
> > >> So when a page fault happens we might need to be able to allocate memory
> > >> and fetch something from disk to handle that.
> > >>
> > >> When this memory allocation then in turn waits for the GPU which is
> > >> running the HMM process we are pretty much busted.
> > > Yeah you can't do that. That's the entire infinite fences discussions.
> >
> > Yes, exactly.
> >
> > > For HMM to work, we need to stop using dma_fence for userspace sync,
> >
> > I was considering of separating that into a dma_fence and a hmm_fence.
> > Or something like this.
> 
> The trouble is that dma_fence it all its forms is uapi. And on gpus
> without page fault support dma_fence_wait is still required in
> allocation contexts. So creating a new kernel structure doesn't really
> solve anything I think, it needs entire new uapi completely decoupled
> from memory management. Last time we've done new uapi was probably
> modifiers, and that's still not rolled out years later.

With hmm there should not be any fence ! You do not need them.
If you feel you need them than you are doing something horribly
wrong. See below on what HMM needs and what it means.


> > > and you can only use the amdkfd style preempt fences. And preempting
> > > while the pagefault is pending is I thought something we require.
> >
> > Yeah, problem is that most hardware can't do that :)
> >
> > Getting page faults to work is hard enough, preempting while waiting for
> > a fault to return is not something which was anticipated :)
> 
> Hm last summer in a thread you said you've blocked that because it
> doesn't work. I agreed, page fault without preempt is rather tough to
> make work.
> 
> > > Iow, the HMM page fault handler must not be a dma-fence critical
> > > section, i.e. it's not allowed to hold up any dma_fence, ever.
> >
> > What do you mean with that?
> 
> dma_fence_signalling_begin/end() annotations essentially, i.e.
> cross-release dependencies. Or the other way round, if you want to be
> able to allocate memory you have to guarantee that you're never
> holding up a dma_fence.

Correct nothing regarding dma/ttm/gem should creep into HMM code
path.


For HMM what you want when handling GPU fault is doing it without
holding any GPU driver locks so that the regular page fault handler
code path can go back into the GPU driver (through shrinker) without
worrying about it.

This is how nouveau does it:
    - get event about page fault (might hold some GPU lock)
    - walk the event buffer to get all faulting addresses
      (might hold some GPU lock)

    ! DROP ALL GPU/DRIVER LOCK !

    - try to coallesce fault together (adjacent address
      trigger a fault for a single range)
    - call in HMM/mmu notifier helpers to handle the fault
    - take GPU lock (svmm->mutex for nouveau)
    - from the fault result update GPU page table

    - Tell GPU it can resume (this can be done after releasing
      the lock below, it is just up to the device driver folks
      to decide when to do that).

    - unlock GPU (svmm->mutex for nouveau)

(see nouveau_svm.c look for range_fault)

GPU page fault should never need to rely on shrinker to succeed
into reclaiming memory. The rational here is that regular memory
reclaim (which includes regular anonymous or mmap file back
memory) will be able to make forward progress even during GPU
page fault. This of course means that while servicing a GPU page
fault you might freeup memory that was also use by the GPU and
which will re-fault again but the lru list should make that very
unlikely.


Note that for regular memory reclaim to make forward progress
means that the GPU _must_ support asynchronous GPU page table
update ie being able to unmap things from GPU page table and
flush GPU TLB without having to wait for anything running on
the GPU. This is where not all hardware might work. Only things
populated through HMM need to be unmapped (dma/ttm/gem should
not be considered unless they too can be unmapped without
deadlock).

For nouveau this is done through register write (see
gf100_vmm_invalidate and gp100_vmm_invalidate_pdb and the
GPU page table itself is updated through the CPU).

I think issue for AMD here is that you rely on updating the
GPU page table through the GPU command processor and this
will not work with HMM. Unless you have a CP channel that
can always make forward progress no matter what. Or update
the GPU page table with the CPU and flush GPU TLB through
some register write (iirc this should be doable on AMD but
i forgetting things now).


There is no need for preemption even if it is a nice to have
feature (lazy GPU design architect and hardware designer ! ;)).


Now if dma/ttm/gem GPU memory allocation use all the system
memory then we are in trouble. This was the whole reason for
limiting ourself, once upon a time, to not use over 50% of
memory with dma/ttm/gem. I do not have any good way to solve
that. If to make forward progress the GPU needs to allocate
more memory but it already has exhausted all memory through
dma/ttm/gem then there is just nothing we can do.

I still believe we should limit dma/ttm/gem memory usage so
that we never endup pinning all system memory in GPU realms.


Hopes this help clarify requirement and expectation.

Cheers,
Jérôme

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 18:09               ` Jerome Glisse
@ 2020-12-17 18:19                 ` Daniel Vetter
  2020-12-17 18:26                   ` Daniel Vetter
  2020-12-17 18:40                   ` Jerome Glisse
  0 siblings, 2 replies; 20+ messages in thread
From: Daniel Vetter @ 2020-12-17 18:19 UTC (permalink / raw)
  To: Jerome Glisse
  Cc: Ralph Campbell, Dave Airlie, John Hubbard, Roland Scheidegger,
	dri-devel, Huang Rui, VMware Graphics, Ben Skeggs,
	Christian König

On Thu, Dec 17, 2020 at 7:09 PM Jerome Glisse <jglisse@redhat.com> wrote:
>
> Adding few folks on cc just to raise awareness and so that i
> could get corrected if i said anything wrong.
>
> On Thu, Dec 17, 2020 at 04:45:55PM +0100, Daniel Vetter wrote:
> > On Thu, Dec 17, 2020 at 4:36 PM Christian König
> > <christian.koenig@amd.com> wrote:
> > > Am 17.12.20 um 16:26 schrieb Daniel Vetter:
> > > > On Thu, Dec 17, 2020 at 4:10 PM Christian König
> > > > <christian.koenig@amd.com> wrote:
> > > >> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> > > >>> On Thu, Dec 17, 2020 at 2:46 PM Christian König
> > > >>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > >>>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> > > >>>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> > > >>>>> [SNIP]
> > > >>>>>> +
> > > >>>>>> +/* As long as pages are available make sure to release at least one */
> > > >>>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> > > >>>>>> +                                      struct shrink_control *sc)
> > > >>>>>> +{
> > > >>>>>> +    struct ttm_operation_ctx ctx = {
> > > >>>>>> +            .no_wait_gpu = true
> > > >>>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
> > > >>>>> once we hit that, we should allow gpu waits. But it's not passed to
> > > >>>>> shrinkers for reasons, so maybe we should have a second round that tries
> > > >>>>> to more actively shrink objects if we fell substantially short of what
> > > >>>>> reclaim expected us to do?
> > > >>>> I think we should try to avoid waiting for the GPU in the shrinker callback.
> > > >>>>
> > > >>>> When we get HMM we will have cases where the shrinker is called from
> > > >>>> there and we can't wait for the GPU then without causing deadlocks.
> > > >>> Uh that doesn't work. Also, the current rules are that you are allowed
> > > >>> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> > > >>> already. This is because shrinkers are a less restrictive context than
> > > >>> mmu notifier invalidation, and we wait in there too.
> > > >>>
> > > >>> So if you can't wait in shrinkers, you also can't wait in mmu
> > > >>> notifiers (and also not in HMM, wĥich is the same thing). Why do you
> > > >>> need this?
> > > >> The core concept of HMM is that pages are faulted in on demand and it is
> > > >> perfectly valid for one of those pages to be on disk.
> > > >>
> > > >> So when a page fault happens we might need to be able to allocate memory
> > > >> and fetch something from disk to handle that.
> > > >>
> > > >> When this memory allocation then in turn waits for the GPU which is
> > > >> running the HMM process we are pretty much busted.
> > > > Yeah you can't do that. That's the entire infinite fences discussions.
> > >
> > > Yes, exactly.
> > >
> > > > For HMM to work, we need to stop using dma_fence for userspace sync,
> > >
> > > I was considering of separating that into a dma_fence and a hmm_fence.
> > > Or something like this.
> >
> > The trouble is that dma_fence it all its forms is uapi. And on gpus
> > without page fault support dma_fence_wait is still required in
> > allocation contexts. So creating a new kernel structure doesn't really
> > solve anything I think, it needs entire new uapi completely decoupled
> > from memory management. Last time we've done new uapi was probably
> > modifiers, and that's still not rolled out years later.
>
> With hmm there should not be any fence ! You do not need them.
> If you feel you need them than you are doing something horribly
> wrong. See below on what HMM needs and what it means.
>
>
> > > > and you can only use the amdkfd style preempt fences. And preempting
> > > > while the pagefault is pending is I thought something we require.
> > >
> > > Yeah, problem is that most hardware can't do that :)
> > >
> > > Getting page faults to work is hard enough, preempting while waiting for
> > > a fault to return is not something which was anticipated :)
> >
> > Hm last summer in a thread you said you've blocked that because it
> > doesn't work. I agreed, page fault without preempt is rather tough to
> > make work.
> >
> > > > Iow, the HMM page fault handler must not be a dma-fence critical
> > > > section, i.e. it's not allowed to hold up any dma_fence, ever.
> > >
> > > What do you mean with that?
> >
> > dma_fence_signalling_begin/end() annotations essentially, i.e.
> > cross-release dependencies. Or the other way round, if you want to be
> > able to allocate memory you have to guarantee that you're never
> > holding up a dma_fence.
>
> Correct nothing regarding dma/ttm/gem should creep into HMM code
> path.
>
>
> For HMM what you want when handling GPU fault is doing it without
> holding any GPU driver locks so that the regular page fault handler
> code path can go back into the GPU driver (through shrinker) without
> worrying about it.
>
> This is how nouveau does it:
>     - get event about page fault (might hold some GPU lock)
>     - walk the event buffer to get all faulting addresses
>       (might hold some GPU lock)
>
>     ! DROP ALL GPU/DRIVER LOCK !
>
>     - try to coallesce fault together (adjacent address
>       trigger a fault for a single range)
>     - call in HMM/mmu notifier helpers to handle the fault
>     - take GPU lock (svmm->mutex for nouveau)
>     - from the fault result update GPU page table
>
>     - Tell GPU it can resume (this can be done after releasing
>       the lock below, it is just up to the device driver folks
>       to decide when to do that).
>
>     - unlock GPU (svmm->mutex for nouveau)
>
> (see nouveau_svm.c look for range_fault)
>
> GPU page fault should never need to rely on shrinker to succeed
> into reclaiming memory. The rational here is that regular memory
> reclaim (which includes regular anonymous or mmap file back
> memory) will be able to make forward progress even during GPU
> page fault. This of course means that while servicing a GPU page
> fault you might freeup memory that was also use by the GPU and
> which will re-fault again but the lru list should make that very
> unlikely.
>
>
> Note that for regular memory reclaim to make forward progress
> means that the GPU _must_ support asynchronous GPU page table
> update ie being able to unmap things from GPU page table and
> flush GPU TLB without having to wait for anything running on
> the GPU. This is where not all hardware might work. Only things
> populated through HMM need to be unmapped (dma/ttm/gem should
> not be considered unless they too can be unmapped without
> deadlock).
>
> For nouveau this is done through register write (see
> gf100_vmm_invalidate and gp100_vmm_invalidate_pdb and the
> GPU page table itself is updated through the CPU).
>
> I think issue for AMD here is that you rely on updating the
> GPU page table through the GPU command processor and this
> will not work with HMM. Unless you have a CP channel that
> can always make forward progress no matter what. Or update
> the GPU page table with the CPU and flush GPU TLB through
> some register write (iirc this should be doable on AMD but
> i forgetting things now).
>
>
> There is no need for preemption even if it is a nice to have
> feature (lazy GPU design architect and hardware designer ! ;)).
>
>
> Now if dma/ttm/gem GPU memory allocation use all the system
> memory then we are in trouble. This was the whole reason for
> limiting ourself, once upon a time, to not use over 50% of
> memory with dma/ttm/gem. I do not have any good way to solve
> that. If to make forward progress the GPU needs to allocate
> more memory but it already has exhausted all memory through
> dma/ttm/gem then there is just nothing we can do.
>
> I still believe we should limit dma/ttm/gem memory usage so
> that we never endup pinning all system memory in GPU realms.

Well this is exactly what we're trying to do here. And to be able to
shrink dma/ttm/gem memory you eventually have to call dma_fence_wait,
which is where the entire trouble comes in.

The other source of trouble is buffer based userptr, where we also
call dma_fence_wait.

And of course through depdencies (even across gpus) this
dma_fence_wait could end up being blocked on running some (non-hmm)
context on the same gpu engine which is currently running your hmm
context and stuck on some kind of fault.

Also, with the recent dma-fence annotation patch we already locked
down the rule that dma_fence_wait from memory reclaim paths is ok
(i915 already has no limit on system memory usage, and Christian
really wants to drop it for ttm drivers too).

> Hopes this help clarify requirement and expectation.

So ... what now?

I see a few options:
- require preempt. Tough on the hardware.
- rework the entire memory handling. Not sure that's possible, since
it would mean that HMM gpus and buffer based memory mangement gpus
cannot co-exist on the same system. Flag days like that across an
entire subsystem dont work.
- gang scheduling, i.e. when a gpu is running an HMM context is must
guarantee that there's not a single buffer based dma-buf/ttm/gem
context pending (and block any execbuf/cs ioctl before we submit a new
one)
- .. something else?

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 18:19                 ` Daniel Vetter
@ 2020-12-17 18:26                   ` Daniel Vetter
  2020-12-17 19:22                     ` Daniel Vetter
  2020-12-17 18:40                   ` Jerome Glisse
  1 sibling, 1 reply; 20+ messages in thread
From: Daniel Vetter @ 2020-12-17 18:26 UTC (permalink / raw)
  To: Jerome Glisse
  Cc: Ralph Campbell, Dave Airlie, John Hubbard, Roland Scheidegger,
	dri-devel, Huang Rui, VMware Graphics, Ben Skeggs,
	Christian König

On Thu, Dec 17, 2020 at 7:19 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Thu, Dec 17, 2020 at 7:09 PM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > Adding few folks on cc just to raise awareness and so that i
> > could get corrected if i said anything wrong.
> >
> > On Thu, Dec 17, 2020 at 04:45:55PM +0100, Daniel Vetter wrote:
> > > On Thu, Dec 17, 2020 at 4:36 PM Christian König
> > > <christian.koenig@amd.com> wrote:
> > > > Am 17.12.20 um 16:26 schrieb Daniel Vetter:
> > > > > On Thu, Dec 17, 2020 at 4:10 PM Christian König
> > > > > <christian.koenig@amd.com> wrote:
> > > > >> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> > > > >>> On Thu, Dec 17, 2020 at 2:46 PM Christian König
> > > > >>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > > >>>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> > > > >>>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> > > > >>>>> [SNIP]
> > > > >>>>>> +
> > > > >>>>>> +/* As long as pages are available make sure to release at least one */
> > > > >>>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> > > > >>>>>> +                                      struct shrink_control *sc)
> > > > >>>>>> +{
> > > > >>>>>> +    struct ttm_operation_ctx ctx = {
> > > > >>>>>> +            .no_wait_gpu = true
> > > > >>>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
> > > > >>>>> once we hit that, we should allow gpu waits. But it's not passed to
> > > > >>>>> shrinkers for reasons, so maybe we should have a second round that tries
> > > > >>>>> to more actively shrink objects if we fell substantially short of what
> > > > >>>>> reclaim expected us to do?
> > > > >>>> I think we should try to avoid waiting for the GPU in the shrinker callback.
> > > > >>>>
> > > > >>>> When we get HMM we will have cases where the shrinker is called from
> > > > >>>> there and we can't wait for the GPU then without causing deadlocks.
> > > > >>> Uh that doesn't work. Also, the current rules are that you are allowed
> > > > >>> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> > > > >>> already. This is because shrinkers are a less restrictive context than
> > > > >>> mmu notifier invalidation, and we wait in there too.
> > > > >>>
> > > > >>> So if you can't wait in shrinkers, you also can't wait in mmu
> > > > >>> notifiers (and also not in HMM, wĥich is the same thing). Why do you
> > > > >>> need this?
> > > > >> The core concept of HMM is that pages are faulted in on demand and it is
> > > > >> perfectly valid for one of those pages to be on disk.
> > > > >>
> > > > >> So when a page fault happens we might need to be able to allocate memory
> > > > >> and fetch something from disk to handle that.
> > > > >>
> > > > >> When this memory allocation then in turn waits for the GPU which is
> > > > >> running the HMM process we are pretty much busted.
> > > > > Yeah you can't do that. That's the entire infinite fences discussions.
> > > >
> > > > Yes, exactly.
> > > >
> > > > > For HMM to work, we need to stop using dma_fence for userspace sync,
> > > >
> > > > I was considering of separating that into a dma_fence and a hmm_fence.
> > > > Or something like this.
> > >
> > > The trouble is that dma_fence it all its forms is uapi. And on gpus
> > > without page fault support dma_fence_wait is still required in
> > > allocation contexts. So creating a new kernel structure doesn't really
> > > solve anything I think, it needs entire new uapi completely decoupled
> > > from memory management. Last time we've done new uapi was probably
> > > modifiers, and that's still not rolled out years later.
> >
> > With hmm there should not be any fence ! You do not need them.
> > If you feel you need them than you are doing something horribly
> > wrong. See below on what HMM needs and what it means.
> >
> >
> > > > > and you can only use the amdkfd style preempt fences. And preempting
> > > > > while the pagefault is pending is I thought something we require.
> > > >
> > > > Yeah, problem is that most hardware can't do that :)
> > > >
> > > > Getting page faults to work is hard enough, preempting while waiting for
> > > > a fault to return is not something which was anticipated :)
> > >
> > > Hm last summer in a thread you said you've blocked that because it
> > > doesn't work. I agreed, page fault without preempt is rather tough to
> > > make work.
> > >
> > > > > Iow, the HMM page fault handler must not be a dma-fence critical
> > > > > section, i.e. it's not allowed to hold up any dma_fence, ever.
> > > >
> > > > What do you mean with that?
> > >
> > > dma_fence_signalling_begin/end() annotations essentially, i.e.
> > > cross-release dependencies. Or the other way round, if you want to be
> > > able to allocate memory you have to guarantee that you're never
> > > holding up a dma_fence.
> >
> > Correct nothing regarding dma/ttm/gem should creep into HMM code
> > path.
> >
> >
> > For HMM what you want when handling GPU fault is doing it without
> > holding any GPU driver locks so that the regular page fault handler
> > code path can go back into the GPU driver (through shrinker) without
> > worrying about it.
> >
> > This is how nouveau does it:
> >     - get event about page fault (might hold some GPU lock)
> >     - walk the event buffer to get all faulting addresses
> >       (might hold some GPU lock)
> >
> >     ! DROP ALL GPU/DRIVER LOCK !
> >
> >     - try to coallesce fault together (adjacent address
> >       trigger a fault for a single range)
> >     - call in HMM/mmu notifier helpers to handle the fault
> >     - take GPU lock (svmm->mutex for nouveau)
> >     - from the fault result update GPU page table
> >
> >     - Tell GPU it can resume (this can be done after releasing
> >       the lock below, it is just up to the device driver folks
> >       to decide when to do that).
> >
> >     - unlock GPU (svmm->mutex for nouveau)
> >
> > (see nouveau_svm.c look for range_fault)
> >
> > GPU page fault should never need to rely on shrinker to succeed
> > into reclaiming memory. The rational here is that regular memory
> > reclaim (which includes regular anonymous or mmap file back
> > memory) will be able to make forward progress even during GPU
> > page fault. This of course means that while servicing a GPU page
> > fault you might freeup memory that was also use by the GPU and
> > which will re-fault again but the lru list should make that very
> > unlikely.
> >
> >
> > Note that for regular memory reclaim to make forward progress
> > means that the GPU _must_ support asynchronous GPU page table
> > update ie being able to unmap things from GPU page table and
> > flush GPU TLB without having to wait for anything running on
> > the GPU. This is where not all hardware might work. Only things
> > populated through HMM need to be unmapped (dma/ttm/gem should
> > not be considered unless they too can be unmapped without
> > deadlock).
> >
> > For nouveau this is done through register write (see
> > gf100_vmm_invalidate and gp100_vmm_invalidate_pdb and the
> > GPU page table itself is updated through the CPU).
> >
> > I think issue for AMD here is that you rely on updating the
> > GPU page table through the GPU command processor and this
> > will not work with HMM. Unless you have a CP channel that
> > can always make forward progress no matter what. Or update
> > the GPU page table with the CPU and flush GPU TLB through
> > some register write (iirc this should be doable on AMD but
> > i forgetting things now).
> >
> >
> > There is no need for preemption even if it is a nice to have
> > feature (lazy GPU design architect and hardware designer ! ;)).
> >
> >
> > Now if dma/ttm/gem GPU memory allocation use all the system
> > memory then we are in trouble. This was the whole reason for
> > limiting ourself, once upon a time, to not use over 50% of
> > memory with dma/ttm/gem. I do not have any good way to solve
> > that. If to make forward progress the GPU needs to allocate
> > more memory but it already has exhausted all memory through
> > dma/ttm/gem then there is just nothing we can do.
> >
> > I still believe we should limit dma/ttm/gem memory usage so
> > that we never endup pinning all system memory in GPU realms.
>
> Well this is exactly what we're trying to do here. And to be able to
> shrink dma/ttm/gem memory you eventually have to call dma_fence_wait,
> which is where the entire trouble comes in.
>
> The other source of trouble is buffer based userptr, where we also
> call dma_fence_wait.
>
> And of course through depdencies (even across gpus) this
> dma_fence_wait could end up being blocked on running some (non-hmm)
> context on the same gpu engine which is currently running your hmm
> context and stuck on some kind of fault.
>
> Also, with the recent dma-fence annotation patch we already locked
> down the rule that dma_fence_wait from memory reclaim paths is ok
> (i915 already has no limit on system memory usage, and Christian
> really wants to drop it for ttm drivers too).
>
> > Hopes this help clarify requirement and expectation.
>
> So ... what now?
>
> I see a few options:
> - require preempt. Tough on the hardware.
> - rework the entire memory handling. Not sure that's possible, since
> it would mean that HMM gpus and buffer based memory mangement gpus
> cannot co-exist on the same system. Flag days like that across an
> entire subsystem dont work.
> - gang scheduling, i.e. when a gpu is running an HMM context is must
> guarantee that there's not a single buffer based dma-buf/ttm/gem
> context pending (and block any execbuf/cs ioctl before we submit a new
> one)

This here wouldn't be the worst solution, even on desktops:
- we already stall at execbuf for timeline semaphores, so this isn't even new
- when the gpu is occupied by an hmm task, then execbuf/cs ioctl can
issue a low-priority preempt request to make sure the desktop doesn't
freeze forever. As long as that happens before we have a dma_fence for
that cs created, it's all fine to preempt HMM context.
- preempted HMM tasks would immediately try to re-run, to make sure
that they get fair scheduling between the execbuf tasks

It's not going to be great since the pipeline is down the gutters, but
no frozen desktop while running HMM jobs. And as long as you do
majority time only one or the other the gpu runs at full speed with
all the pipelining. So no perf impact on gaming or on overnight
compute jobs.
-Daniel

> - .. something else?
>
> Cheers, Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch



-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 18:19                 ` Daniel Vetter
  2020-12-17 18:26                   ` Daniel Vetter
@ 2020-12-17 18:40                   ` Jerome Glisse
  2020-12-17 19:20                     ` Daniel Vetter
  1 sibling, 1 reply; 20+ messages in thread
From: Jerome Glisse @ 2020-12-17 18:40 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Ralph Campbell, Dave Airlie, John Hubbard, Roland Scheidegger,
	dri-devel, Huang Rui, VMware Graphics, Ben Skeggs,
	Christian König

On Thu, Dec 17, 2020 at 07:19:07PM +0100, Daniel Vetter wrote:
> On Thu, Dec 17, 2020 at 7:09 PM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > Adding few folks on cc just to raise awareness and so that i
> > could get corrected if i said anything wrong.
> >
> > On Thu, Dec 17, 2020 at 04:45:55PM +0100, Daniel Vetter wrote:
> > > On Thu, Dec 17, 2020 at 4:36 PM Christian König
> > > <christian.koenig@amd.com> wrote:
> > > > Am 17.12.20 um 16:26 schrieb Daniel Vetter:
> > > > > On Thu, Dec 17, 2020 at 4:10 PM Christian König
> > > > > <christian.koenig@amd.com> wrote:
> > > > >> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> > > > >>> On Thu, Dec 17, 2020 at 2:46 PM Christian König
> > > > >>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > > >>>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> > > > >>>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> > > > >>>>> [SNIP]
> > > > >>>>>> +
> > > > >>>>>> +/* As long as pages are available make sure to release at least one */
> > > > >>>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> > > > >>>>>> +                                      struct shrink_control *sc)
> > > > >>>>>> +{
> > > > >>>>>> +    struct ttm_operation_ctx ctx = {
> > > > >>>>>> +            .no_wait_gpu = true
> > > > >>>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
> > > > >>>>> once we hit that, we should allow gpu waits. But it's not passed to
> > > > >>>>> shrinkers for reasons, so maybe we should have a second round that tries
> > > > >>>>> to more actively shrink objects if we fell substantially short of what
> > > > >>>>> reclaim expected us to do?
> > > > >>>> I think we should try to avoid waiting for the GPU in the shrinker callback.
> > > > >>>>
> > > > >>>> When we get HMM we will have cases where the shrinker is called from
> > > > >>>> there and we can't wait for the GPU then without causing deadlocks.
> > > > >>> Uh that doesn't work. Also, the current rules are that you are allowed
> > > > >>> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> > > > >>> already. This is because shrinkers are a less restrictive context than
> > > > >>> mmu notifier invalidation, and we wait in there too.
> > > > >>>
> > > > >>> So if you can't wait in shrinkers, you also can't wait in mmu
> > > > >>> notifiers (and also not in HMM, wĥich is the same thing). Why do you
> > > > >>> need this?
> > > > >> The core concept of HMM is that pages are faulted in on demand and it is
> > > > >> perfectly valid for one of those pages to be on disk.
> > > > >>
> > > > >> So when a page fault happens we might need to be able to allocate memory
> > > > >> and fetch something from disk to handle that.
> > > > >>
> > > > >> When this memory allocation then in turn waits for the GPU which is
> > > > >> running the HMM process we are pretty much busted.
> > > > > Yeah you can't do that. That's the entire infinite fences discussions.
> > > >
> > > > Yes, exactly.
> > > >
> > > > > For HMM to work, we need to stop using dma_fence for userspace sync,
> > > >
> > > > I was considering of separating that into a dma_fence and a hmm_fence.
> > > > Or something like this.
> > >
> > > The trouble is that dma_fence it all its forms is uapi. And on gpus
> > > without page fault support dma_fence_wait is still required in
> > > allocation contexts. So creating a new kernel structure doesn't really
> > > solve anything I think, it needs entire new uapi completely decoupled
> > > from memory management. Last time we've done new uapi was probably
> > > modifiers, and that's still not rolled out years later.
> >
> > With hmm there should not be any fence ! You do not need them.
> > If you feel you need them than you are doing something horribly
> > wrong. See below on what HMM needs and what it means.
> >
> >
> > > > > and you can only use the amdkfd style preempt fences. And preempting
> > > > > while the pagefault is pending is I thought something we require.
> > > >
> > > > Yeah, problem is that most hardware can't do that :)
> > > >
> > > > Getting page faults to work is hard enough, preempting while waiting for
> > > > a fault to return is not something which was anticipated :)
> > >
> > > Hm last summer in a thread you said you've blocked that because it
> > > doesn't work. I agreed, page fault without preempt is rather tough to
> > > make work.
> > >
> > > > > Iow, the HMM page fault handler must not be a dma-fence critical
> > > > > section, i.e. it's not allowed to hold up any dma_fence, ever.
> > > >
> > > > What do you mean with that?
> > >
> > > dma_fence_signalling_begin/end() annotations essentially, i.e.
> > > cross-release dependencies. Or the other way round, if you want to be
> > > able to allocate memory you have to guarantee that you're never
> > > holding up a dma_fence.
> >
> > Correct nothing regarding dma/ttm/gem should creep into HMM code
> > path.
> >
> >
> > For HMM what you want when handling GPU fault is doing it without
> > holding any GPU driver locks so that the regular page fault handler
> > code path can go back into the GPU driver (through shrinker) without
> > worrying about it.
> >
> > This is how nouveau does it:
> >     - get event about page fault (might hold some GPU lock)
> >     - walk the event buffer to get all faulting addresses
> >       (might hold some GPU lock)
> >
> >     ! DROP ALL GPU/DRIVER LOCK !
> >
> >     - try to coallesce fault together (adjacent address
> >       trigger a fault for a single range)
> >     - call in HMM/mmu notifier helpers to handle the fault
> >     - take GPU lock (svmm->mutex for nouveau)
> >     - from the fault result update GPU page table
> >
> >     - Tell GPU it can resume (this can be done after releasing
> >       the lock below, it is just up to the device driver folks
> >       to decide when to do that).
> >
> >     - unlock GPU (svmm->mutex for nouveau)
> >
> > (see nouveau_svm.c look for range_fault)
> >
> > GPU page fault should never need to rely on shrinker to succeed
> > into reclaiming memory. The rational here is that regular memory
> > reclaim (which includes regular anonymous or mmap file back
> > memory) will be able to make forward progress even during GPU
> > page fault. This of course means that while servicing a GPU page
> > fault you might freeup memory that was also use by the GPU and
> > which will re-fault again but the lru list should make that very
> > unlikely.
> >
> >
> > Note that for regular memory reclaim to make forward progress
> > means that the GPU _must_ support asynchronous GPU page table
> > update ie being able to unmap things from GPU page table and
> > flush GPU TLB without having to wait for anything running on
> > the GPU. This is where not all hardware might work. Only things
> > populated through HMM need to be unmapped (dma/ttm/gem should
> > not be considered unless they too can be unmapped without
> > deadlock).
> >
> > For nouveau this is done through register write (see
> > gf100_vmm_invalidate and gp100_vmm_invalidate_pdb and the
> > GPU page table itself is updated through the CPU).
> >
> > I think issue for AMD here is that you rely on updating the
> > GPU page table through the GPU command processor and this
> > will not work with HMM. Unless you have a CP channel that
> > can always make forward progress no matter what. Or update
> > the GPU page table with the CPU and flush GPU TLB through
> > some register write (iirc this should be doable on AMD but
> > i forgetting things now).
> >
> >
> > There is no need for preemption even if it is a nice to have
> > feature (lazy GPU design architect and hardware designer ! ;)).
> >
> >
> > Now if dma/ttm/gem GPU memory allocation use all the system
> > memory then we are in trouble. This was the whole reason for
> > limiting ourself, once upon a time, to not use over 50% of
> > memory with dma/ttm/gem. I do not have any good way to solve
> > that. If to make forward progress the GPU needs to allocate
> > more memory but it already has exhausted all memory through
> > dma/ttm/gem then there is just nothing we can do.
> >
> > I still believe we should limit dma/ttm/gem memory usage so
> > that we never endup pinning all system memory in GPU realms.
> 
> Well this is exactly what we're trying to do here. And to be able to
> shrink dma/ttm/gem memory you eventually have to call dma_fence_wait,
> which is where the entire trouble comes in.
> 
> The other source of trouble is buffer based userptr, where we also
> call dma_fence_wait.
> 
> And of course through depdencies (even across gpus) this
> dma_fence_wait could end up being blocked on running some (non-hmm)
> context on the same gpu engine which is currently running your hmm
> context and stuck on some kind of fault.
> 
> Also, with the recent dma-fence annotation patch we already locked
> down the rule that dma_fence_wait from memory reclaim paths is ok
> (i915 already has no limit on system memory usage, and Christian
> really wants to drop it for ttm drivers too).
> 
> > Hopes this help clarify requirement and expectation.
> 
> So ... what now?
> 
> I see a few options:
> - require preempt. Tough on the hardware.
> - rework the entire memory handling. Not sure that's possible, since
> it would mean that HMM gpus and buffer based memory mangement gpus
> cannot co-exist on the same system. Flag days like that across an
> entire subsystem dont work.
> - gang scheduling, i.e. when a gpu is running an HMM context is must
> guarantee that there's not a single buffer based dma-buf/ttm/gem
> context pending (and block any execbuf/cs ioctl before we submit a new
> one)

HMM should not play here, HMM should be the polite user assuming
you follow what i expose above ie HMM never pins memory and always
allow memory reclaim to make forward progress without having to
use any kind of fence.

The issue is all about dma/ttm/gem and shrinker needing to wait for
fences for those.


> - .. something else?
> 

One of the new idea i had on the topic is to have the GPU driver
scheduler (the thing that actualy schedule userspace command buffer
in the kernel) keep tracks of how much memory it has pin through is
command queue and for it to refuse to schedule anything more above
some threshold (it would wait until pending GPU activities finishup
and which makes the memory use gauge go down below the threshold).

The big issue is with user space managed command ring. I am not sure
what are all the hardware capabilities there but maybe if we can
unbind (from the kernel) the user space ring any time we see some
threshold then we can circle back on that idea.

Another drawback is that you can not schedule a single job that use
more memory than the threshold.


Also one way to minimize the issue might be to swapout dma/ttm/gem
small chunk by small chunk instead of having to first allocate
the whole object size of system memory before swapping it out.

Jérôme

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 18:40                   ` Jerome Glisse
@ 2020-12-17 19:20                     ` Daniel Vetter
  0 siblings, 0 replies; 20+ messages in thread
From: Daniel Vetter @ 2020-12-17 19:20 UTC (permalink / raw)
  To: Jerome Glisse
  Cc: Ralph Campbell, Dave Airlie, John Hubbard, Roland Scheidegger,
	dri-devel, Huang Rui, VMware Graphics, Ben Skeggs,
	Christian König

On Thu, Dec 17, 2020 at 7:40 PM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Thu, Dec 17, 2020 at 07:19:07PM +0100, Daniel Vetter wrote:
> > On Thu, Dec 17, 2020 at 7:09 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > >
> > > Adding few folks on cc just to raise awareness and so that i
> > > could get corrected if i said anything wrong.
> > >
> > > On Thu, Dec 17, 2020 at 04:45:55PM +0100, Daniel Vetter wrote:
> > > > On Thu, Dec 17, 2020 at 4:36 PM Christian König
> > > > <christian.koenig@amd.com> wrote:
> > > > > Am 17.12.20 um 16:26 schrieb Daniel Vetter:
> > > > > > On Thu, Dec 17, 2020 at 4:10 PM Christian König
> > > > > > <christian.koenig@amd.com> wrote:
> > > > > >> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> > > > > >>> On Thu, Dec 17, 2020 at 2:46 PM Christian König
> > > > > >>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > > > >>>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> > > > > >>>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> > > > > >>>>> [SNIP]
> > > > > >>>>>> +
> > > > > >>>>>> +/* As long as pages are available make sure to release at least one */
> > > > > >>>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> > > > > >>>>>> +                                      struct shrink_control *sc)
> > > > > >>>>>> +{
> > > > > >>>>>> +    struct ttm_operation_ctx ctx = {
> > > > > >>>>>> +            .no_wait_gpu = true
> > > > > >>>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
> > > > > >>>>> once we hit that, we should allow gpu waits. But it's not passed to
> > > > > >>>>> shrinkers for reasons, so maybe we should have a second round that tries
> > > > > >>>>> to more actively shrink objects if we fell substantially short of what
> > > > > >>>>> reclaim expected us to do?
> > > > > >>>> I think we should try to avoid waiting for the GPU in the shrinker callback.
> > > > > >>>>
> > > > > >>>> When we get HMM we will have cases where the shrinker is called from
> > > > > >>>> there and we can't wait for the GPU then without causing deadlocks.
> > > > > >>> Uh that doesn't work. Also, the current rules are that you are allowed
> > > > > >>> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> > > > > >>> already. This is because shrinkers are a less restrictive context than
> > > > > >>> mmu notifier invalidation, and we wait in there too.
> > > > > >>>
> > > > > >>> So if you can't wait in shrinkers, you also can't wait in mmu
> > > > > >>> notifiers (and also not in HMM, wĥich is the same thing). Why do you
> > > > > >>> need this?
> > > > > >> The core concept of HMM is that pages are faulted in on demand and it is
> > > > > >> perfectly valid for one of those pages to be on disk.
> > > > > >>
> > > > > >> So when a page fault happens we might need to be able to allocate memory
> > > > > >> and fetch something from disk to handle that.
> > > > > >>
> > > > > >> When this memory allocation then in turn waits for the GPU which is
> > > > > >> running the HMM process we are pretty much busted.
> > > > > > Yeah you can't do that. That's the entire infinite fences discussions.
> > > > >
> > > > > Yes, exactly.
> > > > >
> > > > > > For HMM to work, we need to stop using dma_fence for userspace sync,
> > > > >
> > > > > I was considering of separating that into a dma_fence and a hmm_fence.
> > > > > Or something like this.
> > > >
> > > > The trouble is that dma_fence it all its forms is uapi. And on gpus
> > > > without page fault support dma_fence_wait is still required in
> > > > allocation contexts. So creating a new kernel structure doesn't really
> > > > solve anything I think, it needs entire new uapi completely decoupled
> > > > from memory management. Last time we've done new uapi was probably
> > > > modifiers, and that's still not rolled out years later.
> > >
> > > With hmm there should not be any fence ! You do not need them.
> > > If you feel you need them than you are doing something horribly
> > > wrong. See below on what HMM needs and what it means.
> > >
> > >
> > > > > > and you can only use the amdkfd style preempt fences. And preempting
> > > > > > while the pagefault is pending is I thought something we require.
> > > > >
> > > > > Yeah, problem is that most hardware can't do that :)
> > > > >
> > > > > Getting page faults to work is hard enough, preempting while waiting for
> > > > > a fault to return is not something which was anticipated :)
> > > >
> > > > Hm last summer in a thread you said you've blocked that because it
> > > > doesn't work. I agreed, page fault without preempt is rather tough to
> > > > make work.
> > > >
> > > > > > Iow, the HMM page fault handler must not be a dma-fence critical
> > > > > > section, i.e. it's not allowed to hold up any dma_fence, ever.
> > > > >
> > > > > What do you mean with that?
> > > >
> > > > dma_fence_signalling_begin/end() annotations essentially, i.e.
> > > > cross-release dependencies. Or the other way round, if you want to be
> > > > able to allocate memory you have to guarantee that you're never
> > > > holding up a dma_fence.
> > >
> > > Correct nothing regarding dma/ttm/gem should creep into HMM code
> > > path.
> > >
> > >
> > > For HMM what you want when handling GPU fault is doing it without
> > > holding any GPU driver locks so that the regular page fault handler
> > > code path can go back into the GPU driver (through shrinker) without
> > > worrying about it.
> > >
> > > This is how nouveau does it:
> > >     - get event about page fault (might hold some GPU lock)
> > >     - walk the event buffer to get all faulting addresses
> > >       (might hold some GPU lock)
> > >
> > >     ! DROP ALL GPU/DRIVER LOCK !
> > >
> > >     - try to coallesce fault together (adjacent address
> > >       trigger a fault for a single range)
> > >     - call in HMM/mmu notifier helpers to handle the fault
> > >     - take GPU lock (svmm->mutex for nouveau)
> > >     - from the fault result update GPU page table
> > >
> > >     - Tell GPU it can resume (this can be done after releasing
> > >       the lock below, it is just up to the device driver folks
> > >       to decide when to do that).
> > >
> > >     - unlock GPU (svmm->mutex for nouveau)
> > >
> > > (see nouveau_svm.c look for range_fault)
> > >
> > > GPU page fault should never need to rely on shrinker to succeed
> > > into reclaiming memory. The rational here is that regular memory
> > > reclaim (which includes regular anonymous or mmap file back
> > > memory) will be able to make forward progress even during GPU
> > > page fault. This of course means that while servicing a GPU page
> > > fault you might freeup memory that was also use by the GPU and
> > > which will re-fault again but the lru list should make that very
> > > unlikely.
> > >
> > >
> > > Note that for regular memory reclaim to make forward progress
> > > means that the GPU _must_ support asynchronous GPU page table
> > > update ie being able to unmap things from GPU page table and
> > > flush GPU TLB without having to wait for anything running on
> > > the GPU. This is where not all hardware might work. Only things
> > > populated through HMM need to be unmapped (dma/ttm/gem should
> > > not be considered unless they too can be unmapped without
> > > deadlock).
> > >
> > > For nouveau this is done through register write (see
> > > gf100_vmm_invalidate and gp100_vmm_invalidate_pdb and the
> > > GPU page table itself is updated through the CPU).
> > >
> > > I think issue for AMD here is that you rely on updating the
> > > GPU page table through the GPU command processor and this
> > > will not work with HMM. Unless you have a CP channel that
> > > can always make forward progress no matter what. Or update
> > > the GPU page table with the CPU and flush GPU TLB through
> > > some register write (iirc this should be doable on AMD but
> > > i forgetting things now).
> > >
> > >
> > > There is no need for preemption even if it is a nice to have
> > > feature (lazy GPU design architect and hardware designer ! ;)).
> > >
> > >
> > > Now if dma/ttm/gem GPU memory allocation use all the system
> > > memory then we are in trouble. This was the whole reason for
> > > limiting ourself, once upon a time, to not use over 50% of
> > > memory with dma/ttm/gem. I do not have any good way to solve
> > > that. If to make forward progress the GPU needs to allocate
> > > more memory but it already has exhausted all memory through
> > > dma/ttm/gem then there is just nothing we can do.
> > >
> > > I still believe we should limit dma/ttm/gem memory usage so
> > > that we never endup pinning all system memory in GPU realms.
> >
> > Well this is exactly what we're trying to do here. And to be able to
> > shrink dma/ttm/gem memory you eventually have to call dma_fence_wait,
> > which is where the entire trouble comes in.
> >
> > The other source of trouble is buffer based userptr, where we also
> > call dma_fence_wait.
> >
> > And of course through depdencies (even across gpus) this
> > dma_fence_wait could end up being blocked on running some (non-hmm)
> > context on the same gpu engine which is currently running your hmm
> > context and stuck on some kind of fault.
> >
> > Also, with the recent dma-fence annotation patch we already locked
> > down the rule that dma_fence_wait from memory reclaim paths is ok
> > (i915 already has no limit on system memory usage, and Christian
> > really wants to drop it for ttm drivers too).
> >
> > > Hopes this help clarify requirement and expectation.
> >
> > So ... what now?
> >
> > I see a few options:
> > - require preempt. Tough on the hardware.
> > - rework the entire memory handling. Not sure that's possible, since
> > it would mean that HMM gpus and buffer based memory mangement gpus
> > cannot co-exist on the same system. Flag days like that across an
> > entire subsystem dont work.
> > - gang scheduling, i.e. when a gpu is running an HMM context is must
> > guarantee that there's not a single buffer based dma-buf/ttm/gem
> > context pending (and block any execbuf/cs ioctl before we submit a new
> > one)
>
> HMM should not play here, HMM should be the polite user assuming
> you follow what i expose above ie HMM never pins memory and always
> allow memory reclaim to make forward progress without having to
> use any kind of fence.

HMM is not pinning any memory with this scheme. That wouldn't work.

> The issue is all about dma/ttm/gem and shrinker needing to wait for
> fences for those.
>
>
> > - .. something else?
> >
>
> One of the new idea i had on the topic is to have the GPU driver
> scheduler (the thing that actualy schedule userspace command buffer
> in the kernel) keep tracks of how much memory it has pin through is
> command queue and for it to refuse to schedule anything more above
> some threshold (it would wait until pending GPU activities finishup
> and which makes the memory use gauge go down below the threshold).

"pin all gpu memory, fixed limit" is what we're trying to get away from.

We could make it more flexible with our own gpu shrinker as a side
wagon, like the ttm_shrink_work. But that's again not great because
then we're trying to balance memory usage without being integrated
into core mm reclaim logic, and that doesn't work that well from a
control pov.

> The big issue is with user space managed command ring. I am not sure
> what are all the hardware capabilities there but maybe if we can
> unbind (from the kernel) the user space ring any time we see some
> threshold then we can circle back on that idea.

Userspace managed ring needs ctx preempt or you have infinite fences.
I think that should be clear, and this I think this is orthogonal to
the entire hmm/page fault handling topic.
-Daniel

> Another drawback is that you can not schedule a single job that use
> more memory than the threshold.
>
>
> Also one way to minimize the issue might be to swapout dma/ttm/gem
> small chunk by small chunk instead of having to first allocate
> the whole object size of system memory before swapping it out.
>
> Jérôme
>


-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2
  2020-12-17 18:26                   ` Daniel Vetter
@ 2020-12-17 19:22                     ` Daniel Vetter
  0 siblings, 0 replies; 20+ messages in thread
From: Daniel Vetter @ 2020-12-17 19:22 UTC (permalink / raw)
  To: Jerome Glisse
  Cc: Ralph Campbell, Dave Airlie, John Hubbard, Roland Scheidegger,
	dri-devel, Huang Rui, VMware Graphics, Ben Skeggs,
	Christian König

On Thu, Dec 17, 2020 at 7:26 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Thu, Dec 17, 2020 at 7:19 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Thu, Dec 17, 2020 at 7:09 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > >
> > > Adding few folks on cc just to raise awareness and so that i
> > > could get corrected if i said anything wrong.
> > >
> > > On Thu, Dec 17, 2020 at 04:45:55PM +0100, Daniel Vetter wrote:
> > > > On Thu, Dec 17, 2020 at 4:36 PM Christian König
> > > > <christian.koenig@amd.com> wrote:
> > > > > Am 17.12.20 um 16:26 schrieb Daniel Vetter:
> > > > > > On Thu, Dec 17, 2020 at 4:10 PM Christian König
> > > > > > <christian.koenig@amd.com> wrote:
> > > > > >> Am 17.12.20 um 15:36 schrieb Daniel Vetter:
> > > > > >>> On Thu, Dec 17, 2020 at 2:46 PM Christian König
> > > > > >>> <ckoenig.leichtzumerken@gmail.com> wrote:
> > > > > >>>> Am 16.12.20 um 16:09 schrieb Daniel Vetter:
> > > > > >>>>> On Wed, Dec 16, 2020 at 03:04:26PM +0100, Christian König wrote:
> > > > > >>>>> [SNIP]
> > > > > >>>>>> +
> > > > > >>>>>> +/* As long as pages are available make sure to release at least one */
> > > > > >>>>>> +static unsigned long ttm_tt_shrinker_scan(struct shrinker *shrink,
> > > > > >>>>>> +                                      struct shrink_control *sc)
> > > > > >>>>>> +{
> > > > > >>>>>> +    struct ttm_operation_ctx ctx = {
> > > > > >>>>>> +            .no_wait_gpu = true
> > > > > >>>>> Iirc there's an eventual shrinker limit where it gets desperate. I think
> > > > > >>>>> once we hit that, we should allow gpu waits. But it's not passed to
> > > > > >>>>> shrinkers for reasons, so maybe we should have a second round that tries
> > > > > >>>>> to more actively shrink objects if we fell substantially short of what
> > > > > >>>>> reclaim expected us to do?
> > > > > >>>> I think we should try to avoid waiting for the GPU in the shrinker callback.
> > > > > >>>>
> > > > > >>>> When we get HMM we will have cases where the shrinker is called from
> > > > > >>>> there and we can't wait for the GPU then without causing deadlocks.
> > > > > >>> Uh that doesn't work. Also, the current rules are that you are allowed
> > > > > >>> to call dma_fence_wait from shrinker callbacks, so that shipped sailed
> > > > > >>> already. This is because shrinkers are a less restrictive context than
> > > > > >>> mmu notifier invalidation, and we wait in there too.
> > > > > >>>
> > > > > >>> So if you can't wait in shrinkers, you also can't wait in mmu
> > > > > >>> notifiers (and also not in HMM, wĥich is the same thing). Why do you
> > > > > >>> need this?
> > > > > >> The core concept of HMM is that pages are faulted in on demand and it is
> > > > > >> perfectly valid for one of those pages to be on disk.
> > > > > >>
> > > > > >> So when a page fault happens we might need to be able to allocate memory
> > > > > >> and fetch something from disk to handle that.
> > > > > >>
> > > > > >> When this memory allocation then in turn waits for the GPU which is
> > > > > >> running the HMM process we are pretty much busted.
> > > > > > Yeah you can't do that. That's the entire infinite fences discussions.
> > > > >
> > > > > Yes, exactly.
> > > > >
> > > > > > For HMM to work, we need to stop using dma_fence for userspace sync,
> > > > >
> > > > > I was considering of separating that into a dma_fence and a hmm_fence.
> > > > > Or something like this.
> > > >
> > > > The trouble is that dma_fence it all its forms is uapi. And on gpus
> > > > without page fault support dma_fence_wait is still required in
> > > > allocation contexts. So creating a new kernel structure doesn't really
> > > > solve anything I think, it needs entire new uapi completely decoupled
> > > > from memory management. Last time we've done new uapi was probably
> > > > modifiers, and that's still not rolled out years later.
> > >
> > > With hmm there should not be any fence ! You do not need them.
> > > If you feel you need them than you are doing something horribly
> > > wrong. See below on what HMM needs and what it means.
> > >
> > >
> > > > > > and you can only use the amdkfd style preempt fences. And preempting
> > > > > > while the pagefault is pending is I thought something we require.
> > > > >
> > > > > Yeah, problem is that most hardware can't do that :)
> > > > >
> > > > > Getting page faults to work is hard enough, preempting while waiting for
> > > > > a fault to return is not something which was anticipated :)
> > > >
> > > > Hm last summer in a thread you said you've blocked that because it
> > > > doesn't work. I agreed, page fault without preempt is rather tough to
> > > > make work.
> > > >
> > > > > > Iow, the HMM page fault handler must not be a dma-fence critical
> > > > > > section, i.e. it's not allowed to hold up any dma_fence, ever.
> > > > >
> > > > > What do you mean with that?
> > > >
> > > > dma_fence_signalling_begin/end() annotations essentially, i.e.
> > > > cross-release dependencies. Or the other way round, if you want to be
> > > > able to allocate memory you have to guarantee that you're never
> > > > holding up a dma_fence.
> > >
> > > Correct nothing regarding dma/ttm/gem should creep into HMM code
> > > path.
> > >
> > >
> > > For HMM what you want when handling GPU fault is doing it without
> > > holding any GPU driver locks so that the regular page fault handler
> > > code path can go back into the GPU driver (through shrinker) without
> > > worrying about it.
> > >
> > > This is how nouveau does it:
> > >     - get event about page fault (might hold some GPU lock)
> > >     - walk the event buffer to get all faulting addresses
> > >       (might hold some GPU lock)
> > >
> > >     ! DROP ALL GPU/DRIVER LOCK !
> > >
> > >     - try to coallesce fault together (adjacent address
> > >       trigger a fault for a single range)
> > >     - call in HMM/mmu notifier helpers to handle the fault
> > >     - take GPU lock (svmm->mutex for nouveau)
> > >     - from the fault result update GPU page table
> > >
> > >     - Tell GPU it can resume (this can be done after releasing
> > >       the lock below, it is just up to the device driver folks
> > >       to decide when to do that).
> > >
> > >     - unlock GPU (svmm->mutex for nouveau)
> > >
> > > (see nouveau_svm.c look for range_fault)
> > >
> > > GPU page fault should never need to rely on shrinker to succeed
> > > into reclaiming memory. The rational here is that regular memory
> > > reclaim (which includes regular anonymous or mmap file back
> > > memory) will be able to make forward progress even during GPU
> > > page fault. This of course means that while servicing a GPU page
> > > fault you might freeup memory that was also use by the GPU and
> > > which will re-fault again but the lru list should make that very
> > > unlikely.
> > >
> > >
> > > Note that for regular memory reclaim to make forward progress
> > > means that the GPU _must_ support asynchronous GPU page table
> > > update ie being able to unmap things from GPU page table and
> > > flush GPU TLB without having to wait for anything running on
> > > the GPU. This is where not all hardware might work. Only things
> > > populated through HMM need to be unmapped (dma/ttm/gem should
> > > not be considered unless they too can be unmapped without
> > > deadlock).
> > >
> > > For nouveau this is done through register write (see
> > > gf100_vmm_invalidate and gp100_vmm_invalidate_pdb and the
> > > GPU page table itself is updated through the CPU).
> > >
> > > I think issue for AMD here is that you rely on updating the
> > > GPU page table through the GPU command processor and this
> > > will not work with HMM. Unless you have a CP channel that
> > > can always make forward progress no matter what. Or update
> > > the GPU page table with the CPU and flush GPU TLB through
> > > some register write (iirc this should be doable on AMD but
> > > i forgetting things now).
> > >
> > >
> > > There is no need for preemption even if it is a nice to have
> > > feature (lazy GPU design architect and hardware designer ! ;)).
> > >
> > >
> > > Now if dma/ttm/gem GPU memory allocation use all the system
> > > memory then we are in trouble. This was the whole reason for
> > > limiting ourself, once upon a time, to not use over 50% of
> > > memory with dma/ttm/gem. I do not have any good way to solve
> > > that. If to make forward progress the GPU needs to allocate
> > > more memory but it already has exhausted all memory through
> > > dma/ttm/gem then there is just nothing we can do.
> > >
> > > I still believe we should limit dma/ttm/gem memory usage so
> > > that we never endup pinning all system memory in GPU realms.
> >
> > Well this is exactly what we're trying to do here. And to be able to
> > shrink dma/ttm/gem memory you eventually have to call dma_fence_wait,
> > which is where the entire trouble comes in.
> >
> > The other source of trouble is buffer based userptr, where we also
> > call dma_fence_wait.
> >
> > And of course through depdencies (even across gpus) this
> > dma_fence_wait could end up being blocked on running some (non-hmm)
> > context on the same gpu engine which is currently running your hmm
> > context and stuck on some kind of fault.
> >
> > Also, with the recent dma-fence annotation patch we already locked
> > down the rule that dma_fence_wait from memory reclaim paths is ok
> > (i915 already has no limit on system memory usage, and Christian
> > really wants to drop it for ttm drivers too).
> >
> > > Hopes this help clarify requirement and expectation.
> >
> > So ... what now?
> >
> > I see a few options:
> > - require preempt. Tough on the hardware.
> > - rework the entire memory handling. Not sure that's possible, since
> > it would mean that HMM gpus and buffer based memory mangement gpus
> > cannot co-exist on the same system. Flag days like that across an
> > entire subsystem dont work.
> > - gang scheduling, i.e. when a gpu is running an HMM context is must
> > guarantee that there's not a single buffer based dma-buf/ttm/gem
> > context pending (and block any execbuf/cs ioctl before we submit a new
> > one)
>
> This here wouldn't be the worst solution, even on desktops:
> - we already stall at execbuf for timeline semaphores, so this isn't even new
> - when the gpu is occupied by an hmm task, then execbuf/cs ioctl can
> issue a low-priority preempt request to make sure the desktop doesn't
> freeze forever. As long as that happens before we have a dma_fence for
> that cs created, it's all fine to preempt HMM context.
> - preempted HMM tasks would immediately try to re-run, to make sure
> that they get fair scheduling between the execbuf tasks
>
> It's not going to be great since the pipeline is down the gutters, but
> no frozen desktop while running HMM jobs. And as long as you do
> majority time only one or the other the gpu runs at full speed with
> all the pipelining. So no perf impact on gaming or on overnight
> compute jobs.

Also, since this gang scheduling mode between hmm and non-hmm mode is
per-engine, this also solves the problem with using the copy engines
to move vram or write pagetables: As long as the kernel guarantees to
never run an hmm task on such an engine (simplest to do by outright
reserving one for the kernel exclusively) you won't deadlock between
dma_fence ctx and hmm ctx biting each another. That way we don't have
to rewrite the entire vram mangament for eventual ZONE_DEVICE world. I
think at least.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2020-12-17 19:22 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-16 14:04 [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2 Christian König
2020-12-16 14:04 ` [PATCH 2/2] drm/ttm: move memory accounting into vmwgfx Christian König
2020-12-16 15:26   ` Daniel Vetter
2020-12-16 16:55   ` kernel test robot
2020-12-16 16:55     ` kernel test robot
2020-12-16 18:19   ` kernel test robot
2020-12-16 18:19     ` kernel test robot
2020-12-16 15:09 ` [PATCH 1/2] drm/ttm: rework ttm_tt page limit v2 Daniel Vetter
2020-12-17 13:46   ` Christian König
2020-12-17 14:36     ` Daniel Vetter
2020-12-17 15:10       ` Christian König
2020-12-17 15:26         ` Daniel Vetter
2020-12-17 15:35           ` Christian König
2020-12-17 15:45             ` Daniel Vetter
2020-12-17 18:09               ` Jerome Glisse
2020-12-17 18:19                 ` Daniel Vetter
2020-12-17 18:26                   ` Daniel Vetter
2020-12-17 19:22                     ` Daniel Vetter
2020-12-17 18:40                   ` Jerome Glisse
2020-12-17 19:20                     ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.