All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/6] Support module unload and hotunplug
@ 2021-11-05 19:38 Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 1/6] drm/vmwgfx: Remove the deprecated lower mem limit Zack Rusin
                   ` (5 more replies)
  0 siblings, 6 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-05 19:38 UTC (permalink / raw)
  To: dri-devel; +Cc: Thomas Hellström, Christian König

v2: Introduces a TTM documentation change to clarify the discussion that
happened after the first version of this series was sent.
Also, removing pointless "unlikely" in the "Introduce a new placement for
MOB page tables" commit that Thomas noticed.

This is a largely trivial set that makes vmwgfx support module reload
and PCI hot-unplug. It also makes IGT's core_hotunplug pass instead
of kernel oops'ing.

Cc: Christian König <christian.koenig@amd.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>

Zack Rusin (6):
  drm/vmwgfx: Remove the deprecated lower mem limit
  drm/vmwgfx: Release ttm memory if probe fails
  drm/vmwgfx: Fail to initialize on broken configs
  drm/vmwgfx: Introduce a new placement for MOB page tables
  drm/vmwgfx: Switch the internal BO's to ttm_bo_type_kernel
  drm/ttm: Clarify that the TTM_PL_SYSTEM buffers need to stay idle

 drivers/gpu/drm/vmwgfx/Makefile               |  2 +-
 drivers/gpu/drm/vmwgfx/ttm_memory.c           | 99 +------------------
 drivers/gpu/drm/vmwgfx/ttm_memory.h           |  6 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_bo.c            |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c           |  7 ++
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c           | 40 +++++---
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h           | 12 ++-
 .../gpu/drm/vmwgfx/vmwgfx_system_manager.c    | 90 +++++++++++++++++
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c    | 58 +++++------
 include/drm/ttm/ttm_placement.h               | 10 ++
 10 files changed, 174 insertions(+), 152 deletions(-)
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_system_manager.c

-- 
2.32.0


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v2 1/6] drm/vmwgfx: Remove the deprecated lower mem limit
  2021-11-05 19:38 [PATCH v2 0/6] Support module unload and hotunplug Zack Rusin
@ 2021-11-05 19:38 ` Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 2/6] drm/vmwgfx: Release ttm memory if probe fails Zack Rusin
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-05 19:38 UTC (permalink / raw)
  To: dri-devel; +Cc: Martin Krastev

TTM during the transition to the new page allocator lost the ability
to constrain the allocations via the lower_mem_limit. The code has
been unused since the change:
256dd44bd897 ("drm/ttm: nuke old page allocator")
and there's no reason to keep it.

Fixes: 256dd44bd897 ("drm/ttm: nuke old page allocator")
Signed-off-by: Zack Rusin <zackr@vmware.com>
Reviewed-by: Martin Krastev <krastevm@vmware.com>
---
 drivers/gpu/drm/vmwgfx/ttm_memory.c | 99 +----------------------------
 drivers/gpu/drm/vmwgfx/ttm_memory.h |  6 +-
 2 files changed, 2 insertions(+), 103 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/ttm_memory.c b/drivers/gpu/drm/vmwgfx/ttm_memory.c
index edd17c30d5a5..2ced4c06ca45 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_memory.c
+++ b/drivers/gpu/drm/vmwgfx/ttm_memory.c
@@ -34,7 +34,6 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/swap.h>
 
 #include <drm/drm_device.h>
 #include <drm/drm_file.h>
@@ -173,69 +172,7 @@ static struct kobj_type ttm_mem_zone_kobj_type = {
 	.sysfs_ops = &ttm_mem_zone_ops,
 	.default_attrs = ttm_mem_zone_attrs,
 };
-
-static struct attribute ttm_mem_global_lower_mem_limit = {
-	.name = "lower_mem_limit",
-	.mode = S_IRUGO | S_IWUSR
-};
-
-static ssize_t ttm_mem_global_show(struct kobject *kobj,
-				 struct attribute *attr,
-				 char *buffer)
-{
-	struct ttm_mem_global *glob =
-		container_of(kobj, struct ttm_mem_global, kobj);
-	uint64_t val = 0;
-
-	spin_lock(&glob->lock);
-	val = glob->lower_mem_limit;
-	spin_unlock(&glob->lock);
-	/* convert from number of pages to KB */
-	val <<= (PAGE_SHIFT - 10);
-	return snprintf(buffer, PAGE_SIZE, "%llu\n",
-			(unsigned long long) val);
-}
-
-static ssize_t ttm_mem_global_store(struct kobject *kobj,
-				  struct attribute *attr,
-				  const char *buffer,
-				  size_t size)
-{
-	int chars;
-	uint64_t val64;
-	unsigned long val;
-	struct ttm_mem_global *glob =
-		container_of(kobj, struct ttm_mem_global, kobj);
-
-	chars = sscanf(buffer, "%lu", &val);
-	if (chars == 0)
-		return size;
-
-	val64 = val;
-	/* convert from KB to number of pages */
-	val64 >>= (PAGE_SHIFT - 10);
-
-	spin_lock(&glob->lock);
-	glob->lower_mem_limit = val64;
-	spin_unlock(&glob->lock);
-
-	return size;
-}
-
-static struct attribute *ttm_mem_global_attrs[] = {
-	&ttm_mem_global_lower_mem_limit,
-	NULL
-};
-
-static const struct sysfs_ops ttm_mem_global_ops = {
-	.show = &ttm_mem_global_show,
-	.store = &ttm_mem_global_store,
-};
-
-static struct kobj_type ttm_mem_glob_kobj_type = {
-	.sysfs_ops = &ttm_mem_global_ops,
-	.default_attrs = ttm_mem_global_attrs,
-};
+static struct kobj_type ttm_mem_glob_kobj_type = {0};
 
 static bool ttm_zones_above_swap_target(struct ttm_mem_global *glob,
 					bool from_wq, uint64_t extra)
@@ -435,11 +372,6 @@ int ttm_mem_global_init(struct ttm_mem_global *glob, struct device *dev)
 
 	si_meminfo(&si);
 
-	spin_lock(&glob->lock);
-	/* set it as 0 by default to keep original behavior of OOM */
-	glob->lower_mem_limit = 0;
-	spin_unlock(&glob->lock);
-
 	ret = ttm_mem_init_kernel_zone(glob, &si);
 	if (unlikely(ret != 0))
 		goto out_no_zone;
@@ -527,35 +459,6 @@ void ttm_mem_global_free(struct ttm_mem_global *glob,
 }
 EXPORT_SYMBOL(ttm_mem_global_free);
 
-/*
- * check if the available mem is under lower memory limit
- *
- * a. if no swap disk at all or free swap space is under swap_mem_limit
- * but available system mem is bigger than sys_mem_limit, allow TTM
- * allocation;
- *
- * b. if the available system mem is less than sys_mem_limit but free
- * swap disk is bigger than swap_mem_limit, allow TTM allocation.
- */
-bool
-ttm_check_under_lowerlimit(struct ttm_mem_global *glob,
-			uint64_t num_pages,
-			struct ttm_operation_ctx *ctx)
-{
-	int64_t available;
-
-	/* We allow over commit during suspend */
-	if (ctx->force_alloc)
-		return false;
-
-	available = get_nr_swap_pages() + si_mem_available();
-	available -= num_pages;
-	if (available < glob->lower_mem_limit)
-		return true;
-
-	return false;
-}
-
 static int ttm_mem_global_reserve(struct ttm_mem_global *glob,
 				  struct ttm_mem_zone *single_zone,
 				  uint64_t amount, bool reserve)
diff --git a/drivers/gpu/drm/vmwgfx/ttm_memory.h b/drivers/gpu/drm/vmwgfx/ttm_memory.h
index c50dba774485..7b0d617ebcb1 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_memory.h
+++ b/drivers/gpu/drm/vmwgfx/ttm_memory.h
@@ -50,8 +50,6 @@
  * @work: The workqueue callback for the shrink queue.
  * @lock: Lock to protect the @shrink - and the memory accounting members,
  * that is, essentially the whole structure with some exceptions.
- * @lower_mem_limit: include lower limit of swap space and lower limit of
- * system memory.
  * @zones: Array of pointers to accounting zones.
  * @num_zones: Number of populated entries in the @zones array.
  * @zone_kernel: Pointer to the kernel zone.
@@ -69,7 +67,6 @@ extern struct ttm_mem_global {
 	struct workqueue_struct *swap_queue;
 	struct work_struct work;
 	spinlock_t lock;
-	uint64_t lower_mem_limit;
 	struct ttm_mem_zone *zones[TTM_MEM_MAX_ZONES];
 	unsigned int num_zones;
 	struct ttm_mem_zone *zone_kernel;
@@ -91,6 +88,5 @@ int ttm_mem_global_alloc_page(struct ttm_mem_global *glob,
 void ttm_mem_global_free_page(struct ttm_mem_global *glob,
 			      struct page *page, uint64_t size);
 size_t ttm_round_pot(size_t size);
-bool ttm_check_under_lowerlimit(struct ttm_mem_global *glob, uint64_t num_pages,
-				struct ttm_operation_ctx *ctx);
+
 #endif
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v2 2/6] drm/vmwgfx: Release ttm memory if probe fails
  2021-11-05 19:38 [PATCH v2 0/6] Support module unload and hotunplug Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 1/6] drm/vmwgfx: Remove the deprecated lower mem limit Zack Rusin
@ 2021-11-05 19:38 ` Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 3/6] drm/vmwgfx: Fail to initialize on broken configs Zack Rusin
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-05 19:38 UTC (permalink / raw)
  To: dri-devel; +Cc: Martin Krastev

The ttm mem global state was leaking if the vmwgfx driver load failed.

In case of a driver load failure we have to make sure we also release
the ttm mem global state.

Signed-off-by: Zack Rusin <zackr@vmware.com>
Reviewed-by: Martin Krastev <krastevm@vmware.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index ab9a1750e1df..8d0b083ba267 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1617,34 +1617,40 @@ static int vmw_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ret = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &driver);
 	if (ret)
-		return ret;
+		goto out_error;
 
 	ret = pcim_enable_device(pdev);
 	if (ret)
-		return ret;
+		goto out_error;
 
 	vmw = devm_drm_dev_alloc(&pdev->dev, &driver,
 				 struct vmw_private, drm);
-	if (IS_ERR(vmw))
-		return PTR_ERR(vmw);
+	if (IS_ERR(vmw)) {
+		ret = PTR_ERR(vmw);
+		goto out_error;
+	}
 
 	pci_set_drvdata(pdev, &vmw->drm);
 
 	ret = ttm_mem_global_init(&ttm_mem_glob, &pdev->dev);
 	if (ret)
-		return ret;
+		goto out_error;
 
 	ret = vmw_driver_load(vmw, ent->device);
 	if (ret)
-		return ret;
+		goto out_release;
 
 	ret = drm_dev_register(&vmw->drm, 0);
-	if (ret) {
-		vmw_driver_unload(&vmw->drm);
-		return ret;
-	}
+	if (ret)
+		goto out_unload;
 
 	return 0;
+out_unload:
+	vmw_driver_unload(&vmw->drm);
+out_release:
+	ttm_mem_global_release(&ttm_mem_glob);
+out_error:
+	return ret;
 }
 
 static int __init vmwgfx_init(void)
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v2 3/6] drm/vmwgfx: Fail to initialize on broken configs
  2021-11-05 19:38 [PATCH v2 0/6] Support module unload and hotunplug Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 1/6] drm/vmwgfx: Remove the deprecated lower mem limit Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 2/6] drm/vmwgfx: Release ttm memory if probe fails Zack Rusin
@ 2021-11-05 19:38 ` Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 4/6] drm/vmwgfx: Introduce a new placement for MOB page tables Zack Rusin
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-05 19:38 UTC (permalink / raw)
  To: dri-devel; +Cc: Martin Krastev

Some of our hosts have a bug where rescaning a pci bus results in stale
fifo memory being mapped on the host. This makes any fifo communication
impossible resulting in various kernel crashes.

Instead of unexpectedly crashing, predictably fail to load the driver
which will preserve the system.

Fixes: fb1d9738ca05 ("drm/vmwgfx: Add DRM driver for VMware Virtual GPU")
Signed-off-by: Zack Rusin <zackr@vmware.com>
Reviewed-by: Martin Krastev <krastevm@vmware.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
index 67db472d3493..a3bfbb6c3e14 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
@@ -145,6 +145,13 @@ struct vmw_fifo_state *vmw_fifo_create(struct vmw_private *dev_priv)
 		 (unsigned int) max,
 		 (unsigned int) min,
 		 (unsigned int) fifo->capabilities);
+
+	if (unlikely(min >= max)) {
+		drm_warn(&dev_priv->drm,
+			 "FIFO memory is not usable. Driver failed to initialize.");
+		return ERR_PTR(-ENXIO);
+	}
+
 	return fifo;
 }
 
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v2 4/6] drm/vmwgfx: Introduce a new placement for MOB page tables
  2021-11-05 19:38 [PATCH v2 0/6] Support module unload and hotunplug Zack Rusin
                   ` (2 preceding siblings ...)
  2021-11-05 19:38 ` [PATCH v2 3/6] drm/vmwgfx: Fail to initialize on broken configs Zack Rusin
@ 2021-11-05 19:38 ` Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 5/6] drm/vmwgfx: Switch the internal BO's to ttm_bo_type_kernel Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 6/6] drm/ttm: Clarify that the TTM_PL_SYSTEM buffers need to stay idle Zack Rusin
  5 siblings, 0 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-05 19:38 UTC (permalink / raw)
  To: dri-devel; +Cc: Martin Krastev, Christian König, Thomas Hellström

For larger (bigger than a page) and noncontiguous mobs we have
to create page tables that allow the host to find the memory.
Those page tables just used regular system memory. Unfortunately
in TTM those BO's are not allowed to be busy thus can't be
fenced and we have to fence those bo's  because we don't want
to destroy the page tables while the host is still executing
the command buffers which might be accessing them.

To solve it we introduce a new placement VMW_PL_SYSTEM which
is very similar to TTM_PL_SYSTEM except that it allows
fencing. This fixes kernel oops'es during unloading of the driver
(and pci hot remove/add) which were caused by busy BO's in
TTM_PL_SYSTEM being present in the delayed deletion list in
TTM (TTM_PL_SYSTEM manager is destroyed before the delayed
deletions are executed)

Signed-off-by: Zack Rusin <zackr@vmware.com>
Reviewed-by: Martin Krastev <krastevm@vmware.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/vmwgfx/Makefile               |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c           | 14 ++-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h           | 12 ++-
 .../gpu/drm/vmwgfx/vmwgfx_system_manager.c    | 90 +++++++++++++++++++
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c    | 58 ++++++------
 5 files changed, 138 insertions(+), 38 deletions(-)
 create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_system_manager.c

diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
index bc323f7d4032..0188a312c38c 100644
--- a/drivers/gpu/drm/vmwgfx/Makefile
+++ b/drivers/gpu/drm/vmwgfx/Makefile
@@ -9,7 +9,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
 	    vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o \
 	    vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
 	    vmwgfx_validation.o vmwgfx_page_dirty.o vmwgfx_streamoutput.o \
-            vmwgfx_devcaps.o ttm_object.o ttm_memory.o
+	    vmwgfx_devcaps.o ttm_object.o ttm_memory.o vmwgfx_system_manager.o
 
 vmwgfx-$(CONFIG_DRM_FBDEV_EMULATION) += vmwgfx_fb.o
 vmwgfx-$(CONFIG_TRANSPARENT_HUGEPAGE) += vmwgfx_thp.o
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 8d0b083ba267..daf65615308a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1071,6 +1071,12 @@ static int vmw_driver_load(struct vmw_private *dev_priv, u32 pci_id)
 				 "3D will be disabled.\n");
 			dev_priv->has_mob = false;
 		}
+		if (vmw_sys_man_init(dev_priv) != 0) {
+			drm_info(&dev_priv->drm,
+				 "No MOB page table memory available. "
+				 "3D will be disabled.\n");
+			dev_priv->has_mob = false;
+		}
 	}
 
 	if (dev_priv->has_mob && (dev_priv->capabilities & SVGA_CAP_DX)) {
@@ -1121,8 +1127,10 @@ static int vmw_driver_load(struct vmw_private *dev_priv, u32 pci_id)
 	vmw_overlay_close(dev_priv);
 	vmw_kms_close(dev_priv);
 out_no_kms:
-	if (dev_priv->has_mob)
+	if (dev_priv->has_mob) {
 		vmw_gmrid_man_fini(dev_priv, VMW_PL_MOB);
+		vmw_sys_man_fini(dev_priv);
+	}
 	if (dev_priv->has_gmr)
 		vmw_gmrid_man_fini(dev_priv, VMW_PL_GMR);
 	vmw_devcaps_destroy(dev_priv);
@@ -1172,8 +1180,10 @@ static void vmw_driver_unload(struct drm_device *dev)
 		vmw_gmrid_man_fini(dev_priv, VMW_PL_GMR);
 
 	vmw_release_device_early(dev_priv);
-	if (dev_priv->has_mob)
+	if (dev_priv->has_mob) {
 		vmw_gmrid_man_fini(dev_priv, VMW_PL_MOB);
+		vmw_sys_man_fini(dev_priv);
+	}
 	vmw_devcaps_destroy(dev_priv);
 	vmw_vram_manager_fini(dev_priv);
 	ttm_device_fini(&dev_priv->bdev);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index a833751099b5..df19dfb3ce18 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -82,8 +82,9 @@
 			VMWGFX_NUM_GB_SURFACE +\
 			VMWGFX_NUM_GB_SCREEN_TARGET)
 
-#define VMW_PL_GMR (TTM_PL_PRIV + 0)
-#define VMW_PL_MOB (TTM_PL_PRIV + 1)
+#define VMW_PL_GMR      (TTM_PL_PRIV + 0)
+#define VMW_PL_MOB      (TTM_PL_PRIV + 1)
+#define VMW_PL_SYSTEM   (TTM_PL_PRIV + 2)
 
 #define VMW_RES_CONTEXT ttm_driver_type0
 #define VMW_RES_SURFACE ttm_driver_type1
@@ -1039,7 +1040,6 @@ extern struct ttm_placement vmw_vram_placement;
 extern struct ttm_placement vmw_vram_sys_placement;
 extern struct ttm_placement vmw_vram_gmr_placement;
 extern struct ttm_placement vmw_sys_placement;
-extern struct ttm_placement vmw_evictable_placement;
 extern struct ttm_placement vmw_srf_placement;
 extern struct ttm_placement vmw_mob_placement;
 extern struct ttm_placement vmw_nonfixed_placement;
@@ -1251,6 +1251,12 @@ int vmw_overlay_num_free_overlays(struct vmw_private *dev_priv);
 int vmw_gmrid_man_init(struct vmw_private *dev_priv, int type);
 void vmw_gmrid_man_fini(struct vmw_private *dev_priv, int type);
 
+/**
+ * System memory manager
+ */
+int vmw_sys_man_init(struct vmw_private *dev_priv);
+void vmw_sys_man_fini(struct vmw_private *dev_priv);
+
 /**
  * Prime - vmwgfx_prime.c
  */
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_system_manager.c b/drivers/gpu/drm/vmwgfx/vmwgfx_system_manager.c
new file mode 100644
index 000000000000..b0005b03a617
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_system_manager.c
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright 2021 VMware, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "vmwgfx_drv.h"
+
+#include <drm/ttm/ttm_bo_driver.h>
+#include <drm/ttm/ttm_device.h>
+#include <drm/ttm/ttm_placement.h>
+#include <drm/ttm/ttm_resource.h>
+#include <linux/slab.h>
+
+
+static int vmw_sys_man_alloc(struct ttm_resource_manager *man,
+			     struct ttm_buffer_object *bo,
+			     const struct ttm_place *place,
+			     struct ttm_resource **res)
+{
+	*res = kzalloc(sizeof(**res), GFP_KERNEL);
+	if (!*res)
+		return -ENOMEM;
+
+	ttm_resource_init(bo, place, *res);
+	return 0;
+}
+
+static void vmw_sys_man_free(struct ttm_resource_manager *man,
+			     struct ttm_resource *res)
+{
+	kfree(res);
+}
+
+static const struct ttm_resource_manager_func vmw_sys_manager_func = {
+	.alloc = vmw_sys_man_alloc,
+	.free = vmw_sys_man_free,
+};
+
+int vmw_sys_man_init(struct vmw_private *dev_priv)
+{
+	struct ttm_device *bdev = &dev_priv->bdev;
+	struct ttm_resource_manager *man =
+			kzalloc(sizeof(*man), GFP_KERNEL);
+
+	if (!man)
+		return -ENOMEM;
+
+	man->use_tt = true;
+	man->func = &vmw_sys_manager_func;
+
+	ttm_resource_manager_init(man, 0);
+	ttm_set_driver_manager(bdev, VMW_PL_SYSTEM, man);
+	ttm_resource_manager_set_used(man, true);
+	return 0;
+}
+
+void vmw_sys_man_fini(struct vmw_private *dev_priv)
+{
+	struct ttm_resource_manager *man = ttm_manager_type(&dev_priv->bdev,
+							    VMW_PL_SYSTEM);
+
+	ttm_resource_manager_evict_all(&dev_priv->bdev, man);
+
+	ttm_resource_manager_set_used(man, false);
+	ttm_resource_manager_cleanup(man);
+
+	ttm_set_driver_manager(&dev_priv->bdev, VMW_PL_SYSTEM, NULL);
+	kfree(man);
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index e899a936a42a..b15228e7dbeb 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
@@ -92,6 +92,13 @@ static const struct ttm_place gmr_vram_placement_flags[] = {
 	}
 };
 
+static const struct ttm_place vmw_sys_placement_flags = {
+	.fpfn = 0,
+	.lpfn = 0,
+	.mem_type = VMW_PL_SYSTEM,
+	.flags = 0
+};
+
 struct ttm_placement vmw_vram_gmr_placement = {
 	.num_placement = 2,
 	.placement = vram_gmr_placement_flags,
@@ -113,28 +120,11 @@ struct ttm_placement vmw_sys_placement = {
 	.busy_placement = &sys_placement_flags
 };
 
-static const struct ttm_place evictable_placement_flags[] = {
-	{
-		.fpfn = 0,
-		.lpfn = 0,
-		.mem_type = TTM_PL_SYSTEM,
-		.flags = 0
-	}, {
-		.fpfn = 0,
-		.lpfn = 0,
-		.mem_type = TTM_PL_VRAM,
-		.flags = 0
-	}, {
-		.fpfn = 0,
-		.lpfn = 0,
-		.mem_type = VMW_PL_GMR,
-		.flags = 0
-	}, {
-		.fpfn = 0,
-		.lpfn = 0,
-		.mem_type = VMW_PL_MOB,
-		.flags = 0
-	}
+struct ttm_placement vmw_pt_sys_placement = {
+	.num_placement = 1,
+	.placement = &vmw_sys_placement_flags,
+	.num_busy_placement = 1,
+	.busy_placement = &vmw_sys_placement_flags
 };
 
 static const struct ttm_place nonfixed_placement_flags[] = {
@@ -156,13 +146,6 @@ static const struct ttm_place nonfixed_placement_flags[] = {
 	}
 };
 
-struct ttm_placement vmw_evictable_placement = {
-	.num_placement = 4,
-	.placement = evictable_placement_flags,
-	.num_busy_placement = 1,
-	.busy_placement = &sys_placement_flags
-};
-
 struct ttm_placement vmw_srf_placement = {
 	.num_placement = 1,
 	.num_busy_placement = 2,
@@ -484,6 +467,9 @@ static int vmw_ttm_bind(struct ttm_device *bdev,
 				    &vmw_be->vsgt, ttm->num_pages,
 				    vmw_be->gmr_id);
 		break;
+	case VMW_PL_SYSTEM:
+		/* Nothing to be done for a system bind */
+		break;
 	default:
 		BUG();
 	}
@@ -507,6 +493,8 @@ static void vmw_ttm_unbind(struct ttm_device *bdev,
 	case VMW_PL_MOB:
 		vmw_mob_unbind(vmw_be->dev_priv, vmw_be->mob);
 		break;
+	case VMW_PL_SYSTEM:
+		break;
 	default:
 		BUG();
 	}
@@ -624,6 +612,7 @@ static int vmw_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *
 
 	switch (mem->mem_type) {
 	case TTM_PL_SYSTEM:
+	case VMW_PL_SYSTEM:
 	case VMW_PL_GMR:
 	case VMW_PL_MOB:
 		return 0;
@@ -670,6 +659,11 @@ static void vmw_swap_notify(struct ttm_buffer_object *bo)
 	(void) ttm_bo_wait(bo, false, false);
 }
 
+static bool vmw_memtype_is_system(uint32_t mem_type)
+{
+	return mem_type == TTM_PL_SYSTEM || mem_type == VMW_PL_SYSTEM;
+}
+
 static int vmw_move(struct ttm_buffer_object *bo,
 		    bool evict,
 		    struct ttm_operation_ctx *ctx,
@@ -680,7 +674,7 @@ static int vmw_move(struct ttm_buffer_object *bo,
 	struct ttm_resource_manager *new_man = ttm_manager_type(bo->bdev, new_mem->mem_type);
 	int ret;
 
-	if (new_man->use_tt && new_mem->mem_type != TTM_PL_SYSTEM) {
+	if (new_man->use_tt && !vmw_memtype_is_system(new_mem->mem_type)) {
 		ret = vmw_ttm_bind(bo->bdev, bo->ttm, new_mem);
 		if (ret)
 			return ret;
@@ -689,7 +683,7 @@ static int vmw_move(struct ttm_buffer_object *bo,
 	vmw_move_notify(bo, bo->resource, new_mem);
 
 	if (old_man->use_tt && new_man->use_tt) {
-		if (bo->resource->mem_type == TTM_PL_SYSTEM) {
+		if (vmw_memtype_is_system(bo->resource->mem_type)) {
 			ttm_bo_move_null(bo, new_mem);
 			return 0;
 		}
@@ -736,7 +730,7 @@ int vmw_bo_create_and_populate(struct vmw_private *dev_priv,
 	int ret;
 
 	ret = vmw_bo_create_kernel(dev_priv, bo_size,
-				   &vmw_sys_placement,
+				   &vmw_pt_sys_placement,
 				   &bo);
 	if (unlikely(ret != 0))
 		return ret;
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v2 5/6] drm/vmwgfx: Switch the internal BO's to ttm_bo_type_kernel
  2021-11-05 19:38 [PATCH v2 0/6] Support module unload and hotunplug Zack Rusin
                   ` (3 preceding siblings ...)
  2021-11-05 19:38 ` [PATCH v2 4/6] drm/vmwgfx: Introduce a new placement for MOB page tables Zack Rusin
@ 2021-11-05 19:38 ` Zack Rusin
  2021-11-05 19:38 ` [PATCH v2 6/6] drm/ttm: Clarify that the TTM_PL_SYSTEM buffers need to stay idle Zack Rusin
  5 siblings, 0 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-05 19:38 UTC (permalink / raw)
  To: dri-devel; +Cc: Martin Krastev

There's never a need to access our internal kernel bo's from
user-space. Those objects are used exclusively for internal
support to guest backed surfaces (in otable setup and mob
page tables) and there's no need to have them be of device
type, i.e. mmappable from user-space.

Signed-off-by: Zack Rusin <zackr@vmware.com>
Reviewed-by: Martin Krastev <krastevm@vmware.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
index fd007f1c1776..c97a3d5e90ce 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
@@ -494,7 +494,7 @@ int vmw_bo_create_kernel(struct vmw_private *dev_priv, unsigned long size,
 	drm_vma_node_reset(&bo->base.vma_node);
 
 	ret = ttm_bo_init_reserved(&dev_priv->bdev, bo, size,
-				   ttm_bo_type_device, placement, 0,
+				   ttm_bo_type_kernel, placement, 0,
 				   &ctx, NULL, NULL, NULL);
 	if (unlikely(ret))
 		goto error_account;
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v2 6/6] drm/ttm: Clarify that the TTM_PL_SYSTEM buffers need to stay idle
  2021-11-05 19:38 [PATCH v2 0/6] Support module unload and hotunplug Zack Rusin
                   ` (4 preceding siblings ...)
  2021-11-05 19:38 ` [PATCH v2 5/6] drm/vmwgfx: Switch the internal BO's to ttm_bo_type_kernel Zack Rusin
@ 2021-11-05 19:38 ` Zack Rusin
  2021-11-08 11:07   ` Christian König
  5 siblings, 1 reply; 24+ messages in thread
From: Zack Rusin @ 2021-11-05 19:38 UTC (permalink / raw)
  To: dri-devel; +Cc: Thomas Hellström, Christian König

TTM was designed to allow TTM_PL_SYSTEM buffer to be fenced but over
the years the code that was meant to handle it was broken and new
changes can not deal with buffers which have been placed in TTM_PL_SYSTEM
buf do not remain idle.
CPU buffers which need to be fenced and shared with accelerators should
be placed in driver specific placements that can explicitly handle
CPU/accelerator buffer fencing.
Currently, apart, from things silently failing nothing is enforcing
that requirement which means that it's easy for drivers and new
developers to get this wrong. To avoid the confusion we can document
this requirement and clarify the solution.

This came up during a discussion on dri-devel:
https://lore.kernel.org/dri-devel/232f45e9-8748-1243-09bf-56763e6668b3@amd.com

Signed-off-by: Zack Rusin <zackr@vmware.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 include/drm/ttm/ttm_placement.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/drm/ttm/ttm_placement.h b/include/drm/ttm/ttm_placement.h
index 76d1b9119a2b..89dfb58ff199 100644
--- a/include/drm/ttm/ttm_placement.h
+++ b/include/drm/ttm/ttm_placement.h
@@ -35,6 +35,16 @@
 
 /*
  * Memory regions for data placement.
+ *
+ * Due to the fact that TTM_PL_SYSTEM BO's can be accessed by the hardware
+ * and are not directly evictable they're handled slightly differently
+ * from other placements. The most important and driver visible side-effect
+ * of that is that TTM_PL_SYSTEM BO's are not allowed to be fenced and have
+ * to remain idle. For BO's which reside in system memory but for which
+ * the accelerator requires direct access (i.e. their usage needs to be
+ * synchronized between the CPU and accelerator via fences) a new, driver
+ * private placement should be introduced that can handle such scenarios.
+ *
  */
 
 #define TTM_PL_SYSTEM           0
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 6/6] drm/ttm: Clarify that the TTM_PL_SYSTEM buffers need to stay idle
  2021-11-05 19:38 ` [PATCH v2 6/6] drm/ttm: Clarify that the TTM_PL_SYSTEM buffers need to stay idle Zack Rusin
@ 2021-11-08 11:07   ` Christian König
  2021-11-08 15:42     ` [PATCH v3] " Zack Rusin
  2021-11-10 14:50     ` [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control Zack Rusin
  0 siblings, 2 replies; 24+ messages in thread
From: Christian König @ 2021-11-08 11:07 UTC (permalink / raw)
  To: Zack Rusin, dri-devel; +Cc: Thomas Hellström

Am 05.11.21 um 20:38 schrieb Zack Rusin:
> TTM was designed to allow TTM_PL_SYSTEM buffer to be fenced but over
> the years the code that was meant to handle it was broken and new
> changes can not deal with buffers which have been placed in TTM_PL_SYSTEM
> buf do not remain idle.
> CPU buffers which need to be fenced and shared with accelerators should
> be placed in driver specific placements that can explicitly handle
> CPU/accelerator buffer fencing.
> Currently, apart, from things silently failing nothing is enforcing
> that requirement which means that it's easy for drivers and new
> developers to get this wrong. To avoid the confusion we can document
> this requirement and clarify the solution.
>
> This came up during a discussion on dri-devel:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C4f4ed0ab73894a482abd08d9a0942b98%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637717380521122147%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=1lGhEIESDwizXnC7APtlhGws8miB4xeh%2FsTewsEPfBY%3D&amp;reserved=0
>
> Signed-off-by: Zack Rusin <zackr@vmware.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>   include/drm/ttm/ttm_placement.h | 10 ++++++++++
>   1 file changed, 10 insertions(+)
>
> diff --git a/include/drm/ttm/ttm_placement.h b/include/drm/ttm/ttm_placement.h
> index 76d1b9119a2b..89dfb58ff199 100644
> --- a/include/drm/ttm/ttm_placement.h
> +++ b/include/drm/ttm/ttm_placement.h
> @@ -35,6 +35,16 @@
>   
>   /*
>    * Memory regions for data placement.
> + *
> + * Due to the fact that TTM_PL_SYSTEM BO's can be accessed by the hardware
> + * and are not directly evictable they're handled slightly differently
> + * from other placements. The most important and driver visible side-effect
> + * of that is that TTM_PL_SYSTEM BO's are not allowed to be fenced and have
> + * to remain idle. For BO's which reside in system memory but for which
> + * the accelerator requires direct access (i.e. their usage needs to be
> + * synchronized between the CPU and accelerator via fences) a new, driver
> + * private placement should be introduced that can handle such scenarios.
> + *

That we don't have fences on them is just the tip on the iceberg and 
mostly nice to have.

The major problem is that TTM_PL_SYSTEM are considered under TTMs 
control and swapped out whenever TTMs thinks it is a good idea.

That needs to be handled manually in the driver if a TTM_PL_SYSTEM is 
ever considered a valid placement.

Christian.

>    */
>   
>   #define TTM_PL_SYSTEM           0


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v3] drm/ttm: Clarify that the TTM_PL_SYSTEM buffers need to stay idle
  2021-11-08 11:07   ` Christian König
@ 2021-11-08 15:42     ` Zack Rusin
  2021-11-10 14:50     ` [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control Zack Rusin
  1 sibling, 0 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-08 15:42 UTC (permalink / raw)
  To: dri-devel; +Cc: Thomas Hellström, Christian König

TTM was designed to allow TTM_PL_SYSTEM buffer to be fenced but over
the years the code that was meant to handle it was broken and new
changes can not deal with buffers which have been placed in TTM_PL_SYSTEM
buf do not remain idle.
CPU buffers which need to be fenced and shared with accelerators should
be placed in driver specific placements that can explicitly handle
CPU/accelerator buffer fencing.
Currently, apart, from things silently failing nothing is enforcing
that requirement which means that it's easy for drivers and new
developers to get this wrong. To avoid the confusion we can document
this requirement and clarify the solution.

This came up during a discussion on dri-devel:
https://lore.kernel.org/dri-devel/232f45e9-8748-1243-09bf-56763e6668b3@amd.com

Signed-off-by: Zack Rusin <zackr@vmware.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 include/drm/ttm/ttm_placement.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/drm/ttm/ttm_placement.h b/include/drm/ttm/ttm_placement.h
index 76d1b9119a2b..8074d0f6cae5 100644
--- a/include/drm/ttm/ttm_placement.h
+++ b/include/drm/ttm/ttm_placement.h
@@ -35,6 +35,17 @@
 
 /*
  * Memory regions for data placement.
+ *
+ * Buffers placed in TTM_PL_SYSTEM are considered under TTMs control and can
+ * be swapped out whenever TTMs thinks it is a good idea.
+ * In cases where drivers would like to use TTM_PL_SYSTEM as a valid
+ * placement they need to be able to handle the issues that arise due to the
+ * above manually.
+ *
+ * For BO's which reside in system memory but for which the accelerator
+ * requires direct access (i.e. their usage needs to be synchronized
+ * between the CPU and accelerator via fences) a new, driver private
+ * placement that can handle such scenarios is a good idea.
  */
 
 #define TTM_PL_SYSTEM           0
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-08 11:07   ` Christian König
  2021-11-08 15:42     ` [PATCH v3] " Zack Rusin
@ 2021-11-10 14:50     ` Zack Rusin
  2021-11-11 16:44       ` Zack Rusin
  2021-11-24  8:22       ` Christian König
  1 sibling, 2 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-10 14:50 UTC (permalink / raw)
  To: dri-devel; +Cc: Thomas Hellström, Christian König

TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
driver internal usage of TTM_PL_SYSTEM prone to errors because it
requires the drivers to manually handle all interactions between TTM
which can swap out those buffers whenever it thinks it's the right
thing to do and driver.

CPU buffers which need to be fenced and shared with accelerators should
be placed in driver specific placements that can explicitly handle
CPU/accelerator buffer fencing.
Currently, apart, from things silently failing nothing is enforcing
that requirement which means that it's easy for drivers and new
developers to get this wrong. To avoid the confusion we can document
this requirement and clarify the solution.

This came up during a discussion on dri-devel:
https://lore.kernel.org/dri-devel/232f45e9-8748-1243-09bf-56763e6668b3@amd.com

Signed-off-by: Zack Rusin <zackr@vmware.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 include/drm/ttm/ttm_placement.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/drm/ttm/ttm_placement.h b/include/drm/ttm/ttm_placement.h
index 76d1b9119a2b..8074d0f6cae5 100644
--- a/include/drm/ttm/ttm_placement.h
+++ b/include/drm/ttm/ttm_placement.h
@@ -35,6 +35,17 @@
 
 /*
  * Memory regions for data placement.
+ *
+ * Buffers placed in TTM_PL_SYSTEM are considered under TTMs control and can
+ * be swapped out whenever TTMs thinks it is a good idea.
+ * In cases where drivers would like to use TTM_PL_SYSTEM as a valid
+ * placement they need to be able to handle the issues that arise due to the
+ * above manually.
+ *
+ * For BO's which reside in system memory but for which the accelerator
+ * requires direct access (i.e. their usage needs to be synchronized
+ * between the CPU and accelerator via fences) a new, driver private
+ * placement that can handle such scenarios is a good idea.
  */
 
 #define TTM_PL_SYSTEM           0
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-10 14:50     ` [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control Zack Rusin
@ 2021-11-11 16:44       ` Zack Rusin
  2021-11-13 11:26         ` Thomas Hellström
  2021-11-24  8:22       ` Christian König
  1 sibling, 1 reply; 24+ messages in thread
From: Zack Rusin @ 2021-11-11 16:44 UTC (permalink / raw)
  To: dri-devel; +Cc: thomas.hellstrom, christian.koenig

On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
> driver internal usage of TTM_PL_SYSTEM prone to errors because it
> requires the drivers to manually handle all interactions between TTM
> which can swap out those buffers whenever it thinks it's the right
> thing to do and driver.
> 
> CPU buffers which need to be fenced and shared with accelerators
> should
> be placed in driver specific placements that can explicitly handle
> CPU/accelerator buffer fencing.
> Currently, apart, from things silently failing nothing is enforcing
> that requirement which means that it's easy for drivers and new
> developers to get this wrong. To avoid the confusion we can document
> this requirement and clarify the solution.
> 
> This came up during a discussion on dri-devel:
> https://lore.kernel.org/dri-devel/232f45e9-8748-1243-09bf-56763e6668b3@amd.com


Polite and gentle ping on that one. Are we ok with the wording here?

z

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-11 16:44       ` Zack Rusin
@ 2021-11-13 11:26         ` Thomas Hellström
  2021-11-16  7:19           ` Christian König
  0 siblings, 1 reply; 24+ messages in thread
From: Thomas Hellström @ 2021-11-13 11:26 UTC (permalink / raw)
  To: Zack Rusin, dri-devel; +Cc: christian.koenig

Hi, Zack,

On 11/11/21 17:44, Zack Rusin wrote:
> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>> requires the drivers to manually handle all interactions between TTM
>> which can swap out those buffers whenever it thinks it's the right
>> thing to do and driver.
>>
>> CPU buffers which need to be fenced and shared with accelerators
>> should
>> be placed in driver specific placements that can explicitly handle
>> CPU/accelerator buffer fencing.
>> Currently, apart, from things silently failing nothing is enforcing
>> that requirement which means that it's easy for drivers and new
>> developers to get this wrong. To avoid the confusion we can document
>> this requirement and clarify the solution.
>>
>> This came up during a discussion on dri-devel:
>> https://lore.kernel.org/dri-devel/232f45e9-8748-1243-09bf-56763e6668b3@amd.com

I took a slightly deeper look into this. I think we need to formalize 
this a bit more to understand pros and cons and what the restrictions 
are really all about. Anybody looking at the prevous discussion will 
mostly see arguments similar to "this is stupid and difficult" and "it 
has always been this way" which are not really constructive.

First disregarding all accounting stuff, I think this all boils down to 
TTM_PL_SYSTEM having three distinct states:
1) POPULATED
2) LIMBO (Or whatever we want to call it. No pages present)
3) SWAPPED.

The ttm_bo_move_memcpy() helper understands these, and any standalone 
driver implementation of the move() callback _currently_ needs to 
understand these as well, unless using the ttm_bo_move_memcpy() helper.

Now using a bounce domain to proxy SYSTEM means that the driver can 
forget about the SWAPPED state, it's automatically handled by the move 
setup code. However, another pitfall is LIMBO, in that if when you move 
from SYSTEM/LIMBO to your bounce domain, the BO will be populated. So 
any naive accelerated move() implementation creating a 1GB BO in fixed 
memory, like VRAM, will needlessly allocate and free 1GB of system 
memory in the process instead of just performing a clear operation. 
Looks like amdgpu suffers from this?

I think what is really needed is either

a) A TTM helper that helps move callback implementations resolve the 
issues populating system from LIMBO or SWAP, and then also formalize 
driver notification for swapping. At a minimum, I think the 
swap_notify() callback needs to be able to return a late error.

b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd vote for 
this without looking into it in detail).

In both these cases, we should really make SYSTEM bindable by GPU, 
otherwise we'd just be trading one pitfall for another related without 
really resolving the root problem.

As for fencing not being supported by SYSTEM, I'm not sure why we don't 
want this, because it would for example prohibit async 
ttm_move_memcpy(), and also, async unbinding of ttm_tt memory like MOB 
on vmgfx. (I think it's still sync).

There might be an accounting issue related to this as well, but I guess 
Christian would need to chime in on this. If so, I think it needs to be 
well understood and documented (in TTM, not in AMD drivers).

Thanks,

/Thomas


>
> Polite and gentle ping on that one. Are we ok with the wording here?
>
> z

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-13 11:26         ` Thomas Hellström
@ 2021-11-16  7:19           ` Christian König
  2021-11-16  7:43             ` Thomas Hellström
  0 siblings, 1 reply; 24+ messages in thread
From: Christian König @ 2021-11-16  7:19 UTC (permalink / raw)
  To: Thomas Hellström, Zack Rusin, dri-devel

Am 13.11.21 um 12:26 schrieb Thomas Hellström:
> Hi, Zack,
>
> On 11/11/21 17:44, Zack Rusin wrote:
>> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
>>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>>> requires the drivers to manually handle all interactions between TTM
>>> which can swap out those buffers whenever it thinks it's the right
>>> thing to do and driver.
>>>
>>> CPU buffers which need to be fenced and shared with accelerators
>>> should
>>> be placed in driver specific placements that can explicitly handle
>>> CPU/accelerator buffer fencing.
>>> Currently, apart, from things silently failing nothing is enforcing
>>> that requirement which means that it's easy for drivers and new
>>> developers to get this wrong. To avoid the confusion we can document
>>> this requirement and clarify the solution.
>>>
>>> This came up during a discussion on dri-devel:
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C3459542a8eab4bc98ecb08d9a69863d9%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637723995727600044%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=6SZIpReHIaNxbu0WsLmwkjKM6e%2Bsk5d%2BDUg1KrfYewI%3D&amp;reserved=0 
>>>
>
> I took a slightly deeper look into this. I think we need to formalize 
> this a bit more to understand pros and cons and what the restrictions 
> are really all about. Anybody looking at the prevous discussion will 
> mostly see arguments similar to "this is stupid and difficult" and "it 
> has always been this way" which are not really constructive.
>
> First disregarding all accounting stuff, I think this all boils down 
> to TTM_PL_SYSTEM having three distinct states:
> 1) POPULATED
> 2) LIMBO (Or whatever we want to call it. No pages present)
> 3) SWAPPED.
>
> The ttm_bo_move_memcpy() helper understands these, and any standalone 
> driver implementation of the move() callback _currently_ needs to 
> understand these as well, unless using the ttm_bo_move_memcpy() helper.
>
> Now using a bounce domain to proxy SYSTEM means that the driver can 
> forget about the SWAPPED state, it's automatically handled by the move 
> setup code. However, another pitfall is LIMBO, in that if when you 
> move from SYSTEM/LIMBO to your bounce domain, the BO will be 
> populated. So any naive accelerated move() implementation creating a 
> 1GB BO in fixed memory, like VRAM, will needlessly allocate and free 
> 1GB of system memory in the process instead of just performing a clear 
> operation. Looks like amdgpu suffers from this?
>
> I think what is really needed is either
>
> a) A TTM helper that helps move callback implementations resolve the 
> issues populating system from LIMBO or SWAP, and then also formalize 
> driver notification for swapping. At a minimum, I think the 
> swap_notify() callback needs to be able to return a late error.
>
> b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd vote 
> for this without looking into it in detail).
>
> In both these cases, we should really make SYSTEM bindable by GPU, 
> otherwise we'd just be trading one pitfall for another related without 
> really resolving the root problem.
>
> As for fencing not being supported by SYSTEM, I'm not sure why we 
> don't want this, because it would for example prohibit async 
> ttm_move_memcpy(), and also, async unbinding of ttm_tt memory like MOB 
> on vmgfx. (I think it's still sync).
>
> There might be an accounting issue related to this as well, but I 
> guess Christian would need to chime in on this. If so, I think it 
> needs to be well understood and documented (in TTM, not in AMD drivers).

I think the problem goes deeper than what has been mentioned here so far.

Having fences attached to BOs in the system domain is probably ok, but 
the key point is that the BOs in the system domain are under TTMs 
control and should not be touched by the driver.

What we have now is that TTMs internals like the allocation state of BOs 
in system memory (the populated, limbo, swapped you mentioned above) is 
leaking into the drivers and I think exactly that is the part which 
doesn't work reliable here. You can of course can get that working, but 
that requires knowledge of the internal state which in my eyes was 
always illegal.

What we could do is to split the system domain into SYSTEM and SWAP and 
then say only the swap domain is under TTMs control.

Regards,
Christian.

>
> Thanks,
>
> /Thomas
>
>
>>
>> Polite and gentle ping on that one. Are we ok with the wording here?
>>
>> z


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16  7:19           ` Christian König
@ 2021-11-16  7:43             ` Thomas Hellström
  2021-11-16  8:20               ` Christian König
  0 siblings, 1 reply; 24+ messages in thread
From: Thomas Hellström @ 2021-11-16  7:43 UTC (permalink / raw)
  To: Christian König, Zack Rusin, dri-devel


On 11/16/21 08:19, Christian König wrote:
> Am 13.11.21 um 12:26 schrieb Thomas Hellström:
>> Hi, Zack,
>>
>> On 11/11/21 17:44, Zack Rusin wrote:
>>> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>>>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
>>>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>>>> requires the drivers to manually handle all interactions between TTM
>>>> which can swap out those buffers whenever it thinks it's the right
>>>> thing to do and driver.
>>>>
>>>> CPU buffers which need to be fenced and shared with accelerators
>>>> should
>>>> be placed in driver specific placements that can explicitly handle
>>>> CPU/accelerator buffer fencing.
>>>> Currently, apart, from things silently failing nothing is enforcing
>>>> that requirement which means that it's easy for drivers and new
>>>> developers to get this wrong. To avoid the confusion we can document
>>>> this requirement and clarify the solution.
>>>>
>>>> This came up during a discussion on dri-devel:
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C3459542a8eab4bc98ecb08d9a69863d9%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637723995727600044%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=6SZIpReHIaNxbu0WsLmwkjKM6e%2Bsk5d%2BDUg1KrfYewI%3D&amp;reserved=0 
>>>>
>>
>> I took a slightly deeper look into this. I think we need to formalize 
>> this a bit more to understand pros and cons and what the restrictions 
>> are really all about. Anybody looking at the prevous discussion will 
>> mostly see arguments similar to "this is stupid and difficult" and 
>> "it has always been this way" which are not really constructive.
>>
>> First disregarding all accounting stuff, I think this all boils down 
>> to TTM_PL_SYSTEM having three distinct states:
>> 1) POPULATED
>> 2) LIMBO (Or whatever we want to call it. No pages present)
>> 3) SWAPPED.
>>
>> The ttm_bo_move_memcpy() helper understands these, and any standalone 
>> driver implementation of the move() callback _currently_ needs to 
>> understand these as well, unless using the ttm_bo_move_memcpy() helper.
>>
>> Now using a bounce domain to proxy SYSTEM means that the driver can 
>> forget about the SWAPPED state, it's automatically handled by the 
>> move setup code. However, another pitfall is LIMBO, in that if when 
>> you move from SYSTEM/LIMBO to your bounce domain, the BO will be 
>> populated. So any naive accelerated move() implementation creating a 
>> 1GB BO in fixed memory, like VRAM, will needlessly allocate and free 
>> 1GB of system memory in the process instead of just performing a 
>> clear operation. Looks like amdgpu suffers from this?
>>
>> I think what is really needed is either
>>
>> a) A TTM helper that helps move callback implementations resolve the 
>> issues populating system from LIMBO or SWAP, and then also formalize 
>> driver notification for swapping. At a minimum, I think the 
>> swap_notify() callback needs to be able to return a late error.
>>
>> b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd vote 
>> for this without looking into it in detail).
>>
>> In both these cases, we should really make SYSTEM bindable by GPU, 
>> otherwise we'd just be trading one pitfall for another related 
>> without really resolving the root problem.
>>
>> As for fencing not being supported by SYSTEM, I'm not sure why we 
>> don't want this, because it would for example prohibit async 
>> ttm_move_memcpy(), and also, async unbinding of ttm_tt memory like 
>> MOB on vmgfx. (I think it's still sync).
>>
>> There might be an accounting issue related to this as well, but I 
>> guess Christian would need to chime in on this. If so, I think it 
>> needs to be well understood and documented (in TTM, not in AMD drivers).
>
> I think the problem goes deeper than what has been mentioned here so far.
>
> Having fences attached to BOs in the system domain is probably ok, but 
> the key point is that the BOs in the system domain are under TTMs 
> control and should not be touched by the driver.
>
> What we have now is that TTMs internals like the allocation state of 
> BOs in system memory (the populated, limbo, swapped you mentioned 
> above) is leaking into the drivers and I think exactly that is the 
> part which doesn't work reliable here. You can of course can get that 
> working, but that requires knowledge of the internal state which in my 
> eyes was always illegal.
>
Well, I tend to agree to some extent, but then, like said above even 
disregarding swap will cause trouble with the limbo state, because the 
driver's move callback would need knowledge of that to implement moves 
limbo -> vram efficiently.

> What we could do is to split the system domain into SYSTEM and SWAP 
> and then say only the swap domain is under TTMs control.

This probably needs some thought also on how to handle the limbo state?

I could craft an RFC hiding these states with helpers that we could 
compare against the SYSTEM + SWAP memory type.

/Thomas

>
> Regards,
> Christian.
>
>>
>> Thanks,
>>
>> /Thomas
>>
>>
>>>
>>> Polite and gentle ping on that one. Are we ok with the wording here?
>>>
>>> z
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16  7:43             ` Thomas Hellström
@ 2021-11-16  8:20               ` Christian König
  2021-11-16  8:33                 ` Thomas Hellström
  2021-11-16 15:53                 ` Zack Rusin
  0 siblings, 2 replies; 24+ messages in thread
From: Christian König @ 2021-11-16  8:20 UTC (permalink / raw)
  To: Thomas Hellström, Zack Rusin, dri-devel

Am 16.11.21 um 08:43 schrieb Thomas Hellström:
> On 11/16/21 08:19, Christian König wrote:
>> Am 13.11.21 um 12:26 schrieb Thomas Hellström:
>>> Hi, Zack,
>>>
>>> On 11/11/21 17:44, Zack Rusin wrote:
>>>> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>>>>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
>>>>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>>>>> requires the drivers to manually handle all interactions between TTM
>>>>> which can swap out those buffers whenever it thinks it's the right
>>>>> thing to do and driver.
>>>>>
>>>>> CPU buffers which need to be fenced and shared with accelerators
>>>>> should
>>>>> be placed in driver specific placements that can explicitly handle
>>>>> CPU/accelerator buffer fencing.
>>>>> Currently, apart, from things silently failing nothing is enforcing
>>>>> that requirement which means that it's easy for drivers and new
>>>>> developers to get this wrong. To avoid the confusion we can document
>>>>> this requirement and clarify the solution.
>>>>>
>>>>> This came up during a discussion on dri-devel:
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C55e15a3b151b401993ca08d9a8d4c878%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637726454113422983%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=HSg2rZf1yFsCCOUOcoG5Y0ogGE%2FsUymh3UqJYvZ1%2BDM%3D&amp;reserved=0 
>>>>>
>>>
>>> I took a slightly deeper look into this. I think we need to 
>>> formalize this a bit more to understand pros and cons and what the 
>>> restrictions are really all about. Anybody looking at the prevous 
>>> discussion will mostly see arguments similar to "this is stupid and 
>>> difficult" and "it has always been this way" which are not really 
>>> constructive.
>>>
>>> First disregarding all accounting stuff, I think this all boils down 
>>> to TTM_PL_SYSTEM having three distinct states:
>>> 1) POPULATED
>>> 2) LIMBO (Or whatever we want to call it. No pages present)
>>> 3) SWAPPED.
>>>
>>> The ttm_bo_move_memcpy() helper understands these, and any 
>>> standalone driver implementation of the move() callback _currently_ 
>>> needs to understand these as well, unless using the 
>>> ttm_bo_move_memcpy() helper.
>>>
>>> Now using a bounce domain to proxy SYSTEM means that the driver can 
>>> forget about the SWAPPED state, it's automatically handled by the 
>>> move setup code. However, another pitfall is LIMBO, in that if when 
>>> you move from SYSTEM/LIMBO to your bounce domain, the BO will be 
>>> populated. So any naive accelerated move() implementation creating a 
>>> 1GB BO in fixed memory, like VRAM, will needlessly allocate and free 
>>> 1GB of system memory in the process instead of just performing a 
>>> clear operation. Looks like amdgpu suffers from this?
>>>
>>> I think what is really needed is either
>>>
>>> a) A TTM helper that helps move callback implementations resolve the 
>>> issues populating system from LIMBO or SWAP, and then also formalize 
>>> driver notification for swapping. At a minimum, I think the 
>>> swap_notify() callback needs to be able to return a late error.
>>>
>>> b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd vote 
>>> for this without looking into it in detail).
>>>
>>> In both these cases, we should really make SYSTEM bindable by GPU, 
>>> otherwise we'd just be trading one pitfall for another related 
>>> without really resolving the root problem.
>>>
>>> As for fencing not being supported by SYSTEM, I'm not sure why we 
>>> don't want this, because it would for example prohibit async 
>>> ttm_move_memcpy(), and also, async unbinding of ttm_tt memory like 
>>> MOB on vmgfx. (I think it's still sync).
>>>
>>> There might be an accounting issue related to this as well, but I 
>>> guess Christian would need to chime in on this. If so, I think it 
>>> needs to be well understood and documented (in TTM, not in AMD 
>>> drivers).
>>
>> I think the problem goes deeper than what has been mentioned here so 
>> far.
>>
>> Having fences attached to BOs in the system domain is probably ok, 
>> but the key point is that the BOs in the system domain are under TTMs 
>> control and should not be touched by the driver.
>>
>> What we have now is that TTMs internals like the allocation state of 
>> BOs in system memory (the populated, limbo, swapped you mentioned 
>> above) is leaking into the drivers and I think exactly that is the 
>> part which doesn't work reliable here. You can of course can get that 
>> working, but that requires knowledge of the internal state which in 
>> my eyes was always illegal.
>>
> Well, I tend to agree to some extent, but then, like said above even 
> disregarding swap will cause trouble with the limbo state, because the 
> driver's move callback would need knowledge of that to implement moves 
> limbo -> vram efficiently.

Well my long term plan is to audit the code base once more and remove 
the limbo state from the SYSTEM domain.

E.g. instead of a SYSTEM BO without pages you allocate a BO without a 
resource in general which is now possible since bo->resource is a pointer.

This would still allow us to allocate "empty shell" BOs. But a 
validation of those BOs doesn't cause a move, but rather just allocates 
the resource for the first time.

The problem so far was just that we access bo->resource way to often 
without checking it.

Regards,
Christian.

>
>> What we could do is to split the system domain into SYSTEM and SWAP 
>> and then say only the swap domain is under TTMs control.
>
> This probably needs some thought also on how to handle the limbo state?
>
> I could craft an RFC hiding these states with helpers that we could 
> compare against the SYSTEM + SWAP memory type.
>
> /Thomas
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Thanks,
>>>
>>> /Thomas
>>>
>>>
>>>>
>>>> Polite and gentle ping on that one. Are we ok with the wording here?
>>>>
>>>> z
>>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16  8:20               ` Christian König
@ 2021-11-16  8:33                 ` Thomas Hellström
  2021-11-16  8:54                   ` Christian König
  2021-11-16 15:53                 ` Zack Rusin
  1 sibling, 1 reply; 24+ messages in thread
From: Thomas Hellström @ 2021-11-16  8:33 UTC (permalink / raw)
  To: Christian König, Zack Rusin, dri-devel


On 11/16/21 09:20, Christian König wrote:
> Am 16.11.21 um 08:43 schrieb Thomas Hellström:
>> On 11/16/21 08:19, Christian König wrote:
>>> Am 13.11.21 um 12:26 schrieb Thomas Hellström:
>>>> Hi, Zack,
>>>>
>>>> On 11/11/21 17:44, Zack Rusin wrote:
>>>>> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>>>>>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
>>>>>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>>>>>> requires the drivers to manually handle all interactions between TTM
>>>>>> which can swap out those buffers whenever it thinks it's the right
>>>>>> thing to do and driver.
>>>>>>
>>>>>> CPU buffers which need to be fenced and shared with accelerators
>>>>>> should
>>>>>> be placed in driver specific placements that can explicitly handle
>>>>>> CPU/accelerator buffer fencing.
>>>>>> Currently, apart, from things silently failing nothing is enforcing
>>>>>> that requirement which means that it's easy for drivers and new
>>>>>> developers to get this wrong. To avoid the confusion we can document
>>>>>> this requirement and clarify the solution.
>>>>>>
>>>>>> This came up during a discussion on dri-devel:
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C55e15a3b151b401993ca08d9a8d4c878%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637726454113422983%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=HSg2rZf1yFsCCOUOcoG5Y0ogGE%2FsUymh3UqJYvZ1%2BDM%3D&amp;reserved=0 
>>>>>>
>>>>
>>>> I took a slightly deeper look into this. I think we need to 
>>>> formalize this a bit more to understand pros and cons and what the 
>>>> restrictions are really all about. Anybody looking at the prevous 
>>>> discussion will mostly see arguments similar to "this is stupid and 
>>>> difficult" and "it has always been this way" which are not really 
>>>> constructive.
>>>>
>>>> First disregarding all accounting stuff, I think this all boils 
>>>> down to TTM_PL_SYSTEM having three distinct states:
>>>> 1) POPULATED
>>>> 2) LIMBO (Or whatever we want to call it. No pages present)
>>>> 3) SWAPPED.
>>>>
>>>> The ttm_bo_move_memcpy() helper understands these, and any 
>>>> standalone driver implementation of the move() callback _currently_ 
>>>> needs to understand these as well, unless using the 
>>>> ttm_bo_move_memcpy() helper.
>>>>
>>>> Now using a bounce domain to proxy SYSTEM means that the driver can 
>>>> forget about the SWAPPED state, it's automatically handled by the 
>>>> move setup code. However, another pitfall is LIMBO, in that if when 
>>>> you move from SYSTEM/LIMBO to your bounce domain, the BO will be 
>>>> populated. So any naive accelerated move() implementation creating 
>>>> a 1GB BO in fixed memory, like VRAM, will needlessly allocate and 
>>>> free 1GB of system memory in the process instead of just performing 
>>>> a clear operation. Looks like amdgpu suffers from this?
>>>>
>>>> I think what is really needed is either
>>>>
>>>> a) A TTM helper that helps move callback implementations resolve 
>>>> the issues populating system from LIMBO or SWAP, and then also 
>>>> formalize driver notification for swapping. At a minimum, I think 
>>>> the swap_notify() callback needs to be able to return a late error.
>>>>
>>>> b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd 
>>>> vote for this without looking into it in detail).
>>>>
>>>> In both these cases, we should really make SYSTEM bindable by GPU, 
>>>> otherwise we'd just be trading one pitfall for another related 
>>>> without really resolving the root problem.
>>>>
>>>> As for fencing not being supported by SYSTEM, I'm not sure why we 
>>>> don't want this, because it would for example prohibit async 
>>>> ttm_move_memcpy(), and also, async unbinding of ttm_tt memory like 
>>>> MOB on vmgfx. (I think it's still sync).
>>>>
>>>> There might be an accounting issue related to this as well, but I 
>>>> guess Christian would need to chime in on this. If so, I think it 
>>>> needs to be well understood and documented (in TTM, not in AMD 
>>>> drivers).
>>>
>>> I think the problem goes deeper than what has been mentioned here so 
>>> far.
>>>
>>> Having fences attached to BOs in the system domain is probably ok, 
>>> but the key point is that the BOs in the system domain are under 
>>> TTMs control and should not be touched by the driver.
>>>
>>> What we have now is that TTMs internals like the allocation state of 
>>> BOs in system memory (the populated, limbo, swapped you mentioned 
>>> above) is leaking into the drivers and I think exactly that is the 
>>> part which doesn't work reliable here. You can of course can get 
>>> that working, but that requires knowledge of the internal state 
>>> which in my eyes was always illegal.
>>>
>> Well, I tend to agree to some extent, but then, like said above even 
>> disregarding swap will cause trouble with the limbo state, because 
>> the driver's move callback would need knowledge of that to implement 
>> moves limbo -> vram efficiently.
>
> Well my long term plan is to audit the code base once more and remove 
> the limbo state from the SYSTEM domain.
>
> E.g. instead of a SYSTEM BO without pages you allocate a BO without a 
> resource in general which is now possible since bo->resource is a 
> pointer.
>
> This would still allow us to allocate "empty shell" BOs. But a 
> validation of those BOs doesn't cause a move, but rather just 
> allocates the resource for the first time.
>
> The problem so far was just that we access bo->resource way to often 
> without checking it.

So the driver would then at least need to be aware of these empty shell 
bos without resource for their move callbacks? (Again thinking of the 
move from empty shell -> VRAM).

Thanks,

/Thomas



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16  8:33                 ` Thomas Hellström
@ 2021-11-16  8:54                   ` Christian König
  2021-11-16  9:00                     ` Thomas Hellström
  0 siblings, 1 reply; 24+ messages in thread
From: Christian König @ 2021-11-16  8:54 UTC (permalink / raw)
  To: Thomas Hellström, Zack Rusin, dri-devel

Am 16.11.21 um 09:33 schrieb Thomas Hellström:
> On 11/16/21 09:20, Christian König wrote:
>> Am 16.11.21 um 08:43 schrieb Thomas Hellström:
>>> On 11/16/21 08:19, Christian König wrote:
>>>> Am 13.11.21 um 12:26 schrieb Thomas Hellström:
>>>>> Hi, Zack,
>>>>>
>>>>> On 11/11/21 17:44, Zack Rusin wrote:
>>>>>> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>>>>>>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This 
>>>>>>> makes
>>>>>>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>>>>>>> requires the drivers to manually handle all interactions between 
>>>>>>> TTM
>>>>>>> which can swap out those buffers whenever it thinks it's the right
>>>>>>> thing to do and driver.
>>>>>>>
>>>>>>> CPU buffers which need to be fenced and shared with accelerators
>>>>>>> should
>>>>>>> be placed in driver specific placements that can explicitly handle
>>>>>>> CPU/accelerator buffer fencing.
>>>>>>> Currently, apart, from things silently failing nothing is enforcing
>>>>>>> that requirement which means that it's easy for drivers and new
>>>>>>> developers to get this wrong. To avoid the confusion we can 
>>>>>>> document
>>>>>>> this requirement and clarify the solution.
>>>>>>>
>>>>>>> This came up during a discussion on dri-devel:
>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C582935bfd2d94d97fa4808d9a8dbd574%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637726484406316306%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=4p9KuhMpWHabLEuIvJB2JEuKRhYx2gUuDywUuZ86s0o%3D&amp;reserved=0 
>>>>>>>
>>>>>
>>>>> I took a slightly deeper look into this. I think we need to 
>>>>> formalize this a bit more to understand pros and cons and what the 
>>>>> restrictions are really all about. Anybody looking at the prevous 
>>>>> discussion will mostly see arguments similar to "this is stupid 
>>>>> and difficult" and "it has always been this way" which are not 
>>>>> really constructive.
>>>>>
>>>>> First disregarding all accounting stuff, I think this all boils 
>>>>> down to TTM_PL_SYSTEM having three distinct states:
>>>>> 1) POPULATED
>>>>> 2) LIMBO (Or whatever we want to call it. No pages present)
>>>>> 3) SWAPPED.
>>>>>
>>>>> The ttm_bo_move_memcpy() helper understands these, and any 
>>>>> standalone driver implementation of the move() callback 
>>>>> _currently_ needs to understand these as well, unless using the 
>>>>> ttm_bo_move_memcpy() helper.
>>>>>
>>>>> Now using a bounce domain to proxy SYSTEM means that the driver 
>>>>> can forget about the SWAPPED state, it's automatically handled by 
>>>>> the move setup code. However, another pitfall is LIMBO, in that if 
>>>>> when you move from SYSTEM/LIMBO to your bounce domain, the BO will 
>>>>> be populated. So any naive accelerated move() implementation 
>>>>> creating a 1GB BO in fixed memory, like VRAM, will needlessly 
>>>>> allocate and free 1GB of system memory in the process instead of 
>>>>> just performing a clear operation. Looks like amdgpu suffers from 
>>>>> this?
>>>>>
>>>>> I think what is really needed is either
>>>>>
>>>>> a) A TTM helper that helps move callback implementations resolve 
>>>>> the issues populating system from LIMBO or SWAP, and then also 
>>>>> formalize driver notification for swapping. At a minimum, I think 
>>>>> the swap_notify() callback needs to be able to return a late error.
>>>>>
>>>>> b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd 
>>>>> vote for this without looking into it in detail).
>>>>>
>>>>> In both these cases, we should really make SYSTEM bindable by GPU, 
>>>>> otherwise we'd just be trading one pitfall for another related 
>>>>> without really resolving the root problem.
>>>>>
>>>>> As for fencing not being supported by SYSTEM, I'm not sure why we 
>>>>> don't want this, because it would for example prohibit async 
>>>>> ttm_move_memcpy(), and also, async unbinding of ttm_tt memory like 
>>>>> MOB on vmgfx. (I think it's still sync).
>>>>>
>>>>> There might be an accounting issue related to this as well, but I 
>>>>> guess Christian would need to chime in on this. If so, I think it 
>>>>> needs to be well understood and documented (in TTM, not in AMD 
>>>>> drivers).
>>>>
>>>> I think the problem goes deeper than what has been mentioned here 
>>>> so far.
>>>>
>>>> Having fences attached to BOs in the system domain is probably ok, 
>>>> but the key point is that the BOs in the system domain are under 
>>>> TTMs control and should not be touched by the driver.
>>>>
>>>> What we have now is that TTMs internals like the allocation state 
>>>> of BOs in system memory (the populated, limbo, swapped you 
>>>> mentioned above) is leaking into the drivers and I think exactly 
>>>> that is the part which doesn't work reliable here. You can of 
>>>> course can get that working, but that requires knowledge of the 
>>>> internal state which in my eyes was always illegal.
>>>>
>>> Well, I tend to agree to some extent, but then, like said above even 
>>> disregarding swap will cause trouble with the limbo state, because 
>>> the driver's move callback would need knowledge of that to implement 
>>> moves limbo -> vram efficiently.
>>
>> Well my long term plan is to audit the code base once more and remove 
>> the limbo state from the SYSTEM domain.
>>
>> E.g. instead of a SYSTEM BO without pages you allocate a BO without a 
>> resource in general which is now possible since bo->resource is a 
>> pointer.
>>
>> This would still allow us to allocate "empty shell" BOs. But a 
>> validation of those BOs doesn't cause a move, but rather just 
>> allocates the resource for the first time.
>>
>> The problem so far was just that we access bo->resource way to often 
>> without checking it.
>
> So the driver would then at least need to be aware of these empty 
> shell bos without resource for their move callbacks? (Again thinking 
> of the move from empty shell -> VRAM).

My thinking goes more into the direction that this looks like a BO 
directly allocated in VRAM to the driver.

We could of course also make it a move, but of hand I don't see a reason 
for it.

Christian.

>
> Thanks,
>
> /Thomas
>
>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16  8:54                   ` Christian König
@ 2021-11-16  9:00                     ` Thomas Hellström
  2021-11-16  9:09                       ` Christian König
  0 siblings, 1 reply; 24+ messages in thread
From: Thomas Hellström @ 2021-11-16  9:00 UTC (permalink / raw)
  To: Christian König, Zack Rusin, dri-devel


On 11/16/21 09:54, Christian König wrote:
> Am 16.11.21 um 09:33 schrieb Thomas Hellström:
>> On 11/16/21 09:20, Christian König wrote:
>>> Am 16.11.21 um 08:43 schrieb Thomas Hellström:
>>>> On 11/16/21 08:19, Christian König wrote:
>>>>> Am 13.11.21 um 12:26 schrieb Thomas Hellström:
>>>>>> Hi, Zack,
>>>>>>
>>>>>> On 11/11/21 17:44, Zack Rusin wrote:
>>>>>>> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>>>>>>>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This 
>>>>>>>> makes
>>>>>>>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>>>>>>>> requires the drivers to manually handle all interactions 
>>>>>>>> between TTM
>>>>>>>> which can swap out those buffers whenever it thinks it's the right
>>>>>>>> thing to do and driver.
>>>>>>>>
>>>>>>>> CPU buffers which need to be fenced and shared with accelerators
>>>>>>>> should
>>>>>>>> be placed in driver specific placements that can explicitly handle
>>>>>>>> CPU/accelerator buffer fencing.
>>>>>>>> Currently, apart, from things silently failing nothing is 
>>>>>>>> enforcing
>>>>>>>> that requirement which means that it's easy for drivers and new
>>>>>>>> developers to get this wrong. To avoid the confusion we can 
>>>>>>>> document
>>>>>>>> this requirement and clarify the solution.
>>>>>>>>
>>>>>>>> This came up during a discussion on dri-devel:
>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C582935bfd2d94d97fa4808d9a8dbd574%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637726484406316306%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=4p9KuhMpWHabLEuIvJB2JEuKRhYx2gUuDywUuZ86s0o%3D&amp;reserved=0 
>>>>>>>>
>>>>>>
>>>>>> I took a slightly deeper look into this. I think we need to 
>>>>>> formalize this a bit more to understand pros and cons and what 
>>>>>> the restrictions are really all about. Anybody looking at the 
>>>>>> prevous discussion will mostly see arguments similar to "this is 
>>>>>> stupid and difficult" and "it has always been this way" which are 
>>>>>> not really constructive.
>>>>>>
>>>>>> First disregarding all accounting stuff, I think this all boils 
>>>>>> down to TTM_PL_SYSTEM having three distinct states:
>>>>>> 1) POPULATED
>>>>>> 2) LIMBO (Or whatever we want to call it. No pages present)
>>>>>> 3) SWAPPED.
>>>>>>
>>>>>> The ttm_bo_move_memcpy() helper understands these, and any 
>>>>>> standalone driver implementation of the move() callback 
>>>>>> _currently_ needs to understand these as well, unless using the 
>>>>>> ttm_bo_move_memcpy() helper.
>>>>>>
>>>>>> Now using a bounce domain to proxy SYSTEM means that the driver 
>>>>>> can forget about the SWAPPED state, it's automatically handled by 
>>>>>> the move setup code. However, another pitfall is LIMBO, in that 
>>>>>> if when you move from SYSTEM/LIMBO to your bounce domain, the BO 
>>>>>> will be populated. So any naive accelerated move() implementation 
>>>>>> creating a 1GB BO in fixed memory, like VRAM, will needlessly 
>>>>>> allocate and free 1GB of system memory in the process instead of 
>>>>>> just performing a clear operation. Looks like amdgpu suffers from 
>>>>>> this?
>>>>>>
>>>>>> I think what is really needed is either
>>>>>>
>>>>>> a) A TTM helper that helps move callback implementations resolve 
>>>>>> the issues populating system from LIMBO or SWAP, and then also 
>>>>>> formalize driver notification for swapping. At a minimum, I think 
>>>>>> the swap_notify() callback needs to be able to return a late error.
>>>>>>
>>>>>> b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd 
>>>>>> vote for this without looking into it in detail).
>>>>>>
>>>>>> In both these cases, we should really make SYSTEM bindable by 
>>>>>> GPU, otherwise we'd just be trading one pitfall for another 
>>>>>> related without really resolving the root problem.
>>>>>>
>>>>>> As for fencing not being supported by SYSTEM, I'm not sure why we 
>>>>>> don't want this, because it would for example prohibit async 
>>>>>> ttm_move_memcpy(), and also, async unbinding of ttm_tt memory 
>>>>>> like MOB on vmgfx. (I think it's still sync).
>>>>>>
>>>>>> There might be an accounting issue related to this as well, but I 
>>>>>> guess Christian would need to chime in on this. If so, I think it 
>>>>>> needs to be well understood and documented (in TTM, not in AMD 
>>>>>> drivers).
>>>>>
>>>>> I think the problem goes deeper than what has been mentioned here 
>>>>> so far.
>>>>>
>>>>> Having fences attached to BOs in the system domain is probably ok, 
>>>>> but the key point is that the BOs in the system domain are under 
>>>>> TTMs control and should not be touched by the driver.
>>>>>
>>>>> What we have now is that TTMs internals like the allocation state 
>>>>> of BOs in system memory (the populated, limbo, swapped you 
>>>>> mentioned above) is leaking into the drivers and I think exactly 
>>>>> that is the part which doesn't work reliable here. You can of 
>>>>> course can get that working, but that requires knowledge of the 
>>>>> internal state which in my eyes was always illegal.
>>>>>
>>>> Well, I tend to agree to some extent, but then, like said above 
>>>> even disregarding swap will cause trouble with the limbo state, 
>>>> because the driver's move callback would need knowledge of that to 
>>>> implement moves limbo -> vram efficiently.
>>>
>>> Well my long term plan is to audit the code base once more and 
>>> remove the limbo state from the SYSTEM domain.
>>>
>>> E.g. instead of a SYSTEM BO without pages you allocate a BO without 
>>> a resource in general which is now possible since bo->resource is a 
>>> pointer.
>>>
>>> This would still allow us to allocate "empty shell" BOs. But a 
>>> validation of those BOs doesn't cause a move, but rather just 
>>> allocates the resource for the first time.
>>>
>>> The problem so far was just that we access bo->resource way to often 
>>> without checking it.
>>
>> So the driver would then at least need to be aware of these empty 
>> shell bos without resource for their move callbacks? (Again thinking 
>> of the move from empty shell -> VRAM).
>
> My thinking goes more into the direction that this looks like a BO 
> directly allocated in VRAM to the driver.
>
> We could of course also make it a move, but of hand I don't see a 
> reason for it.

As long as there is a way to provide accelerated VRAM clearing if 
necessary the directly allocated view sounds fine with me. (Looking at 
amdgpu it looks like you clear on resource destroy? I'm not fully sure 
that would work with all i915 use cases)

/Thomas


>
> Christian.
>
>>
>> Thanks,
>>
>> /Thomas
>>
>>
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16  9:00                     ` Thomas Hellström
@ 2021-11-16  9:09                       ` Christian König
  2021-11-16 15:49                         ` Daniel Vetter
  0 siblings, 1 reply; 24+ messages in thread
From: Christian König @ 2021-11-16  9:09 UTC (permalink / raw)
  To: Thomas Hellström, Zack Rusin, dri-devel

Am 16.11.21 um 10:00 schrieb Thomas Hellström:
> On 11/16/21 09:54, Christian König wrote:
>> Am 16.11.21 um 09:33 schrieb Thomas Hellström:
>>> On 11/16/21 09:20, Christian König wrote:
>>>> Am 16.11.21 um 08:43 schrieb Thomas Hellström:
>>>>> On 11/16/21 08:19, Christian König wrote:
>>>>>> [SNIP]
>>>>
>>>> Well my long term plan is to audit the code base once more and 
>>>> remove the limbo state from the SYSTEM domain.
>>>>
>>>> E.g. instead of a SYSTEM BO without pages you allocate a BO without 
>>>> a resource in general which is now possible since bo->resource is a 
>>>> pointer.
>>>>
>>>> This would still allow us to allocate "empty shell" BOs. But a 
>>>> validation of those BOs doesn't cause a move, but rather just 
>>>> allocates the resource for the first time.
>>>>
>>>> The problem so far was just that we access bo->resource way to 
>>>> often without checking it.
>>>
>>> So the driver would then at least need to be aware of these empty 
>>> shell bos without resource for their move callbacks? (Again thinking 
>>> of the move from empty shell -> VRAM).
>>
>> My thinking goes more into the direction that this looks like a BO 
>> directly allocated in VRAM to the driver.
>>
>> We could of course also make it a move, but of hand I don't see a 
>> reason for it.
>
> As long as there is a way to provide accelerated VRAM clearing if 
> necessary the directly allocated view sounds fine with me. (Looking at 
> amdgpu it looks like you clear on resource destroy? I'm not fully sure 
> that would work with all i915 use cases)

In amdgpu we have both. The AMDGPU_GEM_CREATE_VRAM_CLEARED flag clears 
the memory on allocation and AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE flag 
makes sure it is wiped on release.

Wiping on release is sometimes faster because you don't need to wait for 
the clear to finish before you can first use it.

But thinking about it once more it might be a good idea to have move 
callbacks for empty shells and freshly allocated BOs as well, so that 
the driver is informed about the state change of the BO.

Regards,
Christian.

>
> /Thomas
>
>
>>
>> Christian.
>>
>>>
>>> Thanks,
>>>
>>> /Thomas
>>>
>>>
>>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16  9:09                       ` Christian König
@ 2021-11-16 15:49                         ` Daniel Vetter
  0 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2021-11-16 15:49 UTC (permalink / raw)
  To: Christian König; +Cc: Thomas Hellström, dri-devel

On Tue, Nov 16, 2021 at 10:09:44AM +0100, Christian König wrote:
> Am 16.11.21 um 10:00 schrieb Thomas Hellström:
> > On 11/16/21 09:54, Christian König wrote:
> > > Am 16.11.21 um 09:33 schrieb Thomas Hellström:
> > > > On 11/16/21 09:20, Christian König wrote:
> > > > > Am 16.11.21 um 08:43 schrieb Thomas Hellström:
> > > > > > On 11/16/21 08:19, Christian König wrote:
> > > > > > > [SNIP]
> > > > > 
> > > > > Well my long term plan is to audit the code base once more
> > > > > and remove the limbo state from the SYSTEM domain.
> > > > > 
> > > > > E.g. instead of a SYSTEM BO without pages you allocate a BO
> > > > > without a resource in general which is now possible since
> > > > > bo->resource is a pointer.
> > > > > 
> > > > > This would still allow us to allocate "empty shell" BOs. But
> > > > > a validation of those BOs doesn't cause a move, but rather
> > > > > just allocates the resource for the first time.
> > > > > 
> > > > > The problem so far was just that we access bo->resource way
> > > > > to often without checking it.
> > > > 
> > > > So the driver would then at least need to be aware of these
> > > > empty shell bos without resource for their move callbacks?
> > > > (Again thinking of the move from empty shell -> VRAM).
> > > 
> > > My thinking goes more into the direction that this looks like a BO
> > > directly allocated in VRAM to the driver.
> > > 
> > > We could of course also make it a move, but of hand I don't see a
> > > reason for it.
> > 
> > As long as there is a way to provide accelerated VRAM clearing if
> > necessary the directly allocated view sounds fine with me. (Looking at
> > amdgpu it looks like you clear on resource destroy? I'm not fully sure
> > that would work with all i915 use cases)
> 
> In amdgpu we have both. The AMDGPU_GEM_CREATE_VRAM_CLEARED flag clears the
> memory on allocation and AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE flag makes
> sure it is wiped on release.

btw how does this work when there was random previous stuff in that memory
region previously?

Like the more we do hmm style integration the more you cannot assume that
critical memory is cleared on free/release, because that's just not how
core mm/ works, where pages are all cleared on alloc before userspace sees
them.

Entirely sidetrack discussion :-)
-Daniel

> Wiping on release is sometimes faster because you don't need to wait for the
> clear to finish before you can first use it.
> 
> But thinking about it once more it might be a good idea to have move
> callbacks for empty shells and freshly allocated BOs as well, so that the
> driver is informed about the state change of the BO.
> 
> Regards,
> Christian.
> 
> > 
> > /Thomas
> > 
> > 
> > > 
> > > Christian.
> > > 
> > > > 
> > > > Thanks,
> > > > 
> > > > /Thomas
> > > > 
> > > > 
> > > 
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16  8:20               ` Christian König
  2021-11-16  8:33                 ` Thomas Hellström
@ 2021-11-16 15:53                 ` Zack Rusin
  2021-11-22 14:15                   ` Christian König
  1 sibling, 1 reply; 24+ messages in thread
From: Zack Rusin @ 2021-11-16 15:53 UTC (permalink / raw)
  To: Christian König; +Cc: Thomas Hellström, dri-devel



> On Nov 16, 2021, at 03:20, Christian König <christian.koenig@amd.com> wrote:
> 
> Am 16.11.21 um 08:43 schrieb Thomas Hellström:
>> On 11/16/21 08:19, Christian König wrote:
>>> Am 13.11.21 um 12:26 schrieb Thomas Hellström:
>>>> Hi, Zack,
>>>> 
>>>> On 11/11/21 17:44, Zack Rusin wrote:
>>>>> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>>>>>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
>>>>>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>>>>>> requires the drivers to manually handle all interactions between TTM
>>>>>> which can swap out those buffers whenever it thinks it's the right
>>>>>> thing to do and driver.
>>>>>> 
>>>>>> CPU buffers which need to be fenced and shared with accelerators
>>>>>> should
>>>>>> be placed in driver specific placements that can explicitly handle
>>>>>> CPU/accelerator buffer fencing.
>>>>>> Currently, apart, from things silently failing nothing is enforcing
>>>>>> that requirement which means that it's easy for drivers and new
>>>>>> developers to get this wrong. To avoid the confusion we can document
>>>>>> this requirement and clarify the solution.
>>>>>> 
>>>>>> This came up during a discussion on dri-devel:
>>>>>> https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Czackr%40vmware.com%7C2a4063bb30f84dc921ba08d9a8d9ea8c%7Cb39138ca3cee4b4aa4d6cd83d9dd62f0%7C0%7C0%7C637726476178183950%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=UF0kuamK%2FfETPW22EbEaQ0rUrDqSQbBtFwT5DwknY1s%3D&amp;reserved=0 
>>>> 
>>>> I took a slightly deeper look into this. I think we need to formalize this a bit more to understand pros and cons and what the restrictions are really all about. Anybody looking at the prevous discussion will mostly see arguments similar to "this is stupid and difficult" and "it has always been this way" which are not really constructive.
>>>> 
>>>> First disregarding all accounting stuff, I think this all boils down to TTM_PL_SYSTEM having three distinct states:
>>>> 1) POPULATED
>>>> 2) LIMBO (Or whatever we want to call it. No pages present)
>>>> 3) SWAPPED.
>>>> 
>>>> The ttm_bo_move_memcpy() helper understands these, and any standalone driver implementation of the move() callback _currently_ needs to understand these as well, unless using the ttm_bo_move_memcpy() helper.
>>>> 
>>>> Now using a bounce domain to proxy SYSTEM means that the driver can forget about the SWAPPED state, it's automatically handled by the move setup code. However, another pitfall is LIMBO, in that if when you move from SYSTEM/LIMBO to your bounce domain, the BO will be populated. So any naive accelerated move() implementation creating a 1GB BO in fixed memory, like VRAM, will needlessly allocate and free 1GB of system memory in the process instead of just performing a clear operation. Looks like amdgpu suffers from this?
>>>> 
>>>> I think what is really needed is either
>>>> 
>>>> a) A TTM helper that helps move callback implementations resolve the issues populating system from LIMBO or SWAP, and then also formalize driver notification for swapping. At a minimum, I think the swap_notify() callback needs to be able to return a late error.
>>>> 
>>>> b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd vote for this without looking into it in detail).
>>>> 
>>>> In both these cases, we should really make SYSTEM bindable by GPU, otherwise we'd just be trading one pitfall for another related without really resolving the root problem.
>>>> 
>>>> As for fencing not being supported by SYSTEM, I'm not sure why we don't want this, because it would for example prohibit async ttm_move_memcpy(), and also, async unbinding of ttm_tt memory like MOB on vmgfx. (I think it's still sync).
>>>> 
>>>> There might be an accounting issue related to this as well, but I guess Christian would need to chime in on this. If so, I think it needs to be well understood and documented (in TTM, not in AMD drivers).
>>> 
>>> I think the problem goes deeper than what has been mentioned here so far.
>>> 
>>> Having fences attached to BOs in the system domain is probably ok, but the key point is that the BOs in the system domain are under TTMs control and should not be touched by the driver.
>>> 
>>> What we have now is that TTMs internals like the allocation state of BOs in system memory (the populated, limbo, swapped you mentioned above) is leaking into the drivers and I think exactly that is the part which doesn't work reliable here. You can of course can get that working, but that requires knowledge of the internal state which in my eyes was always illegal.
>>> 
>> Well, I tend to agree to some extent, but then, like said above even disregarding swap will cause trouble with the limbo state, because the driver's move callback would need knowledge of that to implement moves limbo -> vram efficiently.
> 
> Well my long term plan is to audit the code base once more and remove the limbo state from the SYSTEM domain.
> 
> E.g. instead of a SYSTEM BO without pages you allocate a BO without a resource in general which is now possible since bo->resource is a pointer.
> 
> This would still allow us to allocate "empty shell" BOs. But a validation of those BOs doesn't cause a move, but rather just allocates the resource for the first time.
> 
> The problem so far was just that we access bo->resource way to often without checking it.

So all in all this would be a two step process 1) Eliminate the “limbo” state, 2) Split PL_SYSTEM into PL_SYSTEM, that drivers could use for really anything, and PL_SWAP which would be under complete TTM control, yes? If that’s the case that would certainly make my life a lot easier (because the drivers wouldn’t need to introduce/manage their own system placements) and I think it would make the code a lot easier to understand.

That’s a bit of work though so the question what happens until this lands comes to mind. Is introduction of VMW_PL_SYSTEM and documenting that PL_SYSTEM is under complete TTM control (like this patch does) the way to go or do we want to start working on the above immediately? Because I’d love to be able to unload vmwgfx without kernel oopsing =)

z


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-16 15:53                 ` Zack Rusin
@ 2021-11-22 14:15                   ` Christian König
  2021-11-22 15:05                     ` Zack Rusin
  0 siblings, 1 reply; 24+ messages in thread
From: Christian König @ 2021-11-22 14:15 UTC (permalink / raw)
  To: Zack Rusin; +Cc: Thomas Hellström, dri-devel

Am 16.11.21 um 16:53 schrieb Zack Rusin:
>> On Nov 16, 2021, at 03:20, Christian König <christian.koenig@amd.com> wrote:
>>
>> Am 16.11.21 um 08:43 schrieb Thomas Hellström:
>>> On 11/16/21 08:19, Christian König wrote:
>>>> Am 13.11.21 um 12:26 schrieb Thomas Hellström:
>>>>> Hi, Zack,
>>>>>
>>>>> On 11/11/21 17:44, Zack Rusin wrote:
>>>>>> On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
>>>>>>> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
>>>>>>> driver internal usage of TTM_PL_SYSTEM prone to errors because it
>>>>>>> requires the drivers to manually handle all interactions between TTM
>>>>>>> which can swap out those buffers whenever it thinks it's the right
>>>>>>> thing to do and driver.
>>>>>>>
>>>>>>> CPU buffers which need to be fenced and shared with accelerators
>>>>>>> should
>>>>>>> be placed in driver specific placements that can explicitly handle
>>>>>>> CPU/accelerator buffer fencing.
>>>>>>> Currently, apart, from things silently failing nothing is enforcing
>>>>>>> that requirement which means that it's easy for drivers and new
>>>>>>> developers to get this wrong. To avoid the confusion we can document
>>>>>>> this requirement and clarify the solution.
>>>>>>>
>>>>>>> This came up during a discussion on dri-devel:
>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C57658f299a72436627e608d9a9194209%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637726748229186505%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=UbE43u8a0MPNgXtzcqkJJfSe0I%2BA5Yojz7yoh7e6Oyo%3D&amp;reserved=0
>>>>> I took a slightly deeper look into this. I think we need to formalize this a bit more to understand pros and cons and what the restrictions are really all about. Anybody looking at the prevous discussion will mostly see arguments similar to "this is stupid and difficult" and "it has always been this way" which are not really constructive.
>>>>>
>>>>> First disregarding all accounting stuff, I think this all boils down to TTM_PL_SYSTEM having three distinct states:
>>>>> 1) POPULATED
>>>>> 2) LIMBO (Or whatever we want to call it. No pages present)
>>>>> 3) SWAPPED.
>>>>>
>>>>> The ttm_bo_move_memcpy() helper understands these, and any standalone driver implementation of the move() callback _currently_ needs to understand these as well, unless using the ttm_bo_move_memcpy() helper.
>>>>>
>>>>> Now using a bounce domain to proxy SYSTEM means that the driver can forget about the SWAPPED state, it's automatically handled by the move setup code. However, another pitfall is LIMBO, in that if when you move from SYSTEM/LIMBO to your bounce domain, the BO will be populated. So any naive accelerated move() implementation creating a 1GB BO in fixed memory, like VRAM, will needlessly allocate and free 1GB of system memory in the process instead of just performing a clear operation. Looks like amdgpu suffers from this?
>>>>>
>>>>> I think what is really needed is either
>>>>>
>>>>> a) A TTM helper that helps move callback implementations resolve the issues populating system from LIMBO or SWAP, and then also formalize driver notification for swapping. At a minimum, I think the swap_notify() callback needs to be able to return a late error.
>>>>>
>>>>> b) Make LIMBO and SWAPPED distinct memory regions. (I think I'd vote for this without looking into it in detail).
>>>>>
>>>>> In both these cases, we should really make SYSTEM bindable by GPU, otherwise we'd just be trading one pitfall for another related without really resolving the root problem.
>>>>>
>>>>> As for fencing not being supported by SYSTEM, I'm not sure why we don't want this, because it would for example prohibit async ttm_move_memcpy(), and also, async unbinding of ttm_tt memory like MOB on vmgfx. (I think it's still sync).
>>>>>
>>>>> There might be an accounting issue related to this as well, but I guess Christian would need to chime in on this. If so, I think it needs to be well understood and documented (in TTM, not in AMD drivers).
>>>> I think the problem goes deeper than what has been mentioned here so far.
>>>>
>>>> Having fences attached to BOs in the system domain is probably ok, but the key point is that the BOs in the system domain are under TTMs control and should not be touched by the driver.
>>>>
>>>> What we have now is that TTMs internals like the allocation state of BOs in system memory (the populated, limbo, swapped you mentioned above) is leaking into the drivers and I think exactly that is the part which doesn't work reliable here. You can of course can get that working, but that requires knowledge of the internal state which in my eyes was always illegal.
>>>>
>>> Well, I tend to agree to some extent, but then, like said above even disregarding swap will cause trouble with the limbo state, because the driver's move callback would need knowledge of that to implement moves limbo -> vram efficiently.
>> Well my long term plan is to audit the code base once more and remove the limbo state from the SYSTEM domain.
>>
>> E.g. instead of a SYSTEM BO without pages you allocate a BO without a resource in general which is now possible since bo->resource is a pointer.
>>
>> This would still allow us to allocate "empty shell" BOs. But a validation of those BOs doesn't cause a move, but rather just allocates the resource for the first time.
>>
>> The problem so far was just that we access bo->resource way to often without checking it.
> So all in all this would be a two step process 1) Eliminate the “limbo” state, 2) Split PL_SYSTEM into PL_SYSTEM, that drivers could use for really anything, and PL_SWAP which would be under complete TTM control, yes? If that’s the case that would certainly make my life a lot easier (because the drivers wouldn’t need to introduce/manage their own system placements) and I think it would make the code a lot easier to understand.

We also have a couple of other use cases for this, yes.

> That’s a bit of work though so the question what happens until this lands comes to mind. Is introduction of VMW_PL_SYSTEM and documenting that PL_SYSTEM is under complete TTM control (like this patch does) the way to go or do we want to start working on the above immediately? Because I’d love to be able to unload vmwgfx without kernel oopsing =)

I think documenting and getting into a clean state should be 
prerequisite for new development (even if the fix for the unload is 
trivial).

Regards,
Christian.

>
> z
>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-22 14:15                   ` Christian König
@ 2021-11-22 15:05                     ` Zack Rusin
  0 siblings, 0 replies; 24+ messages in thread
From: Zack Rusin @ 2021-11-22 15:05 UTC (permalink / raw)
  To: christian.koenig; +Cc: thomas.hellstrom, dri-devel

On Mon, 2021-11-22 at 15:15 +0100, Christian König wrote:
> Am 16.11.21 um 16:53 schrieb Zack Rusin:
> > > On Nov 16, 2021, at 03:20, Christian König
> > > <christian.koenig@amd.com> wrote:
> > > 
> > > Am 16.11.21 um 08:43 schrieb Thomas Hellström:
> > > > On 11/16/21 08:19, Christian König wrote:
> > > > > Am 13.11.21 um 12:26 schrieb Thomas Hellström:
> > > > > > Hi, Zack,
> > > > > > 
> > > > > > On 11/11/21 17:44, Zack Rusin wrote:
> > > > > > > On Wed, 2021-11-10 at 09:50 -0500, Zack Rusin wrote:
> > > > > > > > TTM takes full control over TTM_PL_SYSTEM placed
> > > > > > > > buffers. This makes
> > > > > > > > driver internal usage of TTM_PL_SYSTEM prone to errors
> > > > > > > > because it
> > > > > > > > requires the drivers to manually handle all
> > > > > > > > interactions between TTM
> > > > > > > > which can swap out those buffers whenever it thinks
> > > > > > > > it's the right
> > > > > > > > thing to do and driver.
> > > > > > > > 
> > > > > > > > CPU buffers which need to be fenced and shared with
> > > > > > > > accelerators
> > > > > > > > should
> > > > > > > > be placed in driver specific placements that can
> > > > > > > > explicitly handle
> > > > > > > > CPU/accelerator buffer fencing.
> > > > > > > > Currently, apart, from things silently failing nothing
> > > > > > > > is enforcing
> > > > > > > > that requirement which means that it's easy for drivers
> > > > > > > > and new
> > > > > > > > developers to get this wrong. To avoid the confusion we
> > > > > > > > can document
> > > > > > > > this requirement and clarify the solution.
> > > > > > > > 
> > > > > > > > This came up during a discussion on dri-devel:
> > > > > > > > https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Czackr%40vmware.com%7C084389d8acc04ffdbbb808d9adc28cfe%7Cb39138ca3cee4b4aa4d6cd83d9dd62f0%7C0%7C0%7C637731873379429725%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=ahHm7HV9VPAhfBKsk9xOBcnObsJXHvVAbCdEhJ%2BjJYU%3D&amp;reserved=0
> > > > > > I took a slightly deeper look into this. I think we need to
> > > > > > formalize this a bit more to understand pros and cons and
> > > > > > what the restrictions are really all about. Anybody looking
> > > > > > at the prevous discussion will mostly see arguments similar
> > > > > > to "this is stupid and difficult" and "it has always been
> > > > > > this way" which are not really constructive.
> > > > > > 
> > > > > > First disregarding all accounting stuff, I think this all
> > > > > > boils down to TTM_PL_SYSTEM having three distinct states:
> > > > > > 1) POPULATED
> > > > > > 2) LIMBO (Or whatever we want to call it. No pages present)
> > > > > > 3) SWAPPED.
> > > > > > 
> > > > > > The ttm_bo_move_memcpy() helper understands these, and any
> > > > > > standalone driver implementation of the move() callback
> > > > > > _currently_ needs to understand these as well, unless using
> > > > > > the ttm_bo_move_memcpy() helper.
> > > > > > 
> > > > > > Now using a bounce domain to proxy SYSTEM means that the
> > > > > > driver can forget about the SWAPPED state, it's
> > > > > > automatically handled by the move setup code. However,
> > > > > > another pitfall is LIMBO, in that if when you move from
> > > > > > SYSTEM/LIMBO to your bounce domain, the BO will be
> > > > > > populated. So any naive accelerated move() implementation
> > > > > > creating a 1GB BO in fixed memory, like VRAM, will
> > > > > > needlessly allocate and free 1GB of system memory in the
> > > > > > process instead of just performing a clear operation. Looks
> > > > > > like amdgpu suffers from this?
> > > > > > 
> > > > > > I think what is really needed is either
> > > > > > 
> > > > > > a) A TTM helper that helps move callback implementations
> > > > > > resolve the issues populating system from LIMBO or SWAP,
> > > > > > and then also formalize driver notification for swapping.
> > > > > > At a minimum, I think the swap_notify() callback needs to
> > > > > > be able to return a late error.
> > > > > > 
> > > > > > b) Make LIMBO and SWAPPED distinct memory regions. (I think
> > > > > > I'd vote for this without looking into it in detail).
> > > > > > 
> > > > > > In both these cases, we should really make SYSTEM bindable
> > > > > > by GPU, otherwise we'd just be trading one pitfall for
> > > > > > another related without really resolving the root problem.
> > > > > > 
> > > > > > As for fencing not being supported by SYSTEM, I'm not sure
> > > > > > why we don't want this, because it would for example
> > > > > > prohibit async ttm_move_memcpy(), and also, async unbinding
> > > > > > of ttm_tt memory like MOB on vmgfx. (I think it's still
> > > > > > sync).
> > > > > > 
> > > > > > There might be an accounting issue related to this as well,
> > > > > > but I guess Christian would need to chime in on this. If
> > > > > > so, I think it needs to be well understood and documented
> > > > > > (in TTM, not in AMD drivers).
> > > > > I think the problem goes deeper than what has been mentioned
> > > > > here so far.
> > > > > 
> > > > > Having fences attached to BOs in the system domain is
> > > > > probably ok, but the key point is that the BOs in the system
> > > > > domain are under TTMs control and should not be touched by
> > > > > the driver.
> > > > > 
> > > > > What we have now is that TTMs internals like the allocation
> > > > > state of BOs in system memory (the populated, limbo, swapped
> > > > > you mentioned above) is leaking into the drivers and I think
> > > > > exactly that is the part which doesn't work reliable here.
> > > > > You can of course can get that working, but that requires
> > > > > knowledge of the internal state which in my eyes was always
> > > > > illegal.
> > > > > 
> > > > Well, I tend to agree to some extent, but then, like said above
> > > > even disregarding swap will cause trouble with the limbo state,
> > > > because the driver's move callback would need knowledge of that
> > > > to implement moves limbo -> vram efficiently.
> > > Well my long term plan is to audit the code base once more and
> > > remove the limbo state from the SYSTEM domain.
> > > 
> > > E.g. instead of a SYSTEM BO without pages you allocate a BO
> > > without a resource in general which is now possible since bo-
> > > >resource is a pointer.
> > > 
> > > This would still allow us to allocate "empty shell" BOs. But a
> > > validation of those BOs doesn't cause a move, but rather just
> > > allocates the resource for the first time.
> > > 
> > > The problem so far was just that we access bo->resource way to
> > > often without checking it.
> > So all in all this would be a two step process 1) Eliminate the
> > “limbo” state, 2) Split PL_SYSTEM into PL_SYSTEM, that drivers
> > could use for really anything, and PL_SWAP which would be under
> > complete TTM control, yes? If that’s the case that would certainly
> > make my life a lot easier (because the drivers wouldn’t need to
> > introduce/manage their own system placements) and I think it would
> > make the code a lot easier to understand.
> 
> We also have a couple of other use cases for this, yes.
> 
> > That’s a bit of work though so the question what happens until this
> > lands comes to mind. Is introduction of VMW_PL_SYSTEM and
> > documenting that PL_SYSTEM is under complete TTM control (like this
> > patch does) the way to go or do we want to start working on the
> > above immediately? Because I’d love to be able to unload vmwgfx
> > without kernel oopsing =)
> 
> I think documenting and getting into a clean state should be 
> prerequisite for new development (even if the fix for the unload is 
> trivial).

That sounds great (both points). Could you double check the wording and
ack it (ideally both the docs and the VMW_PL_SYSTEM patch) if you think
it's ready to go in?

z

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control
  2021-11-10 14:50     ` [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control Zack Rusin
  2021-11-11 16:44       ` Zack Rusin
@ 2021-11-24  8:22       ` Christian König
  1 sibling, 0 replies; 24+ messages in thread
From: Christian König @ 2021-11-24  8:22 UTC (permalink / raw)
  To: Zack Rusin, dri-devel; +Cc: Thomas Hellström

Am 10.11.21 um 15:50 schrieb Zack Rusin:
> TTM takes full control over TTM_PL_SYSTEM placed buffers. This makes
> driver internal usage of TTM_PL_SYSTEM prone to errors because it
> requires the drivers to manually handle all interactions between TTM
> which can swap out those buffers whenever it thinks it's the right
> thing to do and driver.
>
> CPU buffers which need to be fenced and shared with accelerators should
> be placed in driver specific placements that can explicitly handle
> CPU/accelerator buffer fencing.
> Currently, apart, from things silently failing nothing is enforcing
> that requirement which means that it's easy for drivers and new
> developers to get this wrong. To avoid the confusion we can document
> this requirement and clarify the solution.
>
> This came up during a discussion on dri-devel:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fdri-devel%2F232f45e9-8748-1243-09bf-56763e6668b3%40amd.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cbcf8d8977e68448fa20808d9a4597927%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637721526467013347%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=3pldUWJ8u7AEDxtG0vQBINIL7%2FQE4HiE%2FQ7x8fi0MK8%3D&amp;reserved=0
>
> Signed-off-by: Zack Rusin <zackr@vmware.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   include/drm/ttm/ttm_placement.h | 11 +++++++++++
>   1 file changed, 11 insertions(+)
>
> diff --git a/include/drm/ttm/ttm_placement.h b/include/drm/ttm/ttm_placement.h
> index 76d1b9119a2b..8074d0f6cae5 100644
> --- a/include/drm/ttm/ttm_placement.h
> +++ b/include/drm/ttm/ttm_placement.h
> @@ -35,6 +35,17 @@
>   
>   /*
>    * Memory regions for data placement.
> + *
> + * Buffers placed in TTM_PL_SYSTEM are considered under TTMs control and can
> + * be swapped out whenever TTMs thinks it is a good idea.
> + * In cases where drivers would like to use TTM_PL_SYSTEM as a valid
> + * placement they need to be able to handle the issues that arise due to the
> + * above manually.
> + *
> + * For BO's which reside in system memory but for which the accelerator
> + * requires direct access (i.e. their usage needs to be synchronized
> + * between the CPU and accelerator via fences) a new, driver private
> + * placement that can handle such scenarios is a good idea.
>    */
>   
>   #define TTM_PL_SYSTEM           0


^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2021-11-24  8:22 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-05 19:38 [PATCH v2 0/6] Support module unload and hotunplug Zack Rusin
2021-11-05 19:38 ` [PATCH v2 1/6] drm/vmwgfx: Remove the deprecated lower mem limit Zack Rusin
2021-11-05 19:38 ` [PATCH v2 2/6] drm/vmwgfx: Release ttm memory if probe fails Zack Rusin
2021-11-05 19:38 ` [PATCH v2 3/6] drm/vmwgfx: Fail to initialize on broken configs Zack Rusin
2021-11-05 19:38 ` [PATCH v2 4/6] drm/vmwgfx: Introduce a new placement for MOB page tables Zack Rusin
2021-11-05 19:38 ` [PATCH v2 5/6] drm/vmwgfx: Switch the internal BO's to ttm_bo_type_kernel Zack Rusin
2021-11-05 19:38 ` [PATCH v2 6/6] drm/ttm: Clarify that the TTM_PL_SYSTEM buffers need to stay idle Zack Rusin
2021-11-08 11:07   ` Christian König
2021-11-08 15:42     ` [PATCH v3] " Zack Rusin
2021-11-10 14:50     ` [PATCH v4] drm/ttm: Clarify that the TTM_PL_SYSTEM is under TTMs control Zack Rusin
2021-11-11 16:44       ` Zack Rusin
2021-11-13 11:26         ` Thomas Hellström
2021-11-16  7:19           ` Christian König
2021-11-16  7:43             ` Thomas Hellström
2021-11-16  8:20               ` Christian König
2021-11-16  8:33                 ` Thomas Hellström
2021-11-16  8:54                   ` Christian König
2021-11-16  9:00                     ` Thomas Hellström
2021-11-16  9:09                       ` Christian König
2021-11-16 15:49                         ` Daniel Vetter
2021-11-16 15:53                 ` Zack Rusin
2021-11-22 14:15                   ` Christian König
2021-11-22 15:05                     ` Zack Rusin
2021-11-24  8:22       ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.