All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
@ 2021-07-22 12:41 Christian König
  2021-07-22 12:41 ` [PATCH 2/5] drm/amdgpu: unbind in amdgpu_ttm_tt_unpopulate Christian König
                   ` (4 more replies)
  0 siblings, 5 replies; 21+ messages in thread
From: Christian König @ 2021-07-22 12:41 UTC (permalink / raw)
  To: dri-devel, airlied, daniel

Doing this in vmw_ttm_destroy() is to late.

It turned out that this is not a good idea at all because it leaves pointers
to freed up system memory pages in the GART tables of the drivers.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index b0973c27e774..904031d03dbe 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
@@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 	struct vmw_ttm_tt *vmw_be =
 		container_of(ttm, struct vmw_ttm_tt, dma_ttm);
 
-	vmw_ttm_unbind(bdev, ttm);
 	ttm_tt_destroy_common(bdev, ttm);
 	vmw_ttm_unmap_dma(vmw_be);
-	if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
-		ttm_tt_fini(&vmw_be->dma_ttm);
-	else
-		ttm_tt_fini(ttm);
-
+	ttm_tt_fini(ttm);
 	if (vmw_be->mob)
 		vmw_mob_destroy(vmw_be->mob);
 
@@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev,
 						 dma_ttm);
 	unsigned int i;
 
+	vmw_ttm_unbind(bdev, ttm);
+
 	if (vmw_tt->mob) {
 		vmw_mob_destroy(vmw_tt->mob);
 		vmw_tt->mob = NULL;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/5] drm/amdgpu: unbind in amdgpu_ttm_tt_unpopulate
  2021-07-22 12:41 [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Christian König
@ 2021-07-22 12:41 ` Christian König
  2021-07-22 12:41 ` [PATCH 3/5] drm/nouveau: unbind in nouveau_ttm_tt_unpopulate Christian König
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 21+ messages in thread
From: Christian König @ 2021-07-22 12:41 UTC (permalink / raw)
  To: dri-devel, airlied, daniel

Doing this in amdgpu_ttm_backend_destroy() is to late.

It turned out that this is not a good idea at all because it leaves pointers
to freed up system memory pages in the GART tables of the drivers.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index b46726e47bce..42b5162814b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1070,7 +1070,6 @@ static void amdgpu_ttm_backend_destroy(struct ttm_device *bdev,
 {
 	struct amdgpu_ttm_tt *gtt = (void *)ttm;
 
-	amdgpu_ttm_backend_unbind(bdev, ttm);
 	ttm_tt_destroy_common(bdev, ttm);
 	if (gtt->usertask)
 		put_task_struct(gtt->usertask);
@@ -1154,6 +1153,8 @@ static void amdgpu_ttm_tt_unpopulate(struct ttm_device *bdev,
 	struct amdgpu_ttm_tt *gtt = (void *)ttm;
 	struct amdgpu_device *adev;
 
+	amdgpu_ttm_backend_unbind(bdev, ttm);
+
 	if (gtt && gtt->userptr) {
 		amdgpu_ttm_tt_set_user_pages(ttm, NULL);
 		kfree(ttm->sg);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/5] drm/nouveau: unbind in nouveau_ttm_tt_unpopulate
  2021-07-22 12:41 [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Christian König
  2021-07-22 12:41 ` [PATCH 2/5] drm/amdgpu: unbind in amdgpu_ttm_tt_unpopulate Christian König
@ 2021-07-22 12:41 ` Christian König
  2021-07-22 12:41 ` [PATCH 4/5] drm/radeon: unbind in radeon_ttm_tt_unpopulate() Christian König
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 21+ messages in thread
From: Christian König @ 2021-07-22 12:41 UTC (permalink / raw)
  To: dri-devel, airlied, daniel

Doing this in nouveau_ttm_tt_destroy()/nouveau_sgdma_destroy() is to late.

It turned out that this is not a good idea at all because it leaves pointers
to freed up system memory pages in the GART tables of the drivers.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c    | 3 ++-
 drivers/gpu/drm/nouveau/nouveau_sgdma.c | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 085023624fb0..5f309a4ec211 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1276,6 +1276,8 @@ nouveau_ttm_tt_unpopulate(struct ttm_device *bdev,
 	if (slave)
 		return;
 
+	nouveau_ttm_tt_unbind(bdev, ttm);
+
 	drm = nouveau_bdev(bdev);
 	dev = drm->dev->dev;
 
@@ -1289,7 +1291,6 @@ nouveau_ttm_tt_destroy(struct ttm_device *bdev,
 #if IS_ENABLED(CONFIG_AGP)
 	struct nouveau_drm *drm = nouveau_bdev(bdev);
 	if (drm->agp.bridge) {
-		ttm_agp_unbind(ttm);
 		ttm_tt_destroy_common(bdev, ttm);
 		ttm_agp_destroy(ttm);
 		return;
diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
index 256ec5b35473..bde92a9dae7a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
@@ -21,7 +21,6 @@ nouveau_sgdma_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 	struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)ttm;
 
 	if (ttm) {
-		nouveau_sgdma_unbind(bdev, ttm);
 		ttm_tt_destroy_common(bdev, ttm);
 		ttm_tt_fini(&nvbe->ttm);
 		kfree(nvbe);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/5] drm/radeon: unbind in radeon_ttm_tt_unpopulate()
  2021-07-22 12:41 [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Christian König
  2021-07-22 12:41 ` [PATCH 2/5] drm/amdgpu: unbind in amdgpu_ttm_tt_unpopulate Christian König
  2021-07-22 12:41 ` [PATCH 3/5] drm/nouveau: unbind in nouveau_ttm_tt_unpopulate Christian König
@ 2021-07-22 12:41 ` Christian König
  2021-07-22 12:41 ` [PATCH 5/5] drm/ttm: revert "flip tt destroy ordering." Christian König
  2021-07-23  8:47 ` [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Daniel Vetter
  4 siblings, 0 replies; 21+ messages in thread
From: Christian König @ 2021-07-22 12:41 UTC (permalink / raw)
  To: dri-devel, airlied, daniel

Doing this in radeon_ttm_tt_destroy() is to late.

It turned out that this is not a good idea at all because it leaves pointers
to freed up system memory pages in the GART tables of the drivers.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/radeon/radeon_ttm.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index a06d4cc2fb1c..ee343b76db54 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -488,9 +488,7 @@ static void radeon_ttm_backend_destroy(struct ttm_device *bdev, struct ttm_tt *t
 {
 	struct radeon_ttm_tt *gtt = (void *)ttm;
 
-	radeon_ttm_backend_unbind(bdev, ttm);
 	ttm_tt_destroy_common(bdev, ttm);
-
 	ttm_tt_fini(&gtt->ttm);
 	kfree(gtt);
 }
@@ -574,6 +572,8 @@ static void radeon_ttm_tt_unpopulate(struct ttm_device *bdev, struct ttm_tt *ttm
 	struct radeon_ttm_tt *gtt = radeon_ttm_tt_to_gtt(rdev, ttm);
 	bool slave = !!(ttm->page_flags & TTM_PAGE_FLAG_SG);
 
+	radeon_ttm_tt_unbind(bdev, ttm);
+
 	if (gtt && gtt->userptr) {
 		kfree(ttm->sg);
 		ttm->page_flags &= ~TTM_PAGE_FLAG_SG;
@@ -651,7 +651,6 @@ static void radeon_ttm_tt_destroy(struct ttm_device *bdev,
 	struct radeon_device *rdev = radeon_get_rdev(bdev);
 
 	if (rdev->flags & RADEON_IS_AGP) {
-		ttm_agp_unbind(ttm);
 		ttm_tt_destroy_common(bdev, ttm);
 		ttm_agp_destroy(ttm);
 		return;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 5/5] drm/ttm: revert "flip tt destroy ordering."
  2021-07-22 12:41 [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Christian König
                   ` (2 preceding siblings ...)
  2021-07-22 12:41 ` [PATCH 4/5] drm/radeon: unbind in radeon_ttm_tt_unpopulate() Christian König
@ 2021-07-22 12:41 ` Christian König
  2021-07-23  8:47 ` [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Daniel Vetter
  4 siblings, 0 replies; 21+ messages in thread
From: Christian König @ 2021-07-22 12:41 UTC (permalink / raw)
  To: dri-devel, airlied, daniel

It turned out that this is not a good idea at all because it leaves pointers
to freed up system memory pages in the GART tables of the drivers.

Instead change the drivers to unbind their page tables during unpopulate and
merge those things back into ttm_tt_destroy() again.

This reverts commit 7626168fd132009c79a0457bccc58014abc738f5.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 1 -
 drivers/gpu/drm/drm_gem_vram_helper.c      | 1 -
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c    | 1 -
 drivers/gpu/drm/nouveau/nouveau_bo.c       | 1 -
 drivers/gpu/drm/nouveau/nouveau_sgdma.c    | 1 -
 drivers/gpu/drm/qxl/qxl_ttm.c              | 1 -
 drivers/gpu/drm/radeon/radeon_ttm.c        | 2 --
 drivers/gpu/drm/ttm/ttm_tt.c               | 7 +------
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 1 -
 include/drm/ttm/ttm_tt.h                   | 7 -------
 10 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 42b5162814b1..6a6e04b64a80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1070,7 +1070,6 @@ static void amdgpu_ttm_backend_destroy(struct ttm_device *bdev,
 {
 	struct amdgpu_ttm_tt *gtt = (void *)ttm;
 
-	ttm_tt_destroy_common(bdev, ttm);
 	if (gtt->usertask)
 		put_task_struct(gtt->usertask);
 
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index 1e9b82e51a07..cc81fbac1a13 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -846,7 +846,6 @@ static const struct drm_gem_object_funcs drm_gem_vram_object_funcs = {
 
 static void bo_driver_ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *tt)
 {
-	ttm_tt_destroy_common(bdev, tt);
 	ttm_tt_fini(tt);
 	kfree(tt);
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index bf33724bed5c..e646aac9d7a4 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -118,7 +118,6 @@ static void i915_ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 {
 	struct i915_ttm_tt *i915_tt = container_of(ttm, typeof(*i915_tt), ttm);
 
-	ttm_tt_destroy_common(bdev, ttm);
 	kfree(i915_tt);
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 5f309a4ec211..3177f89cf000 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1291,7 +1291,6 @@ nouveau_ttm_tt_destroy(struct ttm_device *bdev,
 #if IS_ENABLED(CONFIG_AGP)
 	struct nouveau_drm *drm = nouveau_bdev(bdev);
 	if (drm->agp.bridge) {
-		ttm_tt_destroy_common(bdev, ttm);
 		ttm_agp_destroy(ttm);
 		return;
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
index bde92a9dae7a..85c03c83259b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
@@ -21,7 +21,6 @@ nouveau_sgdma_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 	struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)ttm;
 
 	if (ttm) {
-		ttm_tt_destroy_common(bdev, ttm);
 		ttm_tt_fini(&nvbe->ttm);
 		kfree(nvbe);
 	}
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index 19fd39d9a00c..e91d8154143e 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -101,7 +101,6 @@ int qxl_ttm_io_mem_reserve(struct ttm_device *bdev,
  */
 static void qxl_ttm_backend_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 {
-	ttm_tt_destroy_common(bdev, ttm);
 	ttm_tt_fini(ttm);
 	kfree(ttm);
 }
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index ee343b76db54..7793249bc549 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -488,7 +488,6 @@ static void radeon_ttm_backend_destroy(struct ttm_device *bdev, struct ttm_tt *t
 {
 	struct radeon_ttm_tt *gtt = (void *)ttm;
 
-	ttm_tt_destroy_common(bdev, ttm);
 	ttm_tt_fini(&gtt->ttm);
 	kfree(gtt);
 }
@@ -651,7 +650,6 @@ static void radeon_ttm_tt_destroy(struct ttm_device *bdev,
 	struct radeon_device *rdev = radeon_get_rdev(bdev);
 
 	if (rdev->flags & RADEON_IS_AGP) {
-		ttm_tt_destroy_common(bdev, ttm);
 		ttm_agp_destroy(ttm);
 		return;
 	}
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 24031a8acd2d..41f38d9c3e1c 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -123,7 +123,7 @@ static int ttm_sg_tt_alloc_page_directory(struct ttm_tt *ttm)
 	return 0;
 }
 
-void ttm_tt_destroy_common(struct ttm_device *bdev, struct ttm_tt *ttm)
+void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 {
 	ttm_tt_unpopulate(bdev, ttm);
 
@@ -131,11 +131,6 @@ void ttm_tt_destroy_common(struct ttm_device *bdev, struct ttm_tt *ttm)
 		fput(ttm->swap_storage);
 
 	ttm->swap_storage = NULL;
-}
-EXPORT_SYMBOL(ttm_tt_destroy_common);
-
-void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
-{
 	bdev->funcs->ttm_tt_destroy(bdev, ttm);
 }
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index 904031d03dbe..f35bdc1cb197 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
@@ -526,7 +526,6 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 	struct vmw_ttm_tt *vmw_be =
 		container_of(ttm, struct vmw_ttm_tt, dma_ttm);
 
-	ttm_tt_destroy_common(bdev, ttm);
 	vmw_ttm_unmap_dma(vmw_be);
 	ttm_tt_fini(ttm);
 	if (vmw_be->mob)
diff --git a/include/drm/ttm/ttm_tt.h b/include/drm/ttm/ttm_tt.h
index 818680c6a8ed..e402dab1d0f6 100644
--- a/include/drm/ttm/ttm_tt.h
+++ b/include/drm/ttm/ttm_tt.h
@@ -134,13 +134,6 @@ void ttm_tt_fini(struct ttm_tt *ttm);
  */
 void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *ttm);
 
-/**
- * ttm_tt_destroy_common:
- *
- * Called from driver to destroy common path.
- */
-void ttm_tt_destroy_common(struct ttm_device *bdev, struct ttm_tt *ttm);
-
 /**
  * ttm_tt_swapin:
  *
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-22 12:41 [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Christian König
                   ` (3 preceding siblings ...)
  2021-07-22 12:41 ` [PATCH 5/5] drm/ttm: revert "flip tt destroy ordering." Christian König
@ 2021-07-23  8:47 ` Daniel Vetter
  2021-07-23  9:13   ` Christian König
  4 siblings, 1 reply; 21+ messages in thread
From: Daniel Vetter @ 2021-07-23  8:47 UTC (permalink / raw)
  To: Christian König; +Cc: airlied, dri-devel

On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
> Doing this in vmw_ttm_destroy() is to late.
> 
> It turned out that this is not a good idea at all because it leaves pointers
> to freed up system memory pages in the GART tables of the drivers.

So I wanted to review this series, and I can't reconcile your claim here
with the demidlayering Dave has done. The driver patches here don't
ouright undo what Dave has done, but that means the bug has been
preexisting since forever (or is due to some other change?), and your
commit message is a bit confusing here.

The final patch just undoes the demidlayering from Dave, and I really
don't see where there's even a functional change there.

And even these patches here don't really change a hole lot with the
calling sequence for at least final teardown: ttm_tt_destroy_common calls
ttm_tt_unpopulate as the first thing, so at least there there's no change.

Can you pls elaborate more clearly what exactly you're fixing and what
exactly needs to be reordered and where this bug is from (commit sha1)? As
is I'm playing detective and the evidence presented is extremely since and
I can't reconcile it at all.

I mean I know you don't like typing commit message and documentation, but
it does get occasionally rather frustrating on the reviewer side if I have
to interpolate between some very sparse hints for this stuff :-/
-Daniel

> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
>  1 file changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> index b0973c27e774..904031d03dbe 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
>  	struct vmw_ttm_tt *vmw_be =
>  		container_of(ttm, struct vmw_ttm_tt, dma_ttm);
>  
> -	vmw_ttm_unbind(bdev, ttm);
>  	ttm_tt_destroy_common(bdev, ttm);
>  	vmw_ttm_unmap_dma(vmw_be);
> -	if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
> -		ttm_tt_fini(&vmw_be->dma_ttm);
> -	else
> -		ttm_tt_fini(ttm);
> -
> +	ttm_tt_fini(ttm);
>  	if (vmw_be->mob)
>  		vmw_mob_destroy(vmw_be->mob);
>  
> @@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev,
>  						 dma_ttm);
>  	unsigned int i;
>  
> +	vmw_ttm_unbind(bdev, ttm);
> +
>  	if (vmw_tt->mob) {
>  		vmw_mob_destroy(vmw_tt->mob);
>  		vmw_tt->mob = NULL;
> -- 
> 2.25.1
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-23  8:47 ` [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Daniel Vetter
@ 2021-07-23  9:13   ` Christian König
  2021-07-23  9:21     ` Daniel Vetter
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2021-07-23  9:13 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: airlied, dri-devel

Am 23.07.21 um 10:47 schrieb Daniel Vetter:
> On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
>> Doing this in vmw_ttm_destroy() is to late.
>>
>> It turned out that this is not a good idea at all because it leaves pointers
>> to freed up system memory pages in the GART tables of the drivers.
> So I wanted to review this series, and I can't reconcile your claim here
> with the demidlayering Dave has done. The driver patches here don't
> ouright undo what Dave has done, but that means the bug has been
> preexisting since forever (or is due to some other change?), and your
> commit message is a bit confusing here.
>
> The final patch just undoes the demidlayering from Dave, and I really
> don't see where there's even a functional change there.
>
> And even these patches here don't really change a hole lot with the
> calling sequence for at least final teardown: ttm_tt_destroy_common calls
> ttm_tt_unpopulate as the first thing, so at least there there's no change.
>
> Can you pls elaborate more clearly what exactly you're fixing and what
> exactly needs to be reordered and where this bug is from (commit sha1)? As
> is I'm playing detective and the evidence presented is extremely since and
> I can't reconcile it at all.
>
> I mean I know you don't like typing commit message and documentation, but
> it does get occasionally rather frustrating on the reviewer side if I have
> to interpolate between some very sparse hints for this stuff :-/

Yeah, when have seen the history it's rather obvious what's wrong here 
and I expected Dave to review it himself.

Previously we had three states in TTM for a tt object: Allocated -> 
Populated -> Bound which on destruction where done in the order unbind 
-> unpopulate -> free.

Dave moved handling of the bound state into the drivers since it is 
basically a driver decision and not a TTM decision what should be bound 
and what not (that part perfectly makes sense).

The problem is that he also moved doing the unbind into the free 
callback instead of the unpopulate callback. This result in stale page 
pointers in the GART if that unpopulate operation isn't immediately 
followed by a free.

Thinking more about it if we do populated->unpopulated->populated then 
we would also have stale pointers to the old pages which is even worse.

This is also not de-midlayering since we already have a proper 
ttm_tt_init()/ttm_tt_fini() functions which should work nicely for the 
tt object.

Christian.

> -Daniel
>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
>>   1 file changed, 3 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
>> index b0973c27e774..904031d03dbe 100644
>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
>> @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
>>   	struct vmw_ttm_tt *vmw_be =
>>   		container_of(ttm, struct vmw_ttm_tt, dma_ttm);
>>   
>> -	vmw_ttm_unbind(bdev, ttm);
>>   	ttm_tt_destroy_common(bdev, ttm);
>>   	vmw_ttm_unmap_dma(vmw_be);
>> -	if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
>> -		ttm_tt_fini(&vmw_be->dma_ttm);
>> -	else
>> -		ttm_tt_fini(ttm);
>> -
>> +	ttm_tt_fini(ttm);
>>   	if (vmw_be->mob)
>>   		vmw_mob_destroy(vmw_be->mob);
>>   
>> @@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev,
>>   						 dma_ttm);
>>   	unsigned int i;
>>   
>> +	vmw_ttm_unbind(bdev, ttm);
>> +
>>   	if (vmw_tt->mob) {
>>   		vmw_mob_destroy(vmw_tt->mob);
>>   		vmw_tt->mob = NULL;
>> -- 
>> 2.25.1
>>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-23  9:13   ` Christian König
@ 2021-07-23  9:21     ` Daniel Vetter
  2021-07-23  9:40       ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Daniel Vetter @ 2021-07-23  9:21 UTC (permalink / raw)
  To: Christian König; +Cc: Dave Airlie, dri-devel

On Fri, Jul 23, 2021 at 11:13 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 23.07.21 um 10:47 schrieb Daniel Vetter:
> > On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
> >> Doing this in vmw_ttm_destroy() is to late.
> >>
> >> It turned out that this is not a good idea at all because it leaves pointers
> >> to freed up system memory pages in the GART tables of the drivers.
> > So I wanted to review this series, and I can't reconcile your claim here
> > with the demidlayering Dave has done. The driver patches here don't
> > ouright undo what Dave has done, but that means the bug has been
> > preexisting since forever (or is due to some other change?), and your
> > commit message is a bit confusing here.
> >
> > The final patch just undoes the demidlayering from Dave, and I really
> > don't see where there's even a functional change there.
> >
> > And even these patches here don't really change a hole lot with the
> > calling sequence for at least final teardown: ttm_tt_destroy_common calls
> > ttm_tt_unpopulate as the first thing, so at least there there's no change.
> >
> > Can you pls elaborate more clearly what exactly you're fixing and what
> > exactly needs to be reordered and where this bug is from (commit sha1)? As
> > is I'm playing detective and the evidence presented is extremely since and
> > I can't reconcile it at all.
> >
> > I mean I know you don't like typing commit message and documentation, but
> > it does get occasionally rather frustrating on the reviewer side if I have
> > to interpolate between some very sparse hints for this stuff :-/
>
> Yeah, when have seen the history it's rather obvious what's wrong here
> and I expected Dave to review it himself.
>
> Previously we had three states in TTM for a tt object: Allocated ->
> Populated -> Bound which on destruction where done in the order unbind
> -> unpopulate -> free.
>
> Dave moved handling of the bound state into the drivers since it is
> basically a driver decision and not a TTM decision what should be bound
> and what not (that part perfectly makes sense).

I haven't reviewed all the patches from Dave, only the one you pointed
at (in the last patch). And that one I still can't match up with your
description. If there's other commits relevant, can you pls dig them
out?

Like it all makes sense what you're saying and matches the code, I
just can't match it up with the commit you're referencing.

> The problem is that he also moved doing the unbind into the free
> callback instead of the unpopulate callback. This result in stale page
> pointers in the GART if that unpopulate operation isn't immediately
> followed by a free.
>
> Thinking more about it if we do populated->unpopulated->populated then
> we would also have stale pointers to the old pages which is even worse.
>
> This is also not de-midlayering since we already have a proper
> ttm_tt_init()/ttm_tt_fini() functions which should work nicely for the
> tt object.

Well you're last patch moves the ttm_tt_destroy_common stuff back into
ttm, which kinda is de-demidlayering. So I'm confused.

Other bit: I think it'd be good to document this properly in the
callbacks, and maybe ideally go about and kerneldoc-ify the entire
ttm_tt.h header. Otherwise when we eventually (never?) get around to
that, everyone has forgotten these semantic details and issues again.
-Daniel

> Christian.
>
> > -Daniel
> >
> >> Signed-off-by: Christian König <christian.koenig@amd.com>
> >> ---
> >>   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
> >>   1 file changed, 3 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> >> index b0973c27e774..904031d03dbe 100644
> >> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> >> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> >> @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
> >>      struct vmw_ttm_tt *vmw_be =
> >>              container_of(ttm, struct vmw_ttm_tt, dma_ttm);
> >>
> >> -    vmw_ttm_unbind(bdev, ttm);
> >>      ttm_tt_destroy_common(bdev, ttm);
> >>      vmw_ttm_unmap_dma(vmw_be);
> >> -    if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
> >> -            ttm_tt_fini(&vmw_be->dma_ttm);
> >> -    else
> >> -            ttm_tt_fini(ttm);
> >> -
> >> +    ttm_tt_fini(ttm);
> >>      if (vmw_be->mob)
> >>              vmw_mob_destroy(vmw_be->mob);
> >>
> >> @@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev,
> >>                                               dma_ttm);
> >>      unsigned int i;
> >>
> >> +    vmw_ttm_unbind(bdev, ttm);
> >> +
> >>      if (vmw_tt->mob) {
> >>              vmw_mob_destroy(vmw_tt->mob);
> >>              vmw_tt->mob = NULL;
> >> --
> >> 2.25.1
> >>
>


-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-23  9:21     ` Daniel Vetter
@ 2021-07-23  9:40       ` Christian König
  2021-07-26  0:03         ` Dave Airlie
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2021-07-23  9:40 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Dave Airlie, dri-devel

Am 23.07.21 um 11:21 schrieb Daniel Vetter:
> On Fri, Jul 23, 2021 at 11:13 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 23.07.21 um 10:47 schrieb Daniel Vetter:
>>> On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
>>>> Doing this in vmw_ttm_destroy() is to late.
>>>>
>>>> It turned out that this is not a good idea at all because it leaves pointers
>>>> to freed up system memory pages in the GART tables of the drivers.
>>> So I wanted to review this series, and I can't reconcile your claim here
>>> with the demidlayering Dave has done. The driver patches here don't
>>> ouright undo what Dave has done, but that means the bug has been
>>> preexisting since forever (or is due to some other change?), and your
>>> commit message is a bit confusing here.
>>>
>>> The final patch just undoes the demidlayering from Dave, and I really
>>> don't see where there's even a functional change there.
>>>
>>> And even these patches here don't really change a hole lot with the
>>> calling sequence for at least final teardown: ttm_tt_destroy_common calls
>>> ttm_tt_unpopulate as the first thing, so at least there there's no change.
>>>
>>> Can you pls elaborate more clearly what exactly you're fixing and what
>>> exactly needs to be reordered and where this bug is from (commit sha1)? As
>>> is I'm playing detective and the evidence presented is extremely since and
>>> I can't reconcile it at all.
>>>
>>> I mean I know you don't like typing commit message and documentation, but
>>> it does get occasionally rather frustrating on the reviewer side if I have
>>> to interpolate between some very sparse hints for this stuff :-/
>> Yeah, when have seen the history it's rather obvious what's wrong here
>> and I expected Dave to review it himself.
>>
>> Previously we had three states in TTM for a tt object: Allocated ->
>> Populated -> Bound which on destruction where done in the order unbind
>> -> unpopulate -> free.
>>
>> Dave moved handling of the bound state into the drivers since it is
>> basically a driver decision and not a TTM decision what should be bound
>> and what not (that part perfectly makes sense).
> I haven't reviewed all the patches from Dave, only the one you pointed
> at (in the last patch). And that one I still can't match up with your
> description. If there's other commits relevant, can you pls dig them
> out?
>
> Like it all makes sense what you're saying and matches the code, I
> just can't match it up with the commit you're referencing.

That is the patch directly following the one I've mentioned:

commit 37bff6542c4e140a11657406c1bab50a40329cc1
Author: Dave Airlie <airlied@redhat.com>
Date:   Thu Sep 17 13:24:50 2020 +1000

     drm/ttm: move unbind into the tt destroy.

     This moves unbind into the driver side on destroy paths.

I will add a Fixes tag to make that clear.

But this patch also just moves the undbind from the TTM destroy path to 
the driver destroy path.

To be honest I'm not 100% sure either when the when the unbind moved 
from the unpopulate path into the destroy path, but I think that this 
wasn't always the case. Let me try to dig that up.

>> The problem is that he also moved doing the unbind into the free
>> callback instead of the unpopulate callback. This result in stale page
>> pointers in the GART if that unpopulate operation isn't immediately
>> followed by a free.
>>
>> Thinking more about it if we do populated->unpopulated->populated then
>> we would also have stale pointers to the old pages which is even worse.
>>
>> This is also not de-midlayering since we already have a proper
>> ttm_tt_init()/ttm_tt_fini() functions which should work nicely for the
>> tt object.
> Well you're last patch moves the ttm_tt_destroy_common stuff back into
> ttm, which kinda is de-demidlayering. So I'm confused.

Ah, yes that is correct. I've also considered to move this in 
ttm_tt_fini instead of there.

But that would be a larger change and I wanted to fix the problem at 
hand first, potentially even adding a CC stable tag.

> Other bit: I think it'd be good to document this properly in the
> callbacks, and maybe ideally go about and kerneldoc-ify the entire
> ttm_tt.h header. Otherwise when we eventually (never?) get around to
> that, everyone has forgotten these semantic details and issues again.

Already working towards including more of the TTM headers and code files 
in kerneldoc. But not quite there yet.

But you know, normal human: Only equipped with one head and two hands 
and not cloneable.

Cheers,
Christian.

> -Daniel
>
>> Christian.
>>
>>> -Daniel
>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
>>>>    1 file changed, 3 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
>>>> index b0973c27e774..904031d03dbe 100644
>>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
>>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
>>>> @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
>>>>       struct vmw_ttm_tt *vmw_be =
>>>>               container_of(ttm, struct vmw_ttm_tt, dma_ttm);
>>>>
>>>> -    vmw_ttm_unbind(bdev, ttm);
>>>>       ttm_tt_destroy_common(bdev, ttm);
>>>>       vmw_ttm_unmap_dma(vmw_be);
>>>> -    if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
>>>> -            ttm_tt_fini(&vmw_be->dma_ttm);
>>>> -    else
>>>> -            ttm_tt_fini(ttm);
>>>> -
>>>> +    ttm_tt_fini(ttm);
>>>>       if (vmw_be->mob)
>>>>               vmw_mob_destroy(vmw_be->mob);
>>>>
>>>> @@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev,
>>>>                                                dma_ttm);
>>>>       unsigned int i;
>>>>
>>>> +    vmw_ttm_unbind(bdev, ttm);
>>>> +
>>>>       if (vmw_tt->mob) {
>>>>               vmw_mob_destroy(vmw_tt->mob);
>>>>               vmw_tt->mob = NULL;
>>>> --
>>>> 2.25.1
>>>>
>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-23  9:40       ` Christian König
@ 2021-07-26  0:03         ` Dave Airlie
  2021-07-26 19:35           ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Dave Airlie @ 2021-07-26  0:03 UTC (permalink / raw)
  To: Christian König; +Cc: Dave Airlie, dri-devel

On Fri, 23 Jul 2021 at 19:40, Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 23.07.21 um 11:21 schrieb Daniel Vetter:
> > On Fri, Jul 23, 2021 at 11:13 AM Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> >> Am 23.07.21 um 10:47 schrieb Daniel Vetter:
> >>> On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
> >>>> Doing this in vmw_ttm_destroy() is to late.
> >>>>
> >>>> It turned out that this is not a good idea at all because it leaves pointers
> >>>> to freed up system memory pages in the GART tables of the drivers.
> >>> So I wanted to review this series, and I can't reconcile your claim here
> >>> with the demidlayering Dave has done. The driver patches here don't
> >>> ouright undo what Dave has done, but that means the bug has been
> >>> preexisting since forever (or is due to some other change?), and your
> >>> commit message is a bit confusing here.
> >>>
> >>> The final patch just undoes the demidlayering from Dave, and I really
> >>> don't see where there's even a functional change there.
> >>>
> >>> And even these patches here don't really change a hole lot with the
> >>> calling sequence for at least final teardown: ttm_tt_destroy_common calls
> >>> ttm_tt_unpopulate as the first thing, so at least there there's no change.
> >>>
> >>> Can you pls elaborate more clearly what exactly you're fixing and what
> >>> exactly needs to be reordered and where this bug is from (commit sha1)? As
> >>> is I'm playing detective and the evidence presented is extremely since and
> >>> I can't reconcile it at all.
> >>>
> >>> I mean I know you don't like typing commit message and documentation, but
> >>> it does get occasionally rather frustrating on the reviewer side if I have
> >>> to interpolate between some very sparse hints for this stuff :-/
> >> Yeah, when have seen the history it's rather obvious what's wrong here
> >> and I expected Dave to review it himself.
> >>
> >> Previously we had three states in TTM for a tt object: Allocated ->
> >> Populated -> Bound which on destruction where done in the order unbind
> >> -> unpopulate -> free.
> >>
> >> Dave moved handling of the bound state into the drivers since it is
> >> basically a driver decision and not a TTM decision what should be bound
> >> and what not (that part perfectly makes sense).
> > I haven't reviewed all the patches from Dave, only the one you pointed
> > at (in the last patch). And that one I still can't match up with your
> > description. If there's other commits relevant, can you pls dig them
> > out?
> >
> > Like it all makes sense what you're saying and matches the code, I
> > just can't match it up with the commit you're referencing.
>
> That is the patch directly following the one I've mentioned:
>
> commit 37bff6542c4e140a11657406c1bab50a40329cc1
> Author: Dave Airlie <airlied@redhat.com>
> Date:   Thu Sep 17 13:24:50 2020 +1000
>
>      drm/ttm: move unbind into the tt destroy.
>
>      This moves unbind into the driver side on destroy paths.
>
> I will add a Fixes tag to make that clear.
>
> But this patch also just moves the undbind from the TTM destroy path to
> the driver destroy path.
>
> To be honest I'm not 100% sure either when the when the unbind moved
> from the unpopulate path into the destroy path, but I think that this
> wasn't always the case. Let me try to dig that up.
>
> >> The problem is that he also moved doing the unbind into the free
> >> callback instead of the unpopulate callback. This result in stale page
> >> pointers in the GART if that unpopulate operation isn't immediately
> >> followed by a free.
> >>
> >> Thinking more about it if we do populated->unpopulated->populated then
> >> we would also have stale pointers to the old pages which is even worse.
> >>
> >> This is also not de-midlayering since we already have a proper
> >> ttm_tt_init()/ttm_tt_fini() functions which should work nicely for the
> >> tt object.
> > Well you're last patch moves the ttm_tt_destroy_common stuff back into
> > ttm, which kinda is de-demidlayering. So I'm confused.
>
> Ah, yes that is correct. I've also considered to move this in
> ttm_tt_fini instead of there.
>
> But that would be a larger change and I wanted to fix the problem at
> hand first, potentially even adding a CC stable tag.
>
> > Other bit: I think it'd be good to document this properly in the
> > callbacks, and maybe ideally go about and kerneldoc-ify the entire
> > ttm_tt.h header. Otherwise when we eventually (never?) get around to
> > that, everyone has forgotten these semantic details and issues again.
>
> Already working towards including more of the TTM headers and code files
> in kerneldoc. But not quite there yet.
>
> But you know, normal human: Only equipped with one head and two hands
> and not cloneable.

I'm the same, but I'm not seeing where this problem happens at all, do
we have any backtraces or demos for this?

I split bind/unbind into the driver, but the driver should still
always be moving things to unbound states before an unpopulate is
called, is there a driver doing something unexpected here?

at worst I'd like to see a WARN_ON put in first and a test in igt that
triggers it, because right now I'm not see that path through the
drivers/ttm that leads to unpopulated pages ending up happening while
bound.

From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in
non-ghost path and there is no unbind,
pipeline gutting is called from evict/validate, when there is no
placement suggested for the object, is this case getting hit somewhere
without the driver having previously unbound things?

ttm_tt_destroy_common: calls unpopulate, everyone calls this directly
after unbinding
ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT
directly, we should always unbind first, this used to have an assert
against that,
ttm_tt_populate: call unpopulate in failure path

So any place I can see unpopulate getting called with a bound TT
should be a bug, and fixed, we could protect against it better but I'm
not seeing the need for this series to outright revert things back as
helping.

Dave.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-26  0:03         ` Dave Airlie
@ 2021-07-26 19:35           ` Christian König
  2021-07-26 20:03             ` Dave Airlie
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2021-07-26 19:35 UTC (permalink / raw)
  To: Dave Airlie; +Cc: Dave Airlie, dri-devel

Am 26.07.21 um 02:03 schrieb Dave Airlie:
> [SNIP]
>> But you know, normal human: Only equipped with one head and two hands
>> and not cloneable.
> I'm the same, but I'm not seeing where this problem happens at all, do
> we have any backtraces or demos for this?

I've stumbled over this while working on some patches which accidentally 
broke delayed delete and caused random memory corruption and was 
wondering how that even happened in the first place.

Having stale PTEs in the GART isn't a problem unless you break other 
things. Which is also the reason why I haven't added a CC stable yet.

> I split bind/unbind into the driver, but the driver should still
> always be moving things to unbound states before an unpopulate is
> called, is there a driver doing something unexpected here?

Possible, I was only testing with amdgpu and the GART handling is rather 
special there (which was one of the reasons why we did that in the first 
place).

> at worst I'd like to see a WARN_ON put in first and a test in igt that
> triggers it, because right now I'm not see that path through the
> drivers/ttm that leads to unpopulated pages ending up happening while
> bound.
>
>  From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in
> non-ghost path and there is no unbind,
> pipeline gutting is called from evict/validate, when there is no
> placement suggested for the object, is this case getting hit somewhere
> without the driver having previously unbound things?

Yes, that will hit absolutely. Most likely with VM page tables for 
amdgpu which uses this.

> ttm_tt_destroy_common: calls unpopulate, everyone calls this directly
> after unbinding
> ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT
> directly, we should always unbind first, this used to have an assert
> against that,
> ttm_tt_populate: call unpopulate in failure path

unbinding was moved into the driver, so ttm_tt_swapout() won't unbind 
anything before calling unpopulate as far as I can see.

> So any place I can see unpopulate getting called with a bound TT
> should be a bug, and fixed, we could protect against it better but I'm
> not seeing the need for this series to outright revert things back as
> helping.

I'm not reverting this because it is offhand wrong, but making sure the 
GART is clear before unpopulating the TT object sounds like the much 
more natural thing to do and the state machine is something I certainly 
don't see in the backend.

Regards,
Christian.

>
> Dave.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-26 19:35           ` Christian König
@ 2021-07-26 20:03             ` Dave Airlie
  2021-07-27  9:28               ` Daniel Vetter
  2021-07-28 11:18               ` Christian König
  0 siblings, 2 replies; 21+ messages in thread
From: Dave Airlie @ 2021-07-26 20:03 UTC (permalink / raw)
  To: Christian König; +Cc: Dave Airlie, dri-devel

On Tue, 27 Jul 2021 at 05:35, Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 26.07.21 um 02:03 schrieb Dave Airlie:
> > [SNIP]
> >> But you know, normal human: Only equipped with one head and two hands
> >> and not cloneable.
> > I'm the same, but I'm not seeing where this problem happens at all, do
> > we have any backtraces or demos for this?
>
> I've stumbled over this while working on some patches which accidentally
> broke delayed delete and caused random memory corruption and was
> wondering how that even happened in the first place.
>
> Having stale PTEs in the GART isn't a problem unless you break other
> things. Which is also the reason why I haven't added a CC stable yet.
>
> > I split bind/unbind into the driver, but the driver should still
> > always be moving things to unbound states before an unpopulate is
> > called, is there a driver doing something unexpected here?
>
> Possible, I was only testing with amdgpu and the GART handling is rather
> special there (which was one of the reasons why we did that in the first
> place).
>
> > at worst I'd like to see a WARN_ON put in first and a test in igt that
> > triggers it, because right now I'm not see that path through the
> > drivers/ttm that leads to unpopulated pages ending up happening while
> > bound.
> >
> >  From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in
> > non-ghost path and there is no unbind,
> > pipeline gutting is called from evict/validate, when there is no
> > placement suggested for the object, is this case getting hit somewhere
> > without the driver having previously unbound things?
>
> Yes, that will hit absolutely. Most likely with VM page tables for
> amdgpu which uses this.

My thing is here we moved binding/unbinding to the driver, if the
driver has a bug
I'd expect the fix to be in the driver side here. I think giving
drivers control over something
and enforcing it in the core/helpers is fine, but I don't think we
should be adding back
the midlayering.

> > ttm_tt_destroy_common: calls unpopulate, everyone calls this directly
> > after unbinding
> > ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT
> > directly, we should always unbind first, this used to have an assert
> > against that,
> > ttm_tt_populate: call unpopulate in failure path
>
> unbinding was moved into the driver, so ttm_tt_swapout() won't unbind
> anything before calling unpopulate as far as I can see.

But we won't call swapout on BOs that aren't in SYSTEM and to be in SYSTEM,
the bo would have to go through the driver move function which will unbind it.

>
> > So any place I can see unpopulate getting called with a bound TT
> > should be a bug, and fixed, we could protect against it better but I'm
> > not seeing the need for this series to outright revert things back as
> > helping.
>
> I'm not reverting this because it is offhand wrong, but making sure the
> GART is clear before unpopulating the TT object sounds like the much
> more natural thing to do and the state machine is something I certainly
> don't see in the backend.
>

I don't think that's the core's responsibility anymore, I'm fine with
adding code and
even an flag that says if the the TT is bound/unbound that unpopulate
can WARN_ON()
against so we get a backtrace and track down where something is
getting unpopulated
without going through the driver move function to be unbound.

Dave.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-26 20:03             ` Dave Airlie
@ 2021-07-27  9:28               ` Daniel Vetter
  2021-07-28 11:18               ` Christian König
  1 sibling, 0 replies; 21+ messages in thread
From: Daniel Vetter @ 2021-07-27  9:28 UTC (permalink / raw)
  To: Dave Airlie; +Cc: Christian König, dri-devel, Dave Airlie

On Tue, Jul 27, 2021 at 06:03:12AM +1000, Dave Airlie wrote:
> On Tue, 27 Jul 2021 at 05:35, Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
> >
> > Am 26.07.21 um 02:03 schrieb Dave Airlie:
> > > [SNIP]
> > >> But you know, normal human: Only equipped with one head and two hands
> > >> and not cloneable.
> > > I'm the same, but I'm not seeing where this problem happens at all, do
> > > we have any backtraces or demos for this?
> >
> > I've stumbled over this while working on some patches which accidentally
> > broke delayed delete and caused random memory corruption and was
> > wondering how that even happened in the first place.
> >
> > Having stale PTEs in the GART isn't a problem unless you break other
> > things. Which is also the reason why I haven't added a CC stable yet.
> >
> > > I split bind/unbind into the driver, but the driver should still
> > > always be moving things to unbound states before an unpopulate is
> > > called, is there a driver doing something unexpected here?
> >
> > Possible, I was only testing with amdgpu and the GART handling is rather
> > special there (which was one of the reasons why we did that in the first
> > place).
> >
> > > at worst I'd like to see a WARN_ON put in first and a test in igt that
> > > triggers it, because right now I'm not see that path through the
> > > drivers/ttm that leads to unpopulated pages ending up happening while
> > > bound.
> > >
> > >  From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in
> > > non-ghost path and there is no unbind,
> > > pipeline gutting is called from evict/validate, when there is no
> > > placement suggested for the object, is this case getting hit somewhere
> > > without the driver having previously unbound things?
> >
> > Yes, that will hit absolutely. Most likely with VM page tables for
> > amdgpu which uses this.
> 
> My thing is here we moved binding/unbinding to the driver, if the
> driver has a bug
> I'd expect the fix to be in the driver side here. I think giving
> drivers control over something
> and enforcing it in the core/helpers is fine, but I don't think we
> should be adding back
> the midlayering.
> 
> > > ttm_tt_destroy_common: calls unpopulate, everyone calls this directly
> > > after unbinding
> > > ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT
> > > directly, we should always unbind first, this used to have an assert
> > > against that,
> > > ttm_tt_populate: call unpopulate in failure path
> >
> > unbinding was moved into the driver, so ttm_tt_swapout() won't unbind
> > anything before calling unpopulate as far as I can see.
> 
> But we won't call swapout on BOs that aren't in SYSTEM and to be in SYSTEM,
> the bo would have to go through the driver move function which will unbind it.
> 
> >
> > > So any place I can see unpopulate getting called with a bound TT
> > > should be a bug, and fixed, we could protect against it better but I'm
> > > not seeing the need for this series to outright revert things back as
> > > helping.
> >
> > I'm not reverting this because it is offhand wrong, but making sure the
> > GART is clear before unpopulating the TT object sounds like the much
> > more natural thing to do and the state machine is something I certainly
> > don't see in the backend.
> >
> 
> I don't think that's the core's responsibility anymore, I'm fine with
> adding code and
> even an flag that says if the the TT is bound/unbound that unpopulate
> can WARN_ON()
> against so we get a backtrace and track down where something is
> getting unpopulated
> without going through the driver move function to be unbound.

Yeah I think sprinkling a few WARN_ON around to make sure drivers use the
functions correctly is the right thing. Re-midlayering because we don't
trust drivers to do things correctly isn't.

Aside from this principle I'll let you two duke out what's actually going
on :-)
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-26 20:03             ` Dave Airlie
  2021-07-27  9:28               ` Daniel Vetter
@ 2021-07-28 11:18               ` Christian König
  1 sibling, 0 replies; 21+ messages in thread
From: Christian König @ 2021-07-28 11:18 UTC (permalink / raw)
  To: Dave Airlie; +Cc: Dave Airlie, dri-devel

Am 26.07.21 um 22:03 schrieb Dave Airlie:
> On Tue, 27 Jul 2021 at 05:35, Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 26.07.21 um 02:03 schrieb Dave Airlie:
>>> [SNIP]
>>>> But you know, normal human: Only equipped with one head and two hands
>>>> and not cloneable.
>>> I'm the same, but I'm not seeing where this problem happens at all, do
>>> we have any backtraces or demos for this?
>> I've stumbled over this while working on some patches which accidentally
>> broke delayed delete and caused random memory corruption and was
>> wondering how that even happened in the first place.
>>
>> Having stale PTEs in the GART isn't a problem unless you break other
>> things. Which is also the reason why I haven't added a CC stable yet.
>>
>>> I split bind/unbind into the driver, but the driver should still
>>> always be moving things to unbound states before an unpopulate is
>>> called, is there a driver doing something unexpected here?
>> Possible, I was only testing with amdgpu and the GART handling is rather
>> special there (which was one of the reasons why we did that in the first
>> place).
>>
>>> at worst I'd like to see a WARN_ON put in first and a test in igt that
>>> triggers it, because right now I'm not see that path through the
>>> drivers/ttm that leads to unpopulated pages ending up happening while
>>> bound.
>>>
>>>   From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in
>>> non-ghost path and there is no unbind,
>>> pipeline gutting is called from evict/validate, when there is no
>>> placement suggested for the object, is this case getting hit somewhere
>>> without the driver having previously unbound things?
>> Yes, that will hit absolutely. Most likely with VM page tables for
>> amdgpu which uses this.
> My thing is here we moved binding/unbinding to the driver, if the
> driver has a bug
> I'd expect the fix to be in the driver side here. I think giving
> drivers control over something
> and enforcing it in the core/helpers is fine, but I don't think we
> should be adding back
> the midlayering.

Ok, then we are pretty much already on the same page here.

I've just reverted the patch because I thought it would be cleaner for 
eventually backporting it.



>
>>> ttm_tt_destroy_common: calls unpopulate, everyone calls this directly
>>> after unbinding
>>> ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT
>>> directly, we should always unbind first, this used to have an assert
>>> against that,
>>> ttm_tt_populate: call unpopulate in failure path
>> unbinding was moved into the driver, so ttm_tt_swapout() won't unbind
>> anything before calling unpopulate as far as I can see.
> But we won't call swapout on BOs that aren't in SYSTEM and to be in SYSTEM,
> the bo would have to go through the driver move function which will unbind it.

Ah, good point.

>
>>> So any place I can see unpopulate getting called with a bound TT
>>> should be a bug, and fixed, we could protect against it better but I'm
>>> not seeing the need for this series to outright revert things back as
>>> helping.
>> I'm not reverting this because it is offhand wrong, but making sure the
>> GART is clear before unpopulating the TT object sounds like the much
>> more natural thing to do and the state machine is something I certainly
>> don't see in the backend.
>>
> I don't think that's the core's responsibility anymore, I'm fine with
> adding code and
> even an flag that says if the the TT is bound/unbound that unpopulate
> can WARN_ON()
> against so we get a backtrace and track down where something is
> getting unpopulated
> without going through the driver move function to be unbound.

I was not talking about bound/unbound, but rather unpopulating before 
destroying.

The requirement we have is that the tt object is unpopulated before it 
is destroyed and the state machine of that object is essentially 
controlled by its BO and not the object itself.

I will prepare a patch making that cleaner.

Thanks,
Christian.

>
> Dave.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-08-26 10:11       ` Christian König
@ 2021-08-26 12:50         ` Daniel Vetter
  0 siblings, 0 replies; 21+ messages in thread
From: Daniel Vetter @ 2021-08-26 12:50 UTC (permalink / raw)
  To: Christian König
  Cc: Daniel Vetter, Thomas Hellström, daniel.vetter, airlied, dri-devel

On Thu, Aug 26, 2021 at 12:11:04PM +0200, Christian König wrote:
> 
> 
> Am 26.08.21 um 10:49 schrieb Daniel Vetter:
> > On Mon, Aug 23, 2021 at 01:15:20PM +0200, Thomas Hellström wrote:
> > > On Mon, 2021-08-23 at 13:05 +0200, Christian König wrote:
> > > > Adding Thomas on CC as well.
> > > > 
> > > > Just a gentle ping. I think the patch set makes sense now.
> > > > 
> > > > Regards,
> > > > Christian.
> > > > 
> > > > Am 28.07.21 um 15:05 schrieb Christian König:
> > > > > Doing this in vmw_ttm_destroy() is to late.
> > > > > 
> > > > > It turned out that this is not a good idea at all because it leaves
> > > > > pointers
> > > > > to freed up system memory pages in the GART tables of the drivers.
> > > > > 
> > > > > Signed-off-by: Christian König <christian.koenig@amd.com>
> > > > > ---
> > > > >    drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
> > > > >    1 file changed, 3 insertions(+), 6 deletions(-)
> > > > > 
> > > Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > For next time around I think recording a bit more of the discussions and
> > git history in these would be really good. At least I'd like to get more
> > people ramped up on ttm work, and for that to work out things need to be a
> > bit more accessible ... The above commit message is pretty much useless if
> > you ever hit it in a git blame, if you haven't been involved in any of
> > these discussions.
> 
> I've pushed it with a link tag back to the patches in patchwork, but I
> should probably include a link tag to the older versions as well for
> completeness.

Imo references to the commits that landed which broke stuff and full
quotes of the relevant discussions in the other thread. Just adding the
links still means you have to painstakingly stitch the story together
yourself, but at least it would be somewhat possible.

Really I think stuff like this should be documented in kerneldoc for these
callbacks, and we just got to start doing that. Waiting until magically
someone else fixes the ttm kerneldoc wont happen :-)
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-08-26  8:49     ` Daniel Vetter
@ 2021-08-26 10:11       ` Christian König
  2021-08-26 12:50         ` Daniel Vetter
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2021-08-26 10:11 UTC (permalink / raw)
  To: Daniel Vetter, Thomas Hellström; +Cc: daniel.vetter, airlied, dri-devel



Am 26.08.21 um 10:49 schrieb Daniel Vetter:
> On Mon, Aug 23, 2021 at 01:15:20PM +0200, Thomas Hellström wrote:
>> On Mon, 2021-08-23 at 13:05 +0200, Christian König wrote:
>>> Adding Thomas on CC as well.
>>>
>>> Just a gentle ping. I think the patch set makes sense now.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 28.07.21 um 15:05 schrieb Christian König:
>>>> Doing this in vmw_ttm_destroy() is to late.
>>>>
>>>> It turned out that this is not a good idea at all because it leaves
>>>> pointers
>>>> to freed up system memory pages in the GART tables of the drivers.
>>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
>>>>    1 file changed, 3 insertions(+), 6 deletions(-)
>>>>
>> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> For next time around I think recording a bit more of the discussions and
> git history in these would be really good. At least I'd like to get more
> people ramped up on ttm work, and for that to work out things need to be a
> bit more accessible ... The above commit message is pretty much useless if
> you ever hit it in a git blame, if you haven't been involved in any of
> these discussions.

I've pushed it with a link tag back to the patches in patchwork, but I 
should probably include a link tag to the older versions as well for 
completeness.

Christian.

> -Daniel


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-08-23 11:15   ` Thomas Hellström
@ 2021-08-26  8:49     ` Daniel Vetter
  2021-08-26 10:11       ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Daniel Vetter @ 2021-08-26  8:49 UTC (permalink / raw)
  To: Thomas Hellström
  Cc: Christian König, daniel.vetter, airlied, dri-devel

On Mon, Aug 23, 2021 at 01:15:20PM +0200, Thomas Hellström wrote:
> On Mon, 2021-08-23 at 13:05 +0200, Christian König wrote:
> > Adding Thomas on CC as well.
> > 
> > Just a gentle ping. I think the patch set makes sense now.
> > 
> > Regards,
> > Christian.
> > 
> > Am 28.07.21 um 15:05 schrieb Christian König:
> > > Doing this in vmw_ttm_destroy() is to late.
> > > 
> > > It turned out that this is not a good idea at all because it leaves
> > > pointers
> > > to freed up system memory pages in the GART tables of the drivers.
> > > 
> > > Signed-off-by: Christian König <christian.koenig@amd.com>
> > > ---
> > >   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
> > >   1 file changed, 3 insertions(+), 6 deletions(-)
> > > 
> 
> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>

For next time around I think recording a bit more of the discussions and
git history in these would be really good. At least I'd like to get more
people ramped up on ttm work, and for that to work out things need to be a
bit more accessible ... The above commit message is pretty much useless if
you ever hit it in a git blame, if you haven't been involved in any of
these discussions.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-08-23 11:05 ` Christian König
@ 2021-08-23 11:15   ` Thomas Hellström
  2021-08-26  8:49     ` Daniel Vetter
  0 siblings, 1 reply; 21+ messages in thread
From: Thomas Hellström @ 2021-08-23 11:15 UTC (permalink / raw)
  To: Christian König, daniel.vetter, airlied; +Cc: dri-devel

On Mon, 2021-08-23 at 13:05 +0200, Christian König wrote:
> Adding Thomas on CC as well.
> 
> Just a gentle ping. I think the patch set makes sense now.
> 
> Regards,
> Christian.
> 
> Am 28.07.21 um 15:05 schrieb Christian König:
> > Doing this in vmw_ttm_destroy() is to late.
> > 
> > It turned out that this is not a good idea at all because it leaves
> > pointers
> > to freed up system memory pages in the GART tables of the drivers.
> > 
> > Signed-off-by: Christian König <christian.koenig@amd.com>
> > ---
> >   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
> >   1 file changed, 3 insertions(+), 6 deletions(-)
> > 

Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
  2021-07-28 13:05 Christian König
@ 2021-08-23 11:05 ` Christian König
  2021-08-23 11:15   ` Thomas Hellström
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2021-08-23 11:05 UTC (permalink / raw)
  To: daniel.vetter, airlied; +Cc: dri-devel, Thomas Hellström

Adding Thomas on CC as well.

Just a gentle ping. I think the patch set makes sense now.

Regards,
Christian.

Am 28.07.21 um 15:05 schrieb Christian König:
> Doing this in vmw_ttm_destroy() is to late.
>
> It turned out that this is not a good idea at all because it leaves pointers
> to freed up system memory pages in the GART tables of the drivers.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
>   1 file changed, 3 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> index b0973c27e774..904031d03dbe 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
> @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
>   	struct vmw_ttm_tt *vmw_be =
>   		container_of(ttm, struct vmw_ttm_tt, dma_ttm);
>   
> -	vmw_ttm_unbind(bdev, ttm);
>   	ttm_tt_destroy_common(bdev, ttm);
>   	vmw_ttm_unmap_dma(vmw_be);
> -	if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
> -		ttm_tt_fini(&vmw_be->dma_ttm);
> -	else
> -		ttm_tt_fini(ttm);
> -
> +	ttm_tt_fini(ttm);
>   	if (vmw_be->mob)
>   		vmw_mob_destroy(vmw_be->mob);
>   
> @@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev,
>   						 dma_ttm);
>   	unsigned int i;
>   
> +	vmw_ttm_unbind(bdev, ttm);
> +
>   	if (vmw_tt->mob) {
>   		vmw_mob_destroy(vmw_tt->mob);
>   		vmw_tt->mob = NULL;


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
@ 2021-07-28 13:05 Christian König
  2021-08-23 11:05 ` Christian König
  0 siblings, 1 reply; 21+ messages in thread
From: Christian König @ 2021-07-28 13:05 UTC (permalink / raw)
  To: daniel.vetter, airlied; +Cc: dri-devel

Doing this in vmw_ttm_destroy() is to late.

It turned out that this is not a good idea at all because it leaves pointers
to freed up system memory pages in the GART tables of the drivers.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index b0973c27e774..904031d03dbe 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
@@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 	struct vmw_ttm_tt *vmw_be =
 		container_of(ttm, struct vmw_ttm_tt, dma_ttm);
 
-	vmw_ttm_unbind(bdev, ttm);
 	ttm_tt_destroy_common(bdev, ttm);
 	vmw_ttm_unmap_dma(vmw_be);
-	if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
-		ttm_tt_fini(&vmw_be->dma_ttm);
-	else
-		ttm_tt_fini(ttm);
-
+	ttm_tt_fini(ttm);
 	if (vmw_be->mob)
 		vmw_mob_destroy(vmw_be->mob);
 
@@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev,
 						 dma_ttm);
 	unsigned int i;
 
+	vmw_ttm_unbind(bdev, ttm);
+
 	if (vmw_tt->mob) {
 		vmw_mob_destroy(vmw_tt->mob);
 		vmw_tt->mob = NULL;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate
@ 2021-07-22  8:55 Christian König
  0 siblings, 0 replies; 21+ messages in thread
From: Christian König @ 2021-07-22  8:55 UTC (permalink / raw)
  To: dri-devel, airlied

Turned out that doing this in vmw_ttm_destroy() is to late.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index b0973c27e774..904031d03dbe 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
@@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm)
 	struct vmw_ttm_tt *vmw_be =
 		container_of(ttm, struct vmw_ttm_tt, dma_ttm);
 
-	vmw_ttm_unbind(bdev, ttm);
 	ttm_tt_destroy_common(bdev, ttm);
 	vmw_ttm_unmap_dma(vmw_be);
-	if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
-		ttm_tt_fini(&vmw_be->dma_ttm);
-	else
-		ttm_tt_fini(ttm);
-
+	ttm_tt_fini(ttm);
 	if (vmw_be->mob)
 		vmw_mob_destroy(vmw_be->mob);
 
@@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev,
 						 dma_ttm);
 	unsigned int i;
 
+	vmw_ttm_unbind(bdev, ttm);
+
 	if (vmw_tt->mob) {
 		vmw_mob_destroy(vmw_tt->mob);
 		vmw_tt->mob = NULL;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2021-08-26 12:50 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-22 12:41 [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Christian König
2021-07-22 12:41 ` [PATCH 2/5] drm/amdgpu: unbind in amdgpu_ttm_tt_unpopulate Christian König
2021-07-22 12:41 ` [PATCH 3/5] drm/nouveau: unbind in nouveau_ttm_tt_unpopulate Christian König
2021-07-22 12:41 ` [PATCH 4/5] drm/radeon: unbind in radeon_ttm_tt_unpopulate() Christian König
2021-07-22 12:41 ` [PATCH 5/5] drm/ttm: revert "flip tt destroy ordering." Christian König
2021-07-23  8:47 ` [PATCH 1/5] drm/vmwgfx: unbind in vmw_ttm_unpopulate Daniel Vetter
2021-07-23  9:13   ` Christian König
2021-07-23  9:21     ` Daniel Vetter
2021-07-23  9:40       ` Christian König
2021-07-26  0:03         ` Dave Airlie
2021-07-26 19:35           ` Christian König
2021-07-26 20:03             ` Dave Airlie
2021-07-27  9:28               ` Daniel Vetter
2021-07-28 11:18               ` Christian König
  -- strict thread matches above, loose matches on Subject: below --
2021-07-28 13:05 Christian König
2021-08-23 11:05 ` Christian König
2021-08-23 11:15   ` Thomas Hellström
2021-08-26  8:49     ` Daniel Vetter
2021-08-26 10:11       ` Christian König
2021-08-26 12:50         ` Daniel Vetter
2021-07-22  8:55 Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.