All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
@ 2021-10-19 23:27 Jason Gunthorpe
  2021-10-20  6:34 ` Thomas Hellström
  0 siblings, 1 reply; 15+ messages in thread
From: Jason Gunthorpe @ 2021-10-19 23:27 UTC (permalink / raw)
  To: David Airlie, Christian Koenig, Daniel Vetter, dri-devel,
	Huang Rui, Thomas Hellström
  Cc: Dan Williams, Ralph Campbell, Roland Scheidegger

PUD and PMD entries do not have a special bit.

get_user_pages_fast() considers any page that passed pmd_huge() as
usable:

	if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
		     pmd_devmap(pmd))) {

And vmf_insert_pfn_pmd_prot() unconditionally sets

	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));

eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.

As such gup_huge_pmd() will try to deref a struct page:

	head = try_grab_compound_head(pmd_page(orig), refs, flags);

and thus crash.

Thomas further notices that the drivers are not expecting the struct page
to be used by anything - in particular the refcount incr above will cause
them to malfunction.

Thus everything about this is not able to fully work correctly considering
GUP_fast. Delete it entirely. It can return someday along with a proper
PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.

Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM pagefaults")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  2 +-
 drivers/gpu/drm/nouveau/nouveau_gem.c      |  2 +-
 drivers/gpu/drm/radeon/radeon_gem.c        |  2 +-
 drivers/gpu/drm/ttm/ttm_bo_vm.c            | 94 +---------------------
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h        |  4 -
 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 72 +----------------
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c   |  3 -
 include/drm/ttm/ttm_bo_api.h               |  3 +-
 8 files changed, 7 insertions(+), 175 deletions(-)

v2:
 - Remove the entire thing as per Thomas's advice
v1: https://lore.kernel.org/r/0-v1-69e7da97f81f+21c-ttm_pmd_jgg@nvidia.com

After this patch the only users of the vmf_insert_pfn_pud/pmd_prot() functions
are DAX and DAX always has a struct page. Eliminating this non-working case
will simplify trying to fix the refcounting on ZONE_DEVICE pages.

Thanks,
Jason

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index d6aa032890ee8b..a1e63ba4c54a59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -61,7 +61,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
 		}
 
 		 ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
-						TTM_BO_VM_NUM_PREFAULT, 1);
+						TTM_BO_VM_NUM_PREFAULT);
 
 		 drm_dev_exit(idx);
 	} else {
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 8c2ecc28272322..c89d5964148fd5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault *vmf)
 
 	nouveau_bo_del_io_reserve_lru(bo);
 	prot = vm_get_page_prot(vma->vm_flags);
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, 1);
+	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
 	nouveau_bo_add_io_reserve_lru(bo);
 	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
 		return ret;
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 458f92a7088797..a36a4f2c76b097 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -61,7 +61,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault *vmf)
 		goto unlock_resv;
 
 	ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
-				       TTM_BO_VM_NUM_PREFAULT, 1);
+				       TTM_BO_VM_NUM_PREFAULT);
 	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
 		goto unlock_mclk;
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index f56be5bc0861ec..e5af7f9e94b273 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -171,89 +171,6 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
 }
 EXPORT_SYMBOL(ttm_bo_vm_reserve);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/**
- * ttm_bo_vm_insert_huge - Insert a pfn for PUD or PMD faults
- * @vmf: Fault data
- * @bo: The buffer object
- * @page_offset: Page offset from bo start
- * @fault_page_size: The size of the fault in pages.
- * @pgprot: The page protections.
- * Does additional checking whether it's possible to insert a PUD or PMD
- * pfn and performs the insertion.
- *
- * Return: VM_FAULT_NOPAGE on successful insertion, VM_FAULT_FALLBACK if
- * a huge fault was not possible, or on insertion error.
- */
-static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
-					struct ttm_buffer_object *bo,
-					pgoff_t page_offset,
-					pgoff_t fault_page_size,
-					pgprot_t pgprot)
-{
-	pgoff_t i;
-	vm_fault_t ret;
-	unsigned long pfn;
-	pfn_t pfnt;
-	struct ttm_tt *ttm = bo->ttm;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
-
-	/* Fault should not cross bo boundary. */
-	page_offset &= ~(fault_page_size - 1);
-	if (page_offset + fault_page_size > bo->resource->num_pages)
-		goto out_fallback;
-
-	if (bo->resource->bus.is_iomem)
-		pfn = ttm_bo_io_mem_pfn(bo, page_offset);
-	else
-		pfn = page_to_pfn(ttm->pages[page_offset]);
-
-	/* pfn must be fault_page_size aligned. */
-	if ((pfn & (fault_page_size - 1)) != 0)
-		goto out_fallback;
-
-	/* Check that memory is contiguous. */
-	if (!bo->resource->bus.is_iomem) {
-		for (i = 1; i < fault_page_size; ++i) {
-			if (page_to_pfn(ttm->pages[page_offset + i]) != pfn + i)
-				goto out_fallback;
-		}
-	} else if (bo->bdev->funcs->io_mem_pfn) {
-		for (i = 1; i < fault_page_size; ++i) {
-			if (ttm_bo_io_mem_pfn(bo, page_offset + i) != pfn + i)
-				goto out_fallback;
-		}
-	}
-
-	pfnt = __pfn_to_pfn_t(pfn, PFN_DEV);
-	if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT))
-		ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write);
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-	else if (fault_page_size == (HPAGE_PUD_SIZE >> PAGE_SHIFT))
-		ret = vmf_insert_pfn_pud_prot(vmf, pfnt, pgprot, write);
-#endif
-	else
-		WARN_ON_ONCE(ret = VM_FAULT_FALLBACK);
-
-	if (ret != VM_FAULT_NOPAGE)
-		goto out_fallback;
-
-	return VM_FAULT_NOPAGE;
-out_fallback:
-	count_vm_event(THP_FAULT_FALLBACK);
-	return VM_FAULT_FALLBACK;
-}
-#else
-static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
-					struct ttm_buffer_object *bo,
-					pgoff_t page_offset,
-					pgoff_t fault_page_size,
-					pgprot_t pgprot)
-{
-	return VM_FAULT_FALLBACK;
-}
-#endif
-
 /**
  * ttm_bo_vm_fault_reserved - TTM fault helper
  * @vmf: The struct vm_fault given as argument to the fault callback
@@ -261,7 +178,6 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
  * @num_prefault: Maximum number of prefault pages. The caller may want to
  * specify this based on madvice settings and the size of the GPU object
  * backed by the memory.
- * @fault_page_size: The size of the fault in pages.
  *
  * This function inserts one or more page table entries pointing to the
  * memory backing the buffer object, and then returns a return code
@@ -275,8 +191,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
  */
 vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 				    pgprot_t prot,
-				    pgoff_t num_prefault,
-				    pgoff_t fault_page_size)
+				    pgoff_t num_prefault)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct ttm_buffer_object *bo = vma->vm_private_data;
@@ -327,11 +242,6 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 		prot = pgprot_decrypted(prot);
 	}
 
-	/* We don't prefault on huge faults. Yet. */
-	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && fault_page_size != 1)
-		return ttm_bo_vm_insert_huge(vmf, bo, page_offset,
-					     fault_page_size, prot);
-
 	/*
 	 * Speculatively prefault a number of pages. Only error on
 	 * first page.
@@ -429,7 +339,7 @@ vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 
 	prot = vma->vm_page_prot;
 	if (drm_dev_enter(ddev, &idx)) {
-		ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, 1);
+		ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
 		drm_dev_exit(idx);
 	} else {
 		ret = ttm_bo_vm_dummy_page(vmf, prot);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index a833751099b559..858aff99a3fe53 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -1550,10 +1550,6 @@ void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
 			pgoff_t start, pgoff_t end);
 vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
 vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
-				enum page_entry_size pe_size);
-#endif
 
 /* Transparent hugepage support - vmwgfx_thp.c */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
index e5a9a5cbd01a7c..922317d1acc8a0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
@@ -477,7 +477,7 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
 	else
 		prot = vm_get_page_prot(vma->vm_flags);
 
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, 1);
+	ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
 	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
 		return ret;
 
@@ -486,73 +486,3 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
 
 	return ret;
 }
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
-				enum page_entry_size pe_size)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	struct vmw_buffer_object *vbo =
-		container_of(bo, struct vmw_buffer_object, base);
-	pgprot_t prot;
-	vm_fault_t ret;
-	pgoff_t fault_page_size;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
-
-	switch (pe_size) {
-	case PE_SIZE_PMD:
-		fault_page_size = HPAGE_PMD_SIZE >> PAGE_SHIFT;
-		break;
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-	case PE_SIZE_PUD:
-		fault_page_size = HPAGE_PUD_SIZE >> PAGE_SHIFT;
-		break;
-#endif
-	default:
-		WARN_ON_ONCE(1);
-		return VM_FAULT_FALLBACK;
-	}
-
-	/* Always do write dirty-tracking and COW on PTE level. */
-	if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping(vma->vm_flags)))
-		return VM_FAULT_FALLBACK;
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	if (vbo->dirty) {
-		pgoff_t allowed_prefault;
-		unsigned long page_offset;
-
-		page_offset = vmf->pgoff -
-			drm_vma_node_start(&bo->base.vma_node);
-		if (page_offset >= bo->resource->num_pages ||
-		    vmw_resources_clean(vbo, page_offset,
-					page_offset + PAGE_SIZE,
-					&allowed_prefault)) {
-			ret = VM_FAULT_SIGBUS;
-			goto out_unlock;
-		}
-
-		/*
-		 * Write protect, so we get a new fault on write, and can
-		 * split.
-		 */
-		prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
-	} else {
-		prot = vm_get_page_prot(vma->vm_flags);
-	}
-
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, 1, fault_page_size);
-	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
-		return ret;
-
-out_unlock:
-	dma_resv_unlock(bo->base.resv);
-
-	return ret;
-}
-#endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
index e6b1f98ec99f09..0a4c340252ec4a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
@@ -61,9 +61,6 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma)
 		.fault = vmw_bo_vm_fault,
 		.open = ttm_bo_vm_open,
 		.close = ttm_bo_vm_close,
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		.huge_fault = vmw_bo_vm_huge_fault,
-#endif
 	};
 	struct drm_file *file_priv = filp->private_data;
 	struct vmw_private *dev_priv = vmw_priv(file_priv->minor->dev);
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index f681bbdbc6982e..36f7eb9d066395 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -594,8 +594,7 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
 
 vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 				    pgprot_t prot,
-				    pgoff_t num_prefault,
-				    pgoff_t fault_page_size);
+				    pgoff_t num_prefault);
 
 vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
 

base-commit: 519d81956ee277b4419c723adfb154603c2565ba
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-19 23:27 [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs Jason Gunthorpe
@ 2021-10-20  6:34 ` Thomas Hellström
  2021-10-20  6:41   ` Christian König
  2021-10-20 14:09   ` Jason Gunthorpe
  0 siblings, 2 replies; 15+ messages in thread
From: Thomas Hellström @ 2021-10-20  6:34 UTC (permalink / raw)
  To: Jason Gunthorpe, David Airlie, Christian Koenig, Daniel Vetter,
	dri-devel, Huang Rui
  Cc: Dan Williams, Ralph Campbell, Roland Scheidegger


On 10/20/21 01:27, Jason Gunthorpe wrote:
> PUD and PMD entries do not have a special bit.
>
> get_user_pages_fast() considers any page that passed pmd_huge() as
> usable:
>
> 	if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
> 		     pmd_devmap(pmd))) {
>
> And vmf_insert_pfn_pmd_prot() unconditionally sets
>
> 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
>
> eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.
>
> As such gup_huge_pmd() will try to deref a struct page:
>
> 	head = try_grab_compound_head(pmd_page(orig), refs, flags);
>
> and thus crash.
>
> Thomas further notices that the drivers are not expecting the struct page
> to be used by anything - in particular the refcount incr above will cause
> them to malfunction.
>
> Thus everything about this is not able to fully work correctly considering
> GUP_fast. Delete it entirely. It can return someday along with a proper
> PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.
>
> Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM pagefaults")
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  2 +-
>   drivers/gpu/drm/nouveau/nouveau_gem.c      |  2 +-
>   drivers/gpu/drm/radeon/radeon_gem.c        |  2 +-
>   drivers/gpu/drm/ttm/ttm_bo_vm.c            | 94 +---------------------
>   drivers/gpu/drm/vmwgfx/vmwgfx_drv.h        |  4 -
>   drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 72 +----------------
>   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c   |  3 -
>   include/drm/ttm/ttm_bo_api.h               |  3 +-
>   8 files changed, 7 insertions(+), 175 deletions(-)
>
> v2:
>   - Remove the entire thing as per Thomas's advice
> v1: https://lore.kernel.org/r/0-v1-69e7da97f81f+21c-ttm_pmd_jgg@nvidia.com
>
> After this patch the only users of the vmf_insert_pfn_pud/pmd_prot() functions
> are DAX and DAX always has a struct page. Eliminating this non-working case
> will simplify trying to fix the refcounting on ZONE_DEVICE pages.
>
> Thanks,
> Jason

I think the patch subject needs updating to reflect that we're disabling 
PUD/PMDs completely.
With that fixed,

Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>

Follow up question: If we resurrect this in the proper way (and in that 
case only for x86_64) is there something we need to pay particular 
attention to WRT the ZONE_DEVICE refcounting fixing you mention above?

Thanks,

Thomas


> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index d6aa032890ee8b..a1e63ba4c54a59 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -61,7 +61,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
>   		}
>   
>   		 ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> -						TTM_BO_VM_NUM_PREFAULT, 1);
> +						TTM_BO_VM_NUM_PREFAULT);
>   
>   		 drm_dev_exit(idx);
>   	} else {
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index 8c2ecc28272322..c89d5964148fd5 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault *vmf)
>   
>   	nouveau_bo_del_io_reserve_lru(bo);
>   	prot = vm_get_page_prot(vma->vm_flags);
> -	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, 1);
> +	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
>   	nouveau_bo_add_io_reserve_lru(bo);
>   	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
>   		return ret;
> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
> index 458f92a7088797..a36a4f2c76b097 100644
> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> @@ -61,7 +61,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault *vmf)
>   		goto unlock_resv;
>   
>   	ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> -				       TTM_BO_VM_NUM_PREFAULT, 1);
> +				       TTM_BO_VM_NUM_PREFAULT);
>   	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
>   		goto unlock_mclk;
>   
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index f56be5bc0861ec..e5af7f9e94b273 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -171,89 +171,6 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
>   }
>   EXPORT_SYMBOL(ttm_bo_vm_reserve);
>   
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -/**
> - * ttm_bo_vm_insert_huge - Insert a pfn for PUD or PMD faults
> - * @vmf: Fault data
> - * @bo: The buffer object
> - * @page_offset: Page offset from bo start
> - * @fault_page_size: The size of the fault in pages.
> - * @pgprot: The page protections.
> - * Does additional checking whether it's possible to insert a PUD or PMD
> - * pfn and performs the insertion.
> - *
> - * Return: VM_FAULT_NOPAGE on successful insertion, VM_FAULT_FALLBACK if
> - * a huge fault was not possible, or on insertion error.
> - */
> -static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
> -					struct ttm_buffer_object *bo,
> -					pgoff_t page_offset,
> -					pgoff_t fault_page_size,
> -					pgprot_t pgprot)
> -{
> -	pgoff_t i;
> -	vm_fault_t ret;
> -	unsigned long pfn;
> -	pfn_t pfnt;
> -	struct ttm_tt *ttm = bo->ttm;
> -	bool write = vmf->flags & FAULT_FLAG_WRITE;
> -
> -	/* Fault should not cross bo boundary. */
> -	page_offset &= ~(fault_page_size - 1);
> -	if (page_offset + fault_page_size > bo->resource->num_pages)
> -		goto out_fallback;
> -
> -	if (bo->resource->bus.is_iomem)
> -		pfn = ttm_bo_io_mem_pfn(bo, page_offset);
> -	else
> -		pfn = page_to_pfn(ttm->pages[page_offset]);
> -
> -	/* pfn must be fault_page_size aligned. */
> -	if ((pfn & (fault_page_size - 1)) != 0)
> -		goto out_fallback;
> -
> -	/* Check that memory is contiguous. */
> -	if (!bo->resource->bus.is_iomem) {
> -		for (i = 1; i < fault_page_size; ++i) {
> -			if (page_to_pfn(ttm->pages[page_offset + i]) != pfn + i)
> -				goto out_fallback;
> -		}
> -	} else if (bo->bdev->funcs->io_mem_pfn) {
> -		for (i = 1; i < fault_page_size; ++i) {
> -			if (ttm_bo_io_mem_pfn(bo, page_offset + i) != pfn + i)
> -				goto out_fallback;
> -		}
> -	}
> -
> -	pfnt = __pfn_to_pfn_t(pfn, PFN_DEV);
> -	if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT))
> -		ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write);
> -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> -	else if (fault_page_size == (HPAGE_PUD_SIZE >> PAGE_SHIFT))
> -		ret = vmf_insert_pfn_pud_prot(vmf, pfnt, pgprot, write);
> -#endif
> -	else
> -		WARN_ON_ONCE(ret = VM_FAULT_FALLBACK);
> -
> -	if (ret != VM_FAULT_NOPAGE)
> -		goto out_fallback;
> -
> -	return VM_FAULT_NOPAGE;
> -out_fallback:
> -	count_vm_event(THP_FAULT_FALLBACK);
> -	return VM_FAULT_FALLBACK;
> -}
> -#else
> -static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
> -					struct ttm_buffer_object *bo,
> -					pgoff_t page_offset,
> -					pgoff_t fault_page_size,
> -					pgprot_t pgprot)
> -{
> -	return VM_FAULT_FALLBACK;
> -}
> -#endif
> -
>   /**
>    * ttm_bo_vm_fault_reserved - TTM fault helper
>    * @vmf: The struct vm_fault given as argument to the fault callback
> @@ -261,7 +178,6 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
>    * @num_prefault: Maximum number of prefault pages. The caller may want to
>    * specify this based on madvice settings and the size of the GPU object
>    * backed by the memory.
> - * @fault_page_size: The size of the fault in pages.
>    *
>    * This function inserts one or more page table entries pointing to the
>    * memory backing the buffer object, and then returns a return code
> @@ -275,8 +191,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
>    */
>   vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>   				    pgprot_t prot,
> -				    pgoff_t num_prefault,
> -				    pgoff_t fault_page_size)
> +				    pgoff_t num_prefault)
>   {
>   	struct vm_area_struct *vma = vmf->vma;
>   	struct ttm_buffer_object *bo = vma->vm_private_data;
> @@ -327,11 +242,6 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>   		prot = pgprot_decrypted(prot);
>   	}
>   
> -	/* We don't prefault on huge faults. Yet. */
> -	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && fault_page_size != 1)
> -		return ttm_bo_vm_insert_huge(vmf, bo, page_offset,
> -					     fault_page_size, prot);
> -
>   	/*
>   	 * Speculatively prefault a number of pages. Only error on
>   	 * first page.
> @@ -429,7 +339,7 @@ vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
>   
>   	prot = vma->vm_page_prot;
>   	if (drm_dev_enter(ddev, &idx)) {
> -		ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, 1);
> +		ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
>   		drm_dev_exit(idx);
>   	} else {
>   		ret = ttm_bo_vm_dummy_page(vmf, prot);
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> index a833751099b559..858aff99a3fe53 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> @@ -1550,10 +1550,6 @@ void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
>   			pgoff_t start, pgoff_t end);
>   vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
>   vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
> -				enum page_entry_size pe_size);
> -#endif
>   
>   /* Transparent hugepage support - vmwgfx_thp.c */
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> index e5a9a5cbd01a7c..922317d1acc8a0 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> @@ -477,7 +477,7 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
>   	else
>   		prot = vm_get_page_prot(vma->vm_flags);
>   
> -	ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, 1);
> +	ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
>   	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
>   		return ret;
>   
> @@ -486,73 +486,3 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
>   
>   	return ret;
>   }
> -
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
> -				enum page_entry_size pe_size)
> -{
> -	struct vm_area_struct *vma = vmf->vma;
> -	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> -	    vma->vm_private_data;
> -	struct vmw_buffer_object *vbo =
> -		container_of(bo, struct vmw_buffer_object, base);
> -	pgprot_t prot;
> -	vm_fault_t ret;
> -	pgoff_t fault_page_size;
> -	bool write = vmf->flags & FAULT_FLAG_WRITE;
> -
> -	switch (pe_size) {
> -	case PE_SIZE_PMD:
> -		fault_page_size = HPAGE_PMD_SIZE >> PAGE_SHIFT;
> -		break;
> -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> -	case PE_SIZE_PUD:
> -		fault_page_size = HPAGE_PUD_SIZE >> PAGE_SHIFT;
> -		break;
> -#endif
> -	default:
> -		WARN_ON_ONCE(1);
> -		return VM_FAULT_FALLBACK;
> -	}
> -
> -	/* Always do write dirty-tracking and COW on PTE level. */
> -	if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping(vma->vm_flags)))
> -		return VM_FAULT_FALLBACK;
> -
> -	ret = ttm_bo_vm_reserve(bo, vmf);
> -	if (ret)
> -		return ret;
> -
> -	if (vbo->dirty) {
> -		pgoff_t allowed_prefault;
> -		unsigned long page_offset;
> -
> -		page_offset = vmf->pgoff -
> -			drm_vma_node_start(&bo->base.vma_node);
> -		if (page_offset >= bo->resource->num_pages ||
> -		    vmw_resources_clean(vbo, page_offset,
> -					page_offset + PAGE_SIZE,
> -					&allowed_prefault)) {
> -			ret = VM_FAULT_SIGBUS;
> -			goto out_unlock;
> -		}
> -
> -		/*
> -		 * Write protect, so we get a new fault on write, and can
> -		 * split.
> -		 */
> -		prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
> -	} else {
> -		prot = vm_get_page_prot(vma->vm_flags);
> -	}
> -
> -	ret = ttm_bo_vm_fault_reserved(vmf, prot, 1, fault_page_size);
> -	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> -		return ret;
> -
> -out_unlock:
> -	dma_resv_unlock(bo->base.resv);
> -
> -	return ret;
> -}
> -#endif
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
> index e6b1f98ec99f09..0a4c340252ec4a 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
> @@ -61,9 +61,6 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma)
>   		.fault = vmw_bo_vm_fault,
>   		.open = ttm_bo_vm_open,
>   		.close = ttm_bo_vm_close,
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -		.huge_fault = vmw_bo_vm_huge_fault,
> -#endif
>   	};
>   	struct drm_file *file_priv = filp->private_data;
>   	struct vmw_private *dev_priv = vmw_priv(file_priv->minor->dev);
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index f681bbdbc6982e..36f7eb9d066395 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -594,8 +594,7 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
>   
>   vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>   				    pgprot_t prot,
> -				    pgoff_t num_prefault,
> -				    pgoff_t fault_page_size);
> +				    pgoff_t num_prefault);
>   
>   vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
>   
>
> base-commit: 519d81956ee277b4419c723adfb154603c2565ba

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-20  6:34 ` Thomas Hellström
@ 2021-10-20  6:41   ` Christian König
  2021-10-20  6:52     ` Thomas Hellström
  2021-10-20 19:37     ` Jason Gunthorpe
  2021-10-20 14:09   ` Jason Gunthorpe
  1 sibling, 2 replies; 15+ messages in thread
From: Christian König @ 2021-10-20  6:41 UTC (permalink / raw)
  To: Thomas Hellström, Jason Gunthorpe, David Airlie,
	Daniel Vetter, dri-devel, Huang Rui
  Cc: Dan Williams, Ralph Campbell, Roland Scheidegger

Am 20.10.21 um 08:34 schrieb Thomas Hellström:
>
> On 10/20/21 01:27, Jason Gunthorpe wrote:
>> PUD and PMD entries do not have a special bit.
>>
>> get_user_pages_fast() considers any page that passed pmd_huge() as
>> usable:
>>
>>     if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
>>              pmd_devmap(pmd))) {
>>
>> And vmf_insert_pfn_pmd_prot() unconditionally sets
>>
>>     entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
>>
>> eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.
>>
>> As such gup_huge_pmd() will try to deref a struct page:
>>
>>     head = try_grab_compound_head(pmd_page(orig), refs, flags);
>>
>> and thus crash.
>>
>> Thomas further notices that the drivers are not expecting the struct 
>> page
>> to be used by anything - in particular the refcount incr above will 
>> cause
>> them to malfunction.
>>
>> Thus everything about this is not able to fully work correctly 
>> considering
>> GUP_fast. Delete it entirely. It can return someday along with a proper
>> PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.
>>
>> Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM pagefaults")
>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  2 +-
>>   drivers/gpu/drm/nouveau/nouveau_gem.c      |  2 +-
>>   drivers/gpu/drm/radeon/radeon_gem.c        |  2 +-
>>   drivers/gpu/drm/ttm/ttm_bo_vm.c            | 94 +---------------------
>>   drivers/gpu/drm/vmwgfx/vmwgfx_drv.h        |  4 -
>>   drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 72 +----------------
>>   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c   |  3 -
>>   include/drm/ttm/ttm_bo_api.h               |  3 +-
>>   8 files changed, 7 insertions(+), 175 deletions(-)
>>
>> v2:
>>   - Remove the entire thing as per Thomas's advice
>> v1: 
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fr%2F0-v1-69e7da97f81f%2B21c-ttm_pmd_jgg%40nvidia.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Ce27350925989400d009c08d99393b14a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637703084808329081%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=ElqvK%2FJWgGMSCzt91lEotVK2pCelchxp6WGRgHv0ojQ%3D&amp;reserved=0
>>
>> After this patch the only users of the vmf_insert_pfn_pud/pmd_prot() 
>> functions
>> are DAX and DAX always has a struct page. Eliminating this 
>> non-working case
>> will simplify trying to fix the refcounting on ZONE_DEVICE pages.
>>
>> Thanks,
>> Jason
>
> I think the patch subject needs updating to reflect that we're 
> disabling PUD/PMDs completely.
> With that fixed,
>
> Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>

Yeah, agree. A commit message like "drop huge page faults, they don't 
work atm" would be rather helpful.

Apart from that Reviewed-by: Christian König <christian.koenig@amd.com> 
as well.

Regards,
Christian.

>
> Follow up question: If we resurrect this in the proper way (and in 
> that case only for x86_64) is there something we need to pay 
> particular attention to WRT the ZONE_DEVICE refcounting fixing you 
> mention above?

Well, I think we certainly need some use case which really shows that 
this is faster to justify the added complexity.

Regards,
Christian.

>
> Thanks,
>
> Thomas
>
>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index d6aa032890ee8b..a1e63ba4c54a59 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -61,7 +61,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault 
>> *vmf)
>>           }
>>              ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
>> -                        TTM_BO_VM_NUM_PREFAULT, 1);
>> +                        TTM_BO_VM_NUM_PREFAULT);
>>              drm_dev_exit(idx);
>>       } else {
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c 
>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> index 8c2ecc28272322..c89d5964148fd5 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault 
>> *vmf)
>>         nouveau_bo_del_io_reserve_lru(bo);
>>       prot = vm_get_page_prot(vma->vm_flags);
>> -    ret = ttm_bo_vm_fault_reserved(vmf, prot, 
>> TTM_BO_VM_NUM_PREFAULT, 1);
>> +    ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
>>       nouveau_bo_add_io_reserve_lru(bo);
>>       if (ret == VM_FAULT_RETRY && !(vmf->flags & 
>> FAULT_FLAG_RETRY_NOWAIT))
>>           return ret;
>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c 
>> b/drivers/gpu/drm/radeon/radeon_gem.c
>> index 458f92a7088797..a36a4f2c76b097 100644
>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
>> @@ -61,7 +61,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault 
>> *vmf)
>>           goto unlock_resv;
>>         ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
>> -                       TTM_BO_VM_NUM_PREFAULT, 1);
>> +                       TTM_BO_VM_NUM_PREFAULT);
>>       if (ret == VM_FAULT_RETRY && !(vmf->flags & 
>> FAULT_FLAG_RETRY_NOWAIT))
>>           goto unlock_mclk;
>>   diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c 
>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> index f56be5bc0861ec..e5af7f9e94b273 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> @@ -171,89 +171,6 @@ vm_fault_t ttm_bo_vm_reserve(struct 
>> ttm_buffer_object *bo,
>>   }
>>   EXPORT_SYMBOL(ttm_bo_vm_reserve);
>>   -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -/**
>> - * ttm_bo_vm_insert_huge - Insert a pfn for PUD or PMD faults
>> - * @vmf: Fault data
>> - * @bo: The buffer object
>> - * @page_offset: Page offset from bo start
>> - * @fault_page_size: The size of the fault in pages.
>> - * @pgprot: The page protections.
>> - * Does additional checking whether it's possible to insert a PUD or 
>> PMD
>> - * pfn and performs the insertion.
>> - *
>> - * Return: VM_FAULT_NOPAGE on successful insertion, 
>> VM_FAULT_FALLBACK if
>> - * a huge fault was not possible, or on insertion error.
>> - */
>> -static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
>> -                    struct ttm_buffer_object *bo,
>> -                    pgoff_t page_offset,
>> -                    pgoff_t fault_page_size,
>> -                    pgprot_t pgprot)
>> -{
>> -    pgoff_t i;
>> -    vm_fault_t ret;
>> -    unsigned long pfn;
>> -    pfn_t pfnt;
>> -    struct ttm_tt *ttm = bo->ttm;
>> -    bool write = vmf->flags & FAULT_FLAG_WRITE;
>> -
>> -    /* Fault should not cross bo boundary. */
>> -    page_offset &= ~(fault_page_size - 1);
>> -    if (page_offset + fault_page_size > bo->resource->num_pages)
>> -        goto out_fallback;
>> -
>> -    if (bo->resource->bus.is_iomem)
>> -        pfn = ttm_bo_io_mem_pfn(bo, page_offset);
>> -    else
>> -        pfn = page_to_pfn(ttm->pages[page_offset]);
>> -
>> -    /* pfn must be fault_page_size aligned. */
>> -    if ((pfn & (fault_page_size - 1)) != 0)
>> -        goto out_fallback;
>> -
>> -    /* Check that memory is contiguous. */
>> -    if (!bo->resource->bus.is_iomem) {
>> -        for (i = 1; i < fault_page_size; ++i) {
>> -            if (page_to_pfn(ttm->pages[page_offset + i]) != pfn + i)
>> -                goto out_fallback;
>> -        }
>> -    } else if (bo->bdev->funcs->io_mem_pfn) {
>> -        for (i = 1; i < fault_page_size; ++i) {
>> -            if (ttm_bo_io_mem_pfn(bo, page_offset + i) != pfn + i)
>> -                goto out_fallback;
>> -        }
>> -    }
>> -
>> -    pfnt = __pfn_to_pfn_t(pfn, PFN_DEV);
>> -    if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT))
>> -        ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write);
>> -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>> -    else if (fault_page_size == (HPAGE_PUD_SIZE >> PAGE_SHIFT))
>> -        ret = vmf_insert_pfn_pud_prot(vmf, pfnt, pgprot, write);
>> -#endif
>> -    else
>> -        WARN_ON_ONCE(ret = VM_FAULT_FALLBACK);
>> -
>> -    if (ret != VM_FAULT_NOPAGE)
>> -        goto out_fallback;
>> -
>> -    return VM_FAULT_NOPAGE;
>> -out_fallback:
>> -    count_vm_event(THP_FAULT_FALLBACK);
>> -    return VM_FAULT_FALLBACK;
>> -}
>> -#else
>> -static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
>> -                    struct ttm_buffer_object *bo,
>> -                    pgoff_t page_offset,
>> -                    pgoff_t fault_page_size,
>> -                    pgprot_t pgprot)
>> -{
>> -    return VM_FAULT_FALLBACK;
>> -}
>> -#endif
>> -
>>   /**
>>    * ttm_bo_vm_fault_reserved - TTM fault helper
>>    * @vmf: The struct vm_fault given as argument to the fault callback
>> @@ -261,7 +178,6 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct 
>> vm_fault *vmf,
>>    * @num_prefault: Maximum number of prefault pages. The caller may 
>> want to
>>    * specify this based on madvice settings and the size of the GPU 
>> object
>>    * backed by the memory.
>> - * @fault_page_size: The size of the fault in pages.
>>    *
>>    * This function inserts one or more page table entries pointing to 
>> the
>>    * memory backing the buffer object, and then returns a return code
>> @@ -275,8 +191,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct 
>> vm_fault *vmf,
>>    */
>>   vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>                       pgprot_t prot,
>> -                    pgoff_t num_prefault,
>> -                    pgoff_t fault_page_size)
>> +                    pgoff_t num_prefault)
>>   {
>>       struct vm_area_struct *vma = vmf->vma;
>>       struct ttm_buffer_object *bo = vma->vm_private_data;
>> @@ -327,11 +242,6 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct 
>> vm_fault *vmf,
>>           prot = pgprot_decrypted(prot);
>>       }
>>   -    /* We don't prefault on huge faults. Yet. */
>> -    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && fault_page_size 
>> != 1)
>> -        return ttm_bo_vm_insert_huge(vmf, bo, page_offset,
>> -                         fault_page_size, prot);
>> -
>>       /*
>>        * Speculatively prefault a number of pages. Only error on
>>        * first page.
>> @@ -429,7 +339,7 @@ vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
>>         prot = vma->vm_page_prot;
>>       if (drm_dev_enter(ddev, &idx)) {
>> -        ret = ttm_bo_vm_fault_reserved(vmf, prot, 
>> TTM_BO_VM_NUM_PREFAULT, 1);
>> +        ret = ttm_bo_vm_fault_reserved(vmf, prot, 
>> TTM_BO_VM_NUM_PREFAULT);
>>           drm_dev_exit(idx);
>>       } else {
>>           ret = ttm_bo_vm_dummy_page(vmf, prot);
>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h 
>> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
>> index a833751099b559..858aff99a3fe53 100644
>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
>> @@ -1550,10 +1550,6 @@ void vmw_bo_dirty_unmap(struct 
>> vmw_buffer_object *vbo,
>>               pgoff_t start, pgoff_t end);
>>   vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
>>   vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
>> -                enum page_entry_size pe_size);
>> -#endif
>>     /* Transparent hugepage support - vmwgfx_thp.c */
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c 
>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>> index e5a9a5cbd01a7c..922317d1acc8a0 100644
>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>> @@ -477,7 +477,7 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
>>       else
>>           prot = vm_get_page_prot(vma->vm_flags);
>>   -    ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, 1);
>> +    ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
>>       if (ret == VM_FAULT_RETRY && !(vmf->flags & 
>> FAULT_FLAG_RETRY_NOWAIT))
>>           return ret;
>>   @@ -486,73 +486,3 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
>>         return ret;
>>   }
>> -
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
>> -                enum page_entry_size pe_size)
>> -{
>> -    struct vm_area_struct *vma = vmf->vma;
>> -    struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
>> -        vma->vm_private_data;
>> -    struct vmw_buffer_object *vbo =
>> -        container_of(bo, struct vmw_buffer_object, base);
>> -    pgprot_t prot;
>> -    vm_fault_t ret;
>> -    pgoff_t fault_page_size;
>> -    bool write = vmf->flags & FAULT_FLAG_WRITE;
>> -
>> -    switch (pe_size) {
>> -    case PE_SIZE_PMD:
>> -        fault_page_size = HPAGE_PMD_SIZE >> PAGE_SHIFT;
>> -        break;
>> -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>> -    case PE_SIZE_PUD:
>> -        fault_page_size = HPAGE_PUD_SIZE >> PAGE_SHIFT;
>> -        break;
>> -#endif
>> -    default:
>> -        WARN_ON_ONCE(1);
>> -        return VM_FAULT_FALLBACK;
>> -    }
>> -
>> -    /* Always do write dirty-tracking and COW on PTE level. */
>> -    if (write && (READ_ONCE(vbo->dirty) || 
>> is_cow_mapping(vma->vm_flags)))
>> -        return VM_FAULT_FALLBACK;
>> -
>> -    ret = ttm_bo_vm_reserve(bo, vmf);
>> -    if (ret)
>> -        return ret;
>> -
>> -    if (vbo->dirty) {
>> -        pgoff_t allowed_prefault;
>> -        unsigned long page_offset;
>> -
>> -        page_offset = vmf->pgoff -
>> -            drm_vma_node_start(&bo->base.vma_node);
>> -        if (page_offset >= bo->resource->num_pages ||
>> -            vmw_resources_clean(vbo, page_offset,
>> -                    page_offset + PAGE_SIZE,
>> -                    &allowed_prefault)) {
>> -            ret = VM_FAULT_SIGBUS;
>> -            goto out_unlock;
>> -        }
>> -
>> -        /*
>> -         * Write protect, so we get a new fault on write, and can
>> -         * split.
>> -         */
>> -        prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
>> -    } else {
>> -        prot = vm_get_page_prot(vma->vm_flags);
>> -    }
>> -
>> -    ret = ttm_bo_vm_fault_reserved(vmf, prot, 1, fault_page_size);
>> -    if (ret == VM_FAULT_RETRY && !(vmf->flags & 
>> FAULT_FLAG_RETRY_NOWAIT))
>> -        return ret;
>> -
>> -out_unlock:
>> -    dma_resv_unlock(bo->base.resv);
>> -
>> -    return ret;
>> -}
>> -#endif
>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c 
>> b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
>> index e6b1f98ec99f09..0a4c340252ec4a 100644
>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
>> @@ -61,9 +61,6 @@ int vmw_mmap(struct file *filp, struct 
>> vm_area_struct *vma)
>>           .fault = vmw_bo_vm_fault,
>>           .open = ttm_bo_vm_open,
>>           .close = ttm_bo_vm_close,
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -        .huge_fault = vmw_bo_vm_huge_fault,
>> -#endif
>>       };
>>       struct drm_file *file_priv = filp->private_data;
>>       struct vmw_private *dev_priv = vmw_priv(file_priv->minor->dev);
>> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
>> index f681bbdbc6982e..36f7eb9d066395 100644
>> --- a/include/drm/ttm/ttm_bo_api.h
>> +++ b/include/drm/ttm/ttm_bo_api.h
>> @@ -594,8 +594,7 @@ vm_fault_t ttm_bo_vm_reserve(struct 
>> ttm_buffer_object *bo,
>>     vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>                       pgprot_t prot,
>> -                    pgoff_t num_prefault,
>> -                    pgoff_t fault_page_size);
>> +                    pgoff_t num_prefault);
>>     vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
>>
>> base-commit: 519d81956ee277b4419c723adfb154603c2565ba


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-20  6:41   ` Christian König
@ 2021-10-20  6:52     ` Thomas Hellström
  2021-10-20 19:37     ` Jason Gunthorpe
  1 sibling, 0 replies; 15+ messages in thread
From: Thomas Hellström @ 2021-10-20  6:52 UTC (permalink / raw)
  To: Christian König, Jason Gunthorpe, David Airlie,
	Daniel Vetter, dri-devel, Huang Rui
  Cc: Dan Williams, Ralph Campbell, Roland Scheidegger


On 10/20/21 08:41, Christian König wrote:
> Am 20.10.21 um 08:34 schrieb Thomas Hellström:
>>
>> On 10/20/21 01:27, Jason Gunthorpe wrote:
>>> PUD and PMD entries do not have a special bit.
>>>
>>> get_user_pages_fast() considers any page that passed pmd_huge() as
>>> usable:
>>>
>>>     if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
>>>              pmd_devmap(pmd))) {
>>>
>>> And vmf_insert_pfn_pmd_prot() unconditionally sets
>>>
>>>     entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
>>>
>>> eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.
>>>
>>> As such gup_huge_pmd() will try to deref a struct page:
>>>
>>>     head = try_grab_compound_head(pmd_page(orig), refs, flags);
>>>
>>> and thus crash.
>>>
>>> Thomas further notices that the drivers are not expecting the struct 
>>> page
>>> to be used by anything - in particular the refcount incr above will 
>>> cause
>>> them to malfunction.
>>>
>>> Thus everything about this is not able to fully work correctly 
>>> considering
>>> GUP_fast. Delete it entirely. It can return someday along with a proper
>>> PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.
>>>
>>> Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM 
>>> pagefaults")
>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  2 +-
>>>   drivers/gpu/drm/nouveau/nouveau_gem.c      |  2 +-
>>>   drivers/gpu/drm/radeon/radeon_gem.c        |  2 +-
>>>   drivers/gpu/drm/ttm/ttm_bo_vm.c            | 94 
>>> +---------------------
>>>   drivers/gpu/drm/vmwgfx/vmwgfx_drv.h        |  4 -
>>>   drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 72 +----------------
>>>   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c   |  3 -
>>>   include/drm/ttm/ttm_bo_api.h               |  3 +-
>>>   8 files changed, 7 insertions(+), 175 deletions(-)
>>>
>>> v2:
>>>   - Remove the entire thing as per Thomas's advice
>>> v1: 
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fr%2F0-v1-69e7da97f81f%2B21c-ttm_pmd_jgg%40nvidia.com&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Ce27350925989400d009c08d99393b14a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637703084808329081%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=ElqvK%2FJWgGMSCzt91lEotVK2pCelchxp6WGRgHv0ojQ%3D&amp;reserved=0
>>>
>>> After this patch the only users of the vmf_insert_pfn_pud/pmd_prot() 
>>> functions
>>> are DAX and DAX always has a struct page. Eliminating this 
>>> non-working case
>>> will simplify trying to fix the refcounting on ZONE_DEVICE pages.
>>>
>>> Thanks,
>>> Jason
>>
>> I think the patch subject needs updating to reflect that we're 
>> disabling PUD/PMDs completely.
>> With that fixed,
>>
>> Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>
>
> Yeah, agree. A commit message like "drop huge page faults, they don't 
> work atm" would be rather helpful.
>
> Apart from that Reviewed-by: Christian König 
> <christian.koenig@amd.com> as well.
>
> Regards,
> Christian.
>
>>
>> Follow up question: If we resurrect this in the proper way (and in 
>> that case only for x86_64) is there something we need to pay 
>> particular attention to WRT the ZONE_DEVICE refcounting fixing you 
>> mention above?
>
> Well, I think we certainly need some use case which really shows that 
> this is faster to justify the added complexity.

Fair enough, although to some extent this is about saving page-table 
memory and being nice to the rest of the system as well. I guess in a 
way similar to have TTM supporting huge page allocations in the first 
place. But yeah I agree listing the benefits and weighing that against 
the added complexity should be a prerequisite.

But for us this is not top priority ATM.

/Thomas

>
> Regards,
> Christian.
>
>>
>> Thanks,
>>
>> Thomas
>>
>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index d6aa032890ee8b..a1e63ba4c54a59 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -61,7 +61,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault 
>>> *vmf)
>>>           }
>>>              ret = ttm_bo_vm_fault_reserved(vmf, 
>>> vmf->vma->vm_page_prot,
>>> -                        TTM_BO_VM_NUM_PREFAULT, 1);
>>> +                        TTM_BO_VM_NUM_PREFAULT);
>>>              drm_dev_exit(idx);
>>>       } else {
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c 
>>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
>>> index 8c2ecc28272322..c89d5964148fd5 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct 
>>> vm_fault *vmf)
>>>         nouveau_bo_del_io_reserve_lru(bo);
>>>       prot = vm_get_page_prot(vma->vm_flags);
>>> -    ret = ttm_bo_vm_fault_reserved(vmf, prot, 
>>> TTM_BO_VM_NUM_PREFAULT, 1);
>>> +    ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
>>>       nouveau_bo_add_io_reserve_lru(bo);
>>>       if (ret == VM_FAULT_RETRY && !(vmf->flags & 
>>> FAULT_FLAG_RETRY_NOWAIT))
>>>           return ret;
>>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c 
>>> b/drivers/gpu/drm/radeon/radeon_gem.c
>>> index 458f92a7088797..a36a4f2c76b097 100644
>>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
>>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
>>> @@ -61,7 +61,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault 
>>> *vmf)
>>>           goto unlock_resv;
>>>         ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
>>> -                       TTM_BO_VM_NUM_PREFAULT, 1);
>>> +                       TTM_BO_VM_NUM_PREFAULT);
>>>       if (ret == VM_FAULT_RETRY && !(vmf->flags & 
>>> FAULT_FLAG_RETRY_NOWAIT))
>>>           goto unlock_mclk;
>>>   diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c 
>>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>> index f56be5bc0861ec..e5af7f9e94b273 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>> @@ -171,89 +171,6 @@ vm_fault_t ttm_bo_vm_reserve(struct 
>>> ttm_buffer_object *bo,
>>>   }
>>>   EXPORT_SYMBOL(ttm_bo_vm_reserve);
>>>   -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -/**
>>> - * ttm_bo_vm_insert_huge - Insert a pfn for PUD or PMD faults
>>> - * @vmf: Fault data
>>> - * @bo: The buffer object
>>> - * @page_offset: Page offset from bo start
>>> - * @fault_page_size: The size of the fault in pages.
>>> - * @pgprot: The page protections.
>>> - * Does additional checking whether it's possible to insert a PUD 
>>> or PMD
>>> - * pfn and performs the insertion.
>>> - *
>>> - * Return: VM_FAULT_NOPAGE on successful insertion, 
>>> VM_FAULT_FALLBACK if
>>> - * a huge fault was not possible, or on insertion error.
>>> - */
>>> -static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
>>> -                    struct ttm_buffer_object *bo,
>>> -                    pgoff_t page_offset,
>>> -                    pgoff_t fault_page_size,
>>> -                    pgprot_t pgprot)
>>> -{
>>> -    pgoff_t i;
>>> -    vm_fault_t ret;
>>> -    unsigned long pfn;
>>> -    pfn_t pfnt;
>>> -    struct ttm_tt *ttm = bo->ttm;
>>> -    bool write = vmf->flags & FAULT_FLAG_WRITE;
>>> -
>>> -    /* Fault should not cross bo boundary. */
>>> -    page_offset &= ~(fault_page_size - 1);
>>> -    if (page_offset + fault_page_size > bo->resource->num_pages)
>>> -        goto out_fallback;
>>> -
>>> -    if (bo->resource->bus.is_iomem)
>>> -        pfn = ttm_bo_io_mem_pfn(bo, page_offset);
>>> -    else
>>> -        pfn = page_to_pfn(ttm->pages[page_offset]);
>>> -
>>> -    /* pfn must be fault_page_size aligned. */
>>> -    if ((pfn & (fault_page_size - 1)) != 0)
>>> -        goto out_fallback;
>>> -
>>> -    /* Check that memory is contiguous. */
>>> -    if (!bo->resource->bus.is_iomem) {
>>> -        for (i = 1; i < fault_page_size; ++i) {
>>> -            if (page_to_pfn(ttm->pages[page_offset + i]) != pfn + i)
>>> -                goto out_fallback;
>>> -        }
>>> -    } else if (bo->bdev->funcs->io_mem_pfn) {
>>> -        for (i = 1; i < fault_page_size; ++i) {
>>> -            if (ttm_bo_io_mem_pfn(bo, page_offset + i) != pfn + i)
>>> -                goto out_fallback;
>>> -        }
>>> -    }
>>> -
>>> -    pfnt = __pfn_to_pfn_t(pfn, PFN_DEV);
>>> -    if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT))
>>> -        ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write);
>>> -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>> -    else if (fault_page_size == (HPAGE_PUD_SIZE >> PAGE_SHIFT))
>>> -        ret = vmf_insert_pfn_pud_prot(vmf, pfnt, pgprot, write);
>>> -#endif
>>> -    else
>>> -        WARN_ON_ONCE(ret = VM_FAULT_FALLBACK);
>>> -
>>> -    if (ret != VM_FAULT_NOPAGE)
>>> -        goto out_fallback;
>>> -
>>> -    return VM_FAULT_NOPAGE;
>>> -out_fallback:
>>> -    count_vm_event(THP_FAULT_FALLBACK);
>>> -    return VM_FAULT_FALLBACK;
>>> -}
>>> -#else
>>> -static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
>>> -                    struct ttm_buffer_object *bo,
>>> -                    pgoff_t page_offset,
>>> -                    pgoff_t fault_page_size,
>>> -                    pgprot_t pgprot)
>>> -{
>>> -    return VM_FAULT_FALLBACK;
>>> -}
>>> -#endif
>>> -
>>>   /**
>>>    * ttm_bo_vm_fault_reserved - TTM fault helper
>>>    * @vmf: The struct vm_fault given as argument to the fault callback
>>> @@ -261,7 +178,6 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct 
>>> vm_fault *vmf,
>>>    * @num_prefault: Maximum number of prefault pages. The caller may 
>>> want to
>>>    * specify this based on madvice settings and the size of the GPU 
>>> object
>>>    * backed by the memory.
>>> - * @fault_page_size: The size of the fault in pages.
>>>    *
>>>    * This function inserts one or more page table entries pointing 
>>> to the
>>>    * memory backing the buffer object, and then returns a return code
>>> @@ -275,8 +191,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct 
>>> vm_fault *vmf,
>>>    */
>>>   vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>>                       pgprot_t prot,
>>> -                    pgoff_t num_prefault,
>>> -                    pgoff_t fault_page_size)
>>> +                    pgoff_t num_prefault)
>>>   {
>>>       struct vm_area_struct *vma = vmf->vma;
>>>       struct ttm_buffer_object *bo = vma->vm_private_data;
>>> @@ -327,11 +242,6 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct 
>>> vm_fault *vmf,
>>>           prot = pgprot_decrypted(prot);
>>>       }
>>>   -    /* We don't prefault on huge faults. Yet. */
>>> -    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && fault_page_size 
>>> != 1)
>>> -        return ttm_bo_vm_insert_huge(vmf, bo, page_offset,
>>> -                         fault_page_size, prot);
>>> -
>>>       /*
>>>        * Speculatively prefault a number of pages. Only error on
>>>        * first page.
>>> @@ -429,7 +339,7 @@ vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
>>>         prot = vma->vm_page_prot;
>>>       if (drm_dev_enter(ddev, &idx)) {
>>> -        ret = ttm_bo_vm_fault_reserved(vmf, prot, 
>>> TTM_BO_VM_NUM_PREFAULT, 1);
>>> +        ret = ttm_bo_vm_fault_reserved(vmf, prot, 
>>> TTM_BO_VM_NUM_PREFAULT);
>>>           drm_dev_exit(idx);
>>>       } else {
>>>           ret = ttm_bo_vm_dummy_page(vmf, prot);
>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h 
>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
>>> index a833751099b559..858aff99a3fe53 100644
>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
>>> @@ -1550,10 +1550,6 @@ void vmw_bo_dirty_unmap(struct 
>>> vmw_buffer_object *vbo,
>>>               pgoff_t start, pgoff_t end);
>>>   vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
>>>   vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
>>> -                enum page_entry_size pe_size);
>>> -#endif
>>>     /* Transparent hugepage support - vmwgfx_thp.c */
>>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c 
>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>> index e5a9a5cbd01a7c..922317d1acc8a0 100644
>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>> @@ -477,7 +477,7 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
>>>       else
>>>           prot = vm_get_page_prot(vma->vm_flags);
>>>   -    ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, 1);
>>> +    ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
>>>       if (ret == VM_FAULT_RETRY && !(vmf->flags & 
>>> FAULT_FLAG_RETRY_NOWAIT))
>>>           return ret;
>>>   @@ -486,73 +486,3 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
>>>         return ret;
>>>   }
>>> -
>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
>>> -                enum page_entry_size pe_size)
>>> -{
>>> -    struct vm_area_struct *vma = vmf->vma;
>>> -    struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
>>> -        vma->vm_private_data;
>>> -    struct vmw_buffer_object *vbo =
>>> -        container_of(bo, struct vmw_buffer_object, base);
>>> -    pgprot_t prot;
>>> -    vm_fault_t ret;
>>> -    pgoff_t fault_page_size;
>>> -    bool write = vmf->flags & FAULT_FLAG_WRITE;
>>> -
>>> -    switch (pe_size) {
>>> -    case PE_SIZE_PMD:
>>> -        fault_page_size = HPAGE_PMD_SIZE >> PAGE_SHIFT;
>>> -        break;
>>> -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>> -    case PE_SIZE_PUD:
>>> -        fault_page_size = HPAGE_PUD_SIZE >> PAGE_SHIFT;
>>> -        break;
>>> -#endif
>>> -    default:
>>> -        WARN_ON_ONCE(1);
>>> -        return VM_FAULT_FALLBACK;
>>> -    }
>>> -
>>> -    /* Always do write dirty-tracking and COW on PTE level. */
>>> -    if (write && (READ_ONCE(vbo->dirty) || 
>>> is_cow_mapping(vma->vm_flags)))
>>> -        return VM_FAULT_FALLBACK;
>>> -
>>> -    ret = ttm_bo_vm_reserve(bo, vmf);
>>> -    if (ret)
>>> -        return ret;
>>> -
>>> -    if (vbo->dirty) {
>>> -        pgoff_t allowed_prefault;
>>> -        unsigned long page_offset;
>>> -
>>> -        page_offset = vmf->pgoff -
>>> -            drm_vma_node_start(&bo->base.vma_node);
>>> -        if (page_offset >= bo->resource->num_pages ||
>>> -            vmw_resources_clean(vbo, page_offset,
>>> -                    page_offset + PAGE_SIZE,
>>> -                    &allowed_prefault)) {
>>> -            ret = VM_FAULT_SIGBUS;
>>> -            goto out_unlock;
>>> -        }
>>> -
>>> -        /*
>>> -         * Write protect, so we get a new fault on write, and can
>>> -         * split.
>>> -         */
>>> -        prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
>>> -    } else {
>>> -        prot = vm_get_page_prot(vma->vm_flags);
>>> -    }
>>> -
>>> -    ret = ttm_bo_vm_fault_reserved(vmf, prot, 1, fault_page_size);
>>> -    if (ret == VM_FAULT_RETRY && !(vmf->flags & 
>>> FAULT_FLAG_RETRY_NOWAIT))
>>> -        return ret;
>>> -
>>> -out_unlock:
>>> -    dma_resv_unlock(bo->base.resv);
>>> -
>>> -    return ret;
>>> -}
>>> -#endif
>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c 
>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
>>> index e6b1f98ec99f09..0a4c340252ec4a 100644
>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
>>> @@ -61,9 +61,6 @@ int vmw_mmap(struct file *filp, struct 
>>> vm_area_struct *vma)
>>>           .fault = vmw_bo_vm_fault,
>>>           .open = ttm_bo_vm_open,
>>>           .close = ttm_bo_vm_close,
>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -        .huge_fault = vmw_bo_vm_huge_fault,
>>> -#endif
>>>       };
>>>       struct drm_file *file_priv = filp->private_data;
>>>       struct vmw_private *dev_priv = vmw_priv(file_priv->minor->dev);
>>> diff --git a/include/drm/ttm/ttm_bo_api.h 
>>> b/include/drm/ttm/ttm_bo_api.h
>>> index f681bbdbc6982e..36f7eb9d066395 100644
>>> --- a/include/drm/ttm/ttm_bo_api.h
>>> +++ b/include/drm/ttm/ttm_bo_api.h
>>> @@ -594,8 +594,7 @@ vm_fault_t ttm_bo_vm_reserve(struct 
>>> ttm_buffer_object *bo,
>>>     vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>>                       pgprot_t prot,
>>> -                    pgoff_t num_prefault,
>>> -                    pgoff_t fault_page_size);
>>> +                    pgoff_t num_prefault);
>>>     vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
>>>
>>> base-commit: 519d81956ee277b4419c723adfb154603c2565ba
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-20  6:34 ` Thomas Hellström
  2021-10-20  6:41   ` Christian König
@ 2021-10-20 14:09   ` Jason Gunthorpe
  2021-10-21 11:40     ` Daniel Vetter
  1 sibling, 1 reply; 15+ messages in thread
From: Jason Gunthorpe @ 2021-10-20 14:09 UTC (permalink / raw)
  To: Thomas Hellström
  Cc: David Airlie, Christian Koenig, Daniel Vetter, dri-devel,
	Huang Rui, Dan Williams, Ralph Campbell, Roland Scheidegger

On Wed, Oct 20, 2021 at 08:34:33AM +0200, Thomas Hellström wrote:

> Follow up question: If we resurrect this in the proper way (and in that case
> only for x86_64) is there something we need to pay particular attention to
> WRT the ZONE_DEVICE refcounting fixing you mention above?

Similar to PTE it should be completely separated from ZONE_DEVICE.

Seeing the special bit set at any level should trigger all page table
walkers to never try to get a struct page.

Today some of the page table walkers are trying to do this with
vma_is_special(), all of those should end up being the Pxx_SPECIAL
test instead.

Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-20  6:41   ` Christian König
  2021-10-20  6:52     ` Thomas Hellström
@ 2021-10-20 19:37     ` Jason Gunthorpe
  2021-10-21  7:04       ` Christian König
  2021-10-21 11:41       ` Daniel Vetter
  1 sibling, 2 replies; 15+ messages in thread
From: Jason Gunthorpe @ 2021-10-20 19:37 UTC (permalink / raw)
  To: Christian König
  Cc: Thomas Hellström, David Airlie, Daniel Vetter, dri-devel,
	Huang Rui, Dan Williams, Ralph Campbell, Roland Scheidegger

On Wed, Oct 20, 2021 at 08:41:24AM +0200, Christian König wrote:

> > I think the patch subject needs updating to reflect that we're disabling
> > PUD/PMDs completely.
> > With that fixed,

Everyone is OK with this?

drm/ttm: remove ttm_bo_vm_insert_huge()

The huge page functionality in TTM does not work safely because PUD and
PMD entries do not have a special bit.

get_user_pages_fast() considers any page that passed pmd_huge() as
usable:

	if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
		     pmd_devmap(pmd))) {

And vmf_insert_pfn_pmd_prot() unconditionally sets

	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));

eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.

As such gup_huge_pmd() will try to deref a struct page:

	head = try_grab_compound_head(pmd_page(orig), refs, flags);

and thus crash.

So, iomem cannot be installed using vmf_insert_pfn_pud/pmd_prot().

Thomas further notices that the drivers are not expecting the struct page
to be used by anything - in particular the refcount incr above will cause
them to malfunction. This means even the struct page memory cannot be
used.

Therefore everything about this is not able to fully work correctly
considering GUP_fast. Delete it entirely. It can return someday along with
a proper PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.

Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM pagefaults")
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-20 19:37     ` Jason Gunthorpe
@ 2021-10-21  7:04       ` Christian König
  2021-10-21  7:05         ` Thomas Hellström
  2021-10-21 11:41       ` Daniel Vetter
  1 sibling, 1 reply; 15+ messages in thread
From: Christian König @ 2021-10-21  7:04 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Thomas Hellström, David Airlie, Daniel Vetter, dri-devel,
	Huang Rui, Dan Williams, Ralph Campbell, Roland Scheidegger

Works for me.

Am 20.10.21 um 21:37 schrieb Jason Gunthorpe:
> On Wed, Oct 20, 2021 at 08:41:24AM +0200, Christian König wrote:
>
>>> I think the patch subject needs updating to reflect that we're disabling
>>> PUD/PMDs completely.
>>> With that fixed,
> Everyone is OK with this?
>
> drm/ttm: remove ttm_bo_vm_insert_huge()
>
> The huge page functionality in TTM does not work safely because PUD and
> PMD entries do not have a special bit.
>
> get_user_pages_fast() considers any page that passed pmd_huge() as
> usable:
>
> 	if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
> 		     pmd_devmap(pmd))) {
>
> And vmf_insert_pfn_pmd_prot() unconditionally sets
>
> 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
>
> eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.
>
> As such gup_huge_pmd() will try to deref a struct page:
>
> 	head = try_grab_compound_head(pmd_page(orig), refs, flags);
>
> and thus crash.
>
> So, iomem cannot be installed using vmf_insert_pfn_pud/pmd_prot().
>
> Thomas further notices that the drivers are not expecting the struct page
> to be used by anything - in particular the refcount incr above will cause
> them to malfunction. This means even the struct page memory cannot be
> used.
>
> Therefore everything about this is not able to fully work correctly
> considering GUP_fast. Delete it entirely. It can return someday along with
> a proper PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.
>
> Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM pagefaults")
> Reviewed-by: Christian König <christian.koenig@amd.com>
> Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-21  7:04       ` Christian König
@ 2021-10-21  7:05         ` Thomas Hellström
  0 siblings, 0 replies; 15+ messages in thread
From: Thomas Hellström @ 2021-10-21  7:05 UTC (permalink / raw)
  To: Christian König, Jason Gunthorpe
  Cc: David Airlie, Daniel Vetter, dri-devel, Huang Rui, Dan Williams,
	Ralph Campbell, Roland Scheidegger

On Thu, 2021-10-21 at 09:04 +0200, Christian König wrote:
> Works for me.

+1
/Thomas


> 
> Am 20.10.21 um 21:37 schrieb Jason Gunthorpe:
> > On Wed, Oct 20, 2021 at 08:41:24AM +0200, Christian König wrote:
> > 
> > > > I think the patch subject needs updating to reflect that we're
> > > > disabling
> > > > PUD/PMDs completely.
> > > > With that fixed,
> > Everyone is OK with this?
> > 
> > drm/ttm: remove ttm_bo_vm_insert_huge()
> > 
> > The huge page functionality in TTM does not work safely because PUD
> > and
> > PMD entries do not have a special bit.
> > 
> > get_user_pages_fast() considers any page that passed pmd_huge() as
> > usable:
> > 
> >         if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
> >                      pmd_devmap(pmd))) {
> > 
> > And vmf_insert_pfn_pmd_prot() unconditionally sets
> > 
> >         entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
> > 
> > eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.
> > 
> > As such gup_huge_pmd() will try to deref a struct page:
> > 
> >         head = try_grab_compound_head(pmd_page(orig), refs, flags);
> > 
> > and thus crash.
> > 
> > So, iomem cannot be installed using vmf_insert_pfn_pud/pmd_prot().
> > 
> > Thomas further notices that the drivers are not expecting the
> > struct page
> > to be used by anything - in particular the refcount incr above will
> > cause
> > them to malfunction. This means even the struct page memory cannot
> > be
> > used.
> > 
> > Therefore everything about this is not able to fully work correctly
> > considering GUP_fast. Delete it entirely. It can return someday
> > along with
> > a proper PMD/PUD_SPECIAL bit in the page table itself to gate
> > GUP_fast.
> > 
> > Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM
> > pagefaults")
> > Reviewed-by: Christian König <christian.koenig@amd.com>
> > Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-20 14:09   ` Jason Gunthorpe
@ 2021-10-21 11:40     ` Daniel Vetter
  2021-10-22 19:00       ` Jason Gunthorpe
  0 siblings, 1 reply; 15+ messages in thread
From: Daniel Vetter @ 2021-10-21 11:40 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Thomas Hellström, David Airlie, Christian Koenig,
	Daniel Vetter, dri-devel, Huang Rui, Dan Williams,
	Ralph Campbell, Roland Scheidegger

On Wed, Oct 20, 2021 at 11:09:58AM -0300, Jason Gunthorpe wrote:
> On Wed, Oct 20, 2021 at 08:34:33AM +0200, Thomas Hellström wrote:
> 
> > Follow up question: If we resurrect this in the proper way (and in that case
> > only for x86_64) is there something we need to pay particular attention to
> > WRT the ZONE_DEVICE refcounting fixing you mention above?
> 
> Similar to PTE it should be completely separated from ZONE_DEVICE.
> 
> Seeing the special bit set at any level should trigger all page table
> walkers to never try to get a struct page.
> 
> Today some of the page table walkers are trying to do this with
> vma_is_special(), all of those should end up being the Pxx_SPECIAL
> test instead.

My understanding is that vma_is_special is for platforms which don't have
pte_special, and hence can't do gup_fast. At least I was assuming this is
why vma_is_special is a thing, and why some architectures cannot do
gup_fast.

I think for pmd and higher that approach isn't feasible and so we'll
probably have to disable VM_PFNMAP hugepages if the arch doesn't support
that.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-20 19:37     ` Jason Gunthorpe
  2021-10-21  7:04       ` Christian König
@ 2021-10-21 11:41       ` Daniel Vetter
  2021-10-22 18:57         ` Jason Gunthorpe
  1 sibling, 1 reply; 15+ messages in thread
From: Daniel Vetter @ 2021-10-21 11:41 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christian König, Thomas Hellström, David Airlie,
	Daniel Vetter, dri-devel, Huang Rui, Dan Williams,
	Ralph Campbell, Roland Scheidegger

On Wed, Oct 20, 2021 at 04:37:02PM -0300, Jason Gunthorpe wrote:
> On Wed, Oct 20, 2021 at 08:41:24AM +0200, Christian König wrote:
> 
> > > I think the patch subject needs updating to reflect that we're disabling
> > > PUD/PMDs completely.
> > > With that fixed,
> 
> Everyone is OK with this?
> 
> drm/ttm: remove ttm_bo_vm_insert_huge()
> 
> The huge page functionality in TTM does not work safely because PUD and
> PMD entries do not have a special bit.
> 
> get_user_pages_fast() considers any page that passed pmd_huge() as
> usable:
> 
> 	if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
> 		     pmd_devmap(pmd))) {
> 
> And vmf_insert_pfn_pmd_prot() unconditionally sets
> 
> 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
> 
> eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.
> 
> As such gup_huge_pmd() will try to deref a struct page:
> 
> 	head = try_grab_compound_head(pmd_page(orig), refs, flags);
> 
> and thus crash.
> 
> So, iomem cannot be installed using vmf_insert_pfn_pud/pmd_prot().
> 
> Thomas further notices that the drivers are not expecting the struct page
> to be used by anything - in particular the refcount incr above will cause
> them to malfunction. This means even the struct page memory cannot be
> used.
> 
> Therefore everything about this is not able to fully work correctly
> considering GUP_fast. Delete it entirely. It can return someday along with
> a proper PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.
> 
> Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM pagefaults")
> Reviewed-by: Christian König <christian.koenig@amd.com>
> Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>

I think we also want cc: stable here.

Do you plan to land this through some dedicated pull for -rc? I think that
makes sense to highlight it, but I can also smash this into some
drm-fixes.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-21 11:41       ` Daniel Vetter
@ 2021-10-22 18:57         ` Jason Gunthorpe
  2021-10-28 15:14           ` Daniel Vetter
  0 siblings, 1 reply; 15+ messages in thread
From: Jason Gunthorpe @ 2021-10-22 18:57 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Christian König, Thomas Hellström, David Airlie,
	dri-devel, Huang Rui, Dan Williams, Ralph Campbell,
	Roland Scheidegger

On Thu, Oct 21, 2021 at 01:41:39PM +0200, Daniel Vetter wrote:
> On Wed, Oct 20, 2021 at 04:37:02PM -0300, Jason Gunthorpe wrote:
> > On Wed, Oct 20, 2021 at 08:41:24AM +0200, Christian König wrote:
> > 
> > > > I think the patch subject needs updating to reflect that we're disabling
> > > > PUD/PMDs completely.
> > > > With that fixed,
> > 
> > Everyone is OK with this?
> > 
> > drm/ttm: remove ttm_bo_vm_insert_huge()
> > 
> > The huge page functionality in TTM does not work safely because PUD and
> > PMD entries do not have a special bit.
> > 
> > get_user_pages_fast() considers any page that passed pmd_huge() as
> > usable:
> > 
> > 	if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
> > 		     pmd_devmap(pmd))) {
> > 
> > And vmf_insert_pfn_pmd_prot() unconditionally sets
> > 
> > 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
> > 
> > eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.
> > 
> > As such gup_huge_pmd() will try to deref a struct page:
> > 
> > 	head = try_grab_compound_head(pmd_page(orig), refs, flags);
> > 
> > and thus crash.
> > 
> > So, iomem cannot be installed using vmf_insert_pfn_pud/pmd_prot().
> > 
> > Thomas further notices that the drivers are not expecting the struct page
> > to be used by anything - in particular the refcount incr above will cause
> > them to malfunction. This means even the struct page memory cannot be
> > used.
> > 
> > Therefore everything about this is not able to fully work correctly
> > considering GUP_fast. Delete it entirely. It can return someday along with
> > a proper PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.
> > 
> > Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM pagefaults")
> > Reviewed-by: Christian König <christian.koenig@amd.com>
> > Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> 
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> I think we also want cc: stable here.

Ok
 
> Do you plan to land this through some dedicated pull for -rc? I think that
> makes sense to highlight it, but I can also smash this into some
> drm-fixes.

I was hoping you'd take it? Do want a v3?

Thanksm
Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-21 11:40     ` Daniel Vetter
@ 2021-10-22 19:00       ` Jason Gunthorpe
  0 siblings, 0 replies; 15+ messages in thread
From: Jason Gunthorpe @ 2021-10-22 19:00 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Thomas Hellström, David Airlie, Christian Koenig, dri-devel,
	Huang Rui, Dan Williams, Ralph Campbell, Roland Scheidegger

On Thu, Oct 21, 2021 at 01:40:30PM +0200, Daniel Vetter wrote:
> On Wed, Oct 20, 2021 at 11:09:58AM -0300, Jason Gunthorpe wrote:
> > On Wed, Oct 20, 2021 at 08:34:33AM +0200, Thomas Hellström wrote:
> > 
> > > Follow up question: If we resurrect this in the proper way (and in that case
> > > only for x86_64) is there something we need to pay particular attention to
> > > WRT the ZONE_DEVICE refcounting fixing you mention above?
> > 
> > Similar to PTE it should be completely separated from ZONE_DEVICE.
> > 
> > Seeing the special bit set at any level should trigger all page table
> > walkers to never try to get a struct page.
> > 
> > Today some of the page table walkers are trying to do this with
> > vma_is_special(), all of those should end up being the Pxx_SPECIAL
> > test instead.
> 
> My understanding is that vma_is_special is for platforms which don't have
> pte_special, and hence can't do gup_fast. At least I was assuming this is
> why vma_is_special is a thing, and why some architectures cannot do
> gup_fast.

AFAIK that is what vm_normal_page() is for as the rules for MIXED are
complicated 

Once the semi-bogus usages in the PUD/PMD page table walkers are
removed the only remaining users are in arch code around vdso, and I
haven't tried to figure that out..

Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-22 18:57         ` Jason Gunthorpe
@ 2021-10-28 15:14           ` Daniel Vetter
  2021-10-28 19:42             ` Jason Gunthorpe
  0 siblings, 1 reply; 15+ messages in thread
From: Daniel Vetter @ 2021-10-28 15:14 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Daniel Vetter, Christian König, Thomas Hellström,
	David Airlie, dri-devel, Huang Rui, Dan Williams, Ralph Campbell,
	Roland Scheidegger

On Fri, Oct 22, 2021 at 03:57:42PM -0300, Jason Gunthorpe wrote:
> On Thu, Oct 21, 2021 at 01:41:39PM +0200, Daniel Vetter wrote:
> > On Wed, Oct 20, 2021 at 04:37:02PM -0300, Jason Gunthorpe wrote:
> > > On Wed, Oct 20, 2021 at 08:41:24AM +0200, Christian König wrote:
> > > 
> > > > > I think the patch subject needs updating to reflect that we're disabling
> > > > > PUD/PMDs completely.
> > > > > With that fixed,
> > > 
> > > Everyone is OK with this?
> > > 
> > > drm/ttm: remove ttm_bo_vm_insert_huge()
> > > 
> > > The huge page functionality in TTM does not work safely because PUD and
> > > PMD entries do not have a special bit.
> > > 
> > > get_user_pages_fast() considers any page that passed pmd_huge() as
> > > usable:
> > > 
> > > 	if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
> > > 		     pmd_devmap(pmd))) {
> > > 
> > > And vmf_insert_pfn_pmd_prot() unconditionally sets
> > > 
> > > 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
> > > 
> > > eg on x86 the page will be _PAGE_PRESENT | PAGE_PSE.
> > > 
> > > As such gup_huge_pmd() will try to deref a struct page:
> > > 
> > > 	head = try_grab_compound_head(pmd_page(orig), refs, flags);
> > > 
> > > and thus crash.
> > > 
> > > So, iomem cannot be installed using vmf_insert_pfn_pud/pmd_prot().
> > > 
> > > Thomas further notices that the drivers are not expecting the struct page
> > > to be used by anything - in particular the refcount incr above will cause
> > > them to malfunction. This means even the struct page memory cannot be
> > > used.
> > > 
> > > Therefore everything about this is not able to fully work correctly
> > > considering GUP_fast. Delete it entirely. It can return someday along with
> > > a proper PMD/PUD_SPECIAL bit in the page table itself to gate GUP_fast.
> > > 
> > > Fixes: 314b6580adc5 ("drm/ttm, drm/vmwgfx: Support huge TTM pagefaults")
> > > Reviewed-by: Christian König <christian.koenig@amd.com>
> > > Reviewed-by: Thomas Hellström <thomas.helllstrom@linux.intel.com>
> > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > 
> > Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> > 
> > I think we also want cc: stable here.
> 
> Ok
>  
> > Do you plan to land this through some dedicated pull for -rc? I think that
> > makes sense to highlight it, but I can also smash this into some
> > drm-fixes.
> 
> I was hoping you'd take it? Do want a v3?

Hm totally lost this, I'm trying to not be too responsible for mm changes
since it scares me :-) Plus dropping this very late in the release feels a
bit risky.

Ok if I stuff this into drm-next instead?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-28 15:14           ` Daniel Vetter
@ 2021-10-28 19:42             ` Jason Gunthorpe
  2021-11-05 10:12               ` Daniel Vetter
  0 siblings, 1 reply; 15+ messages in thread
From: Jason Gunthorpe @ 2021-10-28 19:42 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Christian König, Thomas Hellström, David Airlie,
	dri-devel, Huang Rui, Dan Williams, Ralph Campbell,
	Roland Scheidegger

On Thu, Oct 28, 2021 at 05:14:52PM +0200, Daniel Vetter wrote:
> Hm totally lost this, I'm trying to not be too responsible for mm changes
> since it scares me :-) Plus dropping this very late in the release feels a
> bit risky.
> 
> Ok if I stuff this into drm-next instead?

Sure

Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs
  2021-10-28 19:42             ` Jason Gunthorpe
@ 2021-11-05 10:12               ` Daniel Vetter
  0 siblings, 0 replies; 15+ messages in thread
From: Daniel Vetter @ 2021-11-05 10:12 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Thomas Hellström, Ralph Campbell, David Airlie,
	Roland Scheidegger, dri-devel, Huang Rui, Dan Williams,
	Christian König

On Thu, Oct 28, 2021 at 04:42:56PM -0300, Jason Gunthorpe wrote:
> On Thu, Oct 28, 2021 at 05:14:52PM +0200, Daniel Vetter wrote:
> > Hm totally lost this, I'm trying to not be too responsible for mm changes
> > since it scares me :-) Plus dropping this very late in the release feels a
> > bit risky.
> > 
> > Ok if I stuff this into drm-next instead?
> 
> Sure

Finally got around to this, should show up in the merge window. Apologies
for being everywhere except on dri-devel.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-11-05 10:12 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-19 23:27 [PATCH v2] drm/ttm: Do not put non-struct page memory into PUD/PMDs Jason Gunthorpe
2021-10-20  6:34 ` Thomas Hellström
2021-10-20  6:41   ` Christian König
2021-10-20  6:52     ` Thomas Hellström
2021-10-20 19:37     ` Jason Gunthorpe
2021-10-21  7:04       ` Christian König
2021-10-21  7:05         ` Thomas Hellström
2021-10-21 11:41       ` Daniel Vetter
2021-10-22 18:57         ` Jason Gunthorpe
2021-10-28 15:14           ` Daniel Vetter
2021-10-28 19:42             ` Jason Gunthorpe
2021-11-05 10:12               ` Daniel Vetter
2021-10-20 14:09   ` Jason Gunthorpe
2021-10-21 11:40     ` Daniel Vetter
2021-10-22 19:00       ` Jason Gunthorpe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.