amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
@ 2024-04-17  6:21 Arunpravin Paneer Selvam
  2024-04-17  6:49 ` Christian König
  0 siblings, 1 reply; 6+ messages in thread
From: Arunpravin Paneer Selvam @ 2024-04-17  6:21 UTC (permalink / raw)
  To: amd-gfx
  Cc: christian.koenig, alexander.deucher, felix.kuehling, philip.yang,
	Arunpravin Paneer Selvam

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
  allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
  and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
  the allocation fails, we return -ENOSPC.

v2:
  - keep the mem_flags and bo->flags check as is(Christian)
  - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
    amdgpu_bo_pin_restricted function placement range iteration
    loop(Christian)
  - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
    (Christian)
  - Keep the kernel BO allocation as is(Christain)
  - If BO pin vram allocation failed, we need to return -ENOSPC as
    RDMA cannot work with scattered VRAM pages(Philip)

Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..caaef7b1df49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
 		else
 			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
 
-		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
+		if (abo->tbo.type == ttm_bo_type_kernel &&
+		    flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
 			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
 		c++;
 	}
 
@@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 		if (!bo->placements[i].lpfn ||
 		    (lpfn && lpfn < bo->placements[i].lpfn))
 			bo->placements[i].lpfn = lpfn;
+
+		if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+		    bo->placements[i].mem_type == TTM_PL_VRAM)
+			bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
 	}
 
 	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..4be8b091099a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,29 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
 	return size;
 }
 
+static inline unsigned long
+amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object *tbo,
+					  const struct ttm_place *place,
+					  unsigned long bo_flags)
+{
+	unsigned long pages_per_block;
+
+	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+		pages_per_block = ~0ul;
+	} else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		pages_per_block = HPAGE_PMD_NR;
+#else
+		/* default to 2MB */
+		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+		pages_per_block = max_t(uint32_t, pages_per_block,
+					tbo->page_alignment);
+	}
+
+	return pages_per_block;
+}
+
 /**
  * DOC: mem_info_vram_total
  *
@@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
 	struct amdgpu_device *adev = to_amdgpu_device(mgr);
 	u64 vis_usage = 0, max_bytes, min_block_size;
+	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
 	struct amdgpu_vram_mgr_resource *vres;
 	u64 size, remaining_size, lpfn, fpfn;
+	unsigned long bo_flags = bo->flags;
 	struct drm_buddy *mm = &mgr->mm;
 	struct drm_buddy_block *block;
 	unsigned long pages_per_block;
@@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	if (tbo->type != ttm_bo_type_kernel)
 		max_bytes -= AMDGPU_VM_RESERVED_VRAM;
 
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-		pages_per_block = ~0ul;
-	} else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		pages_per_block = HPAGE_PMD_NR;
-#else
-		/* default to 2MB */
-		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
-#endif
-		pages_per_block = max_t(uint32_t, pages_per_block,
-					tbo->page_alignment);
-	}
+	pages_per_block =
+		amdgpu_vram_mgr_calculate_pages_per_block(tbo, place,
+							  bo_flags);
 
 	vres = kzalloc(sizeof(*vres), GFP_KERNEL);
 	if (!vres)
@@ -498,7 +514,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	if (place->flags & TTM_PL_FLAG_TOPDOWN)
 		vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
 
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
 		vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
 
 	if (fpfn || lpfn != mgr->mm.size)
@@ -529,8 +545,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 					   min_block_size,
 					   &vres->blocks,
 					   vres->flags);
-		if (unlikely(r))
+		if (unlikely(r)) {
+			if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
+			    !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) {
+				/* Fallback to non contiguous allocation */
+				vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
+				bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+
+				pages_per_block =
+					amdgpu_vram_mgr_calculate_pages_per_block(tbo,
+										  place,
+										  bo_flags);
+				continue;
+			}
 			goto error_free_blocks;
+		}
 
 		if (size > remaining_size)
 			remaining_size = 0;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-17  6:21 [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour Arunpravin Paneer Selvam
@ 2024-04-17  6:49 ` Christian König
  2024-04-17 13:27   ` Paneer Selvam, Arunpravin
  0 siblings, 1 reply; 6+ messages in thread
From: Christian König @ 2024-04-17  6:49 UTC (permalink / raw)
  To: Arunpravin Paneer Selvam, amd-gfx
  Cc: alexander.deucher, felix.kuehling, philip.yang

Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam:
> Now we have two flags for contiguous VRAM buffer allocation.
> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
> buffer's placement function.
>
> This patch will change the default behaviour of the two flags.
>
> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
> - This means contiguous is not mandatory.
> - we will try to allocate the contiguous buffer. Say if the
>    allocation fails, we fallback to allocate the individual pages.
>
> When we setTTM_PL_FLAG_CONTIGUOUS
> - This means contiguous allocation is mandatory.
> - we are setting this in amdgpu_bo_pin_restricted() before bo validation
>    and check this flag in the vram manager file.
> - if this is set, we should allocate the buffer pages contiguously.
>    the allocation fails, we return -ENOSPC.
>
> v2:
>    - keep the mem_flags and bo->flags check as is(Christian)
>    - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
>      amdgpu_bo_pin_restricted function placement range iteration
>      loop(Christian)
>    - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
>      (Christian)
>    - Keep the kernel BO allocation as is(Christain)
>    - If BO pin vram allocation failed, we need to return -ENOSPC as
>      RDMA cannot work with scattered VRAM pages(Philip)
>
> Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
> Suggested-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
>   2 files changed, 50 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 8bc79924d171..caaef7b1df49 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
>   		else
>   			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>   
> -		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
> +		if (abo->tbo.type == ttm_bo_type_kernel &&
> +		    flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>   			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
> +
>   		c++;
>   	}
>   
> @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>   		if (!bo->placements[i].lpfn ||
>   		    (lpfn && lpfn < bo->placements[i].lpfn))
>   			bo->placements[i].lpfn = lpfn;
> +
> +		if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
> +		    bo->placements[i].mem_type == TTM_PL_VRAM)
> +			bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>   	}
>   
>   	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);

Nice work, up till here that looks exactly right as far as I can see.

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 8db880244324..4be8b091099a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -88,6 +88,29 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
>   	return size;
>   }
>   
> +static inline unsigned long
> +amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object *tbo,
> +					  const struct ttm_place *place,
> +					  unsigned long bo_flags)
> +{
> +	unsigned long pages_per_block;
> +
> +	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
> +		pages_per_block = ~0ul;

If I understand it correctly this here enforces the allocation of a 
contiguous buffer in the way that it says we should have only one giant 
page for the whole BO.

> +	} else {
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +		pages_per_block = HPAGE_PMD_NR;
> +#else
> +		/* default to 2MB */
> +		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
> +#endif
> +		pages_per_block = max_t(uint32_t, pages_per_block,
> +					tbo->page_alignment);
> +	}
> +
> +	return pages_per_block;
> +}
> +
>   /**
>    * DOC: mem_info_vram_total
>    *
> @@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>   	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>   	u64 vis_usage = 0, max_bytes, min_block_size;
> +	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>   	struct amdgpu_vram_mgr_resource *vres;
>   	u64 size, remaining_size, lpfn, fpfn;
> +	unsigned long bo_flags = bo->flags;
>   	struct drm_buddy *mm = &mgr->mm;
>   	struct drm_buddy_block *block;
>   	unsigned long pages_per_block;
> @@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   	if (tbo->type != ttm_bo_type_kernel)
>   		max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>   
> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
> -		pages_per_block = ~0ul;
> -	} else {
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -		pages_per_block = HPAGE_PMD_NR;
> -#else
> -		/* default to 2MB */
> -		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
> -#endif
> -		pages_per_block = max_t(uint32_t, pages_per_block,
> -					tbo->page_alignment);
> -	}
> +	pages_per_block =
> +		amdgpu_vram_mgr_calculate_pages_per_block(tbo, place,
> +							  bo_flags);
>   
>   	vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>   	if (!vres)
> @@ -498,7 +514,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   	if (place->flags & TTM_PL_FLAG_TOPDOWN)
>   		vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>   
> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
> +	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>   		vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;

And this here just optimizes it in the way that it says try harder to 
make it look contiguous.

Question is if that also works with pages_per_block of 2MiB or does that 
only kick in when pages_per_block is maximal?

>   
>   	if (fpfn || lpfn != mgr->mm.size)
> @@ -529,8 +545,21 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   					   min_block_size,
>   					   &vres->blocks,
>   					   vres->flags);
> -		if (unlikely(r))
> +		if (unlikely(r)) {
> +			if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
> +			    !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) {
> +				/* Fallback to non contiguous allocation */
> +				vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
> +				bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;

Well I would say that this is a rather hacky implementation and should 
be avoided if possible.

> +
> +				pages_per_block =
> +					amdgpu_vram_mgr_calculate_pages_per_block(tbo,
> +										  place,
> +										  bo_flags);

Rather move the AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS handling out of 
amdgpu_vram_mgr_calculate_pages_per_block().

Regards,
Christian.

> +				continue;
> +			}
>   			goto error_free_blocks;
> +		}
>   
>   		if (size > remaining_size)
>   			remaining_size = 0;


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-17  6:49 ` Christian König
@ 2024-04-17 13:27   ` Paneer Selvam, Arunpravin
  2024-04-17 14:32     ` Paneer Selvam, Arunpravin
  0 siblings, 1 reply; 6+ messages in thread
From: Paneer Selvam, Arunpravin @ 2024-04-17 13:27 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: alexander.deucher, felix.kuehling, philip.yang

Hi Christian,

On 4/17/2024 12:19 PM, Christian König wrote:
> Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam:
>> Now we have two flags for contiguous VRAM buffer allocation.
>> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
>> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
>> buffer's placement function.
>>
>> This patch will change the default behaviour of the two flags.
>>
>> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
>> - This means contiguous is not mandatory.
>> - we will try to allocate the contiguous buffer. Say if the
>>    allocation fails, we fallback to allocate the individual pages.
>>
>> When we setTTM_PL_FLAG_CONTIGUOUS
>> - This means contiguous allocation is mandatory.
>> - we are setting this in amdgpu_bo_pin_restricted() before bo validation
>>    and check this flag in the vram manager file.
>> - if this is set, we should allocate the buffer pages contiguously.
>>    the allocation fails, we return -ENOSPC.
>>
>> v2:
>>    - keep the mem_flags and bo->flags check as is(Christian)
>>    - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
>>      amdgpu_bo_pin_restricted function placement range iteration
>>      loop(Christian)
>>    - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
>>      (Christian)
>>    - Keep the kernel BO allocation as is(Christain)
>>    - If BO pin vram allocation failed, we need to return -ENOSPC as
>>      RDMA cannot work with scattered VRAM pages(Philip)
>>
>> Signed-off-by: Arunpravin Paneer Selvam 
>> <Arunpravin.PaneerSelvam@amd.com>
>> Suggested-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
>>   2 files changed, 50 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 8bc79924d171..caaef7b1df49 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct 
>> amdgpu_bo *abo, u32 domain)
>>           else
>>               places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>   -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>> +        if (abo->tbo.type == ttm_bo_type_kernel &&
>> +            flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>               places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>> +
>>           c++;
>>       }
>>   @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>> *bo, u32 domain,
>>           if (!bo->placements[i].lpfn ||
>>               (lpfn && lpfn < bo->placements[i].lpfn))
>>               bo->placements[i].lpfn = lpfn;
>> +
>> +        if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>> +            bo->placements[i].mem_type == TTM_PL_VRAM)
>> +            bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>       }
>>         r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>
> Nice work, up till here that looks exactly right as far as I can see.
>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index 8db880244324..4be8b091099a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -88,6 +88,29 @@ static inline u64 
>> amdgpu_vram_mgr_blocks_size(struct list_head *head)
>>       return size;
>>   }
>>   +static inline unsigned long
>> +amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object 
>> *tbo,
>> +                      const struct ttm_place *place,
>> +                      unsigned long bo_flags)
>> +{
>> +    unsigned long pages_per_block;
>> +
>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
>> +        pages_per_block = ~0ul;
>
> If I understand it correctly this here enforces the allocation of a 
> contiguous buffer in the way that it says we should have only one 
> giant page for the whole BO.
yes, for contiguous we don't have the 2MB limit, hence we are setting to 
maximum to avoid restricting the min_block_size to 2MB limitation.
>
>> +    } else {
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +        pages_per_block = HPAGE_PMD_NR;
>> +#else
>> +        /* default to 2MB */
>> +        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>> +#endif
>> +        pages_per_block = max_t(uint32_t, pages_per_block,
>> +                    tbo->page_alignment);
>> +    }
>> +
>> +    return pages_per_block;
>> +}
>> +
>>   /**
>>    * DOC: mem_info_vram_total
>>    *
>> @@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>       u64 vis_usage = 0, max_bytes, min_block_size;
>> +    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>>       struct amdgpu_vram_mgr_resource *vres;
>>       u64 size, remaining_size, lpfn, fpfn;
>> +    unsigned long bo_flags = bo->flags;
>>       struct drm_buddy *mm = &mgr->mm;
>>       struct drm_buddy_block *block;
>>       unsigned long pages_per_block;
>> @@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>       if (tbo->type != ttm_bo_type_kernel)
>>           max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> -        pages_per_block = ~0ul;
>> -    } else {
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -        pages_per_block = HPAGE_PMD_NR;
>> -#else
>> -        /* default to 2MB */
>> -        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>> -#endif
>> -        pages_per_block = max_t(uint32_t, pages_per_block,
>> -                    tbo->page_alignment);
>> -    }
>> +    pages_per_block =
>> +        amdgpu_vram_mgr_calculate_pages_per_block(tbo, place,
>> +                              bo_flags);
>>         vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>>       if (!vres)
>> @@ -498,7 +514,7 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>       if (place->flags & TTM_PL_FLAG_TOPDOWN)
>>           vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>           vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>
> And this here just optimizes it in the way that it says try harder to 
> make it look contiguous.
Here I removed the TTM_PL_FLAG_CONTIGUOUS. AFAIU, in both cases (BO 
pinning and normal contiguous request)
this flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is always set.
>
> Question is if that also works with pages_per_block of 2MiB or does 
> that only kick in when pages_per_block is maximal?
AFAIU, if this flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is set, we are 
setting the pages_per_block as maximal, thus
we dont limit the BO. when we set the pages_per_block as maximal, the 
min_block_size would be the tbo->page_alignment,
and this tbo->page_alignment would be the same value as the required 
size. Required size can be less than 2MB or more than 2MB.
Below we have check size >= pages_per_block, when the pages_per_block is 
maximal we don't limit the min_block_size.
>
>>         if (fpfn || lpfn != mgr->mm.size)
>> @@ -529,8 +545,21 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>                          min_block_size,
>>                          &vres->blocks,
>>                          vres->flags);
>> -        if (unlikely(r))
>> +        if (unlikely(r)) {
>> +            if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>> +                !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) {
>> +                /* Fallback to non contiguous allocation */
>> +                vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>> +                bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>
> Well I would say that this is a rather hacky implementation and should 
> be avoided if possible.
sure, I will try to rewrite the code.
>
>> +
>> +                pages_per_block =
>> + amdgpu_vram_mgr_calculate_pages_per_block(tbo,
>> +                                          place,
>> +                                          bo_flags);
>
> Rather move the AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS handling out of 
> amdgpu_vram_mgr_calculate_pages_per_block().
sure.

Thanks,
Arun.
>
> Regards,
> Christian.
>
>> +                continue;
>> +            }
>>               goto error_free_blocks;
>> +        }
>>             if (size > remaining_size)
>>               remaining_size = 0;
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-17 13:27   ` Paneer Selvam, Arunpravin
@ 2024-04-17 14:32     ` Paneer Selvam, Arunpravin
  2024-04-17 15:28       ` Philip Yang
  0 siblings, 1 reply; 6+ messages in thread
From: Paneer Selvam, Arunpravin @ 2024-04-17 14:32 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: alexander.deucher, felix.kuehling, philip.yang

Hi Christian,

On 4/17/2024 6:57 PM, Paneer Selvam, Arunpravin wrote:
> Hi Christian,
>
> On 4/17/2024 12:19 PM, Christian König wrote:
>> Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam:
>>> Now we have two flags for contiguous VRAM buffer allocation.
>>> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
>>> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
>>> buffer's placement function.
>>>
>>> This patch will change the default behaviour of the two flags.
>>>
>>> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
>>> - This means contiguous is not mandatory.
>>> - we will try to allocate the contiguous buffer. Say if the
>>>    allocation fails, we fallback to allocate the individual pages.
>>>
>>> When we setTTM_PL_FLAG_CONTIGUOUS
>>> - This means contiguous allocation is mandatory.
>>> - we are setting this in amdgpu_bo_pin_restricted() before bo 
>>> validation
>>>    and check this flag in the vram manager file.
>>> - if this is set, we should allocate the buffer pages contiguously.
>>>    the allocation fails, we return -ENOSPC.
>>>
>>> v2:
>>>    - keep the mem_flags and bo->flags check as is(Christian)
>>>    - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
>>>      amdgpu_bo_pin_restricted function placement range iteration
>>>      loop(Christian)
>>>    - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
>>>      (Christian)
>>>    - Keep the kernel BO allocation as is(Christain)
>>>    - If BO pin vram allocation failed, we need to return -ENOSPC as
>>>      RDMA cannot work with scattered VRAM pages(Philip)
>>>
>>> Signed-off-by: Arunpravin Paneer Selvam 
>>> <Arunpravin.PaneerSelvam@amd.com>
>>> Suggested-by: Christian König <christian.koenig@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 
>>> +++++++++++++++-----
>>>   2 files changed, 50 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 8bc79924d171..caaef7b1df49 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct 
>>> amdgpu_bo *abo, u32 domain)
>>>           else
>>>               places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>>   -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>> +        if (abo->tbo.type == ttm_bo_type_kernel &&
>>> +            flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>>               places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>> +
>>>           c++;
>>>       }
>>>   @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>>> *bo, u32 domain,
>>>           if (!bo->placements[i].lpfn ||
>>>               (lpfn && lpfn < bo->placements[i].lpfn))
>>>               bo->placements[i].lpfn = lpfn;
>>> +
>>> +        if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>>> +            bo->placements[i].mem_type == TTM_PL_VRAM)
>>> +            bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>>       }
>>>         r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>
>> Nice work, up till here that looks exactly right as far as I can see.
>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> index 8db880244324..4be8b091099a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> @@ -88,6 +88,29 @@ static inline u64 
>>> amdgpu_vram_mgr_blocks_size(struct list_head *head)
>>>       return size;
>>>   }
>>>   +static inline unsigned long
>>> +amdgpu_vram_mgr_calculate_pages_per_block(struct ttm_buffer_object 
>>> *tbo,
>>> +                      const struct ttm_place *place,
>>> +                      unsigned long bo_flags)
>>> +{
>>> +    unsigned long pages_per_block;
>>> +
>>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
>>> +        pages_per_block = ~0ul;
>>
>> If I understand it correctly this here enforces the allocation of a 
>> contiguous buffer in the way that it says we should have only one 
>> giant page for the whole BO.
> yes, for contiguous we don't have the 2MB limit, hence we are setting 
> to maximum to avoid restricting the min_block_size to 2MB limitation.
>>
>>> +    } else {
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +        pages_per_block = HPAGE_PMD_NR;
>>> +#else
>>> +        /* default to 2MB */
>>> +        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>>> +#endif
>>> +        pages_per_block = max_t(uint32_t, pages_per_block,
>>> +                    tbo->page_alignment);
>>> +    }
>>> +
>>> +    return pages_per_block;
>>> +}
>>> +
>>>   /**
>>>    * DOC: mem_info_vram_total
>>>    *
>>> @@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>>       u64 vis_usage = 0, max_bytes, min_block_size;
>>> +    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>>>       struct amdgpu_vram_mgr_resource *vres;
>>>       u64 size, remaining_size, lpfn, fpfn;
>>> +    unsigned long bo_flags = bo->flags;
>>>       struct drm_buddy *mm = &mgr->mm;
>>>       struct drm_buddy_block *block;
>>>       unsigned long pages_per_block;
>>> @@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       if (tbo->type != ttm_bo_type_kernel)
>>>           max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>> -        pages_per_block = ~0ul;
>>> -    } else {
>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -        pages_per_block = HPAGE_PMD_NR;
>>> -#else
>>> -        /* default to 2MB */
>>> -        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>>> -#endif
>>> -        pages_per_block = max_t(uint32_t, pages_per_block,
>>> -                    tbo->page_alignment);
>>> -    }
>>> +    pages_per_block =
>>> +        amdgpu_vram_mgr_calculate_pages_per_block(tbo, place,
>>> +                              bo_flags);
>>>         vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>>>       if (!vres)
>>> @@ -498,7 +514,7 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       if (place->flags & TTM_PL_FLAG_TOPDOWN)
>>>           vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>>           vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>
>> And this here just optimizes it in the way that it says try harder to 
>> make it look contiguous.
> Here I removed the TTM_PL_FLAG_CONTIGUOUS. AFAIU, in both cases (BO 
> pinning and normal contiguous request)
> this flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is always set.
>>
>> Question is if that also works with pages_per_block of 2MiB or does 
>> that only kick in when pages_per_block is maximal?
> AFAIU, if this flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is set, we are 
> setting the pages_per_block as maximal, thus
> we dont limit the BO. when we set the pages_per_block as maximal, the 
> min_block_size would be the tbo->page_alignment,
> and this tbo->page_alignment would be the same value as the required 
> size. Required size can be less than 2MB or more than 2MB.
> Below we have check size >= pages_per_block, when the pages_per_block 
> is maximal we don't limit the min_block_size.
a small correction, when we set the pages_per_block as maximal, we don't 
set the min_block_size, the buddy allocator will set the min_block_size
as roundup_pow_of_two(size).

Thanks,
Arun.
>>
>>>         if (fpfn || lpfn != mgr->mm.size)
>>> @@ -529,8 +545,21 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>                          min_block_size,
>>>                          &vres->blocks,
>>>                          vres->flags);
>>> -        if (unlikely(r))
>>> +        if (unlikely(r)) {
>>> +            if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>>> +                !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) {
>>> +                /* Fallback to non contiguous allocation */
>>> +                vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>> +                bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>
>> Well I would say that this is a rather hacky implementation and 
>> should be avoided if possible.
> sure, I will try to rewrite the code.
>>
>>> +
>>> +                pages_per_block =
>>> + amdgpu_vram_mgr_calculate_pages_per_block(tbo,
>>> +                                          place,
>>> +                                          bo_flags);
>>
>> Rather move the AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS handling out of 
>> amdgpu_vram_mgr_calculate_pages_per_block().
> sure.
>
> Thanks,
> Arun.
>>
>> Regards,
>> Christian.
>>
>>> +                continue;
>>> +            }
>>>               goto error_free_blocks;
>>> +        }
>>>             if (size > remaining_size)
>>>               remaining_size = 0;
>>
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-17 14:32     ` Paneer Selvam, Arunpravin
@ 2024-04-17 15:28       ` Philip Yang
  2024-04-17 15:37         ` Paneer Selvam, Arunpravin
  0 siblings, 1 reply; 6+ messages in thread
From: Philip Yang @ 2024-04-17 15:28 UTC (permalink / raw)
  To: Paneer Selvam, Arunpravin, Christian König, amd-gfx
  Cc: alexander.deucher, felix.kuehling, philip.yang

[-- Attachment #1: Type: text/html, Size: 20238 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-17 15:28       ` Philip Yang
@ 2024-04-17 15:37         ` Paneer Selvam, Arunpravin
  0 siblings, 0 replies; 6+ messages in thread
From: Paneer Selvam, Arunpravin @ 2024-04-17 15:37 UTC (permalink / raw)
  To: Philip Yang, Christian König, amd-gfx
  Cc: alexander.deucher, felix.kuehling, philip.yang

Hi Philip,

On 4/17/2024 8:58 PM, Philip Yang wrote:
>
>
> On 2024-04-17 10:32, Paneer Selvam, Arunpravin wrote:
>> Hi Christian,
>>
>> On 4/17/2024 6:57 PM, Paneer Selvam, Arunpravin wrote:
>>> Hi Christian,
>>>
>>> On 4/17/2024 12:19 PM, Christian König wrote:
>>>> Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam:
>>>>> Now we have two flags for contiguous VRAM buffer allocation.
>>>>> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
>>>>> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
>>>>> buffer's placement function.
>>>>>
>>>>> This patch will change the default behaviour of the two flags.
>>>>>
>>>>> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
>>>>> - This means contiguous is not mandatory.
>>>>> - we will try to allocate the contiguous buffer. Say if the
>>>>>    allocation fails, we fallback to allocate the individual pages.
>>>>>
>>>>> When we setTTM_PL_FLAG_CONTIGUOUS
>>>>> - This means contiguous allocation is mandatory.
>>>>> - we are setting this in amdgpu_bo_pin_restricted() before bo 
>>>>> validation
>>>>>    and check this flag in the vram manager file.
>>>>> - if this is set, we should allocate the buffer pages contiguously.
>>>>>    the allocation fails, we return -ENOSPC.
>>>>>
>>>>> v2:
>>>>>    - keep the mem_flags and bo->flags check as is(Christian)
>>>>>    - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the
>>>>>      amdgpu_bo_pin_restricted function placement range iteration
>>>>>      loop(Christian)
>>>>>    - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block
>>>>>      (Christian)
>>>>>    - Keep the kernel BO allocation as is(Christain)
>>>>>    - If BO pin vram allocation failed, we need to return -ENOSPC as
>>>>>      RDMA cannot work with scattered VRAM pages(Philip)
>>>>>
>>>>> Signed-off-by: Arunpravin Paneer Selvam 
>>>>> <Arunpravin.PaneerSelvam@amd.com>
>>>>> Suggested-by: Christian König <christian.koenig@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 
>>>>> +++++++++++++++-----
>>>>>   2 files changed, 50 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>>> index 8bc79924d171..caaef7b1df49 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>>> @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct 
>>>>> amdgpu_bo *abo, u32 domain)
>>>>>           else
>>>>>               places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>>>>   -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>>>> +        if (abo->tbo.type == ttm_bo_type_kernel &&
>>>>> +            flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>>>>               places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>>>> +
>>>>>           c++;
>>>>>       }
>>>>>   @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct 
>>>>> amdgpu_bo *bo, u32 domain,
>>>>>           if (!bo->placements[i].lpfn ||
>>>>>               (lpfn && lpfn < bo->placements[i].lpfn))
>>>>>               bo->placements[i].lpfn = lpfn;
>>>>> +
>>>>> +        if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>>>>> +            bo->placements[i].mem_type == TTM_PL_VRAM)
>>>>> +            bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>>>>       }
>>>>>         r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>>
>>>> Nice work, up till here that looks exactly right as far as I can see.
>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>>> index 8db880244324..4be8b091099a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>>> @@ -88,6 +88,29 @@ static inline u64 
>>>>> amdgpu_vram_mgr_blocks_size(struct list_head *head)
>>>>>       return size;
>>>>>   }
>>>>>   +static inline unsigned long
>>>>> +amdgpu_vram_mgr_calculate_pages_per_block(struct 
>>>>> ttm_buffer_object *tbo,
>>>>> +                      const struct ttm_place *place,
>>>>> +                      unsigned long bo_flags)
>>>>> +{
>>>>> +    unsigned long pages_per_block;
>>>>> +
>>>>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
>>>>> +        pages_per_block = ~0ul;
>>>>
>>>> If I understand it correctly this here enforces the allocation of a 
>>>> contiguous buffer in the way that it says we should have only one 
>>>> giant page for the whole BO.
>>> yes, for contiguous we don't have the 2MB limit, hence we are 
>>> setting to maximum to avoid restricting the min_block_size to 2MB 
>>> limitation.
>>>>
>>>>> +    } else {
>>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>>> +        pages_per_block = HPAGE_PMD_NR;
>>>>> +#else
>>>>> +        /* default to 2MB */
>>>>> +        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>>>>> +#endif
>>>>> +        pages_per_block = max_t(uint32_t, pages_per_block,
>>>>> +                    tbo->page_alignment);
>>>>> +    }
>>>>> +
>>>>> +    return pages_per_block;
>>>>> +}
>>>>> +
>>>>>   /**
>>>>>    * DOC: mem_info_vram_total
>>>>>    *
>>>>> @@ -451,8 +474,10 @@ static int amdgpu_vram_mgr_new(struct 
>>>>> ttm_resource_manager *man,
>>>>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>>>>       u64 vis_usage = 0, max_bytes, min_block_size;
>>>>> +    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>>>>>       struct amdgpu_vram_mgr_resource *vres;
>>>>>       u64 size, remaining_size, lpfn, fpfn;
>>>>> +    unsigned long bo_flags = bo->flags;
>>>>>       struct drm_buddy *mm = &mgr->mm;
>>>>>       struct drm_buddy_block *block;
>>>>>       unsigned long pages_per_block;
>>>>> @@ -468,18 +493,9 @@ static int amdgpu_vram_mgr_new(struct 
>>>>> ttm_resource_manager *man,
>>>>>       if (tbo->type != ttm_bo_type_kernel)
>>>>>           max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>>>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>>>> -        pages_per_block = ~0ul;
>>>>> -    } else {
>>>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>>> -        pages_per_block = HPAGE_PMD_NR;
>>>>> -#else
>>>>> -        /* default to 2MB */
>>>>> -        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>>>>> -#endif
>>>>> -        pages_per_block = max_t(uint32_t, pages_per_block,
>>>>> -                    tbo->page_alignment);
>>>>> -    }
>>>>> +    pages_per_block =
>>>>> +        amdgpu_vram_mgr_calculate_pages_per_block(tbo, place,
>>>>> +                              bo_flags);
>>>>>         vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>>>>>       if (!vres)
>>>>> @@ -498,7 +514,7 @@ static int amdgpu_vram_mgr_new(struct 
>>>>> ttm_resource_manager *man,
>>>>>       if (place->flags & TTM_PL_FLAG_TOPDOWN)
>>>>>           vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>>>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>>>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>>>>           vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>>>
>>>> And this here just optimizes it in the way that it says try harder 
>>>> to make it look contiguous.
>>> Here I removed the TTM_PL_FLAG_CONTIGUOUS. AFAIU, in both cases (BO 
>>> pinning and normal contiguous request)
>>> this flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is always set.
>>>>
>>>> Question is if that also works with pages_per_block of 2MiB or does 
>>>> that only kick in when pages_per_block is maximal?
>>> AFAIU, if this flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is set, we are 
>>> setting the pages_per_block as maximal, thus
>>> we dont limit the BO. when we set the pages_per_block as maximal, 
>>> the min_block_size would be the tbo->page_alignment,
>>> and this tbo->page_alignment would be the same value as the required 
>>> size. Required size can be less than 2MB or more than 2MB.
>>> Below we have check size >= pages_per_block, when the 
>>> pages_per_block is maximal we don't limit the min_block_size.
>> a small correction, when we set the pages_per_block as maximal, we 
>> don't set the min_block_size, the buddy allocator will set the 
>> min_block_size
>> as roundup_pow_of_two(size).
>
> If setting 2MB pages_per_block, without 
> DRM_BUDDY_CONTIGUOUS_ALLOCATION flag, does buddy alloc from free 2MB 
> blocks first, or buddy split larger blocks into smaller blocks, as we 
> want to keep larger block for contiguous allocation.
>
when we set the 2MB pages_per_block without contiguous flag, buddy 
allocator first tries to find the 2MB blocks in the free list, if it 
doesn't find the blocks it will go to the next power of two (say 4MB) 
and split that into 2BM to satisfy the minimum alignment (2MB limitation).

If the required size is less than 2MB, buddy allocator goes with the 
tbo->page_alignment(if this is set) or default size (4KB).

Thanks,
Arun.
>
> Regards,
>
> Philip
>
>>
>> Thanks,
>> Arun.
>>>>
>>>>>         if (fpfn || lpfn != mgr->mm.size)
>>>>> @@ -529,8 +545,21 @@ static int amdgpu_vram_mgr_new(struct 
>>>>> ttm_resource_manager *man,
>>>>>                          min_block_size,
>>>>>                          &vres->blocks,
>>>>>                          vres->flags);
>>>>> -        if (unlikely(r))
>>>>> +        if (unlikely(r)) {
>>>>> +            if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>>>>> +                !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) {
>>>>> +                /* Fallback to non contiguous allocation */
>>>>> +                vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>>>> +                bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>
>>>> Well I would say that this is a rather hacky implementation and 
>>>> should be avoided if possible.
>>> sure, I will try to rewrite the code.
>>>>
>>>>> +
>>>>> +                pages_per_block =
>>>>> + amdgpu_vram_mgr_calculate_pages_per_block(tbo,
>>>>> +                                          place,
>>>>> +                                          bo_flags);
>>>>
>>>> Rather move the AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS handling out of 
>>>> amdgpu_vram_mgr_calculate_pages_per_block().
>>> sure.
>>>
>>> Thanks,
>>> Arun.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> +                continue;
>>>>> +            }
>>>>>               goto error_free_blocks;
>>>>> +        }
>>>>>             if (size > remaining_size)
>>>>>               remaining_size = 0;
>>>>
>>>
>>


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-04-17 15:37 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-17  6:21 [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour Arunpravin Paneer Selvam
2024-04-17  6:49 ` Christian König
2024-04-17 13:27   ` Paneer Selvam, Arunpravin
2024-04-17 14:32     ` Paneer Selvam, Arunpravin
2024-04-17 15:28       ` Philip Yang
2024-04-17 15:37         ` Paneer Selvam, Arunpravin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).