All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: further lower VRAM allocation overhead
@ 2021-07-13 13:32 Christian König
  2021-07-13 16:11 ` Felix Kuehling
  0 siblings, 1 reply; 11+ messages in thread
From: Christian König @ 2021-07-13 13:32 UTC (permalink / raw)
  To: jinhuieric.huang
  Cc: felix.kuehling, amd-gfx, Luugi.Marsan, Jenny-Jing.Liu, Chris.Mason

For allocations larger than 48MiB we need more than a page for the
housekeeping in the worst case resulting in the usual vmalloc overhead.

Try to avoid this by assuming the good case and only falling back to the
worst case if this didn't worked.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 80 +++++++++++++++-----
 1 file changed, 60 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 2fd77c36a1ff..ab8c5e28df7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -361,19 +361,23 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
  * @man: TTM memory type manager
  * @tbo: TTM BO we need this range for
  * @place: placement flags and restrictions
- * @mem: the resulting mem object
+ * @num_nodes: number of page nodes to use.
+ * @pages_per_node: number of pages per node to use.
+ * @res: the resulting mem object
  *
  * Allocate VRAM for the given BO.
  */
 static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 			       struct ttm_buffer_object *tbo,
 			       const struct ttm_place *place,
+			       unsigned long num_nodes,
+			       unsigned long pages_per_node,
 			       struct ttm_resource **res)
 {
-	unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
 	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
 	struct amdgpu_device *adev = to_amdgpu_device(mgr);
 	uint64_t vis_usage = 0, mem_bytes, max_bytes;
+	unsigned long lpfn, pages_left, pages;
 	struct ttm_range_mgr_node *node;
 	struct drm_mm *mm = &mgr->mm;
 	enum drm_mm_insert_mode mode;
@@ -395,21 +399,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 		goto error_sub;
 	}
 
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-		pages_per_node = ~0ul;
-		num_nodes = 1;
-	} else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		pages_per_node = HPAGE_PMD_NR;
-#else
-		/* default to 2MB */
-		pages_per_node = 2UL << (20UL - PAGE_SHIFT);
-#endif
-		pages_per_node = max_t(uint32_t, pages_per_node,
-				       tbo->page_alignment);
-		num_nodes = DIV_ROUND_UP_ULL(PFN_UP(mem_bytes), pages_per_node);
-	}
-
 	node = kvmalloc(struct_size(node, mm_nodes, num_nodes),
 			GFP_KERNEL | __GFP_ZERO);
 	if (!node) {
@@ -431,10 +420,15 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	i = 0;
 	spin_lock(&mgr->lock);
 	while (pages_left) {
-		uint32_t alignment = tbo->page_alignment;
+		unsigned long alignment = tbo->page_alignment;
+
+		if (i >= num_nodes) {
+			r = -E2BIG;
+			goto error_free;
+		}
 
 		if (pages >= pages_per_node)
-			alignment = pages_per_node;
+			alignment = max(alignment, pages_per_node);
 
 		r = drm_mm_insert_node_in_range(mm, &node->mm_nodes[i], pages,
 						alignment, 0, place->fpfn,
@@ -483,6 +477,52 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	return r;
 }
 
+/**
+ * amdgpu_vram_mgr_alloc - allocate new range
+ *
+ * @man: TTM memory type manager
+ * @tbo: TTM BO we need this range for
+ * @place: placement flags and restrictions
+ * @res: the resulting mem object
+ *
+ * Allocate VRAM for the given BO.
+ */
+static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
+				 struct ttm_buffer_object *tbo,
+				 const struct ttm_place *place,
+				 struct ttm_resource **res)
+{
+	unsigned long num_nodes, pages_per_node;
+	struct ttm_range_mgr_node *node;
+	int r;
+
+	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+		return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, res);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	pages_per_node = HPAGE_PMD_NR;
+#else
+	/* default to 2MB */
+	pages_per_node = 2UL << (20UL - PAGE_SHIFT);
+#endif
+	pages_per_node = max_t(uint32_t, pages_per_node, tbo->page_alignment);
+	num_nodes = DIV_ROUND_UP_ULL(PFN_UP(tbo->base.size), pages_per_node);
+
+	if (struct_size(node, mm_nodes, num_nodes) > PAGE_SIZE) {
+		size_t size = PAGE_SIZE;
+
+		size -= sizeof(struct ttm_range_mgr_node);
+		size /= sizeof(struct drm_mm_node);
+		r = amdgpu_vram_mgr_new(man, tbo, place, size, pages_per_node,
+					res);
+		if (r != -E2BIG)
+			return r;
+	}
+
+	return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
+				   res);
+}
+
 /**
  * amdgpu_vram_mgr_del - free ranges
  *
@@ -680,7 +720,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
 }
 
 static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
-	.alloc	= amdgpu_vram_mgr_new,
+	.alloc	= amdgpu_vram_mgr_alloc,
 	.free	= amdgpu_vram_mgr_del,
 	.debug	= amdgpu_vram_mgr_debug
 };
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-13 13:32 [PATCH] drm/amdgpu: further lower VRAM allocation overhead Christian König
@ 2021-07-13 16:11 ` Felix Kuehling
  2021-07-13 17:23   ` Eric Huang
  2021-07-14 12:08   ` Christian König
  0 siblings, 2 replies; 11+ messages in thread
From: Felix Kuehling @ 2021-07-13 16:11 UTC (permalink / raw)
  To: Christian König, jinhuieric.huang
  Cc: amd-gfx, Luugi.Marsan, Jenny-Jing.Liu, Chris.Mason

Am 2021-07-13 um 9:32 a.m. schrieb Christian König:
> For allocations larger than 48MiB we need more than a page for the
> housekeeping in the worst case resulting in the usual vmalloc overhead.
>
> Try to avoid this by assuming the good case and only falling back to the
> worst case if this didn't worked.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 80 +++++++++++++++-----
>  1 file changed, 60 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 2fd77c36a1ff..ab8c5e28df7b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -361,19 +361,23 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
>   * @man: TTM memory type manager
>   * @tbo: TTM BO we need this range for
>   * @place: placement flags and restrictions
> - * @mem: the resulting mem object
> + * @num_nodes: number of page nodes to use.
> + * @pages_per_node: number of pages per node to use.
> + * @res: the resulting mem object
>   *
>   * Allocate VRAM for the given BO.
>   */
>  static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>  			       struct ttm_buffer_object *tbo,
>  			       const struct ttm_place *place,
> +			       unsigned long num_nodes,
> +			       unsigned long pages_per_node,
>  			       struct ttm_resource **res)
>  {
> -	unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>  	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>  	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>  	uint64_t vis_usage = 0, mem_bytes, max_bytes;
> +	unsigned long lpfn, pages_left, pages;
>  	struct ttm_range_mgr_node *node;
>  	struct drm_mm *mm = &mgr->mm;
>  	enum drm_mm_insert_mode mode;
> @@ -395,21 +399,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>  		goto error_sub;
>  	}
>  
> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
> -		pages_per_node = ~0ul;
> -		num_nodes = 1;
> -	} else {
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -		pages_per_node = HPAGE_PMD_NR;
> -#else
> -		/* default to 2MB */
> -		pages_per_node = 2UL << (20UL - PAGE_SHIFT);
> -#endif
> -		pages_per_node = max_t(uint32_t, pages_per_node,
> -				       tbo->page_alignment);
> -		num_nodes = DIV_ROUND_UP_ULL(PFN_UP(mem_bytes), pages_per_node);
> -	}
> -
>  	node = kvmalloc(struct_size(node, mm_nodes, num_nodes),
>  			GFP_KERNEL | __GFP_ZERO);
>  	if (!node) {
> @@ -431,10 +420,15 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>  	i = 0;
>  	spin_lock(&mgr->lock);
>  	while (pages_left) {
> -		uint32_t alignment = tbo->page_alignment;
> +		unsigned long alignment = tbo->page_alignment;
> +
> +		if (i >= num_nodes) {
> +			r = -E2BIG;
> +			goto error_free;
> +		}
>  
>  		if (pages >= pages_per_node)
> -			alignment = pages_per_node;
> +			alignment = max(alignment, pages_per_node);

I don't understand this change. Is this an unrelated fix? pages_per_node
is already bumped up to tbo->page_alignment in amdgpu_vram_mgr_alloc. So
this "max" operation here seems redundant.

Other than that, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>

@JinHuiEric, can you confirm the performance improvement?

Thanks,
  Felix


>  
>  		r = drm_mm_insert_node_in_range(mm, &node->mm_nodes[i], pages,
>  						alignment, 0, place->fpfn,
> @@ -483,6 +477,52 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>  	return r;
>  }
>  
> +/**
> + * amdgpu_vram_mgr_alloc - allocate new range
> + *
> + * @man: TTM memory type manager
> + * @tbo: TTM BO we need this range for
> + * @place: placement flags and restrictions
> + * @res: the resulting mem object
> + *
> + * Allocate VRAM for the given BO.
> + */
> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
> +				 struct ttm_buffer_object *tbo,
> +				 const struct ttm_place *place,
> +				 struct ttm_resource **res)
> +{
> +	unsigned long num_nodes, pages_per_node;
> +	struct ttm_range_mgr_node *node;
> +	int r;
> +
> +	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
> +		return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, res);
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	pages_per_node = HPAGE_PMD_NR;
> +#else
> +	/* default to 2MB */
> +	pages_per_node = 2UL << (20UL - PAGE_SHIFT);
> +#endif
> +	pages_per_node = max_t(uint32_t, pages_per_node, tbo->page_alignment);
> +	num_nodes = DIV_ROUND_UP_ULL(PFN_UP(tbo->base.size), pages_per_node);
> +
> +	if (struct_size(node, mm_nodes, num_nodes) > PAGE_SIZE) {
> +		size_t size = PAGE_SIZE;
> +
> +		size -= sizeof(struct ttm_range_mgr_node);
> +		size /= sizeof(struct drm_mm_node);
> +		r = amdgpu_vram_mgr_new(man, tbo, place, size, pages_per_node,
> +					res);
> +		if (r != -E2BIG)
> +			return r;
> +	}
> +
> +	return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
> +				   res);
> +}
> +
>  /**
>   * amdgpu_vram_mgr_del - free ranges
>   *
> @@ -680,7 +720,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
>  }
>  
>  static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
> -	.alloc	= amdgpu_vram_mgr_new,
> +	.alloc	= amdgpu_vram_mgr_alloc,
>  	.free	= amdgpu_vram_mgr_del,
>  	.debug	= amdgpu_vram_mgr_debug
>  };
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-13 16:11 ` Felix Kuehling
@ 2021-07-13 17:23   ` Eric Huang
  2021-07-14 12:08   ` Christian König
  1 sibling, 0 replies; 11+ messages in thread
From: Eric Huang @ 2021-07-13 17:23 UTC (permalink / raw)
  To: Felix Kuehling, Christian König
  Cc: amd-gfx, Luugi.Marsan, Jenny-Jing.Liu, Chris.Mason

I am converting codes into amd-staging-drm-next. Theoretically it will 
improve a lot on the latency, the size of the array allocated is 24 
(PAGE_SIZE/struct drm_mm_node) with this patch, and it was 8192 before. 
So the latency should be reduced by 98 us.

Regards,
Eric

On 2021-07-13 12:11 p.m., Felix Kuehling wrote:
> Am 2021-07-13 um 9:32 a.m. schrieb Christian König:
>> For allocations larger than 48MiB we need more than a page for the
>> housekeeping in the worst case resulting in the usual vmalloc overhead.
>>
>> Try to avoid this by assuming the good case and only falling back to the
>> worst case if this didn't worked.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 80 +++++++++++++++-----
>>   1 file changed, 60 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index 2fd77c36a1ff..ab8c5e28df7b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -361,19 +361,23 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
>>    * @man: TTM memory type manager
>>    * @tbo: TTM BO we need this range for
>>    * @place: placement flags and restrictions
>> - * @mem: the resulting mem object
>> + * @num_nodes: number of page nodes to use.
>> + * @pages_per_node: number of pages per node to use.
>> + * @res: the resulting mem object
>>    *
>>    * Allocate VRAM for the given BO.
>>    */
>>   static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   			       struct ttm_buffer_object *tbo,
>>   			       const struct ttm_place *place,
>> +			       unsigned long num_nodes,
>> +			       unsigned long pages_per_node,
>>   			       struct ttm_resource **res)
>>   {
>> -	unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>>   	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>   	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>   	uint64_t vis_usage = 0, mem_bytes, max_bytes;
>> +	unsigned long lpfn, pages_left, pages;
>>   	struct ttm_range_mgr_node *node;
>>   	struct drm_mm *mm = &mgr->mm;
>>   	enum drm_mm_insert_mode mode;
>> @@ -395,21 +399,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   		goto error_sub;
>>   	}
>>   
>> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> -		pages_per_node = ~0ul;
>> -		num_nodes = 1;
>> -	} else {
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -		pages_per_node = HPAGE_PMD_NR;
>> -#else
>> -		/* default to 2MB */
>> -		pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>> -#endif
>> -		pages_per_node = max_t(uint32_t, pages_per_node,
>> -				       tbo->page_alignment);
>> -		num_nodes = DIV_ROUND_UP_ULL(PFN_UP(mem_bytes), pages_per_node);
>> -	}
>> -
>>   	node = kvmalloc(struct_size(node, mm_nodes, num_nodes),
>>   			GFP_KERNEL | __GFP_ZERO);
>>   	if (!node) {
>> @@ -431,10 +420,15 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	i = 0;
>>   	spin_lock(&mgr->lock);
>>   	while (pages_left) {
>> -		uint32_t alignment = tbo->page_alignment;
>> +		unsigned long alignment = tbo->page_alignment;
>> +
>> +		if (i >= num_nodes) {
>> +			r = -E2BIG;
>> +			goto error_free;
>> +		}
>>   
>>   		if (pages >= pages_per_node)
>> -			alignment = pages_per_node;
>> +			alignment = max(alignment, pages_per_node);
> I don't understand this change. Is this an unrelated fix? pages_per_node
> is already bumped up to tbo->page_alignment in amdgpu_vram_mgr_alloc. So
> this "max" operation here seems redundant.
>
> Other than that, the patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>
> @JinHuiEric, can you confirm the performance improvement?
>
> Thanks,
>    Felix
>
>
>>   
>>   		r = drm_mm_insert_node_in_range(mm, &node->mm_nodes[i], pages,
>>   						alignment, 0, place->fpfn,
>> @@ -483,6 +477,52 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	return r;
>>   }
>>   
>> +/**
>> + * amdgpu_vram_mgr_alloc - allocate new range
>> + *
>> + * @man: TTM memory type manager
>> + * @tbo: TTM BO we need this range for
>> + * @place: placement flags and restrictions
>> + * @res: the resulting mem object
>> + *
>> + * Allocate VRAM for the given BO.
>> + */
>> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
>> +				 struct ttm_buffer_object *tbo,
>> +				 const struct ttm_place *place,
>> +				 struct ttm_resource **res)
>> +{
>> +	unsigned long num_nodes, pages_per_node;
>> +	struct ttm_range_mgr_node *node;
>> +	int r;
>> +
>> +	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>> +		return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, res);
>> +
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +	pages_per_node = HPAGE_PMD_NR;
>> +#else
>> +	/* default to 2MB */
>> +	pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>> +#endif
>> +	pages_per_node = max_t(uint32_t, pages_per_node, tbo->page_alignment);
>> +	num_nodes = DIV_ROUND_UP_ULL(PFN_UP(tbo->base.size), pages_per_node);
>> +
>> +	if (struct_size(node, mm_nodes, num_nodes) > PAGE_SIZE) {
>> +		size_t size = PAGE_SIZE;
>> +
>> +		size -= sizeof(struct ttm_range_mgr_node);
>> +		size /= sizeof(struct drm_mm_node);
>> +		r = amdgpu_vram_mgr_new(man, tbo, place, size, pages_per_node,
>> +					res);
>> +		if (r != -E2BIG)
>> +			return r;
>> +	}
>> +
>> +	return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
>> +				   res);
>> +}
>> +
>>   /**
>>    * amdgpu_vram_mgr_del - free ranges
>>    *
>> @@ -680,7 +720,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
>>   }
>>   
>>   static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
>> -	.alloc	= amdgpu_vram_mgr_new,
>> +	.alloc	= amdgpu_vram_mgr_alloc,
>>   	.free	= amdgpu_vram_mgr_del,
>>   	.debug	= amdgpu_vram_mgr_debug
>>   };

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-13 16:11 ` Felix Kuehling
  2021-07-13 17:23   ` Eric Huang
@ 2021-07-14 12:08   ` Christian König
  1 sibling, 0 replies; 11+ messages in thread
From: Christian König @ 2021-07-14 12:08 UTC (permalink / raw)
  To: Felix Kuehling, jinhuieric.huang
  Cc: amd-gfx, Luugi.Marsan, Jenny-Jing.Liu, Chris.Mason

Am 13.07.21 um 18:11 schrieb Felix Kuehling:
> Am 2021-07-13 um 9:32 a.m. schrieb Christian König:
>> For allocations larger than 48MiB we need more than a page for the
>> housekeeping in the worst case resulting in the usual vmalloc overhead.
>>
>> Try to avoid this by assuming the good case and only falling back to the
>> worst case if this didn't worked.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 80 +++++++++++++++-----
>>   1 file changed, 60 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index 2fd77c36a1ff..ab8c5e28df7b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -361,19 +361,23 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
>>    * @man: TTM memory type manager
>>    * @tbo: TTM BO we need this range for
>>    * @place: placement flags and restrictions
>> - * @mem: the resulting mem object
>> + * @num_nodes: number of page nodes to use.
>> + * @pages_per_node: number of pages per node to use.
>> + * @res: the resulting mem object
>>    *
>>    * Allocate VRAM for the given BO.
>>    */
>>   static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   			       struct ttm_buffer_object *tbo,
>>   			       const struct ttm_place *place,
>> +			       unsigned long num_nodes,
>> +			       unsigned long pages_per_node,
>>   			       struct ttm_resource **res)
>>   {
>> -	unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>>   	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>   	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>   	uint64_t vis_usage = 0, mem_bytes, max_bytes;
>> +	unsigned long lpfn, pages_left, pages;
>>   	struct ttm_range_mgr_node *node;
>>   	struct drm_mm *mm = &mgr->mm;
>>   	enum drm_mm_insert_mode mode;
>> @@ -395,21 +399,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   		goto error_sub;
>>   	}
>>   
>> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> -		pages_per_node = ~0ul;
>> -		num_nodes = 1;
>> -	} else {
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -		pages_per_node = HPAGE_PMD_NR;
>> -#else
>> -		/* default to 2MB */
>> -		pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>> -#endif
>> -		pages_per_node = max_t(uint32_t, pages_per_node,
>> -				       tbo->page_alignment);
>> -		num_nodes = DIV_ROUND_UP_ULL(PFN_UP(mem_bytes), pages_per_node);
>> -	}
>> -
>>   	node = kvmalloc(struct_size(node, mm_nodes, num_nodes),
>>   			GFP_KERNEL | __GFP_ZERO);
>>   	if (!node) {
>> @@ -431,10 +420,15 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	i = 0;
>>   	spin_lock(&mgr->lock);
>>   	while (pages_left) {
>> -		uint32_t alignment = tbo->page_alignment;
>> +		unsigned long alignment = tbo->page_alignment;
>> +
>> +		if (i >= num_nodes) {
>> +			r = -E2BIG;
>> +			goto error_free;
>> +		}
>>   
>>   		if (pages >= pages_per_node)
>> -			alignment = pages_per_node;
>> +			alignment = max(alignment, pages_per_node);
> I don't understand this change. Is this an unrelated fix? pages_per_node
> is already bumped up to tbo->page_alignment in amdgpu_vram_mgr_alloc. So
> this "max" operation here seems redundant.

Oh, yes good point. I've totally missed that and was wondering why this 
here isn't a problem.

>
> Other than that, the patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>

Thanks,
Christian.

>
> @JinHuiEric, can you confirm the performance improvement?
>
> Thanks,
>    Felix
>
>
>>   
>>   		r = drm_mm_insert_node_in_range(mm, &node->mm_nodes[i], pages,
>>   						alignment, 0, place->fpfn,
>> @@ -483,6 +477,52 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	return r;
>>   }
>>   
>> +/**
>> + * amdgpu_vram_mgr_alloc - allocate new range
>> + *
>> + * @man: TTM memory type manager
>> + * @tbo: TTM BO we need this range for
>> + * @place: placement flags and restrictions
>> + * @res: the resulting mem object
>> + *
>> + * Allocate VRAM for the given BO.
>> + */
>> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
>> +				 struct ttm_buffer_object *tbo,
>> +				 const struct ttm_place *place,
>> +				 struct ttm_resource **res)
>> +{
>> +	unsigned long num_nodes, pages_per_node;
>> +	struct ttm_range_mgr_node *node;
>> +	int r;
>> +
>> +	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>> +		return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, res);
>> +
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +	pages_per_node = HPAGE_PMD_NR;
>> +#else
>> +	/* default to 2MB */
>> +	pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>> +#endif
>> +	pages_per_node = max_t(uint32_t, pages_per_node, tbo->page_alignment);
>> +	num_nodes = DIV_ROUND_UP_ULL(PFN_UP(tbo->base.size), pages_per_node);
>> +
>> +	if (struct_size(node, mm_nodes, num_nodes) > PAGE_SIZE) {
>> +		size_t size = PAGE_SIZE;
>> +
>> +		size -= sizeof(struct ttm_range_mgr_node);
>> +		size /= sizeof(struct drm_mm_node);
>> +		r = amdgpu_vram_mgr_new(man, tbo, place, size, pages_per_node,
>> +					res);
>> +		if (r != -E2BIG)
>> +			return r;
>> +	}
>> +
>> +	return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
>> +				   res);
>> +}
>> +
>>   /**
>>    * amdgpu_vram_mgr_del - free ranges
>>    *
>> @@ -680,7 +720,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
>>   }
>>   
>>   static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
>> -	.alloc	= amdgpu_vram_mgr_new,
>> +	.alloc	= amdgpu_vram_mgr_alloc,
>>   	.free	= amdgpu_vram_mgr_del,
>>   	.debug	= amdgpu_vram_mgr_debug
>>   };

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-15 13:50     ` Eric Huang
@ 2021-07-15 14:38       ` Felix Kuehling
  0 siblings, 0 replies; 11+ messages in thread
From: Felix Kuehling @ 2021-07-15 14:38 UTC (permalink / raw)
  To: Eric Huang, Christian König, amd-gfx
  Cc: Christian König, Liu, Jenny (Jing), Marsan, Luugi

Looking at the ticket, it's getting a segfault in user mode. The only
possible mechanism I can see where this change causes that segfault, is
a badly handled memory allocation failure. But I don't have an
explanation for how this patch introduces a memory allocation failure
that wasn't there before (other than maybe some very obscure internal
detail of the drm_mm node manager).

I see one potential problem here on the error handling path that we're
more likely to go to now. But I think it should only affect 32-bit kernels:

> error: while (i--) drm_mm_remove_node(&nodes[i]);
> spin_unlock(&mgr->lock); atomic64_sub(mem->num_pages << PAGE_SHIFT,
> &mgr->usage);

We need a (u64) cast here to avoid the left-shift overflowing.
mem->num_pages is unsigned long, so that's already 64 bits on a 64-bit
kernel.

> kvfree(nodes); return r; }

Regards,
  Felix

Am 2021-07-15 um 9:50 a.m. schrieb Eric Huang:
> Hi Christian,
>
> I have pushed it into amd-staging-dkms-5.11, but it causes a
> regression with test TransferBench on MI200. Jira is here:
> https://ontrack-internal.amd.com/browse/SWDEV-295245
>
> Can you please take a look? Thanks!
>
> Regards,
> Eric
>
> On 2021-07-14 4:33 a.m., Christian König wrote:
>> Hi Eric,
>>
>> feel free to push into amd-staging-dkms-5.11, but please don't push
>> it into amd-staging-drm-next.
>>
>> The later will just cause a merge failure which Alex needs to resolve
>> manually.
>>
>> I can take care of pushing to amd-staging-drm-next as soon as that is
>> rebased on latest upstream.
>>
>> Regards,
>> Christian.
>>
>> Am 13.07.21 um 21:19 schrieb Eric Huang:
>>> Hi Christian/Felix,
>>>
>>> If you don't have objection, it will be pushed into
>>> amd-staging-dkms-5.11 and amd-staging-drm-next.
>>>
>>> Thanks,
>>> Eric
>>>
>>> On 2021-07-13 3:17 p.m., Eric Huang wrote:
>>>> For allocations larger than 48MiB we need more than a page for the
>>>> housekeeping in the worst case resulting in the usual vmalloc
>>>> overhead.
>>>>
>>>> Try to avoid this by assuming the good case and only falling back
>>>> to the
>>>> worst case if this didn't worked.
>>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 71
>>>> +++++++++++++++-----
>>>>   1 file changed, 53 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>> index be4261c4512e..ecbe05e1db66 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>> @@ -361,9 +361,11 @@ static void amdgpu_vram_mgr_virt_start(struct
>>>> ttm_resource *mem,
>>>>   static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>>                      struct ttm_buffer_object *tbo,
>>>>                      const struct ttm_place *place,
>>>> +                   unsigned long num_nodes,
>>>> +                   unsigned long pages_per_node,
>>>>                      struct ttm_resource *mem)
>>>>   {
>>>> -    unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>>>> +    unsigned long lpfn, pages_left, pages;
>>>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>>>       uint64_t vis_usage = 0, mem_bytes, max_bytes;
>>>> @@ -393,21 +395,6 @@ static int amdgpu_vram_mgr_new(struct
>>>> ttm_resource_manager *man,
>>>>           return -ENOSPC;
>>>>       }
>>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>>> -        pages_per_node = ~0ul;
>>>> -        num_nodes = 1;
>>>> -    } else {
>>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> -        pages_per_node = HPAGE_PMD_NR;
>>>> -#else
>>>> -        /* default to 2MB */
>>>> -        pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>>> -#endif
>>>> -        pages_per_node = max_t(uint32_t, pages_per_node,
>>>> -                       mem->page_alignment);
>>>> -        num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>>> -    }
>>>> -
>>>>       nodes = kvmalloc_array((uint32_t)num_nodes, sizeof(*nodes),
>>>>                      GFP_KERNEL | __GFP_ZERO);
>>>>       if (!nodes) {
>>>> @@ -435,7 +422,12 @@ static int amdgpu_vram_mgr_new(struct
>>>> ttm_resource_manager *man,
>>>>       i = 0;
>>>>       spin_lock(&mgr->lock);
>>>>       while (pages_left) {
>>>> -        uint32_t alignment = mem->page_alignment;
>>>> +        unsigned long alignment = mem->page_alignment;
>>>> +
>>>> +        if (i >= num_nodes) {
>>>> +            r = -E2BIG;
>>>> +            goto error;
>>>> +        }
>>>>             if (pages >= pages_per_node)
>>>>               alignment = pages_per_node;
>>>> @@ -492,6 +484,49 @@ static int amdgpu_vram_mgr_new(struct
>>>> ttm_resource_manager *man,
>>>>       return r;
>>>>   }
>>>>   +/**
>>>> + * amdgpu_vram_mgr_alloc - allocate new range
>>>> + *
>>>> + * @man: TTM memory type manager
>>>> + * @tbo: TTM BO we need this range for
>>>> + * @place: placement flags and restrictions
>>>> + * @mem: the resulting mem object
>>>> + *
>>>> + * Allocate VRAM for the given BO.
>>>> + */
>>>> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
>>>> +                 struct ttm_buffer_object *tbo,
>>>> +                 const struct ttm_place *place,
>>>> +                 struct ttm_resource *mem)
>>>> +{
>>>> +    unsigned long num_nodes, pages_per_node;
>>>> +    int r;
>>>> +
>>>> +    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>>> +        return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, mem);
>>>> +
>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +    pages_per_node = HPAGE_PMD_NR;
>>>> +#else
>>>> +    /* default to 2MB */
>>>> +    pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>>> +#endif
>>>> +    pages_per_node = max_t(uint32_t, pages_per_node,
>>>> +                   mem->page_alignment);
>>>> +    num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>>> +
>>>> +    if (sizeof(struct drm_mm_node) * num_nodes > PAGE_SIZE) {
>>>> +        r = amdgpu_vram_mgr_new(man, tbo, place,
>>>> +                PAGE_SIZE / sizeof(struct drm_mm_node),
>>>> +                pages_per_node,    mem);
>>>> +        if (r != -E2BIG)
>>>> +            return r;
>>>> +    }
>>>> +
>>>> +    return amdgpu_vram_mgr_new(man, tbo, place, num_nodes,
>>>> pages_per_node,
>>>> +                   mem);
>>>> +}
>>>> +
>>>>   /**
>>>>    * amdgpu_vram_mgr_del - free ranges
>>>>    *
>>>> @@ -693,7 +728,7 @@ static void amdgpu_vram_mgr_debug(struct
>>>> ttm_resource_manager *man,
>>>>   }
>>>>     static const struct ttm_resource_manager_func
>>>> amdgpu_vram_mgr_func = {
>>>> -    .alloc    = amdgpu_vram_mgr_new,
>>>> +    .alloc    = amdgpu_vram_mgr_alloc,
>>>>       .free    = amdgpu_vram_mgr_del,
>>>>       .debug    = amdgpu_vram_mgr_debug
>>>>   };
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-14  8:33   ` Christian König
  2021-07-14  9:41     ` Pan, Xinhui
@ 2021-07-15 13:50     ` Eric Huang
  2021-07-15 14:38       ` Felix Kuehling
  1 sibling, 1 reply; 11+ messages in thread
From: Eric Huang @ 2021-07-15 13:50 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: Felix Kuehling, Christian König, Liu, Jenny (Jing), Marsan, Luugi

Hi Christian,

I have pushed it into amd-staging-dkms-5.11, but it causes a regression 
with test TransferBench on MI200. Jira is here:
https://ontrack-internal.amd.com/browse/SWDEV-295245

Can you please take a look? Thanks!

Regards,
Eric

On 2021-07-14 4:33 a.m., Christian König wrote:
> Hi Eric,
>
> feel free to push into amd-staging-dkms-5.11, but please don't push it 
> into amd-staging-drm-next.
>
> The later will just cause a merge failure which Alex needs to resolve 
> manually.
>
> I can take care of pushing to amd-staging-drm-next as soon as that is 
> rebased on latest upstream.
>
> Regards,
> Christian.
>
> Am 13.07.21 um 21:19 schrieb Eric Huang:
>> Hi Christian/Felix,
>>
>> If you don't have objection, it will be pushed into 
>> amd-staging-dkms-5.11 and amd-staging-drm-next.
>>
>> Thanks,
>> Eric
>>
>> On 2021-07-13 3:17 p.m., Eric Huang wrote:
>>> For allocations larger than 48MiB we need more than a page for the
>>> housekeeping in the worst case resulting in the usual vmalloc overhead.
>>>
>>> Try to avoid this by assuming the good case and only falling back to 
>>> the
>>> worst case if this didn't worked.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 71 
>>> +++++++++++++++-----
>>>   1 file changed, 53 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> index be4261c4512e..ecbe05e1db66 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> @@ -361,9 +361,11 @@ static void amdgpu_vram_mgr_virt_start(struct 
>>> ttm_resource *mem,
>>>   static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>                      struct ttm_buffer_object *tbo,
>>>                      const struct ttm_place *place,
>>> +                   unsigned long num_nodes,
>>> +                   unsigned long pages_per_node,
>>>                      struct ttm_resource *mem)
>>>   {
>>> -    unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>>> +    unsigned long lpfn, pages_left, pages;
>>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>>       uint64_t vis_usage = 0, mem_bytes, max_bytes;
>>> @@ -393,21 +395,6 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>           return -ENOSPC;
>>>       }
>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>> -        pages_per_node = ~0ul;
>>> -        num_nodes = 1;
>>> -    } else {
>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -        pages_per_node = HPAGE_PMD_NR;
>>> -#else
>>> -        /* default to 2MB */
>>> -        pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>> -#endif
>>> -        pages_per_node = max_t(uint32_t, pages_per_node,
>>> -                       mem->page_alignment);
>>> -        num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>> -    }
>>> -
>>>       nodes = kvmalloc_array((uint32_t)num_nodes, sizeof(*nodes),
>>>                      GFP_KERNEL | __GFP_ZERO);
>>>       if (!nodes) {
>>> @@ -435,7 +422,12 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       i = 0;
>>>       spin_lock(&mgr->lock);
>>>       while (pages_left) {
>>> -        uint32_t alignment = mem->page_alignment;
>>> +        unsigned long alignment = mem->page_alignment;
>>> +
>>> +        if (i >= num_nodes) {
>>> +            r = -E2BIG;
>>> +            goto error;
>>> +        }
>>>             if (pages >= pages_per_node)
>>>               alignment = pages_per_node;
>>> @@ -492,6 +484,49 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       return r;
>>>   }
>>>   +/**
>>> + * amdgpu_vram_mgr_alloc - allocate new range
>>> + *
>>> + * @man: TTM memory type manager
>>> + * @tbo: TTM BO we need this range for
>>> + * @place: placement flags and restrictions
>>> + * @mem: the resulting mem object
>>> + *
>>> + * Allocate VRAM for the given BO.
>>> + */
>>> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
>>> +                 struct ttm_buffer_object *tbo,
>>> +                 const struct ttm_place *place,
>>> +                 struct ttm_resource *mem)
>>> +{
>>> +    unsigned long num_nodes, pages_per_node;
>>> +    int r;
>>> +
>>> +    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>> +        return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, mem);
>>> +
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +    pages_per_node = HPAGE_PMD_NR;
>>> +#else
>>> +    /* default to 2MB */
>>> +    pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>> +#endif
>>> +    pages_per_node = max_t(uint32_t, pages_per_node,
>>> +                   mem->page_alignment);
>>> +    num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>> +
>>> +    if (sizeof(struct drm_mm_node) * num_nodes > PAGE_SIZE) {
>>> +        r = amdgpu_vram_mgr_new(man, tbo, place,
>>> +                PAGE_SIZE / sizeof(struct drm_mm_node),
>>> +                pages_per_node,    mem);
>>> +        if (r != -E2BIG)
>>> +            return r;
>>> +    }
>>> +
>>> +    return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, 
>>> pages_per_node,
>>> +                   mem);
>>> +}
>>> +
>>>   /**
>>>    * amdgpu_vram_mgr_del - free ranges
>>>    *
>>> @@ -693,7 +728,7 @@ static void amdgpu_vram_mgr_debug(struct 
>>> ttm_resource_manager *man,
>>>   }
>>>     static const struct ttm_resource_manager_func 
>>> amdgpu_vram_mgr_func = {
>>> -    .alloc    = amdgpu_vram_mgr_new,
>>> +    .alloc    = amdgpu_vram_mgr_alloc,
>>>       .free    = amdgpu_vram_mgr_del,
>>>       .debug    = amdgpu_vram_mgr_debug
>>>   };
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cjinhuieric.huang%40amd.com%7C1f368defa88042677f4008d946a217c8%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618484270197021%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=wvuHNuK1gQBYbErvhBO%2FambKpNzjHL2A9ea22fvQMkY%3D&amp;reserved=0 
>>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-14  9:41     ` Pan, Xinhui
@ 2021-07-14 12:17       ` Christian König
  0 siblings, 0 replies; 11+ messages in thread
From: Christian König @ 2021-07-14 12:17 UTC (permalink / raw)
  To: Pan, Xinhui
  Cc: Huang, JinHuiEric, Kuehling, Felix, Koenig, Christian, amd-gfx

Am 14.07.21 um 11:41 schrieb Pan, Xinhui:
>> 2021年7月14日 16:33,Christian König <ckoenig.leichtzumerken@gmail.com> 写道:
>>
>> Hi Eric,
>>
>> feel free to push into amd-staging-dkms-5.11, but please don't push it into amd-staging-drm-next.
>>
>> The later will just cause a merge failure which Alex needs to resolve manually.
>>
>> I can take care of pushing to amd-staging-drm-next as soon as that is rebased on latest upstream.
>>
>> Regards,
>> Christian.
>>
>> Am 13.07.21 um 21:19 schrieb Eric Huang:
>>> Hi Christian/Felix,
>>>
>>> If you don't have objection, it will be pushed into amd-staging-dkms-5.11 and amd-staging-drm-next.
>>>
>>> Thanks,
>>> Eric
>>>
>>> On 2021-07-13 3:17 p.m., Eric Huang wrote:
>>>> For allocations larger than 48MiB we need more than a page for the
>>>> housekeeping in the worst case resulting in the usual vmalloc overhead.
>>>>
>>>> Try to avoid this by assuming the good case and only falling back to the
>>>> worst case if this didn't worked.
>>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 71 +++++++++++++++-----
>>>>    1 file changed, 53 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>> index be4261c4512e..ecbe05e1db66 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>>> @@ -361,9 +361,11 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
>>>>    static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>>                       struct ttm_buffer_object *tbo,
>>>>                       const struct ttm_place *place,
>>>> +                   unsigned long num_nodes,
>>>> +                   unsigned long pages_per_node,
>>>>                       struct ttm_resource *mem)
>>>>    {
>>>> -    unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>>>> +    unsigned long lpfn, pages_left, pages;
>>>>        struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>>        struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>>>        uint64_t vis_usage = 0, mem_bytes, max_bytes;
>>>> @@ -393,21 +395,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>>            return -ENOSPC;
>>>>        }
>>>>    -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>>> -        pages_per_node = ~0ul;
>>>> -        num_nodes = 1;
>>>> -    } else {
>>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> -        pages_per_node = HPAGE_PMD_NR;
>>>> -#else
>>>> -        /* default to 2MB */
>>>> -        pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>>> -#endif
>>>> -        pages_per_node = max_t(uint32_t, pages_per_node,
>>>> -                       mem->page_alignment);
>>>> -        num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>>> -    }
>>>> -
>>>>        nodes = kvmalloc_array((uint32_t)num_nodes, sizeof(*nodes),
>>>>                       GFP_KERNEL | __GFP_ZERO);
>>>>        if (!nodes) {
>>>> @@ -435,7 +422,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>>        i = 0;
>>>>        spin_lock(&mgr->lock);
>>>>        while (pages_left) {
>>>> -        uint32_t alignment = mem->page_alignment;
>>>> +        unsigned long alignment = mem->page_alignment;
>>>> +
>>>> +        if (i >= num_nodes) {
>>>> +            r = -E2BIG;
>>>> +            goto error;
>>>> +        }
>>>>              if (pages >= pages_per_node)
>>>>                alignment = pages_per_node;
>>>> @@ -492,6 +484,49 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>>        return r;
>>>>    }
>>>>    +/**
>>>> + * amdgpu_vram_mgr_alloc - allocate new range
>>>> + *
>>>> + * @man: TTM memory type manager
>>>> + * @tbo: TTM BO we need this range for
>>>> + * @place: placement flags and restrictions
>>>> + * @mem: the resulting mem object
>>>> + *
>>>> + * Allocate VRAM for the given BO.
>>>> + */
>>>> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
>>>> +                 struct ttm_buffer_object *tbo,
>>>> +                 const struct ttm_place *place,
>>>> +                 struct ttm_resource *mem)
>>>> +{
>>>> +    unsigned long num_nodes, pages_per_node;
>>>> +    int r;
>>>> +
>>>> +    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>>> +        return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, mem);
>>>> +
>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +    pages_per_node = HPAGE_PMD_NR;
>>>> +#else
>>>> +    /* default to 2MB */
>>>> +    pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>>> +#endif
>>>> +    pages_per_node = max_t(uint32_t, pages_per_node,
>>>> +                   mem->page_alignment);
>>>> +    num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>>> +
>>>> +    if (sizeof(struct drm_mm_node) * num_nodes > PAGE_SIZE) {
> I think this should be < PAGE_SIZE? Otherwise amdgpu_vram_mgr_new always return -E2BIG.  Or I am missing something?

Sounds like you are missing something. The idea here is that we don't 
always allocate for the worst case, but try with just one page first.

> But you want one page to save all drm mm nodes in the good case. What if user just create a bunch of small VRAM BO, say, 1 thound of 4KB VRAM BOs.
> the system memory usage would change from 24KB to 4MB. I have no strong objection to it as time is more expensive in reality.

The system memory usage will stay the same because we only apply this if 
sizeof(struct drm_mm_node) * num_nodes is larger than a page size.

Regards,
Christian.

>
>
>
>>>> +        r = amdgpu_vram_mgr_new(man, tbo, place,
>>>> +                PAGE_SIZE / sizeof(struct drm_mm_node),
>>>> +                pages_per_node,    mem);
>>>> +        if (r != -E2BIG)
>>>> +            return r;
>>>> +    }
>>>> +
>>>> +    return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
>>>> +                   mem);
>>>> +}
>>>> +
>>>>    /**
>>>>     * amdgpu_vram_mgr_del - free ranges
>>>>     *
>>>> @@ -693,7 +728,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
>>>>    }
>>>>      static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
>>>> -    .alloc    = amdgpu_vram_mgr_new,
>>>> +    .alloc    = amdgpu_vram_mgr_alloc,
>>>>        .free    = amdgpu_vram_mgr_del,
>>>>        .debug    = amdgpu_vram_mgr_debug
>>>>    };
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cxinhui.pan%40amd.com%7C66465cc2d6b9468a2d8208d946a218ba%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618484289336937%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=U1xfDEKdKmDr%2FeFF%2BXoLaZVcuTzcoFnoZOD%2Fwo4redo%3D&amp;reserved=0
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cxinhui.pan%40amd.com%7C66465cc2d6b9468a2d8208d946a218ba%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618484289336937%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=U1xfDEKdKmDr%2FeFF%2BXoLaZVcuTzcoFnoZOD%2Fwo4redo%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-14  8:33   ` Christian König
@ 2021-07-14  9:41     ` Pan, Xinhui
  2021-07-14 12:17       ` Christian König
  2021-07-15 13:50     ` Eric Huang
  1 sibling, 1 reply; 11+ messages in thread
From: Pan, Xinhui @ 2021-07-14  9:41 UTC (permalink / raw)
  To: Christian König
  Cc: Huang, JinHuiEric, Kuehling, Felix, Pan, Xinhui, Koenig,
	Christian, amd-gfx



> 2021年7月14日 16:33,Christian König <ckoenig.leichtzumerken@gmail.com> 写道:
> 
> Hi Eric,
> 
> feel free to push into amd-staging-dkms-5.11, but please don't push it into amd-staging-drm-next.
> 
> The later will just cause a merge failure which Alex needs to resolve manually.
> 
> I can take care of pushing to amd-staging-drm-next as soon as that is rebased on latest upstream.
> 
> Regards,
> Christian.
> 
> Am 13.07.21 um 21:19 schrieb Eric Huang:
>> Hi Christian/Felix,
>> 
>> If you don't have objection, it will be pushed into amd-staging-dkms-5.11 and amd-staging-drm-next.
>> 
>> Thanks,
>> Eric
>> 
>> On 2021-07-13 3:17 p.m., Eric Huang wrote:
>>> For allocations larger than 48MiB we need more than a page for the
>>> housekeeping in the worst case resulting in the usual vmalloc overhead.
>>> 
>>> Try to avoid this by assuming the good case and only falling back to the
>>> worst case if this didn't worked.
>>> 
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 71 +++++++++++++++-----
>>>   1 file changed, 53 insertions(+), 18 deletions(-)
>>> 
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> index be4261c4512e..ecbe05e1db66 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> @@ -361,9 +361,11 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
>>>   static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>                      struct ttm_buffer_object *tbo,
>>>                      const struct ttm_place *place,
>>> +                   unsigned long num_nodes,
>>> +                   unsigned long pages_per_node,
>>>                      struct ttm_resource *mem)
>>>   {
>>> -    unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>>> +    unsigned long lpfn, pages_left, pages;
>>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>>       uint64_t vis_usage = 0, mem_bytes, max_bytes;
>>> @@ -393,21 +395,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>           return -ENOSPC;
>>>       }
>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>> -        pages_per_node = ~0ul;
>>> -        num_nodes = 1;
>>> -    } else {
>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -        pages_per_node = HPAGE_PMD_NR;
>>> -#else
>>> -        /* default to 2MB */
>>> -        pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>> -#endif
>>> -        pages_per_node = max_t(uint32_t, pages_per_node,
>>> -                       mem->page_alignment);
>>> -        num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>> -    }
>>> -
>>>       nodes = kvmalloc_array((uint32_t)num_nodes, sizeof(*nodes),
>>>                      GFP_KERNEL | __GFP_ZERO);
>>>       if (!nodes) {
>>> @@ -435,7 +422,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>       i = 0;
>>>       spin_lock(&mgr->lock);
>>>       while (pages_left) {
>>> -        uint32_t alignment = mem->page_alignment;
>>> +        unsigned long alignment = mem->page_alignment;
>>> +
>>> +        if (i >= num_nodes) {
>>> +            r = -E2BIG;
>>> +            goto error;
>>> +        }
>>>             if (pages >= pages_per_node)
>>>               alignment = pages_per_node;
>>> @@ -492,6 +484,49 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>>       return r;
>>>   }
>>>   +/**
>>> + * amdgpu_vram_mgr_alloc - allocate new range
>>> + *
>>> + * @man: TTM memory type manager
>>> + * @tbo: TTM BO we need this range for
>>> + * @place: placement flags and restrictions
>>> + * @mem: the resulting mem object
>>> + *
>>> + * Allocate VRAM for the given BO.
>>> + */
>>> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
>>> +                 struct ttm_buffer_object *tbo,
>>> +                 const struct ttm_place *place,
>>> +                 struct ttm_resource *mem)
>>> +{
>>> +    unsigned long num_nodes, pages_per_node;
>>> +    int r;
>>> +
>>> +    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>> +        return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, mem);
>>> +
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +    pages_per_node = HPAGE_PMD_NR;
>>> +#else
>>> +    /* default to 2MB */
>>> +    pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>> +#endif
>>> +    pages_per_node = max_t(uint32_t, pages_per_node,
>>> +                   mem->page_alignment);
>>> +    num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>> +
>>> +    if (sizeof(struct drm_mm_node) * num_nodes > PAGE_SIZE) {

I think this should be < PAGE_SIZE? Otherwise amdgpu_vram_mgr_new always return -E2BIG.  Or I am missing something?

But you want one page to save all drm mm nodes in the good case. What if user just create a bunch of small VRAM BO, say, 1 thound of 4KB VRAM BOs.
the system memory usage would change from 24KB to 4MB. I have no strong objection to it as time is more expensive in reality.



>>> +        r = amdgpu_vram_mgr_new(man, tbo, place,
>>> +                PAGE_SIZE / sizeof(struct drm_mm_node),
>>> +                pages_per_node,    mem);
>>> +        if (r != -E2BIG)
>>> +            return r;
>>> +    }
>>> +
>>> +    return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
>>> +                   mem);
>>> +}
>>> +
>>>   /**
>>>    * amdgpu_vram_mgr_del - free ranges
>>>    *
>>> @@ -693,7 +728,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
>>>   }
>>>     static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
>>> -    .alloc    = amdgpu_vram_mgr_new,
>>> +    .alloc    = amdgpu_vram_mgr_alloc,
>>>       .free    = amdgpu_vram_mgr_del,
>>>       .debug    = amdgpu_vram_mgr_debug
>>>   };
>> 
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cxinhui.pan%40amd.com%7C66465cc2d6b9468a2d8208d946a218ba%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618484289336937%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=U1xfDEKdKmDr%2FeFF%2BXoLaZVcuTzcoFnoZOD%2Fwo4redo%3D&amp;reserved=0
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cxinhui.pan%40amd.com%7C66465cc2d6b9468a2d8208d946a218ba%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618484289336937%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=U1xfDEKdKmDr%2FeFF%2BXoLaZVcuTzcoFnoZOD%2Fwo4redo%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-13 19:19 ` Eric Huang
@ 2021-07-14  8:33   ` Christian König
  2021-07-14  9:41     ` Pan, Xinhui
  2021-07-15 13:50     ` Eric Huang
  0 siblings, 2 replies; 11+ messages in thread
From: Christian König @ 2021-07-14  8:33 UTC (permalink / raw)
  To: Eric Huang, amd-gfx; +Cc: Felix Kuehling, Christian König

Hi Eric,

feel free to push into amd-staging-dkms-5.11, but please don't push it 
into amd-staging-drm-next.

The later will just cause a merge failure which Alex needs to resolve 
manually.

I can take care of pushing to amd-staging-drm-next as soon as that is 
rebased on latest upstream.

Regards,
Christian.

Am 13.07.21 um 21:19 schrieb Eric Huang:
> Hi Christian/Felix,
>
> If you don't have objection, it will be pushed into 
> amd-staging-dkms-5.11 and amd-staging-drm-next.
>
> Thanks,
> Eric
>
> On 2021-07-13 3:17 p.m., Eric Huang wrote:
>> For allocations larger than 48MiB we need more than a page for the
>> housekeeping in the worst case resulting in the usual vmalloc overhead.
>>
>> Try to avoid this by assuming the good case and only falling back to the
>> worst case if this didn't worked.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 71 +++++++++++++++-----
>>   1 file changed, 53 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index be4261c4512e..ecbe05e1db66 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -361,9 +361,11 @@ static void amdgpu_vram_mgr_virt_start(struct 
>> ttm_resource *mem,
>>   static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>                      struct ttm_buffer_object *tbo,
>>                      const struct ttm_place *place,
>> +                   unsigned long num_nodes,
>> +                   unsigned long pages_per_node,
>>                      struct ttm_resource *mem)
>>   {
>> -    unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>> +    unsigned long lpfn, pages_left, pages;
>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>       uint64_t vis_usage = 0, mem_bytes, max_bytes;
>> @@ -393,21 +395,6 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>           return -ENOSPC;
>>       }
>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> -        pages_per_node = ~0ul;
>> -        num_nodes = 1;
>> -    } else {
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -        pages_per_node = HPAGE_PMD_NR;
>> -#else
>> -        /* default to 2MB */
>> -        pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>> -#endif
>> -        pages_per_node = max_t(uint32_t, pages_per_node,
>> -                       mem->page_alignment);
>> -        num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>> -    }
>> -
>>       nodes = kvmalloc_array((uint32_t)num_nodes, sizeof(*nodes),
>>                      GFP_KERNEL | __GFP_ZERO);
>>       if (!nodes) {
>> @@ -435,7 +422,12 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>       i = 0;
>>       spin_lock(&mgr->lock);
>>       while (pages_left) {
>> -        uint32_t alignment = mem->page_alignment;
>> +        unsigned long alignment = mem->page_alignment;
>> +
>> +        if (i >= num_nodes) {
>> +            r = -E2BIG;
>> +            goto error;
>> +        }
>>             if (pages >= pages_per_node)
>>               alignment = pages_per_node;
>> @@ -492,6 +484,49 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>       return r;
>>   }
>>   +/**
>> + * amdgpu_vram_mgr_alloc - allocate new range
>> + *
>> + * @man: TTM memory type manager
>> + * @tbo: TTM BO we need this range for
>> + * @place: placement flags and restrictions
>> + * @mem: the resulting mem object
>> + *
>> + * Allocate VRAM for the given BO.
>> + */
>> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
>> +                 struct ttm_buffer_object *tbo,
>> +                 const struct ttm_place *place,
>> +                 struct ttm_resource *mem)
>> +{
>> +    unsigned long num_nodes, pages_per_node;
>> +    int r;
>> +
>> +    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>> +        return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, mem);
>> +
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +    pages_per_node = HPAGE_PMD_NR;
>> +#else
>> +    /* default to 2MB */
>> +    pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>> +#endif
>> +    pages_per_node = max_t(uint32_t, pages_per_node,
>> +                   mem->page_alignment);
>> +    num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>> +
>> +    if (sizeof(struct drm_mm_node) * num_nodes > PAGE_SIZE) {
>> +        r = amdgpu_vram_mgr_new(man, tbo, place,
>> +                PAGE_SIZE / sizeof(struct drm_mm_node),
>> +                pages_per_node,    mem);
>> +        if (r != -E2BIG)
>> +            return r;
>> +    }
>> +
>> +    return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, 
>> pages_per_node,
>> +                   mem);
>> +}
>> +
>>   /**
>>    * amdgpu_vram_mgr_del - free ranges
>>    *
>> @@ -693,7 +728,7 @@ static void amdgpu_vram_mgr_debug(struct 
>> ttm_resource_manager *man,
>>   }
>>     static const struct ttm_resource_manager_func 
>> amdgpu_vram_mgr_func = {
>> -    .alloc    = amdgpu_vram_mgr_new,
>> +    .alloc    = amdgpu_vram_mgr_alloc,
>>       .free    = amdgpu_vram_mgr_del,
>>       .debug    = amdgpu_vram_mgr_debug
>>   };
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdgpu: further lower VRAM allocation overhead
  2021-07-13 19:17 Eric Huang
@ 2021-07-13 19:19 ` Eric Huang
  2021-07-14  8:33   ` Christian König
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Huang @ 2021-07-13 19:19 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix Kuehling, Christian König

Hi Christian/Felix,

If you don't have objection, it will be pushed into 
amd-staging-dkms-5.11 and amd-staging-drm-next.

Thanks,
Eric

On 2021-07-13 3:17 p.m., Eric Huang wrote:
> For allocations larger than 48MiB we need more than a page for the
> housekeeping in the worst case resulting in the usual vmalloc overhead.
>
> Try to avoid this by assuming the good case and only falling back to the
> worst case if this didn't worked.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 71 +++++++++++++++-----
>   1 file changed, 53 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index be4261c4512e..ecbe05e1db66 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -361,9 +361,11 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
>   static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   			       struct ttm_buffer_object *tbo,
>   			       const struct ttm_place *place,
> +			       unsigned long num_nodes,
> +			       unsigned long pages_per_node,
>   			       struct ttm_resource *mem)
>   {
> -	unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
> +	unsigned long lpfn, pages_left, pages;
>   	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>   	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>   	uint64_t vis_usage = 0, mem_bytes, max_bytes;
> @@ -393,21 +395,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   		return -ENOSPC;
>   	}
>   
> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
> -		pages_per_node = ~0ul;
> -		num_nodes = 1;
> -	} else {
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -		pages_per_node = HPAGE_PMD_NR;
> -#else
> -		/* default to 2MB */
> -		pages_per_node = 2UL << (20UL - PAGE_SHIFT);
> -#endif
> -		pages_per_node = max_t(uint32_t, pages_per_node,
> -				       mem->page_alignment);
> -		num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
> -	}
> -
>   	nodes = kvmalloc_array((uint32_t)num_nodes, sizeof(*nodes),
>   			       GFP_KERNEL | __GFP_ZERO);
>   	if (!nodes) {
> @@ -435,7 +422,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   	i = 0;
>   	spin_lock(&mgr->lock);
>   	while (pages_left) {
> -		uint32_t alignment = mem->page_alignment;
> +		unsigned long alignment = mem->page_alignment;
> +
> +		if (i >= num_nodes) {
> +			r = -E2BIG;
> +			goto error;
> +		}
>   
>   		if (pages >= pages_per_node)
>   			alignment = pages_per_node;
> @@ -492,6 +484,49 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   	return r;
>   }
>   
> +/**
> + * amdgpu_vram_mgr_alloc - allocate new range
> + *
> + * @man: TTM memory type manager
> + * @tbo: TTM BO we need this range for
> + * @place: placement flags and restrictions
> + * @mem: the resulting mem object
> + *
> + * Allocate VRAM for the given BO.
> + */
> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
> +				 struct ttm_buffer_object *tbo,
> +				 const struct ttm_place *place,
> +				 struct ttm_resource *mem)
> +{
> +	unsigned long num_nodes, pages_per_node;
> +	int r;
> +
> +	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
> +		return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, mem);
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	pages_per_node = HPAGE_PMD_NR;
> +#else
> +	/* default to 2MB */
> +	pages_per_node = 2UL << (20UL - PAGE_SHIFT);
> +#endif
> +	pages_per_node = max_t(uint32_t, pages_per_node,
> +			       mem->page_alignment);
> +	num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
> +
> +	if (sizeof(struct drm_mm_node) * num_nodes > PAGE_SIZE) {
> +		r = amdgpu_vram_mgr_new(man, tbo, place,
> +				PAGE_SIZE / sizeof(struct drm_mm_node),
> +				pages_per_node,	mem);
> +		if (r != -E2BIG)
> +			return r;
> +	}
> +
> +	return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
> +				   mem);
> +}
> +
>   /**
>    * amdgpu_vram_mgr_del - free ranges
>    *
> @@ -693,7 +728,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
>   }
>   
>   static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
> -	.alloc	= amdgpu_vram_mgr_new,
> +	.alloc	= amdgpu_vram_mgr_alloc,
>   	.free	= amdgpu_vram_mgr_del,
>   	.debug	= amdgpu_vram_mgr_debug
>   };

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] drm/amdgpu: further lower VRAM allocation overhead
@ 2021-07-13 19:17 Eric Huang
  2021-07-13 19:19 ` Eric Huang
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Huang @ 2021-07-13 19:17 UTC (permalink / raw)
  To: amd-gfx; +Cc: Eric Huang, Felix Kuehling, Christian König

For allocations larger than 48MiB we need more than a page for the
housekeeping in the worst case resulting in the usual vmalloc overhead.

Try to avoid this by assuming the good case and only falling back to the
worst case if this didn't worked.

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 71 +++++++++++++++-----
 1 file changed, 53 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index be4261c4512e..ecbe05e1db66 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -361,9 +361,11 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
 static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 			       struct ttm_buffer_object *tbo,
 			       const struct ttm_place *place,
+			       unsigned long num_nodes,
+			       unsigned long pages_per_node,
 			       struct ttm_resource *mem)
 {
-	unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
+	unsigned long lpfn, pages_left, pages;
 	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
 	struct amdgpu_device *adev = to_amdgpu_device(mgr);
 	uint64_t vis_usage = 0, mem_bytes, max_bytes;
@@ -393,21 +395,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 		return -ENOSPC;
 	}
 
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-		pages_per_node = ~0ul;
-		num_nodes = 1;
-	} else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		pages_per_node = HPAGE_PMD_NR;
-#else
-		/* default to 2MB */
-		pages_per_node = 2UL << (20UL - PAGE_SHIFT);
-#endif
-		pages_per_node = max_t(uint32_t, pages_per_node,
-				       mem->page_alignment);
-		num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
-	}
-
 	nodes = kvmalloc_array((uint32_t)num_nodes, sizeof(*nodes),
 			       GFP_KERNEL | __GFP_ZERO);
 	if (!nodes) {
@@ -435,7 +422,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	i = 0;
 	spin_lock(&mgr->lock);
 	while (pages_left) {
-		uint32_t alignment = mem->page_alignment;
+		unsigned long alignment = mem->page_alignment;
+
+		if (i >= num_nodes) {
+			r = -E2BIG;
+			goto error;
+		}
 
 		if (pages >= pages_per_node)
 			alignment = pages_per_node;
@@ -492,6 +484,49 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	return r;
 }
 
+/**
+ * amdgpu_vram_mgr_alloc - allocate new range
+ *
+ * @man: TTM memory type manager
+ * @tbo: TTM BO we need this range for
+ * @place: placement flags and restrictions
+ * @mem: the resulting mem object
+ *
+ * Allocate VRAM for the given BO.
+ */
+static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
+				 struct ttm_buffer_object *tbo,
+				 const struct ttm_place *place,
+				 struct ttm_resource *mem)
+{
+	unsigned long num_nodes, pages_per_node;
+	int r;
+
+	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+		return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, mem);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	pages_per_node = HPAGE_PMD_NR;
+#else
+	/* default to 2MB */
+	pages_per_node = 2UL << (20UL - PAGE_SHIFT);
+#endif
+	pages_per_node = max_t(uint32_t, pages_per_node,
+			       mem->page_alignment);
+	num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
+
+	if (sizeof(struct drm_mm_node) * num_nodes > PAGE_SIZE) {
+		r = amdgpu_vram_mgr_new(man, tbo, place,
+				PAGE_SIZE / sizeof(struct drm_mm_node),
+				pages_per_node,	mem);
+		if (r != -E2BIG)
+			return r;
+	}
+
+	return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
+				   mem);
+}
+
 /**
  * amdgpu_vram_mgr_del - free ranges
  *
@@ -693,7 +728,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
 }
 
 static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
-	.alloc	= amdgpu_vram_mgr_new,
+	.alloc	= amdgpu_vram_mgr_alloc,
 	.free	= amdgpu_vram_mgr_del,
 	.debug	= amdgpu_vram_mgr_debug
 };
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-07-15 14:38 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-13 13:32 [PATCH] drm/amdgpu: further lower VRAM allocation overhead Christian König
2021-07-13 16:11 ` Felix Kuehling
2021-07-13 17:23   ` Eric Huang
2021-07-14 12:08   ` Christian König
2021-07-13 19:17 Eric Huang
2021-07-13 19:19 ` Eric Huang
2021-07-14  8:33   ` Christian König
2021-07-14  9:41     ` Pan, Xinhui
2021-07-14 12:17       ` Christian König
2021-07-15 13:50     ` Eric Huang
2021-07-15 14:38       ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.