All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] intel: align reuse buffer's size on page size instead
@ 2018-03-02 17:53 James Xiong
  2018-03-03  9:45 ` Chris Wilson
  0 siblings, 1 reply; 3+ messages in thread
From: James Xiong @ 2018-03-02 17:53 UTC (permalink / raw)
  To: dri-devel, intel-gfx; +Cc: james.xiong

From: "Xiong, James" <james.xiong@intel.com>

With gem_reuse enabled, when a buffer size is different than
the sizes of buckets, it is aligned to the next bucket's size,
which means about 25% more memory than the requested is allocated
in the worst senario. For example:

Orignal size    Actual
32KB+1Byte      40KB
.
.
.
8MB+1Byte       10MB
.
.
.
96MB+1Byte      112MB

This is very memory expensive and make the reuse feature less
favorable than it deserves to be.

This commit aligns the reuse buffer size on page size instead,
the buffer whose size falls between bucket[n] and bucket[n+1] is
put in bucket[n] when it's done; And when searching for a cached
buffer for reuse, it goes through the cached buffers list in the
bucket until a cached buffer, whose size is larger than or equal
to the requested size, is found.

Performed gfxbench tests, the performances and reuse ratioes
(reuse count/allocation count) were same as before, saved memory
usage by 1% ~ 7%(gl_manhattan: peak allocated memory size was
reduced from 448401408 to 419078144).

Signed-off-by: Xiong, James <james.xiong@intel.com>
---
 intel/intel_bufmgr_gem.c | 180 +++++++++++++++++++++++++----------------------
 libdrm_lists.h           |   6 ++
 2 files changed, 101 insertions(+), 85 deletions(-)

diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 386da30..5b2d0d0 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -402,11 +402,10 @@ drm_intel_gem_bo_bucket_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
 {
 	int i;
 
-	for (i = 0; i < bufmgr_gem->num_buckets; i++) {
-		struct drm_intel_gem_bo_bucket *bucket =
-		    &bufmgr_gem->cache_bucket[i];
-		if (bucket->size >= size) {
-			return bucket;
+	for (i = 0; i < bufmgr_gem->num_buckets - 1; i++) {
+		if (size >= bufmgr_gem->cache_bucket[i].size &&
+		    size < bufmgr_gem->cache_bucket[i+1].size) {
+			return &bufmgr_gem->cache_bucket[i];
 		}
 	}
 
@@ -685,25 +684,95 @@ drm_intel_gem_bo_madvise(drm_intel_bo *bo, int madv)
 		 madv);
 }
 
-/* drop the oldest entries that have been purged by the kernel */
+/* drop the entries that are older than the given time */
 static void
 drm_intel_gem_bo_cache_purge_bucket(drm_intel_bufmgr_gem *bufmgr_gem,
-				    struct drm_intel_gem_bo_bucket *bucket)
+				    struct drm_intel_gem_bo_bucket *bucket,
+				    time_t time)
 {
-	while (!DRMLISTEMPTY(&bucket->head)) {
-		drm_intel_bo_gem *bo_gem;
-
-		bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-				      bucket->head.next, head);
-		if (drm_intel_gem_bo_madvise_internal
-		    (bufmgr_gem, bo_gem, I915_MADV_DONTNEED))
-			break;
-
-		DRMLISTDEL(&bo_gem->head);
-		drm_intel_gem_bo_free(&bo_gem->bo);
+	drm_intel_bo_gem *bo_gem, *temp;
+	DRMLISTFOREACHENTRYSAFE(bo_gem, temp, &bucket->head, head) {
+		if (bo_gem->free_time >= time) {
+			drm_intel_gem_bo_madvise_internal
+				(bufmgr_gem, bo_gem, I915_MADV_DONTNEED);
+			DRMLISTDEL(&bo_gem->head);
+			drm_intel_gem_bo_free(&bo_gem->bo);
+		}
 	}
 }
 
+static drm_intel_bo_gem *
+drm_intel_gem_bo_cached_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
+                                 unsigned long size,
+                                 uint32_t tiling_mode,
+                                 unsigned long stride,
+                                 unsigned long alignment,
+                                 bool for_render)
+{
+	struct drm_intel_gem_bo_bucket *bucket =
+		drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
+
+	if(bucket != NULL) {
+		drm_intel_bo_gem *bo_gem, *temp_bo_gem;
+retry:
+		bo_gem = NULL;
+		if (for_render) {
+			/* Allocate new render-target BOs from the tail (MRU)
+			 * of the list, as it will likely be hot in the GPU
+			 * cache and in the aperture for us.
+			 */
+			DRMLISTFOREACHENTRYREVERSE(temp_bo_gem, &bucket->head, head) {
+				if (temp_bo_gem->bo.size >= size) {
+					bo_gem = temp_bo_gem;
+					DRMLISTDEL(&bo_gem->head);
+					bo_gem->bo.align = alignment;
+					break;
+				}
+			}
+		} else {
+			assert(alignment == 0);
+			/* For non-render-target BOs (where we're probably
+			 * going to map it first thing in order to fill it
+			 * with data), check if the last BO in the cache is
+			 * unbusy, and only reuse in that case. Otherwise,
+			 * allocating a new buffer is probably faster than
+			 * waiting for the GPU to finish.
+			 */
+			DRMLISTFOREACHENTRY(temp_bo_gem, &bucket->head, head) {
+				if (temp_bo_gem->bo.size >= size &&
+				    !drm_intel_gem_bo_busy(&temp_bo_gem->bo)) {
+					bo_gem = temp_bo_gem;
+					DRMLISTDEL(&bo_gem->head);
+					break;
+				}
+			}
+		}
+
+		if(bo_gem) {
+			if (!drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
+							       I915_MADV_WILLNEED)) {
+				/* failed to reuse the cached buffer.
+				 * the buffers older cannot be reused and can be purged
+				 * from the bucket.
+				 */
+				drm_intel_gem_bo_cache_purge_bucket(bufmgr_gem, bucket,
+								    bo_gem->free_time);
+				drm_intel_gem_bo_free(&bo_gem->bo);
+				return NULL;
+			}
+
+			if (drm_intel_gem_bo_set_tiling_internal(&bo_gem->bo,
+								 tiling_mode,
+								 stride)) {
+				drm_intel_gem_bo_free(&bo_gem->bo);
+				goto retry;
+			}
+		}
+		return bo_gem;
+	}
+	return NULL;
+}
+
 static drm_intel_bo *
 drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
 				const char *name,
@@ -715,81 +784,22 @@ drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
 	drm_intel_bo_gem *bo_gem;
-	unsigned int page_size = getpagesize();
 	int ret;
-	struct drm_intel_gem_bo_bucket *bucket;
-	bool alloc_from_cache;
-	unsigned long bo_size;
 	bool for_render = false;
 
 	if (flags & BO_ALLOC_FOR_RENDER)
 		for_render = true;
 
-	/* Round the allocated size up to a power of two number of pages. */
-	bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
-
-	/* If we don't have caching at this size, don't actually round the
-	 * allocation up.
-	 */
-	if (bucket == NULL) {
-		bo_size = size;
-		if (bo_size < page_size)
-			bo_size = page_size;
-	} else {
-		bo_size = bucket->size;
-	}
+	/* first align the size on page boundary */
+	size = ALIGN(size, getpagesize());
 
 	pthread_mutex_lock(&bufmgr_gem->lock);
+
 	/* Get a buffer out of the cache if available */
-retry:
-	alloc_from_cache = false;
-	if (bucket != NULL && !DRMLISTEMPTY(&bucket->head)) {
-		if (for_render) {
-			/* Allocate new render-target BOs from the tail (MRU)
-			 * of the list, as it will likely be hot in the GPU
-			 * cache and in the aperture for us.
-			 */
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.prev, head);
-			DRMLISTDEL(&bo_gem->head);
-			alloc_from_cache = true;
-			bo_gem->bo.align = alignment;
-		} else {
-			assert(alignment == 0);
-			/* For non-render-target BOs (where we're probably
-			 * going to map it first thing in order to fill it
-			 * with data), check if the last BO in the cache is
-			 * unbusy, and only reuse in that case. Otherwise,
-			 * allocating a new buffer is probably faster than
-			 * waiting for the GPU to finish.
-			 */
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.next, head);
-			if (!drm_intel_gem_bo_busy(&bo_gem->bo)) {
-				alloc_from_cache = true;
-				DRMLISTDEL(&bo_gem->head);
-			}
-		}
+	bo_gem = drm_intel_gem_bo_cached_for_size(bufmgr_gem, size, tiling_mode,
+						  stride, alignment, for_render);
 
-		if (alloc_from_cache) {
-			if (!drm_intel_gem_bo_madvise_internal
-			    (bufmgr_gem, bo_gem, I915_MADV_WILLNEED)) {
-				drm_intel_gem_bo_free(&bo_gem->bo);
-				drm_intel_gem_bo_cache_purge_bucket(bufmgr_gem,
-								    bucket);
-				goto retry;
-			}
-
-			if (drm_intel_gem_bo_set_tiling_internal(&bo_gem->bo,
-								 tiling_mode,
-								 stride)) {
-				drm_intel_gem_bo_free(&bo_gem->bo);
-				goto retry;
-			}
-		}
-	}
-
-	if (!alloc_from_cache) {
+	if (bo_gem == NULL) {
 		struct drm_i915_gem_create create;
 
 		bo_gem = calloc(1, sizeof(*bo_gem));
@@ -800,10 +810,10 @@ retry:
 		   list (vma_list), so better set the list head here */
 		DRMINITLISTHEAD(&bo_gem->vma_list);
 
-		bo_gem->bo.size = bo_size;
+		bo_gem->bo.size = size;
 
 		memclear(create);
-		create.size = bo_size;
+		create.size = size;
 
 		ret = drmIoctl(bufmgr_gem->fd,
 			       DRM_IOCTL_I915_GEM_CREATE,
diff --git a/libdrm_lists.h b/libdrm_lists.h
index 8926d8d..400c731 100644
--- a/libdrm_lists.h
+++ b/libdrm_lists.h
@@ -101,6 +101,12 @@ typedef struct _drmMMListHead
 	     (__item) = DRMLISTENTRY(typeof(*__item),                          \
 				     (__item)->__head.next, __head))
 
+#define DRMLISTFOREACHENTRYREVERSE(__item, __list, __head)                     \
+	for ((__item) = DRMLISTENTRY(typeof(*__item), (__list)->prev, __head); \
+	     &(__item)->__head != (__list);                                    \
+	     (__item) = DRMLISTENTRY(typeof(*__item),                          \
+				     (__item)->__head.prev, __head))
+
 #define DRMLISTFOREACHENTRYSAFE(__item, __temp, __list, __head)                \
 	for ((__item) = DRMLISTENTRY(typeof(*__item), (__list)->next, __head), \
 	     (__temp) = DRMLISTENTRY(typeof(*__item),                          \
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH 1/1] intel: align reuse buffer's size on page size instead
  2018-03-02 17:53 [PATCH 1/1] intel: align reuse buffer's size on page size instead James Xiong
@ 2018-03-03  9:45 ` Chris Wilson
  2018-03-15 16:29   ` Xiong, James
  0 siblings, 1 reply; 3+ messages in thread
From: Chris Wilson @ 2018-03-03  9:45 UTC (permalink / raw)
  To: dri-devel, intel-gfx; +Cc: james.xiong

Quoting James Xiong (2018-03-02 17:53:04)
> From: "Xiong, James" <james.xiong@intel.com>
> 
> With gem_reuse enabled, when a buffer size is different than
> the sizes of buckets, it is aligned to the next bucket's size,
> which means about 25% more memory than the requested is allocated
> in the worst senario. For example:
> 
> Orignal size    Actual
> 32KB+1Byte      40KB
> .
> .
> .
> 8MB+1Byte       10MB
> .
> .
> .
> 96MB+1Byte      112MB
> 
> This is very memory expensive and make the reuse feature less
> favorable than it deserves to be.
> 
> This commit aligns the reuse buffer size on page size instead,
> the buffer whose size falls between bucket[n] and bucket[n+1] is
> put in bucket[n] when it's done; And when searching for a cached
> buffer for reuse, it goes through the cached buffers list in the
> bucket until a cached buffer, whose size is larger than or equal
> to the requested size, is found.
> 
> Performed gfxbench tests, the performances and reuse ratioes
> (reuse count/allocation count) were same as before, saved memory
> usage by 1% ~ 7%(gl_manhattan: peak allocated memory size was
> reduced from 448401408 to 419078144).

Apart from GL doesn't use this... And what is the impact on !llc, where
buffer reuse is more important? (Every new buffer requires clflushing.)

> Signed-off-by: Xiong, James <james.xiong@intel.com>
> ---
>  intel/intel_bufmgr_gem.c | 180 +++++++++++++++++++++++++----------------------
>  libdrm_lists.h           |   6 ++
>  2 files changed, 101 insertions(+), 85 deletions(-)
> 
> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> index 386da30..5b2d0d0 100644
> --- a/intel/intel_bufmgr_gem.c
> +++ b/intel/intel_bufmgr_gem.c
> @@ -402,11 +402,10 @@ drm_intel_gem_bo_bucket_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
>  {
>         int i;
>  
> -       for (i = 0; i < bufmgr_gem->num_buckets; i++) {
> -               struct drm_intel_gem_bo_bucket *bucket =
> -                   &bufmgr_gem->cache_bucket[i];
> -               if (bucket->size >= size) {
> -                       return bucket;
> +       for (i = 0; i < bufmgr_gem->num_buckets - 1; i++) {
> +               if (size >= bufmgr_gem->cache_bucket[i].size &&
> +                   size < bufmgr_gem->cache_bucket[i+1].size) {
> +                       return &bufmgr_gem->cache_bucket[i];

Are your buckets not ordered correctly?

Please reduce this patch to a series of small functional changes required to
bring into effect having mixed, page-aligned bo->size within a bucket.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 1/1] intel: align reuse buffer's size on page size instead
  2018-03-03  9:45 ` Chris Wilson
@ 2018-03-15 16:29   ` Xiong, James
  0 siblings, 0 replies; 3+ messages in thread
From: Xiong, James @ 2018-03-15 16:29 UTC (permalink / raw)
  To: Chris Wilson, dri-devel, intel-gfx

Thanks for the review, Chris. Sorry for the late response.

>-----Original Message-----
>From: dri-devel [mailto:dri-devel-bounces@lists.freedesktop.org] On Behalf Of Chris Wilson
>Sent: Saturday, March 3, 2018 1:46 AM
>To: Xiong, James <james.xiong@intel.com>; dri-devel@lists.freedesktop.org; intel-gfx@lists.freedesktop.org
>Cc: Xiong, James <james.xiong@intel.com>
>Subject: Re: [Intel-gfx] [PATCH 1/1] intel: align reuse buffer's size on page size instead
>
>Quoting James Xiong (2018-03-02 17:53:04)
>> From: "Xiong, James" <james.xiong@intel.com>
>> 
>> With gem_reuse enabled, when a buffer size is different than the sizes 
>> of buckets, it is aligned to the next bucket's size, which means about 
>> 25% more memory than the requested is allocated in the worst senario. 
>> For example:
>> 
>> Orignal size    Actual
>> 32KB+1Byte      40KB
>>.
>>.
>>.
>>8MB+1Byte       10MB
>>.
>>.
>>.
>>96MB+1Byte      112MB
>> 
>> This is very memory expensive and make the reuse feature less 
>> favorable than it deserves to be.
>> 
>> This commit aligns the reuse buffer size on page size instead, the 
>> buffer whose size falls between bucket[n] and bucket[n+1] is put in 
>> bucket[n] when it's done; And when searching for a cached buffer for 
>> reuse, it goes through the cached buffers list in the bucket until a 
>> cached buffer, whose size is larger than or equal to the requested 
>> size, is found.
>> 
>> Performed gfxbench tests, the performances and reuse ratioes (reuse 
>> count/allocation count) were same as before, saved memory usage by 1% 
>> ~ 7%(gl_manhattan: peak allocated memory size was reduced from 
>> 448401408 to 419078144).
>
>Apart from GL doesn't use this... And what is the impact on !llc, where buffer reuse is more important? (Every new buffer requires clflushing.)
The test was run against a Gen9 that doesn't support LLC. Please let me know if you have some performance tests for me to run.
>
>
>
>> Signed-off-by: Xiong, James <james.xiong@intel.com>
>> ---
>>  intel/intel_bufmgr_gem.c | 180 +++++++++++++++++++++++++----------------------
>>  libdrm_lists.h           |   6 ++
>>  2 files changed, 101 insertions(+), 85 deletions(-)
>> 
>> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c index 
>> 386da30..5b2d0d0 100644
>> --- a/intel/intel_bufmgr_gem.c
>> +++ b/intel/intel_bufmgr_gem.c
>> @@ -402,11 +402,10 @@ 
>> drm_intel_gem_bo_bucket_for_size(drm_intel_bufmgr_gem *bufmgr_gem,  {
>>         int i;
>>  
>> -       for (i = 0; i < bufmgr_gem->num_buckets; i++) {
>> -               struct drm_intel_gem_bo_bucket *bucket =
>> -                   &bufmgr_gem->cache_bucket[i];
>> -               if (bucket->size >= size) {
>> -                       return bucket;
>> +       for (i = 0; i < bufmgr_gem->num_buckets - 1; i++) {
>> +               if (size >= bufmgr_gem->cache_bucket[i].size &&
>> +                   size < bufmgr_gem->cache_bucket[i+1].size) {
>> +                       return &bufmgr_gem->cache_bucket[i];
>
>Are your buckets not ordered correctly?
The order is the same as before, so are the buckets' sizes.
>
>Please reduce this patch to a series of small functional changes required to bring into effect having mixed, page-aligned bo->size within a bucket.
Will do.
>-Chris
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-03-15 16:29 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-02 17:53 [PATCH 1/1] intel: align reuse buffer's size on page size instead James Xiong
2018-03-03  9:45 ` Chris Wilson
2018-03-15 16:29   ` Xiong, James

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.