* [PATCH v3 1/1] drm/mm: optimize rb_hole_addr rbtree search
@ 2020-05-04 10:42 Nirmoy Das
2020-05-04 13:35 ` Christian König
0 siblings, 1 reply; 3+ messages in thread
From: Nirmoy Das @ 2020-05-04 10:42 UTC (permalink / raw)
To: dri-devel
Cc: kbuild test robot, nirmoy.aiemd, Nirmoy Das, christian.koenig, chris
Userspace can severely fragment rb_hole_addr rbtree by manipulating
alignment while allocating buffers. Fragmented rb_hole_addr rbtree
would result in large delays while allocating buffer object for a
userspace application. It takes long time to find suitable hole
because if we fail to find a suitable hole in the first attempt
then we look for neighbouring nodes using rb_prev()/rb_next().
Traversing rbtree using rb_prev()/rb_next() can take really long
time if the tree is fragmented.
This patch improves searches in fragmented rb_hole_addr rbtree by
modifying it to an augmented rbtree which will store an extra field
in drm_mm_node, subtree_max_hole. Each drm_mm_node now stores maximum
hole size for its subtree in drm_mm_node->subtree_max_hole. Using
drm_mm_node->subtree_max_hole, it is possible to eliminate complete subtree
if that subtree is unable to serve a request hence reducing number of
rb_prev()/rb_next() used.
With this patch applied, 1 million bo allocations on amdgpu took ~8 sec and
without the patch the test code took 28 sec for only 50k bo allocs.
partial test code:
int test_fragmentation(void)
{
int i = 0;
uint32_t minor_version;
uint32_t major_version;
struct amdgpu_bo_alloc_request request = {};
amdgpu_bo_handle vram_handle[MAX_ALLOC] = {};
amdgpu_device_handle device_handle;
request.alloc_size = 4096;
request.phys_alignment = 8192;
request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
int fd = open("/dev/dri/card0", O_RDWR | O_CLOEXEC);
amdgpu_device_initialize(fd, &major_version, &minor_version,
&device_handle);
for (i = 0; i < MAX_ALLOC; i++) {
amdgpu_bo_alloc(device_handle, &request, &vram_handle[i]);
}
for (i = 0; i < MAX_ALLOC; i++)
amdgpu_bo_free(vram_handle[i]);
return 0;
}
v2:
Use RB_DECLARE_CALLBACKS_MAX to maintain subtree_max_hole
v3:
insert_hole_addr() should be static a function
fix return value of next_hole_high_addr()/next_hole_low_addr()
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/drm_mm.c | 133 +++++++++++++++++++++++++++++++++------
include/drm/drm_mm.h | 1 +
2 files changed, 115 insertions(+), 19 deletions(-)
diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index 8981abe8b7c9..f4ca1ff80af9 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -212,20 +212,6 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node,
&drm_mm_interval_tree_augment);
}
-#define RB_INSERT(root, member, expr) do { \
- struct rb_node **link = &root.rb_node, *rb = NULL; \
- u64 x = expr(node); \
- while (*link) { \
- rb = *link; \
- if (x < expr(rb_entry(rb, struct drm_mm_node, member))) \
- link = &rb->rb_left; \
- else \
- link = &rb->rb_right; \
- } \
- rb_link_node(&node->member, rb, link); \
- rb_insert_color(&node->member, &root); \
-} while (0)
-
#define HOLE_SIZE(NODE) ((NODE)->hole_size)
#define HOLE_ADDR(NODE) (__drm_mm_hole_node_start(NODE))
@@ -255,16 +241,42 @@ static void insert_hole_size(struct rb_root_cached *root,
rb_insert_color_cached(&node->rb_hole_size, root, first);
}
+RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
+ struct drm_mm_node, rb_hole_addr,
+ u64, subtree_max_hole, HOLE_SIZE)
+
+static void insert_hole_addr(struct rb_root *root, struct drm_mm_node *node)
+{
+ struct rb_node **link = &root->rb_node, *rb_parent = NULL;
+ u64 start = HOLE_ADDR(node), subtree_max_hole = node->subtree_max_hole;
+ struct drm_mm_node *parent;
+
+ while (*link) {
+ rb_parent = *link;
+ parent = rb_entry(rb_parent, struct drm_mm_node, rb_hole_addr);
+ if (parent->subtree_max_hole < subtree_max_hole)
+ parent->subtree_max_hole = subtree_max_hole;
+ if (start < HOLE_ADDR(parent))
+ link = &parent->rb_hole_addr.rb_left;
+ else
+ link = &parent->rb_hole_addr.rb_right;
+ }
+
+ rb_link_node(&node->rb_hole_addr, rb_parent, link);
+ rb_insert_augmented(&node->rb_hole_addr, root, &augment_callbacks);
+}
+
static void add_hole(struct drm_mm_node *node)
{
struct drm_mm *mm = node->mm;
node->hole_size =
__drm_mm_hole_node_end(node) - __drm_mm_hole_node_start(node);
+ node->subtree_max_hole = node->hole_size;
DRM_MM_BUG_ON(!drm_mm_hole_follows(node));
insert_hole_size(&mm->holes_size, node);
- RB_INSERT(mm->holes_addr, rb_hole_addr, HOLE_ADDR);
+ insert_hole_addr(&mm->holes_addr, node);
list_add(&node->hole_stack, &mm->hole_stack);
}
@@ -275,8 +287,10 @@ static void rm_hole(struct drm_mm_node *node)
list_del(&node->hole_stack);
rb_erase_cached(&node->rb_hole_size, &node->mm->holes_size);
- rb_erase(&node->rb_hole_addr, &node->mm->holes_addr);
+ rb_erase_augmented(&node->rb_hole_addr, &node->mm->holes_addr,
+ &augment_callbacks);
node->hole_size = 0;
+ node->subtree_max_hole = 0;
DRM_MM_BUG_ON(drm_mm_hole_follows(node));
}
@@ -361,9 +375,90 @@ first_hole(struct drm_mm *mm,
}
}
+/**
+ * next_hole_high_addr - returns next hole for a DRM_MM_INSERT_HIGH mode request
+ * @entry: previously selected drm_mm_node
+ * @size: size of the a hole needed for the request
+ *
+ * This function will verify whether left subtree of @entry has hole big enough
+ * to fit the requtested size. If so, it will return previous node of @entry or
+ * else it will return parent node of @entry
+ *
+ * It will also skip the complete left subtree if subtree_max_hole of that
+ * subtree is same as the subtree_max_hole of the @entry.
+ *
+ * Returns:
+ * previous node of @entry if left subtree of @entry can serve the request or
+ * else return parent of @entry
+ */
+static struct drm_mm_node *
+next_hole_high_addr(struct drm_mm_node *entry, u64 size)
+{
+ struct rb_node *rb_node, *left_rb_node, *parent_rb_node;
+ struct drm_mm_node *left_node;
+
+ if (!entry)
+ return NULL;
+
+ rb_node = &entry->rb_hole_addr;
+ if (rb_node->rb_left) {
+ left_rb_node = rb_node->rb_left;
+ parent_rb_node = rb_parent(rb_node);
+ left_node = rb_entry(left_rb_node,
+ struct drm_mm_node, rb_hole_addr);
+ if ((left_node->subtree_max_hole < size ||
+ entry->size == entry->subtree_max_hole) &&
+ parent_rb_node && parent_rb_node->rb_left != rb_node)
+ return rb_hole_addr_to_node(parent_rb_node);
+ }
+
+ return rb_hole_addr_to_node(rb_prev(rb_node));
+}
+
+/**
+ * next_hole_low_addr - returns next hole for a DRM_MM_INSERT_LOW mode request
+ * @entry: previously selected drm_mm_node
+ * @size: size of the a hole needed for the request
+ *
+ * This function will verify whether right subtree of @entry has hole big enough
+ * to fit the requtested size. If so, it will return next node of @entry or
+ * else it will return parent node of @entry
+ *
+ * It will also skip the complete right subtree if subtree_max_hole of that
+ * subtree is same as the subtree_max_hole of the @entry.
+ *
+ * Returns:
+ * next node of @entry if right subtree of @entry can serve the request or
+ * else return parent of @entry
+ */
+static struct drm_mm_node *
+next_hole_low_addr(struct drm_mm_node *entry, u64 size)
+{
+ struct rb_node *rb_node, *right_rb_node, *parent_rb_node;
+ struct drm_mm_node *right_node;
+
+ if (!entry)
+ return NULL;
+
+ rb_node = &entry->rb_hole_addr;
+ if (rb_node->rb_right) {
+ right_rb_node = rb_node->rb_right;
+ parent_rb_node = rb_parent(rb_node);
+ right_node = rb_entry(right_rb_node,
+ struct drm_mm_node, rb_hole_addr);
+ if ((right_node->subtree_max_hole < size ||
+ entry->size == entry->subtree_max_hole) &&
+ parent_rb_node && parent_rb_node->rb_right != rb_node)
+ return rb_hole_addr_to_node(parent_rb_node);
+ }
+
+ return rb_hole_addr_to_node(rb_next(rb_node));
+}
+
static struct drm_mm_node *
next_hole(struct drm_mm *mm,
struct drm_mm_node *node,
+ u64 size,
enum drm_mm_insert_mode mode)
{
switch (mode) {
@@ -372,10 +467,10 @@ next_hole(struct drm_mm *mm,
return rb_hole_size_to_node(rb_prev(&node->rb_hole_size));
case DRM_MM_INSERT_LOW:
- return rb_hole_addr_to_node(rb_next(&node->rb_hole_addr));
+ return next_hole_low_addr(node, size);
case DRM_MM_INSERT_HIGH:
- return rb_hole_addr_to_node(rb_prev(&node->rb_hole_addr));
+ return next_hole_high_addr(node, size);
case DRM_MM_INSERT_EVICT:
node = list_next_entry(node, hole_stack);
@@ -489,7 +584,7 @@ int drm_mm_insert_node_in_range(struct drm_mm * const mm,
remainder_mask = is_power_of_2(alignment) ? alignment - 1 : 0;
for (hole = first_hole(mm, range_start, range_end, size, mode);
hole;
- hole = once ? NULL : next_hole(mm, hole, mode)) {
+ hole = once ? NULL : next_hole(mm, hole, size, mode)) {
u64 hole_start = __drm_mm_hole_node_start(hole);
u64 hole_end = hole_start + hole->hole_size;
u64 adj_start, adj_end;
diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h
index ee8b0e80ca90..a01bc6fac83c 100644
--- a/include/drm/drm_mm.h
+++ b/include/drm/drm_mm.h
@@ -168,6 +168,7 @@ struct drm_mm_node {
struct rb_node rb_hole_addr;
u64 __subtree_last;
u64 hole_size;
+ u64 subtree_max_hole;
unsigned long flags;
#define DRM_MM_NODE_ALLOCATED_BIT 0
#define DRM_MM_NODE_SCANNED_BIT 1
--
2.26.1
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v3 1/1] drm/mm: optimize rb_hole_addr rbtree search
2020-05-04 10:42 [PATCH v3 1/1] drm/mm: optimize rb_hole_addr rbtree search Nirmoy Das
@ 2020-05-04 13:35 ` Christian König
2020-05-04 15:33 ` Nirmoy
0 siblings, 1 reply; 3+ messages in thread
From: Christian König @ 2020-05-04 13:35 UTC (permalink / raw)
To: Nirmoy Das, dri-devel; +Cc: nirmoy.aiemd, kbuild test robot, chris
Am 04.05.20 um 12:42 schrieb Nirmoy Das:
> Userspace can severely fragment rb_hole_addr rbtree by manipulating
> alignment while allocating buffers. Fragmented rb_hole_addr rbtree
> would result in large delays while allocating buffer object for a
> userspace application. It takes long time to find suitable hole
> because if we fail to find a suitable hole in the first attempt
> then we look for neighbouring nodes using rb_prev()/rb_next().
> Traversing rbtree using rb_prev()/rb_next() can take really long
> time if the tree is fragmented.
>
> This patch improves searches in fragmented rb_hole_addr rbtree by
> modifying it to an augmented rbtree which will store an extra field
> in drm_mm_node, subtree_max_hole. Each drm_mm_node now stores maximum
> hole size for its subtree in drm_mm_node->subtree_max_hole. Using
> drm_mm_node->subtree_max_hole, it is possible to eliminate complete subtree
"eliminate a complete subtree", but I'm not a native speaker either so
take this with a grain of salt.
> if that subtree is unable to serve a request hence reducing number of
> rb_prev()/rb_next() used.
>
> With this patch applied, 1 million bo allocations on amdgpu took ~8 sec and
"1 million bo allocs on amdgpu took ~8 sec, compared to 50k bo allocs
took 28 sec without".
E.g. keep the numbers in the same order for easy compare.
> without the patch the test code took 28 sec for only 50k bo allocs.
>
> partial test code:
> int test_fragmentation(void)
> {
>
> int i = 0;
> uint32_t minor_version;
> uint32_t major_version;
>
> struct amdgpu_bo_alloc_request request = {};
> amdgpu_bo_handle vram_handle[MAX_ALLOC] = {};
> amdgpu_device_handle device_handle;
>
> request.alloc_size = 4096;
> request.phys_alignment = 8192;
> request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
>
> int fd = open("/dev/dri/card0", O_RDWR | O_CLOEXEC);
> amdgpu_device_initialize(fd, &major_version, &minor_version,
> &device_handle);
>
> for (i = 0; i < MAX_ALLOC; i++) {
> amdgpu_bo_alloc(device_handle, &request, &vram_handle[i]);
> }
>
> for (i = 0; i < MAX_ALLOC; i++)
> amdgpu_bo_free(vram_handle[i]);
>
> return 0;
> }
>
> v2:
> Use RB_DECLARE_CALLBACKS_MAX to maintain subtree_max_hole
> v3:
> insert_hole_addr() should be static a function
> fix return value of next_hole_high_addr()/next_hole_low_addr()
> Reported-by: kbuild test robot <lkp@intel.com>
>
> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
> drivers/gpu/drm/drm_mm.c | 133 +++++++++++++++++++++++++++++++++------
> include/drm/drm_mm.h | 1 +
> 2 files changed, 115 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
> index 8981abe8b7c9..f4ca1ff80af9 100644
> --- a/drivers/gpu/drm/drm_mm.c
> +++ b/drivers/gpu/drm/drm_mm.c
> @@ -212,20 +212,6 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node,
> &drm_mm_interval_tree_augment);
> }
>
> -#define RB_INSERT(root, member, expr) do { \
> - struct rb_node **link = &root.rb_node, *rb = NULL; \
> - u64 x = expr(node); \
> - while (*link) { \
> - rb = *link; \
> - if (x < expr(rb_entry(rb, struct drm_mm_node, member))) \
> - link = &rb->rb_left; \
> - else \
> - link = &rb->rb_right; \
> - } \
> - rb_link_node(&node->member, rb, link); \
> - rb_insert_color(&node->member, &root); \
> -} while (0)
> -
> #define HOLE_SIZE(NODE) ((NODE)->hole_size)
> #define HOLE_ADDR(NODE) (__drm_mm_hole_node_start(NODE))
>
> @@ -255,16 +241,42 @@ static void insert_hole_size(struct rb_root_cached *root,
> rb_insert_color_cached(&node->rb_hole_size, root, first);
> }
>
> +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
> + struct drm_mm_node, rb_hole_addr,
> + u64, subtree_max_hole, HOLE_SIZE)
Well that is a really nice solution. I've reviewed the implementation
and couldn't find anything obvious wrong.
But I'm neither an expert on the drm_mm code nor the augmented RB tree
implementation. So Acked-by: Christian König <christian.koenig@amd.com>
for this patch.
As a follow up I think we should extend this into keeping track of
subtree_max_hole_align. With HOLE_SIZE_ALIGN(NODE) defined as
(((NODE)->hole_size) << 6) | ffs(HOLE_ADDR(NODE)).
This way we can also handle the ugly case of 4K allocation which are 8K
aligned.
Regards,
Christian.
> +
> +static void insert_hole_addr(struct rb_root *root, struct drm_mm_node *node)
> +{
> + struct rb_node **link = &root->rb_node, *rb_parent = NULL;
> + u64 start = HOLE_ADDR(node), subtree_max_hole = node->subtree_max_hole;
> + struct drm_mm_node *parent;
> +
> + while (*link) {
> + rb_parent = *link;
> + parent = rb_entry(rb_parent, struct drm_mm_node, rb_hole_addr);
> + if (parent->subtree_max_hole < subtree_max_hole)
> + parent->subtree_max_hole = subtree_max_hole;
> + if (start < HOLE_ADDR(parent))
> + link = &parent->rb_hole_addr.rb_left;
> + else
> + link = &parent->rb_hole_addr.rb_right;
> + }
> +
> + rb_link_node(&node->rb_hole_addr, rb_parent, link);
> + rb_insert_augmented(&node->rb_hole_addr, root, &augment_callbacks);
> +}
> +
> static void add_hole(struct drm_mm_node *node)
> {
> struct drm_mm *mm = node->mm;
>
> node->hole_size =
> __drm_mm_hole_node_end(node) - __drm_mm_hole_node_start(node);
> + node->subtree_max_hole = node->hole_size;
> DRM_MM_BUG_ON(!drm_mm_hole_follows(node));
>
> insert_hole_size(&mm->holes_size, node);
> - RB_INSERT(mm->holes_addr, rb_hole_addr, HOLE_ADDR);
> + insert_hole_addr(&mm->holes_addr, node);
>
> list_add(&node->hole_stack, &mm->hole_stack);
> }
> @@ -275,8 +287,10 @@ static void rm_hole(struct drm_mm_node *node)
>
> list_del(&node->hole_stack);
> rb_erase_cached(&node->rb_hole_size, &node->mm->holes_size);
> - rb_erase(&node->rb_hole_addr, &node->mm->holes_addr);
> + rb_erase_augmented(&node->rb_hole_addr, &node->mm->holes_addr,
> + &augment_callbacks);
> node->hole_size = 0;
> + node->subtree_max_hole = 0;
>
> DRM_MM_BUG_ON(drm_mm_hole_follows(node));
> }
> @@ -361,9 +375,90 @@ first_hole(struct drm_mm *mm,
> }
> }
>
> +/**
> + * next_hole_high_addr - returns next hole for a DRM_MM_INSERT_HIGH mode request
> + * @entry: previously selected drm_mm_node
> + * @size: size of the a hole needed for the request
> + *
> + * This function will verify whether left subtree of @entry has hole big enough
> + * to fit the requtested size. If so, it will return previous node of @entry or
> + * else it will return parent node of @entry
> + *
> + * It will also skip the complete left subtree if subtree_max_hole of that
> + * subtree is same as the subtree_max_hole of the @entry.
> + *
> + * Returns:
> + * previous node of @entry if left subtree of @entry can serve the request or
> + * else return parent of @entry
> + */
> +static struct drm_mm_node *
> +next_hole_high_addr(struct drm_mm_node *entry, u64 size)
> +{
> + struct rb_node *rb_node, *left_rb_node, *parent_rb_node;
> + struct drm_mm_node *left_node;
> +
> + if (!entry)
> + return NULL;
> +
> + rb_node = &entry->rb_hole_addr;
> + if (rb_node->rb_left) {
> + left_rb_node = rb_node->rb_left;
> + parent_rb_node = rb_parent(rb_node);
> + left_node = rb_entry(left_rb_node,
> + struct drm_mm_node, rb_hole_addr);
> + if ((left_node->subtree_max_hole < size ||
> + entry->size == entry->subtree_max_hole) &&
> + parent_rb_node && parent_rb_node->rb_left != rb_node)
> + return rb_hole_addr_to_node(parent_rb_node);
> + }
> +
> + return rb_hole_addr_to_node(rb_prev(rb_node));
> +}
> +
> +/**
> + * next_hole_low_addr - returns next hole for a DRM_MM_INSERT_LOW mode request
> + * @entry: previously selected drm_mm_node
> + * @size: size of the a hole needed for the request
> + *
> + * This function will verify whether right subtree of @entry has hole big enough
> + * to fit the requtested size. If so, it will return next node of @entry or
> + * else it will return parent node of @entry
> + *
> + * It will also skip the complete right subtree if subtree_max_hole of that
> + * subtree is same as the subtree_max_hole of the @entry.
> + *
> + * Returns:
> + * next node of @entry if right subtree of @entry can serve the request or
> + * else return parent of @entry
> + */
> +static struct drm_mm_node *
> +next_hole_low_addr(struct drm_mm_node *entry, u64 size)
> +{
> + struct rb_node *rb_node, *right_rb_node, *parent_rb_node;
> + struct drm_mm_node *right_node;
> +
> + if (!entry)
> + return NULL;
> +
> + rb_node = &entry->rb_hole_addr;
> + if (rb_node->rb_right) {
> + right_rb_node = rb_node->rb_right;
> + parent_rb_node = rb_parent(rb_node);
> + right_node = rb_entry(right_rb_node,
> + struct drm_mm_node, rb_hole_addr);
> + if ((right_node->subtree_max_hole < size ||
> + entry->size == entry->subtree_max_hole) &&
> + parent_rb_node && parent_rb_node->rb_right != rb_node)
> + return rb_hole_addr_to_node(parent_rb_node);
> + }
> +
> + return rb_hole_addr_to_node(rb_next(rb_node));
> +}
> +
> static struct drm_mm_node *
> next_hole(struct drm_mm *mm,
> struct drm_mm_node *node,
> + u64 size,
> enum drm_mm_insert_mode mode)
> {
> switch (mode) {
> @@ -372,10 +467,10 @@ next_hole(struct drm_mm *mm,
> return rb_hole_size_to_node(rb_prev(&node->rb_hole_size));
>
> case DRM_MM_INSERT_LOW:
> - return rb_hole_addr_to_node(rb_next(&node->rb_hole_addr));
> + return next_hole_low_addr(node, size);
>
> case DRM_MM_INSERT_HIGH:
> - return rb_hole_addr_to_node(rb_prev(&node->rb_hole_addr));
> + return next_hole_high_addr(node, size);
>
> case DRM_MM_INSERT_EVICT:
> node = list_next_entry(node, hole_stack);
> @@ -489,7 +584,7 @@ int drm_mm_insert_node_in_range(struct drm_mm * const mm,
> remainder_mask = is_power_of_2(alignment) ? alignment - 1 : 0;
> for (hole = first_hole(mm, range_start, range_end, size, mode);
> hole;
> - hole = once ? NULL : next_hole(mm, hole, mode)) {
> + hole = once ? NULL : next_hole(mm, hole, size, mode)) {
> u64 hole_start = __drm_mm_hole_node_start(hole);
> u64 hole_end = hole_start + hole->hole_size;
> u64 adj_start, adj_end;
> diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h
> index ee8b0e80ca90..a01bc6fac83c 100644
> --- a/include/drm/drm_mm.h
> +++ b/include/drm/drm_mm.h
> @@ -168,6 +168,7 @@ struct drm_mm_node {
> struct rb_node rb_hole_addr;
> u64 __subtree_last;
> u64 hole_size;
> + u64 subtree_max_hole;
> unsigned long flags;
> #define DRM_MM_NODE_ALLOCATED_BIT 0
> #define DRM_MM_NODE_SCANNED_BIT 1
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v3 1/1] drm/mm: optimize rb_hole_addr rbtree search
2020-05-04 13:35 ` Christian König
@ 2020-05-04 15:33 ` Nirmoy
0 siblings, 0 replies; 3+ messages in thread
From: Nirmoy @ 2020-05-04 15:33 UTC (permalink / raw)
To: Christian König, Nirmoy Das, dri-devel; +Cc: kbuild test robot, chris
On 5/4/20 3:35 PM, Christian König wrote:
> Am 04.05.20 um 12:42 schrieb Nirmoy Das:
>> Userspace can severely fragment rb_hole_addr rbtree by manipulating
>> alignment while allocating buffers. Fragmented rb_hole_addr rbtree
>> would result in large delays while allocating buffer object for a
>> userspace application. It takes long time to find suitable hole
>> because if we fail to find a suitable hole in the first attempt
>> then we look for neighbouring nodes using rb_prev()/rb_next().
>> Traversing rbtree using rb_prev()/rb_next() can take really long
>> time if the tree is fragmented.
>>
>> This patch improves searches in fragmented rb_hole_addr rbtree by
>> modifying it to an augmented rbtree which will store an extra field
>> in drm_mm_node, subtree_max_hole. Each drm_mm_node now stores maximum
>> hole size for its subtree in drm_mm_node->subtree_max_hole. Using
>> drm_mm_node->subtree_max_hole, it is possible to eliminate complete
>> subtree
>
> "eliminate a complete subtree", but I'm not a native speaker either so
> take this with a grain of salt.
No, you are right.
>
>> if that subtree is unable to serve a request hence reducing number of
>> rb_prev()/rb_next() used.
>>
>> With this patch applied, 1 million bo allocations on amdgpu took ~8
>> sec and
>
> "1 million bo allocs on amdgpu took ~8 sec, compared to 50k bo allocs
> took 28 sec without".
>
> E.g. keep the numbers in the same order for easy compare.
Thanks, this looks easier to understand.
>
>> without the patch the test code took 28 sec for only 50k bo allocs.
>>
>> partial test code:
>> int test_fragmentation(void)
>> {
>>
>> int i = 0;
>> uint32_t minor_version;
>> uint32_t major_version;
>>
>> struct amdgpu_bo_alloc_request request = {};
>> amdgpu_bo_handle vram_handle[MAX_ALLOC] = {};
>> amdgpu_device_handle device_handle;
>>
>> request.alloc_size = 4096;
>> request.phys_alignment = 8192;
>> request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
>>
>> int fd = open("/dev/dri/card0", O_RDWR | O_CLOEXEC);
>> amdgpu_device_initialize(fd, &major_version, &minor_version,
>> &device_handle);
>>
>> for (i = 0; i < MAX_ALLOC; i++) {
>> amdgpu_bo_alloc(device_handle, &request,
>> &vram_handle[i]);
>> }
>>
>> for (i = 0; i < MAX_ALLOC; i++)
>> amdgpu_bo_free(vram_handle[i]);
>>
>> return 0;
>> }
>>
>> v2:
>> Use RB_DECLARE_CALLBACKS_MAX to maintain subtree_max_hole
>> v3:
>> insert_hole_addr() should be static a function
>> fix return value of next_hole_high_addr()/next_hole_low_addr()
>> Reported-by: kbuild test robot <lkp@intel.com>
>>
>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>> drivers/gpu/drm/drm_mm.c | 133 +++++++++++++++++++++++++++++++++------
>> include/drm/drm_mm.h | 1 +
>> 2 files changed, 115 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
>> index 8981abe8b7c9..f4ca1ff80af9 100644
>> --- a/drivers/gpu/drm/drm_mm.c
>> +++ b/drivers/gpu/drm/drm_mm.c
>> @@ -212,20 +212,6 @@ static void drm_mm_interval_tree_add_node(struct
>> drm_mm_node *hole_node,
>> &drm_mm_interval_tree_augment);
>> }
>> -#define RB_INSERT(root, member, expr) do { \
>> - struct rb_node **link = &root.rb_node, *rb = NULL; \
>> - u64 x = expr(node); \
>> - while (*link) { \
>> - rb = *link; \
>> - if (x < expr(rb_entry(rb, struct drm_mm_node, member))) \
>> - link = &rb->rb_left; \
>> - else \
>> - link = &rb->rb_right; \
>> - } \
>> - rb_link_node(&node->member, rb, link); \
>> - rb_insert_color(&node->member, &root); \
>> -} while (0)
>> -
>> #define HOLE_SIZE(NODE) ((NODE)->hole_size)
>> #define HOLE_ADDR(NODE) (__drm_mm_hole_node_start(NODE))
>> @@ -255,16 +241,42 @@ static void insert_hole_size(struct
>> rb_root_cached *root,
>> rb_insert_color_cached(&node->rb_hole_size, root, first);
>> }
>> +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
>> + struct drm_mm_node, rb_hole_addr,
>> + u64, subtree_max_hole, HOLE_SIZE)
>
> Well that is a really nice solution. I've reviewed the implementation
> and couldn't find anything obvious wrong.
>
> But I'm neither an expert on the drm_mm code nor the augmented RB tree
> implementation. So Acked-by: Christian König
> <christian.koenig@amd.com> for this patch.
Thanks I will resend after updating the commit message.
>
> As a follow up I think we should extend this into keeping track of
> subtree_max_hole_align. With HOLE_SIZE_ALIGN(NODE) defined as
> (((NODE)->hole_size) << 6) | ffs(HOLE_ADDR(NODE)).
This should make it more faster .
Regards,
Nirmoy
>
> This way we can also handle the ugly case of 4K allocation which are
> 8K aligned.
>
> Regards,
> Christian.
>
>> +
>> +static void insert_hole_addr(struct rb_root *root, struct
>> drm_mm_node *node)
>> +{
>> + struct rb_node **link = &root->rb_node, *rb_parent = NULL;
>> + u64 start = HOLE_ADDR(node), subtree_max_hole =
>> node->subtree_max_hole;
>> + struct drm_mm_node *parent;
>> +
>> + while (*link) {
>> + rb_parent = *link;
>> + parent = rb_entry(rb_parent, struct drm_mm_node, rb_hole_addr);
>> + if (parent->subtree_max_hole < subtree_max_hole)
>> + parent->subtree_max_hole = subtree_max_hole;
>> + if (start < HOLE_ADDR(parent))
>> + link = &parent->rb_hole_addr.rb_left;
>> + else
>> + link = &parent->rb_hole_addr.rb_right;
>> + }
>> +
>> + rb_link_node(&node->rb_hole_addr, rb_parent, link);
>> + rb_insert_augmented(&node->rb_hole_addr, root, &augment_callbacks);
>> +}
>> +
>> static void add_hole(struct drm_mm_node *node)
>> {
>> struct drm_mm *mm = node->mm;
>> node->hole_size =
>> __drm_mm_hole_node_end(node) - __drm_mm_hole_node_start(node);
>> + node->subtree_max_hole = node->hole_size;
>> DRM_MM_BUG_ON(!drm_mm_hole_follows(node));
>> insert_hole_size(&mm->holes_size, node);
>> - RB_INSERT(mm->holes_addr, rb_hole_addr, HOLE_ADDR);
>> + insert_hole_addr(&mm->holes_addr, node);
>> list_add(&node->hole_stack, &mm->hole_stack);
>> }
>> @@ -275,8 +287,10 @@ static void rm_hole(struct drm_mm_node *node)
>> list_del(&node->hole_stack);
>> rb_erase_cached(&node->rb_hole_size, &node->mm->holes_size);
>> - rb_erase(&node->rb_hole_addr, &node->mm->holes_addr);
>> + rb_erase_augmented(&node->rb_hole_addr, &node->mm->holes_addr,
>> + &augment_callbacks);
>> node->hole_size = 0;
>> + node->subtree_max_hole = 0;
>> DRM_MM_BUG_ON(drm_mm_hole_follows(node));
>> }
>> @@ -361,9 +375,90 @@ first_hole(struct drm_mm *mm,
>> }
>> }
>> +/**
>> + * next_hole_high_addr - returns next hole for a DRM_MM_INSERT_HIGH
>> mode request
>> + * @entry: previously selected drm_mm_node
>> + * @size: size of the a hole needed for the request
>> + *
>> + * This function will verify whether left subtree of @entry has hole
>> big enough
>> + * to fit the requtested size. If so, it will return previous node
>> of @entry or
>> + * else it will return parent node of @entry
>> + *
>> + * It will also skip the complete left subtree if subtree_max_hole
>> of that
>> + * subtree is same as the subtree_max_hole of the @entry.
>> + *
>> + * Returns:
>> + * previous node of @entry if left subtree of @entry can serve the
>> request or
>> + * else return parent of @entry
>> + */
>> +static struct drm_mm_node *
>> +next_hole_high_addr(struct drm_mm_node *entry, u64 size)
>> +{
>> + struct rb_node *rb_node, *left_rb_node, *parent_rb_node;
>> + struct drm_mm_node *left_node;
>> +
>> + if (!entry)
>> + return NULL;
>> +
>> + rb_node = &entry->rb_hole_addr;
>> + if (rb_node->rb_left) {
>> + left_rb_node = rb_node->rb_left;
>> + parent_rb_node = rb_parent(rb_node);
>> + left_node = rb_entry(left_rb_node,
>> + struct drm_mm_node, rb_hole_addr);
>> + if ((left_node->subtree_max_hole < size ||
>> + entry->size == entry->subtree_max_hole) &&
>> + parent_rb_node && parent_rb_node->rb_left != rb_node)
>> + return rb_hole_addr_to_node(parent_rb_node);
>> + }
>> +
>> + return rb_hole_addr_to_node(rb_prev(rb_node));
>> +}
>> +
>> +/**
>> + * next_hole_low_addr - returns next hole for a DRM_MM_INSERT_LOW
>> mode request
>> + * @entry: previously selected drm_mm_node
>> + * @size: size of the a hole needed for the request
>> + *
>> + * This function will verify whether right subtree of @entry has
>> hole big enough
>> + * to fit the requtested size. If so, it will return next node of
>> @entry or
>> + * else it will return parent node of @entry
>> + *
>> + * It will also skip the complete right subtree if subtree_max_hole
>> of that
>> + * subtree is same as the subtree_max_hole of the @entry.
>> + *
>> + * Returns:
>> + * next node of @entry if right subtree of @entry can serve the
>> request or
>> + * else return parent of @entry
>> + */
>> +static struct drm_mm_node *
>> +next_hole_low_addr(struct drm_mm_node *entry, u64 size)
>> +{
>> + struct rb_node *rb_node, *right_rb_node, *parent_rb_node;
>> + struct drm_mm_node *right_node;
>> +
>> + if (!entry)
>> + return NULL;
>> +
>> + rb_node = &entry->rb_hole_addr;
>> + if (rb_node->rb_right) {
>> + right_rb_node = rb_node->rb_right;
>> + parent_rb_node = rb_parent(rb_node);
>> + right_node = rb_entry(right_rb_node,
>> + struct drm_mm_node, rb_hole_addr);
>> + if ((right_node->subtree_max_hole < size ||
>> + entry->size == entry->subtree_max_hole) &&
>> + parent_rb_node && parent_rb_node->rb_right != rb_node)
>> + return rb_hole_addr_to_node(parent_rb_node);
>> + }
>> +
>> + return rb_hole_addr_to_node(rb_next(rb_node));
>> +}
>> +
>> static struct drm_mm_node *
>> next_hole(struct drm_mm *mm,
>> struct drm_mm_node *node,
>> + u64 size,
>> enum drm_mm_insert_mode mode)
>> {
>> switch (mode) {
>> @@ -372,10 +467,10 @@ next_hole(struct drm_mm *mm,
>> return rb_hole_size_to_node(rb_prev(&node->rb_hole_size));
>> case DRM_MM_INSERT_LOW:
>> - return rb_hole_addr_to_node(rb_next(&node->rb_hole_addr));
>> + return next_hole_low_addr(node, size);
>> case DRM_MM_INSERT_HIGH:
>> - return rb_hole_addr_to_node(rb_prev(&node->rb_hole_addr));
>> + return next_hole_high_addr(node, size);
>> case DRM_MM_INSERT_EVICT:
>> node = list_next_entry(node, hole_stack);
>> @@ -489,7 +584,7 @@ int drm_mm_insert_node_in_range(struct drm_mm *
>> const mm,
>> remainder_mask = is_power_of_2(alignment) ? alignment - 1 : 0;
>> for (hole = first_hole(mm, range_start, range_end, size, mode);
>> hole;
>> - hole = once ? NULL : next_hole(mm, hole, mode)) {
>> + hole = once ? NULL : next_hole(mm, hole, size, mode)) {
>> u64 hole_start = __drm_mm_hole_node_start(hole);
>> u64 hole_end = hole_start + hole->hole_size;
>> u64 adj_start, adj_end;
>> diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h
>> index ee8b0e80ca90..a01bc6fac83c 100644
>> --- a/include/drm/drm_mm.h
>> +++ b/include/drm/drm_mm.h
>> @@ -168,6 +168,7 @@ struct drm_mm_node {
>> struct rb_node rb_hole_addr;
>> u64 __subtree_last;
>> u64 hole_size;
>> + u64 subtree_max_hole;
>> unsigned long flags;
>> #define DRM_MM_NODE_ALLOCATED_BIT 0
>> #define DRM_MM_NODE_SCANNED_BIT 1
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-05-04 15:34 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-04 10:42 [PATCH v3 1/1] drm/mm: optimize rb_hole_addr rbtree search Nirmoy Das
2020-05-04 13:35 ` Christian König
2020-05-04 15:33 ` Nirmoy
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.