All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] rb_first to rb_first_cached conversion
@ 2018-08-22 19:51 Liu Bo
  2018-08-22 19:51 ` [PATCH 1/5] Btrfs: href_root: use rb_first_cached Liu Bo
                   ` (9 more replies)
  0 siblings, 10 replies; 16+ messages in thread
From: Liu Bo @ 2018-08-22 19:51 UTC (permalink / raw)
  To: linux-btrfs

Several structs in btrfs are using rb_first() in a while loop, it'd be
more efficient to do this with rb_first_cached() which has the O(1)
complexity.

This patch set updates five structs which may have a large rb tree in
practice

Liu Bo (5):
  Btrfs: href_root: use rb_first_cached
  Btrfs: href->ref_tree: use rb_first_cached
  Btrfs: delayed_items: use rb_first_cached
  Btrfs: extent_map: use rb_first_cached
  Btrfs: preftree: use rb_first_cached

 fs/btrfs/backref.c                | 34 +++++++++++++-------------
 fs/btrfs/delayed-inode.c          | 29 +++++++++++++----------
 fs/btrfs/delayed-inode.h          |  4 ++--
 fs/btrfs/delayed-ref.c            | 50 +++++++++++++++++++++++----------------
 fs/btrfs/delayed-ref.h            |  4 ++--
 fs/btrfs/disk-io.c                |  8 +++----
 fs/btrfs/extent-tree.c            | 19 ++++++++-------
 fs/btrfs/extent_map.c             | 27 +++++++++++----------
 fs/btrfs/extent_map.h             |  2 +-
 fs/btrfs/inode.c                  |  4 ++--
 fs/btrfs/tests/extent-map-tests.c |  4 ++--
 fs/btrfs/transaction.c            |  5 ++--
 fs/btrfs/volumes.c                |  5 ++--
 13 files changed, 107 insertions(+), 88 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/5] Btrfs: href_root: use rb_first_cached
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
@ 2018-08-22 19:51 ` Liu Bo
  2018-08-22 19:51 ` [PATCH 2/5] Btrfs: href->ref_tree: " Liu Bo
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 16+ messages in thread
From: Liu Bo @ 2018-08-22 19:51 UTC (permalink / raw)
  To: linux-btrfs

rb_first_cached() trades an extra pointer "leftmost" for doing the
same job as rb_first() but in O(1).

Functions manipulating href_root need to get the first entry, this
converts href_root to use rb_first_cached().

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
 fs/btrfs/delayed-ref.c | 26 +++++++++++++++-----------
 fs/btrfs/delayed-ref.h |  2 +-
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/extent-tree.c |  6 +++---
 fs/btrfs/transaction.c |  5 +++--
 5 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 62ff545ba1f7..4ef4a30ccc3d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,14 +101,15 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
 }
 
 /* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
 						   struct rb_node *node)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p = &root->rb_root.rb_node;
 	struct rb_node *parent_node = NULL;
 	struct btrfs_delayed_ref_head *entry;
 	struct btrfs_delayed_ref_head *ins;
 	u64 bytenr;
+	bool leftmost = true;
 
 	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
 	bytenr = ins->bytenr;
@@ -117,16 +118,18 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
 				 href_node);
 
-		if (bytenr < entry->bytenr)
+		if (bytenr < entry->bytenr) {
 			p = &(*p)->rb_left;
-		else if (bytenr > entry->bytenr)
+		} else if (bytenr > entry->bytenr) {
 			p = &(*p)->rb_right;
-		else
+			leftmost = false;
+		} else {
 			return entry;
+		}
 	}
 
 	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
+	rb_insert_color_cached(node, root, leftmost);
 	return NULL;
 }
 
@@ -165,9 +168,10 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
  * match is found.
  */
 static struct btrfs_delayed_ref_head *
-find_ref_head(struct rb_root *root, u64 bytenr,
+find_ref_head(struct btrfs_delayed_ref_root *dr, u64 bytenr,
 	      int return_bigger)
 {
+	struct rb_root *root = &dr->href_root.rb_root;
 	struct rb_node *n;
 	struct btrfs_delayed_ref_head *entry;
 
@@ -187,7 +191,7 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
 		if (bytenr > entry->bytenr) {
 			n = rb_next(&entry->href_node);
 			if (!n)
-				n = rb_first(root);
+				n = rb_first_cached(&dr->href_root);
 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
 					 href_node);
 			return entry;
@@ -357,12 +361,12 @@ struct btrfs_delayed_ref_head *
 
 again:
 	start = delayed_refs->run_delayed_start;
-	head = find_ref_head(&delayed_refs->href_root, start, 1);
+	head = find_ref_head(delayed_refs, start, 1);
 	if (!head && !loop) {
 		delayed_refs->run_delayed_start = 0;
 		start = 0;
 		loop = true;
-		head = find_ref_head(&delayed_refs->href_root, start, 1);
+		head = find_ref_head(delayed_refs, start, 1);
 		if (!head)
 			return NULL;
 	} else if (!head && loop) {
@@ -903,7 +907,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
 {
-	return find_ref_head(&delayed_refs->href_root, bytenr, 0);
+	return find_ref_head(delayed_refs, bytenr, 0);
 }
 
 void __cold btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d9f2a4ebd5db..88438b6cee45 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -148,7 +148,7 @@ struct btrfs_delayed_data_ref {
 
 struct btrfs_delayed_ref_root {
 	/* head ref rbtree */
-	struct rb_root href_root;
+	struct rb_root_cached href_root;
 
 	/* dirty extent records */
 	struct rb_root dirty_extent_root;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5124c15705ce..dee7bcc13ffa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4203,7 +4203,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 		return ret;
 	}
 
-	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
 		struct btrfs_delayed_ref_head *head;
 		struct rb_node *n;
 		bool pin_bytes = false;
@@ -4239,7 +4239,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 		if (head->processing == 0)
 			delayed_refs->num_heads_ready--;
 		atomic_dec(&delayed_refs->num_entries);
-		rb_erase(&head->href_node, &delayed_refs->href_root);
+		rb_erase_cached(&head->href_node, &delayed_refs->href_root);
 		RB_CLEAR_NODE(&head->href_node);
 		spin_unlock(&head->lock);
 		spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index de6f75f5547b..0b0472bc209d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2454,7 +2454,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 	delayed_refs->num_heads--;
-	rb_erase(&head->href_node, &delayed_refs->href_root);
+	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
 	RB_CLEAR_NODE(&head->href_node);
 	spin_unlock(&head->lock);
 	spin_unlock(&delayed_refs->lock);
@@ -2940,7 +2940,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			btrfs_create_pending_block_groups(trans);
 
 		spin_lock(&delayed_refs->lock);
-		node = rb_first(&delayed_refs->href_root);
+		node = rb_first_cached(&delayed_refs->href_root);
 		if (!node) {
 			spin_unlock(&delayed_refs->lock);
 			goto out;
@@ -6947,7 +6947,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	 * at this point we have a head with no other entries.  Go
 	 * ahead and process it.
 	 */
-	rb_erase(&head->href_node, &delayed_refs->href_root);
+	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
 	RB_CLEAR_NODE(&head->href_node);
 	atomic_dec(&delayed_refs->num_entries);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b84f5015029..a5c0c2629c8c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -44,7 +44,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 	WARN_ON(refcount_read(&transaction->use_count) == 0);
 	if (refcount_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
-		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+		WARN_ON(!RB_EMPTY_ROOT(
+				&transaction->delayed_refs.href_root.rb_root));
 		if (transaction->delayed_refs.pending_csums)
 			btrfs_err(transaction->fs_info,
 				  "pending csums is %llu",
@@ -245,7 +246,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
 
 	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
 
-	cur_trans->delayed_refs.href_root = RB_ROOT;
+	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
 	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/5] Btrfs: href->ref_tree: use rb_first_cached
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
  2018-08-22 19:51 ` [PATCH 1/5] Btrfs: href_root: use rb_first_cached Liu Bo
@ 2018-08-22 19:51 ` Liu Bo
  2018-08-22 19:51 ` [PATCH 3/5] Btrfs: delayed_items: " Liu Bo
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 16+ messages in thread
From: Liu Bo @ 2018-08-22 19:51 UTC (permalink / raw)
  To: linux-btrfs

rb_first_cached() trades an extra pointer "leftmost" for doing the same
job as rb_first() but in O(1).

Functions manipulating href->ref_tree need to get the first entry, this
converts href->ref_tree to use rb_first_cached().

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
 fs/btrfs/backref.c     |  2 +-
 fs/btrfs/delayed-ref.c | 24 ++++++++++++++----------
 fs/btrfs/delayed-ref.h |  2 +-
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/extent-tree.c | 13 +++++++------
 5 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ae750b1574a2..bc1ea6e9ea4f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -769,7 +769,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 		btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
 
 	spin_lock(&head->lock);
-	for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
+	for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
 		node = rb_entry(n, struct btrfs_delayed_ref_node,
 				ref_node);
 		if (node->seq > seq)
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 4ef4a30ccc3d..5f4d46e0f2c0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -133,13 +133,14 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
 	return NULL;
 }
 
-static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root_cached *root,
 		struct btrfs_delayed_ref_node *ins)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p = &root->rb_root.rb_node;
 	struct rb_node *node = &ins->ref_node;
 	struct rb_node *parent_node = NULL;
 	struct btrfs_delayed_ref_node *entry;
+	bool leftmost = true;
 
 	while (*p) {
 		int comp;
@@ -148,16 +149,18 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 ref_node);
 		comp = comp_refs(ins, entry, true);
-		if (comp < 0)
+		if (comp < 0) {
 			p = &(*p)->rb_left;
-		else if (comp > 0)
+		} else if (comp > 0) {
 			p = &(*p)->rb_right;
-		else
+			leftmost = false;
+		} else {
 			return entry;
+		}
 	}
 
 	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
+	rb_insert_color_cached(node, root, leftmost);
 	return NULL;
 }
 
@@ -231,7 +234,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 				    struct btrfs_delayed_ref_node *ref)
 {
 	lockdep_assert_held(&head->lock);
-	rb_erase(&ref->ref_node, &head->ref_tree);
+	rb_erase_cached(&ref->ref_node, &head->ref_tree);
 	RB_CLEAR_NODE(&ref->ref_node);
 	if (!list_empty(&ref->add_list))
 		list_del(&ref->add_list);
@@ -300,7 +303,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 
 	lockdep_assert_held(&head->lock);
 
-	if (RB_EMPTY_ROOT(&head->ref_tree))
+	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
 		return;
 
 	/* We don't have too many refs to merge for data. */
@@ -318,7 +321,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 	spin_unlock(&fs_info->tree_mod_seq_lock);
 
 again:
-	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+	for (node = rb_first_cached(&head->ref_tree); node;
+	     node = rb_next(node)) {
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		if (seq && ref->seq >= seq)
 			continue;
@@ -573,7 +577,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
 	head_ref->is_system = is_system;
-	head_ref->ref_tree = RB_ROOT;
+	head_ref->ref_tree = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&head_ref->ref_add_list);
 	RB_CLEAR_NODE(&head_ref->href_node);
 	head_ref->processing = 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 88438b6cee45..c3e3486a126c 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -79,7 +79,7 @@ struct btrfs_delayed_ref_head {
 	struct mutex mutex;
 
 	spinlock_t lock;
-	struct rb_root ref_tree;
+	struct rb_root_cached ref_tree;
 	/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
 	struct list_head ref_add_list;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dee7bcc13ffa..95a3d33a0454 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4221,11 +4221,11 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 			continue;
 		}
 		spin_lock(&head->lock);
-		while ((n = rb_first(&head->ref_tree)) != NULL) {
+		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
 			ref = rb_entry(n, struct btrfs_delayed_ref_node,
 				       ref_node);
 			ref->in_tree = 0;
-			rb_erase(&ref->ref_node, &head->ref_tree);
+			rb_erase_cached(&ref->ref_node, &head->ref_tree);
 			RB_CLEAR_NODE(&ref->ref_node);
 			if (!list_empty(&ref->add_list))
 				list_del(&ref->add_list);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0b0472bc209d..e3ee3b7746a8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2374,7 +2374,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_delayed_ref_node *ref;
 
-	if (RB_EMPTY_ROOT(&head->ref_tree))
+	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
 		return NULL;
 
 	/*
@@ -2387,7 +2387,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		return list_first_entry(&head->ref_add_list,
 				struct btrfs_delayed_ref_node, add_list);
 
-	ref = rb_entry(rb_first(&head->ref_tree),
+	ref = rb_entry(rb_first_cached(&head->ref_tree),
 		       struct btrfs_delayed_ref_node, ref_node);
 	ASSERT(list_empty(&ref->add_list));
 	return ref;
@@ -2448,7 +2448,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	spin_unlock(&head->lock);
 	spin_lock(&delayed_refs->lock);
 	spin_lock(&head->lock);
-	if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
 		spin_unlock(&head->lock);
 		spin_unlock(&delayed_refs->lock);
 		return 1;
@@ -2597,7 +2597,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 		actual_count++;
 		ref->in_tree = 0;
-		rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
 		RB_CLEAR_NODE(&ref->ref_node);
 		if (!list_empty(&ref->add_list))
 			list_del(&ref->add_list);
@@ -3040,7 +3040,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 	 * XXX: We should replace this with a proper search function in the
 	 * future.
 	 */
-	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+	for (node = rb_first_cached(&head->ref_tree); node;
+	     node = rb_next(node)) {
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		/* If it's a shared ref we know a cross reference exists */
 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -6926,7 +6927,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 		goto out_delayed_unlock;
 
 	spin_lock(&head->lock);
-	if (!RB_EMPTY_ROOT(&head->ref_tree))
+	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
 		goto out;
 
 	if (head->extent_op) {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/5] Btrfs: delayed_items: use rb_first_cached
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
  2018-08-22 19:51 ` [PATCH 1/5] Btrfs: href_root: use rb_first_cached Liu Bo
  2018-08-22 19:51 ` [PATCH 2/5] Btrfs: href->ref_tree: " Liu Bo
@ 2018-08-22 19:51 ` Liu Bo
  2018-08-22 19:51 ` [PATCH 4/5] Btrfs: extent_map: " Liu Bo
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 16+ messages in thread
From: Liu Bo @ 2018-08-22 19:51 UTC (permalink / raw)
  To: linux-btrfs

rb_first_cached() trades an extra pointer "leftmost" for doing the same
job as rb_first() but in O(1).

Functions manipulating delayed_item need to get the first entry, this
converts it to use rb_first_cached().

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
 fs/btrfs/delayed-inode.c | 29 ++++++++++++++++-------------
 fs/btrfs/delayed-inode.h |  4 ++--
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index db9f45082c61..72569849828e 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -42,8 +42,8 @@ static inline void btrfs_init_delayed_node(
 	delayed_node->root = root;
 	delayed_node->inode_id = inode_id;
 	refcount_set(&delayed_node->refs, 0);
-	delayed_node->ins_root = RB_ROOT;
-	delayed_node->del_root = RB_ROOT;
+	delayed_node->ins_root = RB_ROOT_CACHED;
+	delayed_node->del_root = RB_ROOT_CACHED;
 	mutex_init(&delayed_node->mutex);
 	INIT_LIST_HEAD(&delayed_node->n_list);
 	INIT_LIST_HEAD(&delayed_node->p_list);
@@ -390,7 +390,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
 					struct btrfs_delayed_node *delayed_node,
 					struct btrfs_key *key)
 {
-	return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+	return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
 					   NULL, NULL);
 }
 
@@ -400,9 +400,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
 {
 	struct rb_node **p, *node;
 	struct rb_node *parent_node = NULL;
-	struct rb_root *root;
+	struct rb_root_cached *root;
 	struct btrfs_delayed_item *item;
 	int cmp;
+	bool leftmost = true;
 
 	if (action == BTRFS_DELAYED_INSERTION_ITEM)
 		root = &delayed_node->ins_root;
@@ -410,7 +411,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
 		root = &delayed_node->del_root;
 	else
 		BUG();
-	p = &root->rb_node;
+	p = &root->rb_root.rb_node;
 	node = &ins->rb_node;
 
 	while (*p) {
@@ -419,16 +420,18 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
 				 rb_node);
 
 		cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
-		if (cmp < 0)
+		if (cmp < 0) {
 			p = &(*p)->rb_right;
-		else if (cmp > 0)
+			leftmost = false;
+		} else if (cmp > 0) {
 			p = &(*p)->rb_left;
-		else
+		} else {
 			return -EEXIST;
+		}
 	}
 
 	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
+	rb_insert_color_cached(node, root, leftmost);
 	ins->delayed_node = delayed_node;
 	ins->ins_or_del = action;
 
@@ -468,7 +471,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
 
 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 {
-	struct rb_root *root;
+	struct rb_root_cached *root;
 	struct btrfs_delayed_root *delayed_root;
 
 	delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
@@ -482,7 +485,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 	else
 		root = &delayed_item->delayed_node->del_root;
 
-	rb_erase(&delayed_item->rb_node, root);
+	rb_erase_cached(&delayed_item->rb_node, root);
 	delayed_item->delayed_node->count--;
 
 	finish_one_item(delayed_root);
@@ -503,7 +506,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
 	struct rb_node *p;
 	struct btrfs_delayed_item *item = NULL;
 
-	p = rb_first(&delayed_node->ins_root);
+	p = rb_first_cached(&delayed_node->ins_root);
 	if (p)
 		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
 
@@ -516,7 +519,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
 	struct rb_node *p;
 	struct btrfs_delayed_item *item = NULL;
 
-	p = rb_first(&delayed_node->del_root);
+	p = rb_first_cached(&delayed_node->del_root);
 	if (p)
 		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
 
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 33536cd681d4..74ae226ffaf0 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -50,8 +50,8 @@ struct btrfs_delayed_node {
 	 * is waiting to be dealt with by the async worker.
 	 */
 	struct list_head p_list;
-	struct rb_root ins_root;
-	struct rb_root del_root;
+	struct rb_root_cached ins_root;
+	struct rb_root_cached del_root;
 	struct mutex mutex;
 	struct btrfs_inode_item inode_item;
 	refcount_t refs;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/5] Btrfs: extent_map: use rb_first_cached
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
                   ` (2 preceding siblings ...)
  2018-08-22 19:51 ` [PATCH 3/5] Btrfs: delayed_items: " Liu Bo
@ 2018-08-22 19:51 ` Liu Bo
  2018-08-22 19:51 ` [PATCH 5/5] Btrfs: preftree: " Liu Bo
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 16+ messages in thread
From: Liu Bo @ 2018-08-22 19:51 UTC (permalink / raw)
  To: linux-btrfs

rb_first_cached() trades an extra pointer "leftmost" for doing the
same job as rb_first() but in O(1).

As evict_inode_truncate_pages() removes all extent mapping by always
looking for the first rb entry, it's helpful to use rb_first_cached
instead.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
 fs/btrfs/extent_map.c             | 27 +++++++++++++++------------
 fs/btrfs/extent_map.h             |  2 +-
 fs/btrfs/inode.c                  |  4 ++--
 fs/btrfs/tests/extent-map-tests.c |  4 ++--
 fs/btrfs/volumes.c                |  5 +++--
 5 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6648d55e5339..819420eab91d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -34,7 +34,7 @@ void __cold extent_map_exit(void)
  */
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
-	tree->map = RB_ROOT;
+	tree->map = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&tree->modified_extents);
 	rwlock_init(&tree->lock);
 }
@@ -90,24 +90,27 @@ static u64 range_end(u64 start, u64 len)
 	return start + len;
 }
 
-static int tree_insert(struct rb_root *root, struct extent_map *em)
+static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p = &root->rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct extent_map *entry = NULL;
 	struct rb_node *orig_parent = NULL;
 	u64 end = range_end(em->start, em->len);
+	bool leftmost = true;
 
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct extent_map, rb_node);
 
-		if (em->start < entry->start)
+		if (em->start < entry->start) {
 			p = &(*p)->rb_left;
-		else if (em->start >= extent_map_end(entry))
+		} else if (em->start >= extent_map_end(entry)) {
 			p = &(*p)->rb_right;
-		else
+			leftmost = false;
+		} else {
 			return -EEXIST;
+		}
 	}
 
 	orig_parent = parent;
@@ -130,7 +133,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
 			return -EEXIST;
 
 	rb_link_node(&em->rb_node, orig_parent, p);
-	rb_insert_color(&em->rb_node, root);
+	rb_insert_color_cached(&em->rb_node, root, leftmost);
 	return 0;
 }
 
@@ -242,7 +245,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->mod_start = merge->mod_start;
 			em->generation = max(em->generation, merge->generation);
 
-			rb_erase(&merge->rb_node, &tree->map);
+			rb_erase_cached(&merge->rb_node, &tree->map);
 			RB_CLEAR_NODE(&merge->rb_node);
 			free_extent_map(merge);
 		}
@@ -254,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 	if (rb && mergable_maps(em, merge)) {
 		em->len += merge->len;
 		em->block_len += merge->block_len;
-		rb_erase(&merge->rb_node, &tree->map);
+		rb_erase_cached(&merge->rb_node, &tree->map);
 		RB_CLEAR_NODE(&merge->rb_node);
 		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
 		em->generation = max(em->generation, merge->generation);
@@ -367,7 +370,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	struct rb_node *next = NULL;
 	u64 end = range_end(start, len);
 
-	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
 	if (!rb_node) {
 		if (prev)
 			rb_node = prev;
@@ -433,7 +436,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	int ret = 0;
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
-	rb_erase(&em->rb_node, &tree->map);
+	rb_erase_cached(&em->rb_node, &tree->map);
 	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
 		list_del_init(&em->list);
 	RB_CLEAR_NODE(&em->rb_node);
@@ -449,7 +452,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
 	ASSERT(extent_map_in_tree(cur));
 	if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
 		list_del_init(&cur->list);
-	rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+	rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
 	RB_CLEAR_NODE(&cur->rb_node);
 
 	setup_extent_mapping(tree, new, modified);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 25d985e7532a..6afe786bf6d0 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -49,7 +49,7 @@ struct extent_map {
 };
 
 struct extent_map_tree {
-	struct rb_root map;
+	struct rb_root_cached map;
 	struct list_head modified_extents;
 	rwlock_t lock;
 };
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6e761cec1f2a..10e86f7b5398 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5252,10 +5252,10 @@ static void evict_inode_truncate_pages(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 
 	write_lock(&map_tree->lock);
-	while (!RB_EMPTY_ROOT(&map_tree->map)) {
+	while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
 		struct extent_map *em;
 
-		node = rb_first(&map_tree->map);
+		node = rb_first_cached(&map_tree->map);
 		em = rb_entry(node, struct extent_map, rb_node);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 385a5316e4bf..bf15d3a7f20e 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -12,8 +12,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
 	struct extent_map *em;
 	struct rb_node *node;
 
-	while (!RB_EMPTY_ROOT(&em_tree->map)) {
-		node = rb_first(&em_tree->map);
+	while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
+		node = rb_first_cached(&em_tree->map);
 		em = rb_entry(node, struct extent_map, rb_node);
 		remove_extent_mapping(em_tree, em);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index da86706123ff..24d37d22a361 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1613,7 +1613,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
 
 	em_tree = &fs_info->mapping_tree.map_tree;
 	read_lock(&em_tree->lock);
-	n = rb_last(&em_tree->map);
+	n = rb_last(&em_tree->map.rb_root);
 	if (n) {
 		em = rb_entry(n, struct extent_map, rb_node);
 		ret = em->start + em->len;
@@ -7433,7 +7433,8 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
 	int ret = 0;
 
 	read_lock(&em_tree->lock);
-	for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
+	for (node = rb_first_cached(&em_tree->map); node;
+	     node = rb_next(node)) {
 		em = rb_entry(node, struct extent_map, rb_node);
 		if (em->map_lookup->num_stripes !=
 		    em->map_lookup->verified_stripes) {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 5/5] Btrfs: preftree: use rb_first_cached
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
                   ` (3 preceding siblings ...)
  2018-08-22 19:51 ` [PATCH 4/5] Btrfs: extent_map: " Liu Bo
@ 2018-08-22 19:51 ` Liu Bo
  2018-08-23  0:27 ` [PATCH 0/5] rb_first to rb_first_cached conversion Qu Wenruo
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 16+ messages in thread
From: Liu Bo @ 2018-08-22 19:51 UTC (permalink / raw)
  To: linux-btrfs

rb_first_cached() trades an extra pointer "leftmost" for doing the same
job as rb_first() but in O(1).

While resolving indirect refs and missing refs, it always looks for the
first rb entry in a while loop, it's helpful to use rb_first_cached
instead.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
 fs/btrfs/backref.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bc1ea6e9ea4f..49f2188e728b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
 }
 
 struct preftree {
-	struct rb_root root;
+	struct rb_root_cached root;
 	unsigned int count;
 };
 
-#define PREFTREE_INIT	{ .root = RB_ROOT, .count = 0 }
+#define PREFTREE_INIT	{ .root = RB_ROOT_CACHED, .count = 0 }
 
 struct preftrees {
 	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
 			      struct prelim_ref *newref,
 			      struct share_check *sc)
 {
-	struct rb_root *root;
+	struct rb_root_cached *root;
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct prelim_ref *ref;
 	int result;
+	bool leftmost = true;
 
 	root = &preftree->root;
-	p = &root->rb_node;
+	p = &root->rb_root.rb_node;
 
 	while (*p) {
 		parent = *p;
@@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
 			p = &(*p)->rb_left;
 		} else if (result > 0) {
 			p = &(*p)->rb_right;
+			leftmost = false;
 		} else {
 			/* Identical refs, merge them and free @newref */
 			struct extent_inode_elem *eie = ref->inode_list;
@@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
 	preftree->count++;
 	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
 	rb_link_node(&newref->rbnode, parent, p);
-	rb_insert_color(&newref->rbnode, root);
+	rb_insert_color_cached(&newref->rbnode, root, leftmost);
 }
 
 /*
@@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree)
 {
 	struct prelim_ref *ref, *next_ref;
 
-	rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
-					     rbnode)
+	rbtree_postorder_for_each_entry_safe(ref, next_ref,
+					     &preftree->root.rb_root, rbnode)
 		free_pref(ref);
 
-	preftree->root = RB_ROOT;
+	preftree->root = RB_ROOT_CACHED;
 	preftree->count = 0;
 }
 
@@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 	 * freeing the entire indirect tree when we're done.  In some test
 	 * cases, the tree can grow quite large (~200k objects).
 	 */
-	while ((rnode = rb_first(&preftrees->indirect.root))) {
+	while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
 		struct prelim_ref *ref;
 
 		ref = rb_entry(rnode, struct prelim_ref, rbnode);
@@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			goto out;
 		}
 
-		rb_erase(&ref->rbnode, &preftrees->indirect.root);
+		rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
 		preftrees->indirect.count--;
 
 		if (ref->count == 0) {
@@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 	struct preftree *tree = &preftrees->indirect_missing_keys;
 	struct rb_node *node;
 
-	while ((node = rb_first(&tree->root))) {
+	while ((node = rb_first_cached(&tree->root))) {
 		ref = rb_entry(node, struct prelim_ref, rbnode);
-		rb_erase(node, &tree->root);
+		rb_erase_cached(node, &tree->root);
 
 		BUG_ON(ref->parent);	/* should not be a direct ref */
 		BUG_ON(ref->key_for_search.type);
@@ -1229,14 +1231,14 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
 
 	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
 				    extent_item_pos, total_refs, sc, ignore_offset);
 	if (ret)
 		goto out;
 
-	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));
 
 	/*
 	 * This walks the tree of merged and resolved refs. Tree blocks are
@@ -1245,7 +1247,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	 *
 	 * We release the entire tree in one go before returning.
 	 */
-	node = rb_first(&preftrees.direct.root);
+	node = rb_first_cached(&preftrees.direct.root);
 	while (node) {
 		ref = rb_entry(node, struct prelim_ref, rbnode);
 		node = rb_next(&ref->rbnode);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
                   ` (4 preceding siblings ...)
  2018-08-22 19:51 ` [PATCH 5/5] Btrfs: preftree: " Liu Bo
@ 2018-08-23  0:27 ` Qu Wenruo
  2018-08-23 10:21 ` Holger Hoffstätte
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 16+ messages in thread
From: Qu Wenruo @ 2018-08-23  0:27 UTC (permalink / raw)
  To: Liu Bo, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 1673 bytes --]



On 2018/8/23 上午3:51, Liu Bo wrote:
> Several structs in btrfs are using rb_first() in a while loop, it'd be
> more efficient to do this with rb_first_cached() which has the O(1)
> complexity.

I'm a little curious about such rb_first() inside loop usage.

What I can find is mostly deletion + iteration code for rb_tree.
Like btrfs_free_qgroup_config() or btrfs_qgroup_account_extents().

I'm wondering if there is any special way to do it faster for such usage.
Or rb_tree_cached is the only solution here.

Thanks,
Qu

> 
> This patch set updates five structs which may have a large rb tree in
> practice
> 
> Liu Bo (5):
>   Btrfs: href_root: use rb_first_cached
>   Btrfs: href->ref_tree: use rb_first_cached
>   Btrfs: delayed_items: use rb_first_cached
>   Btrfs: extent_map: use rb_first_cached
>   Btrfs: preftree: use rb_first_cached
> 
>  fs/btrfs/backref.c                | 34 +++++++++++++-------------
>  fs/btrfs/delayed-inode.c          | 29 +++++++++++++----------
>  fs/btrfs/delayed-inode.h          |  4 ++--
>  fs/btrfs/delayed-ref.c            | 50 +++++++++++++++++++++++----------------
>  fs/btrfs/delayed-ref.h            |  4 ++--
>  fs/btrfs/disk-io.c                |  8 +++----
>  fs/btrfs/extent-tree.c            | 19 ++++++++-------
>  fs/btrfs/extent_map.c             | 27 +++++++++++----------
>  fs/btrfs/extent_map.h             |  2 +-
>  fs/btrfs/inode.c                  |  4 ++--
>  fs/btrfs/tests/extent-map-tests.c |  4 ++--
>  fs/btrfs/transaction.c            |  5 ++--
>  fs/btrfs/volumes.c                |  5 ++--
>  13 files changed, 107 insertions(+), 88 deletions(-)
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
                   ` (5 preceding siblings ...)
  2018-08-23  0:27 ` [PATCH 0/5] rb_first to rb_first_cached conversion Qu Wenruo
@ 2018-08-23 10:21 ` Holger Hoffstätte
  2018-08-23 11:26 ` Nikolay Borisov
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 16+ messages in thread
From: Holger Hoffstätte @ 2018-08-23 10:21 UTC (permalink / raw)
  To: Liu Bo, linux-btrfs

On 08/22/18 21:51, Liu Bo wrote:
> Several structs in btrfs are using rb_first() in a while loop, it'd be
> more efficient to do this with rb_first_cached() which has the O(1)
> complexity.
> 
> This patch set updates five structs which may have a large rb tree in
> practice

Great idea, works for me with no bad side effects so far. So:

Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>

cheers
Holger

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
                   ` (6 preceding siblings ...)
  2018-08-23 10:21 ` Holger Hoffstätte
@ 2018-08-23 11:26 ` Nikolay Borisov
  2018-09-11 15:34 ` David Sterba
  2018-09-20 16:49 ` David Sterba
  9 siblings, 0 replies; 16+ messages in thread
From: Nikolay Borisov @ 2018-08-23 11:26 UTC (permalink / raw)
  To: Liu Bo, linux-btrfs



On 22.08.2018 22:51, Liu Bo wrote:
> Several structs in btrfs are using rb_first() in a while loop, it'd be
> more efficient to do this with rb_first_cached() which has the O(1)
> complexity.
> 
> This patch set updates five structs which may have a large rb tree in
> practice
> 
> Liu Bo (5):
>   Btrfs: href_root: use rb_first_cached
>   Btrfs: href->ref_tree: use rb_first_cached
>   Btrfs: delayed_items: use rb_first_cached
>   Btrfs: extent_map: use rb_first_cached
>   Btrfs: preftree: use rb_first_cached
> 
>  fs/btrfs/backref.c                | 34 +++++++++++++-------------
>  fs/btrfs/delayed-inode.c          | 29 +++++++++++++----------
>  fs/btrfs/delayed-inode.h          |  4 ++--
>  fs/btrfs/delayed-ref.c            | 50 +++++++++++++++++++++++----------------
>  fs/btrfs/delayed-ref.h            |  4 ++--
>  fs/btrfs/disk-io.c                |  8 +++----
>  fs/btrfs/extent-tree.c            | 19 ++++++++-------
>  fs/btrfs/extent_map.c             | 27 +++++++++++----------
>  fs/btrfs/extent_map.h             |  2 +-
>  fs/btrfs/inode.c                  |  4 ++--
>  fs/btrfs/tests/extent-map-tests.c |  4 ++--
>  fs/btrfs/transaction.c            |  5 ++--
>  fs/btrfs/volumes.c                |  5 ++--
>  13 files changed, 107 insertions(+), 88 deletions(-)
> 

Do you have any performance data proving this is in fact helping?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
                   ` (7 preceding siblings ...)
  2018-08-23 11:26 ` Nikolay Borisov
@ 2018-09-11 15:34 ` David Sterba
  2018-09-11 18:31   ` Liu Bo
  2018-09-20 16:49 ` David Sterba
  9 siblings, 1 reply; 16+ messages in thread
From: David Sterba @ 2018-09-11 15:34 UTC (permalink / raw)
  To: Liu Bo; +Cc: linux-btrfs

On Thu, Aug 23, 2018 at 03:51:48AM +0800, Liu Bo wrote:
> Several structs in btrfs are using rb_first() in a while loop, it'd be
> more efficient to do this with rb_first_cached() which has the O(1)
> complexity.

We'd like to see some numbers, though just by reading the code I think
there's going to be a noticeable improvement for some workloads.

There's a common pattern:

while(node = rb_first) {
	entry = rb_entry(node)
	next = rb_next(node)
	rb_erase(node)
	cleanup(entry)
}

rb_first needs to traverse the tree up to logN depth, rb_erase can
completely reshuffle the tree. With the caching we'll skip the traversal
in rb_first. That's a cached memory access vs looped pointer
dereference trade-off that IMHO has a clear winner.

As the pattern in many cases traverses the whole tree and removes all
entries, ideally we could get rid of the rebalancing too. The entries
would be cleaned up in postorder way, ie. reset the data and kfree in
the end. This requires that order of the freeing does not matter, which
might no apply to some trees.

I did not find suitable rbtree functions or helpers for that,
rbtree_postorder_for_each_entry_safe does not allow rb_erase and
rb_erase itself forces the rebalancing internally. But I think we can
implement that.

We could possibly use rb_next_postorder that makes sure that all
children are traversed before the parent so this is at least something
that could considerd.

In cases where the order of rbnode processing must be preserved, ie. the
rb_erase must be there, the cached rb tree is all we can do.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-09-11 15:34 ` David Sterba
@ 2018-09-11 18:31   ` Liu Bo
  2018-09-14 15:11     ` David Sterba
  0 siblings, 1 reply; 16+ messages in thread
From: Liu Bo @ 2018-09-11 18:31 UTC (permalink / raw)
  To: dsterba, linux-btrfs

On Tue, Sep 11, 2018 at 05:34:03PM +0200, David Sterba wrote:
> On Thu, Aug 23, 2018 at 03:51:48AM +0800, Liu Bo wrote:
> > Several structs in btrfs are using rb_first() in a while loop, it'd be
> > more efficient to do this with rb_first_cached() which has the O(1)
> > complexity.
> 
> We'd like to see some numbers, though just by reading the code I think
> there's going to be a noticeable improvement for some workloads.
> 
> There's a common pattern:
> 
> while(node = rb_first) {
> 	entry = rb_entry(node)
> 	next = rb_next(node)
> 	rb_erase(node)
> 	cleanup(entry)
> }
> 
> rb_first needs to traverse the tree up to logN depth, rb_erase can
> completely reshuffle the tree. With the caching we'll skip the traversal
> in rb_first. That's a cached memory access vs looped pointer
> dereference trade-off that IMHO has a clear winner.
> 
> As the pattern in many cases traverses the whole tree and removes all
> entries, ideally we could get rid of the rebalancing too. The entries
> would be cleaned up in postorder way, ie. reset the data and kfree in
> the end. This requires that order of the freeing does not matter, which
> might no apply to some trees.

The order of freeing does not matter, but we need the tree to return
the correct next node to free, IOW, we have to maintain the rbtree
until the last two nodes.

> 
> I did not find suitable rbtree functions or helpers for that,
> rbtree_postorder_for_each_entry_safe does not allow rb_erase and
> rb_erase itself forces the rebalancing internally. But I think we can
> implement that.
> 
> We could possibly use rb_next_postorder that makes sure that all
> children are traversed before the parent so this is at least something
> that could considerd.
> 

Qu was inquiring about the same thing, I haven't got time to dig it
further.

The question we need to answer is that whether we care about how fast
destruction can make, as some are in critical paths while some are
not.

thanks,
-liubo

> In cases where the order of rbnode processing must be preserved, ie. the
> rb_erase must be there, the cached rb tree is all we can do.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-09-11 18:31   ` Liu Bo
@ 2018-09-14 15:11     ` David Sterba
  2018-09-14 19:14       ` Liu Bo
  0 siblings, 1 reply; 16+ messages in thread
From: David Sterba @ 2018-09-14 15:11 UTC (permalink / raw)
  To: Liu Bo; +Cc: dsterba, linux-btrfs

On Tue, Sep 11, 2018 at 11:31:49AM -0700, Liu Bo wrote:
> On Tue, Sep 11, 2018 at 05:34:03PM +0200, David Sterba wrote:
> > On Thu, Aug 23, 2018 at 03:51:48AM +0800, Liu Bo wrote:
> > > Several structs in btrfs are using rb_first() in a while loop, it'd be
> > > more efficient to do this with rb_first_cached() which has the O(1)
> > > complexity.
> > 
> > We'd like to see some numbers, though just by reading the code I think
> > there's going to be a noticeable improvement for some workloads.
> > 
> > There's a common pattern:
> > 
> > while(node = rb_first) {
> > 	entry = rb_entry(node)
> > 	next = rb_next(node)
> > 	rb_erase(node)
> > 	cleanup(entry)
> > }
> > 
> > rb_first needs to traverse the tree up to logN depth, rb_erase can
> > completely reshuffle the tree. With the caching we'll skip the traversal
> > in rb_first. That's a cached memory access vs looped pointer
> > dereference trade-off that IMHO has a clear winner.
> > 
> > As the pattern in many cases traverses the whole tree and removes all
> > entries, ideally we could get rid of the rebalancing too. The entries
> > would be cleaned up in postorder way, ie. reset the data and kfree in
> > the end. This requires that order of the freeing does not matter, which
> > might no apply to some trees.
> 
> The order of freeing does not matter, but we need the tree to return
> the correct next node to free, IOW, we have to maintain the rbtree
> until the last two nodes.
> 
> > 
> > I did not find suitable rbtree functions or helpers for that,
> > rbtree_postorder_for_each_entry_safe does not allow rb_erase and
> > rb_erase itself forces the rebalancing internally. But I think we can
> > implement that.
> > 
> > We could possibly use rb_next_postorder that makes sure that all
> > children are traversed before the parent so this is at least something
> > that could considerd.
> > 
> 
> Qu was inquiring about the same thing, I haven't got time to dig it
> further.
> 
> The question we need to answer is that whether we care about how fast
> destruction can make, as some are in critical paths while some are
> not.

Yeah, I take __btrfs_return_cluster_to_free_space as an example where
the more efficient tree destruction would shorten the time where a lock
is held.

In other cases it might complicate the code too much, though the
performance is not critical, eg. at umount time. Although, it'd be good
to optimize that too now that we know how.

And in the remaining cases it's not possible to avoid rb_erase. I had a
closer look at btrfs_cleanup_defrag_inodes and btrfs_put_tree_mod_seq.
Based on that I'd like to add this series to for-next, the first node
caching is clear enough and safe. We can do the other optimizations
later.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-09-14 15:11     ` David Sterba
@ 2018-09-14 19:14       ` Liu Bo
  2018-09-20 15:56         ` David Sterba
  0 siblings, 1 reply; 16+ messages in thread
From: Liu Bo @ 2018-09-14 19:14 UTC (permalink / raw)
  To: dsterba, linux-btrfs

On Fri, Sep 14, 2018 at 05:11:02PM +0200, David Sterba wrote:
> On Tue, Sep 11, 2018 at 11:31:49AM -0700, Liu Bo wrote:
> > On Tue, Sep 11, 2018 at 05:34:03PM +0200, David Sterba wrote:
> > > On Thu, Aug 23, 2018 at 03:51:48AM +0800, Liu Bo wrote:
> > > > Several structs in btrfs are using rb_first() in a while loop, it'd be
> > > > more efficient to do this with rb_first_cached() which has the O(1)
> > > > complexity.
> > > 
> > > We'd like to see some numbers, though just by reading the code I think
> > > there's going to be a noticeable improvement for some workloads.
> > > 
> > > There's a common pattern:
> > > 
> > > while(node = rb_first) {
> > > 	entry = rb_entry(node)
> > > 	next = rb_next(node)
> > > 	rb_erase(node)
> > > 	cleanup(entry)
> > > }
> > > 
> > > rb_first needs to traverse the tree up to logN depth, rb_erase can
> > > completely reshuffle the tree. With the caching we'll skip the traversal
> > > in rb_first. That's a cached memory access vs looped pointer
> > > dereference trade-off that IMHO has a clear winner.
> > > 
> > > As the pattern in many cases traverses the whole tree and removes all
> > > entries, ideally we could get rid of the rebalancing too. The entries
> > > would be cleaned up in postorder way, ie. reset the data and kfree in
> > > the end. This requires that order of the freeing does not matter, which
> > > might no apply to some trees.
> > 
> > The order of freeing does not matter, but we need the tree to return
> > the correct next node to free, IOW, we have to maintain the rbtree
> > until the last two nodes.
> > 
> > > 
> > > I did not find suitable rbtree functions or helpers for that,
> > > rbtree_postorder_for_each_entry_safe does not allow rb_erase and
> > > rb_erase itself forces the rebalancing internally. But I think we can
> > > implement that.
> > > 
> > > We could possibly use rb_next_postorder that makes sure that all
> > > children are traversed before the parent so this is at least something
> > > that could considerd.
> > > 
> > 
> > Qu was inquiring about the same thing, I haven't got time to dig it
> > further.
> > 
> > The question we need to answer is that whether we care about how fast
> > destruction can make, as some are in critical paths while some are
> > not.
> 
> Yeah, I take __btrfs_return_cluster_to_free_space as an example where
> the more efficient tree destruction would shorten the time where a lock
> is held.
> 
> In other cases it might complicate the code too much, though the
> performance is not critical, eg. at umount time. Although, it'd be good
> to optimize that too now that we know how.
> 
> And in the remaining cases it's not possible to avoid rb_erase. I had a
> closer look at btrfs_cleanup_defrag_inodes and btrfs_put_tree_mod_seq.
> Based on that I'd like to add this series to for-next, the first node
> caching is clear enough and safe. We can do the other optimizations
> later.

This needs more fine-grained debugging to see what's the cost
lies on.

About the perf. number of rb_cached_node, I measured the time spent in
extent map loop in btrfs_evict_inode(),

evict_inode_truncate_pages()
    while (!RB_EMPTY_ROOT(&map_tree->map)) {
         rb_first();
	 rb_erase();
    }


for a em tree containing 10,000 em,
- with rb_first_cached, the loop takes 4880454 ns,
- with rb_first, the loop takes 4584148 ns,

looks like the difference is not that much, ~6%.  After all it's about
scalability, the more rb tree gets, the more rb_first_cached() wins.

thanks,
-liubo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-09-14 19:14       ` Liu Bo
@ 2018-09-20 15:56         ` David Sterba
  0 siblings, 0 replies; 16+ messages in thread
From: David Sterba @ 2018-09-20 15:56 UTC (permalink / raw)
  To: Liu Bo; +Cc: dsterba, linux-btrfs

On Fri, Sep 14, 2018 at 12:14:57PM -0700, Liu Bo wrote:
> On Fri, Sep 14, 2018 at 05:11:02PM +0200, David Sterba wrote:
> > On Tue, Sep 11, 2018 at 11:31:49AM -0700, Liu Bo wrote:
> > > On Tue, Sep 11, 2018 at 05:34:03PM +0200, David Sterba wrote:
> > > > On Thu, Aug 23, 2018 at 03:51:48AM +0800, Liu Bo wrote:
> > > > > Several structs in btrfs are using rb_first() in a while loop, it'd be
> > > > > more efficient to do this with rb_first_cached() which has the O(1)
> > > > > complexity.
> > > > 
> > > > We'd like to see some numbers, though just by reading the code I think
> > > > there's going to be a noticeable improvement for some workloads.
> > > > 
> > > > There's a common pattern:
> > > > 
> > > > while(node = rb_first) {
> > > > 	entry = rb_entry(node)
> > > > 	next = rb_next(node)
> > > > 	rb_erase(node)
> > > > 	cleanup(entry)
> > > > }
> > > > 
> > > > rb_first needs to traverse the tree up to logN depth, rb_erase can
> > > > completely reshuffle the tree. With the caching we'll skip the traversal
> > > > in rb_first. That's a cached memory access vs looped pointer
> > > > dereference trade-off that IMHO has a clear winner.
> > > > 
> > > > As the pattern in many cases traverses the whole tree and removes all
> > > > entries, ideally we could get rid of the rebalancing too. The entries
> > > > would be cleaned up in postorder way, ie. reset the data and kfree in
> > > > the end. This requires that order of the freeing does not matter, which
> > > > might no apply to some trees.
> > > 
> > > The order of freeing does not matter, but we need the tree to return
> > > the correct next node to free, IOW, we have to maintain the rbtree
> > > until the last two nodes.
> > > 
> > > > 
> > > > I did not find suitable rbtree functions or helpers for that,
> > > > rbtree_postorder_for_each_entry_safe does not allow rb_erase and
> > > > rb_erase itself forces the rebalancing internally. But I think we can
> > > > implement that.
> > > > 
> > > > We could possibly use rb_next_postorder that makes sure that all
> > > > children are traversed before the parent so this is at least something
> > > > that could considerd.
> > > > 
> > > 
> > > Qu was inquiring about the same thing, I haven't got time to dig it
> > > further.
> > > 
> > > The question we need to answer is that whether we care about how fast
> > > destruction can make, as some are in critical paths while some are
> > > not.
> > 
> > Yeah, I take __btrfs_return_cluster_to_free_space as an example where
> > the more efficient tree destruction would shorten the time where a lock
> > is held.
> > 
> > In other cases it might complicate the code too much, though the
> > performance is not critical, eg. at umount time. Although, it'd be good
> > to optimize that too now that we know how.
> > 
> > And in the remaining cases it's not possible to avoid rb_erase. I had a
> > closer look at btrfs_cleanup_defrag_inodes and btrfs_put_tree_mod_seq.
> > Based on that I'd like to add this series to for-next, the first node
> > caching is clear enough and safe. We can do the other optimizations
> > later.
> 
> This needs more fine-grained debugging to see what's the cost
> lies on.
> 
> About the perf. number of rb_cached_node, I measured the time spent in
> extent map loop in btrfs_evict_inode(),
> 
> evict_inode_truncate_pages()
>     while (!RB_EMPTY_ROOT(&map_tree->map)) {
>          rb_first();
> 	 rb_erase();
>     }
> 
> 
> for a em tree containing 10,000 em,

The maximum depth of the rbtree for this number of nodes is roughly
2 * lg(10000) = 2 * 14 = 28. This is not an average case so it's going
to be better in practice but still 10+ pointer dereferences can be
exchanged with a pointer update.

> - with rb_first_cached, the loop takes 4880454 ns,
> - with rb_first, the loop takes 4584148 ns,
> 
> looks like the difference is not that much, ~6%.  After all it's about
> scalability, the more rb tree gets, the more rb_first_cached() wins.

Yeah, there's some difference but I think the cache effects will make
the measurements hard to estimate, so I take the numbers as a sort of
confirmation that there's not going to be a big difference.

The delayed refs and delayed inode can get some speedup from that as
there's usually only rb_first without rb_erase.

I'm going to update the first patch with the analysis we did and
reference the patch from the others so we'll have a starting point for
further optimizations and can verify if the described expected speedups
happen.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
                   ` (8 preceding siblings ...)
  2018-09-11 15:34 ` David Sterba
@ 2018-09-20 16:49 ` David Sterba
  2018-09-20 17:40   ` Liu Bo
  9 siblings, 1 reply; 16+ messages in thread
From: David Sterba @ 2018-09-20 16:49 UTC (permalink / raw)
  To: Liu Bo; +Cc: linux-btrfs

On Thu, Aug 23, 2018 at 03:51:48AM +0800, Liu Bo wrote:
> Several structs in btrfs are using rb_first() in a while loop, it'd be
> more efficient to do this with rb_first_cached() which has the O(1)
> complexity.
> 
> This patch set updates five structs which may have a large rb tree in
> practice
> 
> Liu Bo (5):
>   Btrfs: href_root: use rb_first_cached
>   Btrfs: href->ref_tree: use rb_first_cached
>   Btrfs: delayed_items: use rb_first_cached
>   Btrfs: extent_map: use rb_first_cached
>   Btrfs: preftree: use rb_first_cached

For the record, patches have been merged to misc-next. I've changed the
subject titles to

Btrfs: delayed-refs: use rb_first_cached for href_root
Btrfs: delayed-refs: use rb_first_cached for ref_tree
Btrfs: delayed-inode: use rb_first_cached for ins_root and del_root
Btrfs: extent_map: use rb_first_cached
Btrfs: preftree: use rb_first_cached

added Holger's tested-by and my reviewed-by.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/5] rb_first to rb_first_cached conversion
  2018-09-20 16:49 ` David Sterba
@ 2018-09-20 17:40   ` Liu Bo
  0 siblings, 0 replies; 16+ messages in thread
From: Liu Bo @ 2018-09-20 17:40 UTC (permalink / raw)
  To: dsterba, linux-btrfs

On Thu, Sep 20, 2018 at 06:49:59PM +0200, David Sterba wrote:
> On Thu, Aug 23, 2018 at 03:51:48AM +0800, Liu Bo wrote:
> > Several structs in btrfs are using rb_first() in a while loop, it'd be
> > more efficient to do this with rb_first_cached() which has the O(1)
> > complexity.
> > 
> > This patch set updates five structs which may have a large rb tree in
> > practice
> > 
> > Liu Bo (5):
> >   Btrfs: href_root: use rb_first_cached
> >   Btrfs: href->ref_tree: use rb_first_cached
> >   Btrfs: delayed_items: use rb_first_cached
> >   Btrfs: extent_map: use rb_first_cached
> >   Btrfs: preftree: use rb_first_cached
> 
> For the record, patches have been merged to misc-next. I've changed the
> subject titles to
> 
> Btrfs: delayed-refs: use rb_first_cached for href_root
> Btrfs: delayed-refs: use rb_first_cached for ref_tree
> Btrfs: delayed-inode: use rb_first_cached for ins_root and del_root
> Btrfs: extent_map: use rb_first_cached
> Btrfs: preftree: use rb_first_cached
> 
> added Holger's tested-by and my reviewed-by.

Thanks so much for the efforts.

thanks,
-liubo

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2018-09-20 23:25 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-22 19:51 [PATCH 0/5] rb_first to rb_first_cached conversion Liu Bo
2018-08-22 19:51 ` [PATCH 1/5] Btrfs: href_root: use rb_first_cached Liu Bo
2018-08-22 19:51 ` [PATCH 2/5] Btrfs: href->ref_tree: " Liu Bo
2018-08-22 19:51 ` [PATCH 3/5] Btrfs: delayed_items: " Liu Bo
2018-08-22 19:51 ` [PATCH 4/5] Btrfs: extent_map: " Liu Bo
2018-08-22 19:51 ` [PATCH 5/5] Btrfs: preftree: " Liu Bo
2018-08-23  0:27 ` [PATCH 0/5] rb_first to rb_first_cached conversion Qu Wenruo
2018-08-23 10:21 ` Holger Hoffstätte
2018-08-23 11:26 ` Nikolay Borisov
2018-09-11 15:34 ` David Sterba
2018-09-11 18:31   ` Liu Bo
2018-09-14 15:11     ` David Sterba
2018-09-14 19:14       ` Liu Bo
2018-09-20 15:56         ` David Sterba
2018-09-20 16:49 ` David Sterba
2018-09-20 17:40   ` Liu Bo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.