Linux-BTRFS Archive on lore.kernel.org
 help / color / Atom feed
From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org
Subject: [PATCH v4 5/7] btrfs: qgroup: Introduce per-root swapped blocks infrastructure
Date: Tue, 15 Jan 2019 16:16:02 +0800
Message-ID: <20190115081604.785-6-wqu@suse.com> (raw)
In-Reply-To: <20190115081604.785-1-wqu@suse.com>

To allow delayed subtree swap rescan, btrfs needs to record per-root
info about which tree blocks get swapped.

So this patch introduces per-root btrfs_qgroup_swapped_blocks structure,
which records which tree blocks get swapped.

The designed workflow will be:

1) Record the subtree root block get swapped.

   During subtree swap:
   O = Old tree blocks
   N = New tree blocks
         reloc tree                         subvolume tree X
            Root                               Root
           /    \                             /    \
         NA     OB                          OA      OB
       /  |     |  \                      /  |      |  \
     NC  ND     OE  OF                   OC  OD     OE  OF

  In these case, NA and OA is going to be swapped, record (NA, OA) into
  subvolume tree X.

2) After subtree swap.
         reloc tree                         subvolume tree X
            Root                               Root
           /    \                             /    \
         OA     OB                          NA      OB
       /  |     |  \                      /  |      |  \
     OC  OD     OE  OF                   NC  ND     OE  OF

3a) CoW happens for OB
    If we are going to CoW tree block OB, we check OB's bytenr against
    tree X's swapped_blocks structure.
    It doesn't fit any one, nothing will happen.

3b) CoW happens for NA
    Check NA's bytenr against tree X's swapped_blocks, and get a hit.
    Then we do subtree scan on both subtree OA and NA.
    Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).

    Then no matter what we do to subvolume tree X, qgroup numbers will
    still be correct.
    Then NA's record get removed from X's swapped_blocks.

4)  Transaction commit
    Any record in X's swapped_blocks get removed, since there is no
    modification to swapped subtrees, no need to trigger heavy qgroup
    subtree rescan for them.

This will introduce 128 bytes overhead for each btrfs_root even qgroup
is not enabled.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/ctree.h       |  14 +++++
 fs/btrfs/disk-io.c     |   1 +
 fs/btrfs/qgroup.c      | 130 +++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h      |  99 +++++++++++++++++++++++++++++++
 fs/btrfs/relocation.c  |   7 +++
 fs/btrfs/transaction.c |   1 +
 6 files changed, 252 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c374dfb6aec..b93ca931af08 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1204,6 +1204,17 @@ enum {
 	BTRFS_ROOT_DEAD_RELOC_TREE,
 };
 
+/*
+ * Record swapped tree blocks of a file/subvolume tree for delayed subtree
+ * trace code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+	spinlock_t lock;
+	struct rb_root blocks[BTRFS_MAX_LEVEL];
+	/* RM_EMPTY_ROOT() of above blocks[] */
+	bool swapped;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
@@ -1339,6 +1350,9 @@ struct btrfs_root {
 	/* Number of active swapfiles */
 	atomic_t nr_swapfiles;
 
+	/* Record pairs of swapped blocks for qgroup */
+	struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 	u64 alloc_bytenr;
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bfefa1de0455..31b2facdfc1e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1219,6 +1219,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->anon_dev = 0;
 
 	spin_lock_init(&root->root_item_lock);
+	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
 }
 
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 8fe6ebe9aef8..001830328960 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3797,3 +3797,133 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
 	}
 	extent_changeset_release(&changeset);
 }
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Get called when commit one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+	struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+	int i;
+
+	swapped_blocks = &root->swapped_blocks;
+
+	spin_lock(&swapped_blocks->lock);
+	if (!swapped_blocks->swapped)
+		goto out;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		struct rb_root *cur_root = &swapped_blocks->blocks[i];
+		struct btrfs_qgroup_swapped_block *entry;
+		struct btrfs_qgroup_swapped_block *next;
+
+		rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+						     node)
+			kfree(entry);
+		swapped_blocks->blocks[i] = RB_ROOT;
+	}
+	swapped_blocks->swapped = false;
+out:
+	spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Adding subtree roots record into @subv_root.
+ *
+ * @subv_root:		tree root of the subvolume tree get swapped
+ * @bg:			block group under balance
+ * @subv_parent/slot:	pointer to the subtree root in file tree
+ * @reloc_parent/slot:	pointer to the subtree root in reloc tree
+ *			BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot:	last snapshot generation of the file tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+		struct btrfs_root *subv_root,
+		struct btrfs_block_group_cache *bg,
+		struct extent_buffer *subv_parent, int subv_slot,
+		struct extent_buffer *reloc_parent, int reloc_slot,
+		u64 last_snapshot)
+{
+	int level = btrfs_header_level(subv_parent) - 1;
+	struct btrfs_qgroup_swapped_blocks *blocks = &subv_root->swapped_blocks;
+	struct btrfs_fs_info *fs_info = subv_root->fs_info;
+	struct btrfs_qgroup_swapped_block *block;
+	struct rb_node **p = &blocks->blocks[level].rb_node;
+	struct rb_node *parent = NULL;
+	int ret = 0;
+
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+		return 0;
+
+	if (btrfs_node_ptr_generation(subv_parent, subv_slot) >
+		btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+		btrfs_err_rl(fs_info,
+		"%s: bad parameter order, subv_gen=%llu reloc_gen=%llu",
+			__func__,
+			btrfs_node_ptr_generation(subv_parent, subv_slot),
+			btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+		return -EUCLEAN;
+	}
+
+	block = kmalloc(sizeof(*block), GFP_NOFS);
+	if (!block) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * @reloc_parent/slot is still *BEFORE* swap, while @block is going to
+	 * record the bytenr *AFTER* swap, so we do the swap here.
+	 */
+	block->subv_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+	block->subv_generation = btrfs_node_ptr_generation(reloc_parent,
+							   reloc_slot);
+	block->reloc_bytenr = btrfs_node_blockptr(subv_parent, subv_slot);
+	block->reloc_generation = btrfs_node_ptr_generation(subv_parent,
+							    subv_slot);
+	block->last_snapshot = last_snapshot;
+	block->level = level;
+	if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+		block->trace_leaf = true;
+	else
+		block->trace_leaf = false;
+	btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+	/* Insert @block into @blocks */
+	spin_lock(&blocks->lock);
+	while (*p) {
+		struct btrfs_qgroup_swapped_block *entry;
+
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+				 node);
+
+		if (entry->subv_bytenr < block->subv_bytenr)
+			p = &(*p)->rb_left;
+		else if (entry->subv_bytenr > block->subv_bytenr)
+			p = &(*p)->rb_right;
+		else {
+			if (entry->subv_generation != block->subv_generation ||
+			    entry->reloc_bytenr != block->reloc_bytenr ||
+			    entry->reloc_generation !=
+			    block->reloc_generation) {
+				WARN_ON_ONCE(1);
+				ret = -EEXIST;
+			}
+			kfree(block);
+			goto out_unlock;
+		}
+	}
+	rb_link_node(&block->node, parent, p);
+	rb_insert_color(&block->node, &blocks->blocks[level]);
+	blocks->swapped = true;
+out_unlock:
+	spin_unlock(&blocks->lock);
+out:
+	if (ret < 0)
+		fs_info->qgroup_flags |=
+			BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 226956f0bd73..93d7f0998f03 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,6 +6,8 @@
 #ifndef BTRFS_QGROUP_H
 #define BTRFS_QGROUP_H
 
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
 #include "ulist.h"
 #include "delayed-ref.h"
 
@@ -37,6 +39,66 @@
  *    Normally at qgroup rescan and transaction commit time.
  */
 
+/*
+ * Special performance hack for balance.
+ *
+ * For balance, we need to swap subtree of file and reloc tree.
+ * In theory, we need to trace all subtree blocks of both file and reloc tree,
+ * since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still needs to trace original subtrees.
+ *
+ * So for balance, we use a delayed subtree trace, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ *    During subtree swap:
+ *    O = Old tree blocks
+ *    N = New tree blocks
+ *          reloc tree                         file tree X
+ *             Root                               Root
+ *            /    \                             /    \
+ *          NA     OB                          OA      OB
+ *        /  |     |  \                      /  |      |  \
+ *      NC  ND     OE  OF                   OC  OD     OE  OF
+ *
+ *   In these case, NA and OA is going to be swapped, record (NA, OA) into
+ *   file tree X.
+ *
+ * 2) After subtree swap.
+ *          reloc tree                         file tree X
+ *             Root                               Root
+ *            /    \                             /    \
+ *          OA     OB                          NA      OB
+ *        /  |     |  \                      /  |      |  \
+ *      OC  OD     OE  OF                   NC  ND     OE  OF
+ *
+ * 3a) CoW happens for OB
+ *     If we are going to CoW tree block OB, we check OB's bytenr against
+ *     tree X's swapped_blocks structure.
+ *     It doesn't fit any one, nothing will happen.
+ *
+ * 3b) CoW happens for NA
+ *     Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ *     Then we do subtree scan on both subtree OA and NA.
+ *     Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ *     Then no matter what we do to file tree X, qgroup numbers will
+ *     still be correct.
+ *     Then NA's record get removed from X's swapped_blocks.
+ *
+ * 4)  Transaction commit
+ *     Any record in X's swapped_blocks get removed, since there is no
+ *     modification to swapped subtrees, no need to trigger heavy qgroup
+ *     subtree rescan for them.
+ */
+
 /*
  * Record a dirty extent, and info qgroup to update quota on it
  * TODO: Use kmem cache to alloc it.
@@ -59,6 +121,24 @@ struct btrfs_qgroup_extent_record {
 	struct ulist *old_roots;
 };
 
+struct btrfs_qgroup_swapped_block {
+	struct rb_node node;
+
+	bool trace_leaf;
+	int level;
+
+	/* bytenr/generation of the tree block in subvolume tree after swap */
+	u64 subv_bytenr;
+	u64 subv_generation;
+
+	/* bytenr/generation of the tree block in reloc tree after swap */
+	u64 reloc_bytenr;
+	u64 reloc_generation;
+
+	u64 last_snapshot;
+	struct btrfs_key first_key;
+};
+
 /*
  * Qgroup reservation types:
  *
@@ -301,4 +381,23 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
 
 void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 
+/* btrfs_qgroup_swapped_blocks related functions */
+static inline void btrfs_qgroup_init_swapped_blocks(
+		struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+	int i;
+
+	spin_lock_init(&swapped_blocks->lock);
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		swapped_blocks->blocks[i] = RB_ROOT;
+	swapped_blocks->swapped = false;
+}
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+		struct btrfs_root *subv_root,
+		struct btrfs_block_group_cache *bg,
+		struct extent_buffer *subv_parent, int subv_slot,
+		struct extent_buffer *reloc_parent, int reloc_slot,
+		u64 last_snapshot);
 #endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1d5cfceb46c1..65f09b53a67d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1898,6 +1898,13 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
 		if (ret < 0)
 			break;
 
+		btrfs_node_key_to_cpu(parent, &first_key, slot);
+		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+				rc->block_group, parent, slot,
+				path->nodes[level], path->slots[level],
+				last_snapshot);
+		if (ret < 0)
+			break;
 		/*
 		 * swap blocks in fs tree and reloc tree.
 		 */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 127fa1535f58..22b0dacae003 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
 		if (is_fstree(root->root_key.objectid))
 			btrfs_unpin_free_ino(root);
 		clear_btree_io_tree(&root->dirty_log_pages);
+		btrfs_qgroup_clean_swapped_blocks(root);
 	}
 
 	/* We can free old roots now. */
-- 
2.20.1


  parent reply index

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-15  8:15 [PATCH v4 0/7] btrfs: qgroup: Delay subtree scan to reduce overhead Qu Wenruo
2019-01-15  8:15 ` [PATCH v4 1/7] btrfs: qgroup: Move reserved data account from btrfs_delayed_ref_head to btrfs_qgroup_extent_record Qu Wenruo
2019-01-15  8:15 ` [PATCH v4 2/7] btrfs: qgroup: Don't trigger backref walk at delayed ref insert time Qu Wenruo
2019-01-15  8:16 ` [PATCH v4 3/7] btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots() Qu Wenruo
2019-01-22 16:32   ` David Sterba
2019-01-23  6:01     ` Qu Wenruo
2019-01-15  8:16 ` [PATCH v4 4/7] btrfs: qgroup: Refactor btrfs_qgroup_trace_subtree_swap() Qu Wenruo
2019-01-22 16:38   ` David Sterba
2019-01-15  8:16 ` Qu Wenruo [this message]
2019-01-22 16:55   ` [PATCH v4 5/7] btrfs: qgroup: Introduce per-root swapped blocks infrastructure David Sterba
2019-01-22 23:07     ` Qu Wenruo
2019-01-24 18:44       ` David Sterba
2019-01-15  8:16 ` [PATCH v4 6/7] btrfs: qgroup: Use delayed subtree rescan for balance Qu Wenruo
2019-01-22 17:05   ` David Sterba
2019-01-15  8:16 ` [PATCH v4 7/7] btrfs: qgroup: Cleanup old subtree swap code Qu Wenruo
2019-01-15 17:26 ` [PATCH v4 0/7] btrfs: qgroup: Delay subtree scan to reduce overhead Josef Bacik
2019-01-16  0:31   ` Qu Wenruo
2019-01-16  1:07     ` Qu Wenruo
2019-01-16  1:15       ` Josef Bacik
2019-01-16  1:29         ` Qu Wenruo
2019-01-16  1:34           ` Josef Bacik
2019-01-16  1:36             ` Qu Wenruo
2019-01-22 16:28 ` David Sterba
2019-01-23  5:43   ` Qu Wenruo

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190115081604.785-6-wqu@suse.com \
    --to=wqu@suse.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-BTRFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-btrfs/0 linux-btrfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-btrfs linux-btrfs/ https://lore.kernel.org/linux-btrfs \
		linux-btrfs@vger.kernel.org linux-btrfs@archiver.kernel.org
	public-inbox-index linux-btrfs


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-btrfs


AGPL code for this site: git clone https://public-inbox.org/ public-inbox