All of lore.kernel.org
 help / color / mirror / Atom feed
From: Naohiro Aota <naohiro.aota@wdc.com>
To: linux-btrfs@vger.kernel.org, dsterba@suse.com
Cc: hare@suse.com, linux-fsdevel@vger.kernel.org,
	Naohiro Aota <naohiro.aota@wdc.com>
Subject: [PATCH v8 31/41] btrfs: mark block groups to copy for device-replace
Date: Fri,  2 Oct 2020 03:36:38 +0900	[thread overview]
Message-ID: <83652d36a020f8c11e601d969cc8940a829020e9.1601574234.git.naohiro.aota@wdc.com> (raw)
In-Reply-To: <dece91bca322ce44bed19f2b0f460fa5ded2e512.1601574234.git.naohiro.aota@wdc.com>

This is the 1/4 patch to support device-replace in ZONED mode.

We have two types of I/Os during the device-replace process. One is an I/O
to "copy" (by the scrub functions) all the device extents on the source
device to the destination device.  The other one is an I/O to "clone" (by
handle_ops_on_dev_replace()) new incoming write I/Os from users to the
source device into the target device.

Cloning incoming I/Os can break the sequential write rule in the target
device. When writing is mapped in the middle of a block group, the I/O is
directed in the middle of a target device zone, which breaks the sequential
write rule.

However, the cloning function cannot be merely disabled since incoming I/Os
targeting already copied device extents must be cloned so that the I/O is
executed on the target device.

We cannot use dev_replace->cursor_{left,right} to determine whether bio is
going to not yet copied region.  Since we have a time gap between finishing
btrfs_scrub_dev() and rewriting the mapping tree in
btrfs_dev_replace_finishing(), we can have a newly allocated device extent
which is never cloned nor copied.

So the point is to copy only already existing device extents. This patch
introduces mark_block_group_to_copy() to mark existing block groups as a
target of copying. Then, handle_ops_on_dev_replace() and dev-replace can
check the flag to do their job.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/block-group.h |   1 +
 fs/btrfs/dev-replace.c | 175 +++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/dev-replace.h |   3 +
 fs/btrfs/scrub.c       |  17 ++++
 4 files changed, 196 insertions(+)

diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index b2a8a3beceac..e91123495d68 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -95,6 +95,7 @@ struct btrfs_block_group {
 	unsigned int iref:1;
 	unsigned int has_caching_ctl:1;
 	unsigned int removed:1;
+	unsigned int to_copy:1;
 
 	int disk_cache_state;
 
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5e3554482af1..e86aff38aea4 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -22,6 +22,7 @@
 #include "dev-replace.h"
 #include "sysfs.h"
 #include "zoned.h"
+#include "block-group.h"
 
 /*
  * Device replace overview
@@ -437,6 +438,176 @@ static char* btrfs_dev_name(struct btrfs_device *device)
 		return rcu_str_deref(device->name);
 }
 
+static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
+				    struct btrfs_device *src_dev)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_root *root = fs_info->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_block_group *cache;
+	struct extent_buffer *l;
+	struct btrfs_trans_handle *trans;
+	int slot;
+	int ret = 0;
+	u64 chunk_offset, length;
+
+	/* Do not use "to_copy" on non-ZONED for now */
+	if (!btrfs_fs_incompat(fs_info, ZONED))
+		return 0;
+
+	mutex_lock(&fs_info->chunk_mutex);
+
+	/* ensulre we don't have pending new block group */
+	while (fs_info->running_transaction &&
+	       !list_empty(&fs_info->running_transaction->dev_update_list)) {
+		mutex_unlock(&fs_info->chunk_mutex);
+		trans = btrfs_attach_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			mutex_lock(&fs_info->chunk_mutex);
+			if (ret == -ENOENT)
+				continue;
+			else
+				goto out;
+		}
+
+		ret = btrfs_commit_transaction(trans);
+		mutex_lock(&fs_info->chunk_mutex);
+		if (ret)
+			goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	path->reada = READA_FORWARD;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
+	key.objectid = src_dev->devid;
+	key.offset = 0ull;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] >=
+			    btrfs_header_nritems(path->nodes[0])) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret < 0)
+					break;
+				if (ret > 0) {
+					ret = 0;
+					break;
+				}
+			} else {
+				ret = 0;
+			}
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+
+		if (found_key.objectid != src_dev->devid)
+			break;
+
+		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
+			break;
+
+		if (found_key.offset < key.offset)
+			break;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+
+		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+		if (!cache)
+			goto skip;
+
+		spin_lock(&cache->lock);
+		cache->to_copy = 1;
+		spin_unlock(&cache->lock);
+
+		btrfs_put_block_group(cache);
+
+skip:
+		key.offset = found_key.offset + length;
+		btrfs_release_path(path);
+	}
+
+	btrfs_free_path(path);
+out:
+	mutex_unlock(&fs_info->chunk_mutex);
+
+	return ret;
+}
+
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+				      struct btrfs_block_group *cache,
+				      u64 physical)
+{
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 chunk_offset = cache->start;
+	int num_extents, cur_extent;
+	int i;
+
+	/* Do not use "to_copy" on non-ZONED for now */
+	if (!btrfs_fs_incompat(fs_info, ZONED))
+		return true;
+
+	spin_lock(&cache->lock);
+	if (cache->removed) {
+		spin_unlock(&cache->lock);
+		return true;
+	}
+	spin_unlock(&cache->lock);
+
+	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+	BUG_ON(IS_ERR(em));
+	map = em->map_lookup;
+
+	num_extents = cur_extent = 0;
+	for (i = 0; i < map->num_stripes; i++) {
+		/* we have more device extent to copy */
+		if (srcdev != map->stripes[i].dev)
+			continue;
+
+		num_extents++;
+		if (physical == map->stripes[i].physical)
+			cur_extent = i;
+	}
+
+	free_extent_map(em);
+
+	if (num_extents > 1 && cur_extent < num_extents - 1) {
+		/*
+		 * Has more stripes on this device. Keep this BG
+		 * readonly until we finish all the stripes.
+		 */
+		return false;
+	}
+
+	/* last stripe on this device */
+	spin_lock(&cache->lock);
+	cache->to_copy = 0;
+	spin_unlock(&cache->lock);
+
+	return true;
+}
+
 static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
 		int read_src)
@@ -478,6 +649,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 	if (ret)
 		return ret;
 
+	ret = mark_block_group_to_copy(fs_info, src_device);
+	if (ret)
+		return ret;
+
 	down_write(&dev_replace->rwsem);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 60b70dacc299..3911049a5f23 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
 int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+				      struct btrfs_block_group *cache,
+				      u64 physical);
 
 #endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index aa1b36cf5c88..d0d7db3c8b0b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3500,6 +3500,17 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		if (!cache)
 			goto skip;
 
+
+		if (sctx->is_dev_replace && btrfs_fs_incompat(fs_info, ZONED)) {
+			spin_lock(&cache->lock);
+			if (!cache->to_copy) {
+				spin_unlock(&cache->lock);
+				ro_set = 0;
+				goto done;
+			}
+			spin_unlock(&cache->lock);
+		}
+
 		/*
 		 * Make sure that while we are scrubbing the corresponding block
 		 * group doesn't get its logical address and its device extents
@@ -3631,6 +3642,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		scrub_pause_off(fs_info);
 
+		if (sctx->is_dev_replace &&
+		    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
+						      cache, found_key.offset))
+			ro_set = 0;
+
+done:
 		down_write(&dev_replace->rwsem);
 		dev_replace->cursor_left = dev_replace->cursor_right;
 		dev_replace->item_needs_writeback = 1;
-- 
2.27.0


  parent reply	other threads:[~2020-10-01 18:39 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-01 18:36 [PATCH v8 00/41] btrfs: zoned block device support Naohiro Aota
2020-10-01 18:36 ` [PATCH v8 01/41] block: add bio_add_zone_append_page Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 02/41] btrfs: introduce ZONED feature flag Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 03/41] btrfs: Get zone information of zoned block devices Naohiro Aota
2020-10-13 15:53     ` David Sterba
2020-10-15  7:45       ` Johannes Thumshirn
2020-10-01 18:36   ` [PATCH v8 04/41] btrfs: Check and enable ZONED mode Naohiro Aota
2020-10-13 15:56     ` David Sterba
2020-10-28 14:49       ` Johannes Thumshirn
2020-10-13 16:13     ` David Sterba
2020-10-01 18:36   ` [PATCH v8 05/41] btrfs: introduce max_zone_append_size Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 06/41] btrfs: disallow space_cache in ZONED mode Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 07/41] btrfs: disallow NODATACOW " Naohiro Aota
2020-10-13 15:39     ` David Sterba
2020-10-21  8:53       ` Johannes Thumshirn
2020-10-01 18:36   ` [PATCH v8 08/41] btrfs: disable fallocate " Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 09/41] btrfs: disallow mixed-bg " Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 10/41] btrfs: disallow inode_cache " Naohiro Aota
2020-10-13 15:41     ` David Sterba
2020-10-22  6:48       ` Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 11/41] btrfs: implement log-structured superblock for " Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 12/41] btrfs: implement zoned chunk allocator Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 13/41] btrfs: verify device extent is aligned to zone Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 14/41] btrfs: load zone's alloction offset Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 15/41] btrfs: emulate write pointer for conventional zones Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 16/41] btrfs: track unusable bytes for zones Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 17/41] btrfs: do sequential extent allocation in ZONED mode Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 18/41] btrfs: reset zones of unused block groups Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 19/41] btrfs: redirty released extent buffers in ZONED mode Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 20/41] btrfs: extract page adding function Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 21/41] btrfs: use bio_add_zone_append_page for zoned btrfs Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 22/41] btrfs: handle REQ_OP_ZONE_APPEND as writing Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 23/41] btrfs: split ordered extent when bio is sent Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 24/41] btrfs: extend btrfs_rmap_block for specifying a device Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 25/41] btrfs: use ZONE_APPEND write for ZONED btrfs Naohiro Aota
2020-10-13 16:45     ` David Sterba
2020-10-01 18:36   ` [PATCH v8 26/41] btrfs: enable zone append writing for direct IO Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 27/41] btrfs: introduce dedicated data write path for ZONED mode Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 28/41] btrfs: serialize meta IOs on " Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 29/41] btrfs: wait existing extents before truncating Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 30/41] btrfs: avoid async metadata checksum on ZONED mode Naohiro Aota
2020-10-01 18:36   ` Naohiro Aota [this message]
2020-10-01 18:36   ` [PATCH v8 32/41] btrfs: implement cloning for ZONED device-replace Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 33/41] btrfs: implement copying " Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 34/41] btrfs: support dev-replace in ZONED mode Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 35/41] btrfs: enable relocation " Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 36/41] btrfs: relocate block group to repair IO failure in ZONED Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 37/41] btrfs: split alloc_log_tree() Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 38/41] btrfs: extend zoned allocator to use dedicated tree-log block group Naohiro Aota
2020-10-13 16:26     ` David Sterba
2020-10-15  7:21       ` Johannes Thumshirn
2020-10-01 18:36   ` [PATCH v8 39/41] btrfs: serialize log transaction on ZONED mode Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 40/41] btrfs: reorder log node allocation Naohiro Aota
2020-10-01 18:36   ` [PATCH v8 41/41] btrfs: enable to mount ZONED incompat flag Naohiro Aota
2020-10-02 13:39   ` [PATCH v8 01/41] block: add bio_add_zone_append_page Martin K. Petersen
2020-10-05  1:46     ` Damien Le Moal
2020-10-05 13:43     ` Christoph Hellwig
2020-10-06  1:26       ` Martin K. Petersen
2020-10-06  5:12         ` Damien Le Moal
2020-10-09 15:40 ` [PATCH v8 00/41] btrfs: zoned block device support Josef Bacik
2020-10-12  9:17   ` Naohiro Aota

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=83652d36a020f8c11e601d969cc8940a829020e9.1601574234.git.naohiro.aota@wdc.com \
    --to=naohiro.aota@wdc.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.