Linux-BTRFS Archive on lore.kernel.org
 help / color / Atom feed
From: Naohiro Aota <naohiro.aota@wdc.com>
To: linux-btrfs@vger.kernel.org, David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>, Josef Bacik <josef@toxicpanda.com>,
	Nikolay Borisov <nborisov@suse.com>,
	Damien Le Moal <damien.lemoal@wdc.com>,
	Matias Bjorling <Matias.Bjorling@wdc.com>,
	Johannes Thumshirn <jthumshirn@suse.de>,
	Hannes Reinecke <hare@suse.com>,
	linux-fsdevel@vger.kernel.org,
	Naohiro Aota <naohiro.aota@wdc.com>
Subject: [PATCH v3 09/27] btrfs: align device extent allocation to zone boundary
Date: Thu,  8 Aug 2019 18:30:20 +0900
Message-ID: <20190808093038.4163421-10-naohiro.aota@wdc.com> (raw)
In-Reply-To: <20190808093038.4163421-1-naohiro.aota@wdc.com>

In HMZONED mode, align the device extents to zone boundaries so that a zone
reset affects only the device extent and does not change the state of
blocks in the neighbor device extents. Also, check that a region allocation
is always over empty same-type zones and it is not over any locations of
super block copies.

This patch also add a verification in verify_one_dev_extent() to check if
the device extent is align to zone boundary.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/extent-tree.c |  6 ++++
 fs/btrfs/hmzoned.c     | 56 ++++++++++++++++++++++++++++++++
 fs/btrfs/hmzoned.h     | 10 ++++++
 fs/btrfs/volumes.c     | 72 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 144 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d3b58e388535..3a36646dfaa8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7637,6 +7637,12 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 		min_free = div64_u64(min_free, dev_min);
 	}
 
+	/* We cannot allocate size less than zone_size anyway */
+	if (index == BTRFS_RAID_DUP)
+		min_free = max_t(u64, min_free, 2 * fs_info->zone_size);
+	else
+		min_free = max_t(u64, min_free, fs_info->zone_size);
+
 	mutex_lock(&fs_info->chunk_mutex);
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 dev_offset;
diff --git a/fs/btrfs/hmzoned.c b/fs/btrfs/hmzoned.c
index e07e76af1e82..7d334b236cd3 100644
--- a/fs/btrfs/hmzoned.c
+++ b/fs/btrfs/hmzoned.c
@@ -12,6 +12,7 @@
 #include "volumes.h"
 #include "hmzoned.h"
 #include "rcu-string.h"
+#include "disk-io.h"
 
 /* Maximum number of zones to report per blkdev_report_zones() call */
 #define BTRFS_REPORT_NR_ZONES   4096
@@ -264,3 +265,58 @@ int btrfs_check_mountopts_hmzoned(struct btrfs_fs_info *info)
 
 	return 0;
 }
+
+/*
+ * btrfs_check_allocatable_zones - check if spcecifeid region is
+ *                                 suitable for allocation
+ * @device:	the device to allocate a region
+ * @pos:	the position of the region
+ * @num_bytes:	the size of the region
+ *
+ * In non-ZONED device, anywhere is suitable for allocation. In ZONED
+ * device, check if
+ * 1) the region is not on non-empty zones,
+ * 2) all zones in the region have the same zone type,
+ * 3) it does not contain super block location, if the zones are
+ *    sequential.
+ */
+bool btrfs_check_allocatable_zones(struct btrfs_device *device, u64 pos,
+				   u64 num_bytes)
+{
+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	u64 nzones, begin, end;
+	u64 sb_pos;
+	u8 shift;
+	int i;
+
+	if (!zinfo)
+		return true;
+
+	shift = zinfo->zone_size_shift;
+	nzones = num_bytes >> shift;
+	begin = pos >> shift;
+	end = begin + nzones;
+
+	ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+	if (end > zinfo->nr_zones)
+		return false;
+
+	/* check if zones in the region are all empty */
+	if (find_next_zero_bit(zinfo->empty_zones, end, begin) != end)
+		return false;
+
+	if (btrfs_dev_is_sequential(device, pos)) {
+		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+			sb_pos = btrfs_sb_offset(i);
+			if (!(sb_pos + BTRFS_SUPER_INFO_SIZE <= pos ||
+			      pos + end <= sb_pos))
+				return false;
+		}
+
+		return find_next_zero_bit(zinfo->seq_zones, end, begin) == end;
+	}
+
+	return find_next_bit(zinfo->seq_zones, end, begin) == end;
+}
diff --git a/fs/btrfs/hmzoned.h b/fs/btrfs/hmzoned.h
index 83579b2dc0a4..396ece5f9410 100644
--- a/fs/btrfs/hmzoned.h
+++ b/fs/btrfs/hmzoned.h
@@ -29,6 +29,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device);
 void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
 int btrfs_check_hmzoned_mode(struct btrfs_fs_info *fs_info);
 int btrfs_check_mountopts_hmzoned(struct btrfs_fs_info *info);
+bool btrfs_check_allocatable_zones(struct btrfs_device *device, u64 pos,
+				   u64 num_bytes);
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
 {
@@ -95,4 +97,12 @@ static inline bool btrfs_check_device_zone_type(struct btrfs_fs_info *fs_info,
 	return bdev_zoned_model(bdev) != BLK_ZONED_HM;
 }
 
+static inline u64 btrfs_zone_align(struct btrfs_device *device, u64 pos)
+{
+	if (!device->zone_info)
+		return pos;
+
+	return ALIGN(pos, device->zone_info->zone_size);
+}
+
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 755b2ec1e0de..265a1496e459 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1572,6 +1572,7 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 	u64 max_hole_size;
 	u64 extent_end;
 	u64 search_end = device->total_bytes;
+	u64 zone_size = 0;
 	int ret;
 	int slot;
 	struct extent_buffer *l;
@@ -1582,6 +1583,14 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 	 * at an offset of at least 1MB.
 	 */
 	search_start = max_t(u64, search_start, SZ_1M);
+	/*
+	 * For a zoned block device, skip the first zone of the device
+	 * entirely.
+	 */
+	if (device->zone_info)
+		zone_size = device->zone_info->zone_size;
+	search_start = max_t(u64, search_start, zone_size);
+	search_start = btrfs_zone_align(device, search_start);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1646,12 +1655,21 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 			 */
 			if (contains_pending_extent(device, &search_start,
 						    hole_size)) {
+				search_start = btrfs_zone_align(device,
+								search_start);
 				if (key.offset >= search_start)
 					hole_size = key.offset - search_start;
 				else
 					hole_size = 0;
 			}
 
+			if (!btrfs_check_allocatable_zones(device, search_start,
+							   num_bytes)) {
+				search_start += zone_size;
+				btrfs_release_path(path);
+				goto again;
+			}
+
 			if (hole_size > max_hole_size) {
 				max_hole_start = search_start;
 				max_hole_size = hole_size;
@@ -1691,6 +1709,14 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 		hole_size = search_end - search_start;
 
 		if (contains_pending_extent(device, &search_start, hole_size)) {
+			search_start = btrfs_zone_align(device, search_start);
+			btrfs_release_path(path);
+			goto again;
+		}
+
+		if (!btrfs_check_allocatable_zones(device, search_start,
+						   num_bytes)) {
+			search_start += zone_size;
 			btrfs_release_path(path);
 			goto again;
 		}
@@ -1708,6 +1734,7 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 		ret = 0;
 
 out:
+	ASSERT(zone_size == 0 || IS_ALIGNED(max_hole_start, zone_size));
 	btrfs_free_path(path);
 	*start = max_hole_start;
 	if (len)
@@ -4964,6 +4991,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int i;
 	int j;
 	int index;
+	int hmzoned = btrfs_fs_incompat(info, HMZONED);
 
 	BUG_ON(!alloc_profile_is_valid(type, 0));
 
@@ -5004,10 +5032,20 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 
+	if (hmzoned) {
+		max_stripe_size = info->zone_size;
+		max_chunk_size = round_down(max_chunk_size, info->zone_size);
+	}
+
 	/* We don't want a chunk larger than 10% of writable space */
 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
 			     max_chunk_size);
 
+	if (hmzoned)
+		max_chunk_size = max(round_down(max_chunk_size,
+						info->zone_size),
+				     info->zone_size);
+
 	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
 			       GFP_NOFS);
 	if (!devices_info)
@@ -5042,6 +5080,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		if (total_avail == 0)
 			continue;
 
+		if (hmzoned && total_avail < max_stripe_size * dev_stripes)
+			continue;
+
 		ret = find_free_dev_extent(device,
 					   max_stripe_size * dev_stripes,
 					   &dev_offset, &max_avail);
@@ -5060,6 +5101,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
+		if (hmzoned && max_avail < max_stripe_size * dev_stripes)
+			continue;
+
 		if (ndevs == fs_devices->rw_devices) {
 			WARN(1, "%s: found more than %llu devices\n",
 			     __func__, fs_devices->rw_devices);
@@ -5093,6 +5137,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 	ndevs = min(ndevs, devs_max);
 
+again:
 	/*
 	 * The primary goal is to maximize the number of stripes, so use as
 	 * many devices as possible, even if the stripes are not maximum sized.
@@ -5116,6 +5161,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 * we try to reduce stripe_size.
 	 */
 	if (stripe_size * data_stripes > max_chunk_size) {
+		if (hmzoned) {
+			/*
+			 * stripe_size is fixed in HMZONED. Reduce ndevs
+			 * instead.
+			 */
+			ASSERT(nparity == 0);
+			ndevs = div_u64(max_chunk_size * ncopies,
+					stripe_size * dev_stripes);
+			goto again;
+		}
+
 		/*
 		 * Reduce stripe_size, round it up to a 16MB boundary again and
 		 * then use it, unless it ends up being even bigger than the
@@ -5129,6 +5185,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	/* align to BTRFS_STRIPE_LEN */
 	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
 
+	ASSERT(!hmzoned || stripe_size == info->zone_size);
+
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		ret = -ENOMEM;
@@ -7755,6 +7813,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 		ret = -EUCLEAN;
 		goto out;
 	}
+
+	if (dev->zone_info) {
+		u64 zone_size = dev->zone_info->zone_size;
+
+		if (!IS_ALIGNED(physical_offset, zone_size) ||
+		    !IS_ALIGNED(physical_len, zone_size)) {
+			btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
+				  devid, physical_offset, physical_len);
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+
 out:
 	free_extent_map(em);
 	return ret;
-- 
2.22.0


  parent reply index

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-08  9:30 [PATCH v3 00/27] btrfs zoned block device support Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 01/27] btrfs: introduce HMZONED feature flag Naohiro Aota
2019-08-16  4:49   ` Anand Jain
2019-08-08  9:30 ` [PATCH v3 02/27] btrfs: Get zone information of zoned block devices Naohiro Aota
2019-08-16  4:44   ` Anand Jain
2019-08-16 14:19     ` Damien Le Moal
2019-08-16 23:47       ` Anand Jain
2019-08-16 23:55         ` Damien Le Moal
2019-08-08  9:30 ` [PATCH v3 03/27] btrfs: Check and enable HMZONED mode Naohiro Aota
2019-08-16  5:46   ` Anand Jain
2019-08-16 14:23     ` Damien Le Moal
2019-08-16 23:56       ` Anand Jain
2019-08-17  0:05         ` Damien Le Moal
2019-08-20  5:07         ` Naohiro Aota
2019-08-20 13:05           ` David Sterba
2019-08-08  9:30 ` [PATCH v3 04/27] btrfs: disallow RAID5/6 in " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 05/27] btrfs: disallow space_cache " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 06/27] btrfs: disallow NODATACOW " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 07/27] btrfs: disable tree-log " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 08/27] btrfs: disable fallocate " Naohiro Aota
2019-08-08  9:30 ` Naohiro Aota [this message]
2019-08-08  9:30 ` [PATCH v3 10/27] btrfs: do sequential extent allocation " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 11/27] btrfs: make unmirroed BGs readonly only if we have at least one writable BG Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 12/27] btrfs: ensure metadata space available on/after degraded mount in HMZONED Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 13/27] btrfs: reset zones of unused block groups Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 14/27] btrfs: limit super block locations in HMZONED mode Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 15/27] btrfs: redirty released extent buffers in sequential BGs Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 16/27] btrfs: serialize data allocation and submit IOs Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 17/27] btrfs: implement atomic compressed IO submission Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 18/27] btrfs: support direct write IO in HMZONED Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 19/27] btrfs: serialize meta IOs on HMZONED mode Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 20/27] btrfs: wait existing extents before truncating Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 21/27] btrfs: avoid async checksum/submit on HMZONED mode Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 22/27] btrfs: disallow mixed-bg in " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 23/27] btrfs: disallow inode_cache " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 24/27] btrfs: support dev-replace " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 25/27] btrfs: enable relocation " Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 26/27] btrfs: relocate block group to repair IO failure in HMZONED Naohiro Aota
2019-08-08  9:30 ` [PATCH v3 27/27] btrfs: enable to mount HMZONED incompat flag Naohiro Aota

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190808093038.4163421-10-naohiro.aota@wdc.com \
    --to=naohiro.aota@wdc.com \
    --cc=Matias.Bjorling@wdc.com \
    --cc=clm@fb.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=josef@toxicpanda.com \
    --cc=jthumshirn@suse.de \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=nborisov@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-BTRFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-btrfs/0 linux-btrfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-btrfs linux-btrfs/ https://lore.kernel.org/linux-btrfs \
		linux-btrfs@vger.kernel.org linux-btrfs@archiver.kernel.org
	public-inbox-index linux-btrfs

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-btrfs


AGPL code for this site: git clone https://public-inbox.org/ public-inbox