Linux-BTRFS Archive on lore.kernel.org
 help / color / Atom feed
From: Naohiro Aota <naohiro.aota@wdc.com>
To: linux-btrfs@vger.kernel.org, David Sterba <dsterba@suse.com>
Cc: "Chris Mason" <clm@fb.com>, "Josef Bacik" <josef@toxicpanda.com>,
	"Qu Wenruo" <wqu@suse.com>, "Nikolay Borisov" <nborisov@suse.com>,
	linux-kernel@vger.kernel.org, "Hannes Reinecke" <hare@suse.com>,
	linux-fsdevel@vger.kernel.org,
	"Damien Le Moal" <damien.lemoal@wdc.com>,
	"Matias Bjørling" <mb@lightnvm.io>,
	"Johannes Thumshirn" <jthumshirn@suse.de>,
	"Bart Van Assche" <bvanassche@acm.org>,
	"Naohiro Aota" <naohiro.aota@wdc.com>
Subject: [PATCH 06/19] btrfs: align dev extent allocation to zone boundary
Date: Fri,  7 Jun 2019 22:10:12 +0900
Message-ID: <20190607131025.31996-7-naohiro.aota@wdc.com> (raw)
In-Reply-To: <20190607131025.31996-1-naohiro.aota@wdc.com>

In HMZONED mode, align the device extents to zone boundaries so that a zone
reset affects only the device extent and does not change the state of
blocks in the neighbor device extents. Also, check that a region allocation
is always over empty same-type zones.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/extent-tree.c |   6 +++
 fs/btrfs/volumes.c     | 100 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1aee51a9f3bf..363db58f56b8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9884,6 +9884,12 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 		min_free = div64_u64(min_free, dev_min);
 	}
 
+	/* We cannot allocate size less than zone_size anyway */
+	if (index == BTRFS_RAID_DUP)
+		min_free = max_t(u64, min_free, 2 * fs_info->zone_size);
+	else
+		min_free = max_t(u64, min_free, fs_info->zone_size);
+
 	mutex_lock(&fs_info->chunk_mutex);
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 dev_offset;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b6f367d19dc9..c1ed3b6e3cfd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1737,6 +1737,46 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
 	return false;
 }
 
+static u64 dev_zone_align(struct btrfs_device *device, u64 pos)
+{
+	if (device->zone_size)
+		return ALIGN(pos, device->zone_size);
+	return pos;
+}
+
+/*
+ * is_allocatable_region - check if spcecifeid region is suitable for allocation
+ * @device:	the device to allocate a region
+ * @pos:	the position of the region
+ * @num_bytes:	the size of the region
+ *
+ * In non-ZONED device, anywhere is suitable for allocation. In ZONED
+ * device, check if the region is not on non-empty zones. Also, check if
+ * all zones in the region have the same zone type.
+ */
+static bool is_allocatable_region(struct btrfs_device *device, u64 pos,
+				  u64 num_bytes)
+{
+	int is_sequential;
+
+	if (device->zone_size == 0)
+		return true;
+
+	WARN_ON(!IS_ALIGNED(pos, device->zone_size));
+	WARN_ON(!IS_ALIGNED(num_bytes, device->zone_size));
+
+	is_sequential = btrfs_dev_is_sequential(device, pos);
+
+	while (num_bytes > 0) {
+		if (!btrfs_dev_is_empty_zone(device, pos) ||
+		    (is_sequential != btrfs_dev_is_sequential(device, pos)))
+			return false;
+		pos += device->zone_size;
+		num_bytes -= device->zone_size;
+	}
+
+	return true;
+}
 
 /*
  * find_free_dev_extent_start - find free space in the specified device
@@ -1779,9 +1819,14 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 	/*
 	 * We don't want to overwrite the superblock on the drive nor any area
 	 * used by the boot loader (grub for example), so we make sure to start
-	 * at an offset of at least 1MB.
+	 * at an offset of at least 1MB on a regular disk. For a zoned block
+	 * device, skip the first zone of the device entirely.
 	 */
-	search_start = max_t(u64, search_start, SZ_1M);
+	if (device->zone_size)
+		search_start = max_t(u64, dev_zone_align(device, search_start),
+				     device->zone_size);
+	else
+		search_start = max_t(u64, search_start, SZ_1M);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1846,12 +1891,22 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 			 */
 			if (contains_pending_extent(device, &search_start,
 						    hole_size)) {
+				search_start = dev_zone_align(device,
+							      search_start);
 				if (key.offset >= search_start)
 					hole_size = key.offset - search_start;
 				else
 					hole_size = 0;
 			}
 
+			if (!is_allocatable_region(device, search_start,
+						   num_bytes)) {
+				search_start = dev_zone_align(device,
+							      search_start+1);
+				btrfs_release_path(path);
+				goto again;
+			}
+
 			if (hole_size > max_hole_size) {
 				max_hole_start = search_start;
 				max_hole_size = hole_size;
@@ -1876,7 +1931,7 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 		extent_end = key.offset + btrfs_dev_extent_length(l,
 								  dev_extent);
 		if (extent_end > search_start)
-			search_start = extent_end;
+			search_start = dev_zone_align(device, extent_end);
 next:
 		path->slots[0]++;
 		cond_resched();
@@ -1891,6 +1946,14 @@ int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
 		hole_size = search_end - search_start;
 
 		if (contains_pending_extent(device, &search_start, hole_size)) {
+			search_start = dev_zone_align(device,
+						      search_start);
+			btrfs_release_path(path);
+			goto again;
+		}
+
+		if (!is_allocatable_region(device, search_start, num_bytes)) {
+			search_start = dev_zone_align(device, search_start+1);
 			btrfs_release_path(path);
 			goto again;
 		}
@@ -5177,6 +5240,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int i;
 	int j;
 	int index;
+	int hmzoned = btrfs_fs_incompat(info, HMZONED);
 
 	BUG_ON(!alloc_profile_is_valid(type, 0));
 
@@ -5221,10 +5285,20 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 
+	if (hmzoned) {
+		max_stripe_size = info->zone_size;
+		max_chunk_size = round_down(max_chunk_size, info->zone_size);
+	}
+
 	/* We don't want a chunk larger than 10% of writable space */
 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
 			     max_chunk_size);
 
+	if (hmzoned)
+		max_chunk_size = max(round_down(max_chunk_size,
+						info->zone_size),
+				     info->zone_size);
+
 	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
 			       GFP_NOFS);
 	if (!devices_info)
@@ -5259,6 +5333,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		if (total_avail == 0)
 			continue;
 
+		if (hmzoned && total_avail < max_stripe_size * dev_stripes)
+			continue;
+
 		ret = find_free_dev_extent(device,
 					   max_stripe_size * dev_stripes,
 					   &dev_offset, &max_avail);
@@ -5277,6 +5354,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
+		if (hmzoned && max_avail < max_stripe_size * dev_stripes)
+			continue;
+
 		if (ndevs == fs_devices->rw_devices) {
 			WARN(1, "%s: found more than %llu devices\n",
 			     __func__, fs_devices->rw_devices);
@@ -5310,6 +5390,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 	ndevs = min(ndevs, devs_max);
 
+again:
 	/*
 	 * The primary goal is to maximize the number of stripes, so use as
 	 * many devices as possible, even if the stripes are not maximum sized.
@@ -5333,6 +5414,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 * we try to reduce stripe_size.
 	 */
 	if (stripe_size * data_stripes > max_chunk_size) {
+		if (hmzoned) {
+			/*
+			 * stripe_size is fixed in HMZONED. Reduce ndevs
+			 * instead.
+			 */
+			WARN_ON(nparity != 0);
+			ndevs = div_u64(max_chunk_size * ncopies,
+					stripe_size * dev_stripes);
+			goto again;
+		}
+
 		/*
 		 * Reduce stripe_size, round it up to a 16MB boundary again and
 		 * then use it, unless it ends up being even bigger than the
@@ -5346,6 +5438,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	/* align to BTRFS_STRIPE_LEN */
 	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
 
+	WARN_ON(hmzoned && stripe_size != info->zone_size);
+
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		ret = -ENOMEM;
-- 
2.21.0


  parent reply index

Thread overview: 79+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-07 13:10 [PATCH v2 00/19] btrfs zoned block device support Naohiro Aota
2019-06-07 13:10 ` [PATCH 01/19] btrfs: introduce HMZONED feature flag Naohiro Aota
2019-06-07 13:10 ` [PATCH 02/19] btrfs: Get zone information of zoned block devices Naohiro Aota
2019-06-13 13:58   ` Josef Bacik
2019-06-18  6:04     ` Naohiro Aota
2019-06-13 13:58   ` Josef Bacik
2019-06-17 18:57   ` David Sterba
2019-06-18  6:42     ` Naohiro Aota
2019-06-27 15:11       ` David Sterba
2019-06-07 13:10 ` [PATCH 03/19] btrfs: Check and enable HMZONED mode Naohiro Aota
2019-06-13 13:57   ` Josef Bacik
2019-06-18  6:43     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 04/19] btrfs: disable fallocate in " Naohiro Aota
2019-06-07 13:10 ` [PATCH 05/19] btrfs: disable direct IO " Naohiro Aota
2019-06-13 14:00   ` Josef Bacik
2019-06-18  8:17     ` Naohiro Aota
2019-06-07 13:10 ` Naohiro Aota [this message]
2019-06-07 13:10 ` [PATCH 07/19] btrfs: do sequential extent allocation " Naohiro Aota
2019-06-13 14:07   ` Josef Bacik
2019-06-18  8:28     ` Naohiro Aota
2019-06-18 13:37       ` Josef Bacik
2019-06-17 22:30   ` David Sterba
2019-06-18  8:49     ` Naohiro Aota
2019-06-27 15:28       ` David Sterba
2019-06-07 13:10 ` [PATCH 08/19] btrfs: make unmirroed BGs readonly only if we have at least one writable BG Naohiro Aota
2019-06-13 14:09   ` Josef Bacik
2019-06-18  7:42     ` Naohiro Aota
2019-06-18 13:35       ` Josef Bacik
2019-06-07 13:10 ` [PATCH 09/19] btrfs: limit super block locations in HMZONED mode Naohiro Aota
2019-06-13 14:12   ` Josef Bacik
2019-06-18  8:51     ` Naohiro Aota
2019-06-17 22:53   ` David Sterba
2019-06-18  9:01     ` Naohiro Aota
2019-06-27 15:35       ` David Sterba
2019-06-28  3:55   ` Anand Jain
2019-06-28  6:39     ` Naohiro Aota
2019-06-28  6:52       ` Anand Jain
2019-06-07 13:10 ` [PATCH 10/19] btrfs: rename btrfs_map_bio() Naohiro Aota
2019-06-07 13:10 ` [PATCH 11/19] btrfs: introduce submit buffer Naohiro Aota
2019-06-13 14:14   ` Josef Bacik
2019-06-17  3:16     ` Damien Le Moal
2019-06-18  0:00       ` David Sterba
2019-06-18  4:04         ` Damien Le Moal
2019-06-18 13:33       ` Josef Bacik
2019-06-19 10:32         ` Damien Le Moal
2019-06-07 13:10 ` [PATCH 12/19] btrfs: expire submit buffer on timeout Naohiro Aota
2019-06-13 14:15   ` Josef Bacik
2019-06-17  3:19     ` Damien Le Moal
2019-06-07 13:10 ` [PATCH 13/19] btrfs: avoid sync IO prioritization on checksum in HMZONED mode Naohiro Aota
2019-06-13 14:17   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 14/19] btrfs: redirty released extent buffers in sequential BGs Naohiro Aota
2019-06-13 14:24   ` Josef Bacik
2019-06-18  9:09     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 15/19] btrfs: reset zones of unused block groups Naohiro Aota
2019-06-07 13:10 ` [PATCH 16/19] btrfs: wait existing extents before truncating Naohiro Aota
2019-06-13 14:25   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 17/19] btrfs: shrink delayed allocation size in HMZONED mode Naohiro Aota
2019-06-13 14:27   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 18/19] btrfs: support dev-replace " Naohiro Aota
2019-06-13 14:33   ` Josef Bacik
2019-06-18  9:14     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 19/19] btrfs: enable to mount HMZONED incompat flag Naohiro Aota
2019-06-07 13:17 ` [PATCH 01/12] btrfs-progs: build: Check zoned block device support Naohiro Aota
2019-06-07 13:17   ` [PATCH 02/12] btrfs-progs: utils: Introduce queue_param Naohiro Aota
2019-06-07 13:17   ` [PATCH 03/12] btrfs-progs: add new HMZONED feature flag Naohiro Aota
2019-06-07 13:17   ` [PATCH 04/12] btrfs-progs: Introduce zone block device helper functions Naohiro Aota
2019-06-07 13:17   ` [PATCH 05/12] btrfs-progs: load and check zone information Naohiro Aota
2019-06-07 13:17   ` [PATCH 06/12] btrfs-progs: avoid writing super block to sequential zones Naohiro Aota
2019-06-07 13:17   ` [PATCH 07/12] btrfs-progs: support discarding zoned device Naohiro Aota
2019-06-07 13:17   ` [PATCH 08/12] btrfs-progs: volume: align chunk allocation to zones Naohiro Aota
2019-06-07 13:17   ` [PATCH 09/12] btrfs-progs: do sequential allocation Naohiro Aota
2019-06-07 13:17   ` [PATCH 10/12] btrfs-progs: mkfs: Zoned block device support Naohiro Aota
2019-06-07 13:17   ` [PATCH 11/12] btrfs-progs: device-add: support HMZONED device Naohiro Aota
2019-06-07 13:17   ` [PATCH 12/12] btrfs-progs: introduce support for dev-place " Naohiro Aota
2019-06-12 17:51 ` [PATCH v2 00/19] btrfs zoned block device support David Sterba
2019-06-13  4:59   ` Naohiro Aota
2019-06-13 13:46     ` David Sterba
2019-06-14  2:07       ` Naohiro Aota
2019-06-17  2:44       ` Damien Le Moal

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190607131025.31996-7-naohiro.aota@wdc.com \
    --to=naohiro.aota@wdc.com \
    --cc=bvanassche@acm.org \
    --cc=clm@fb.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=josef@toxicpanda.com \
    --cc=jthumshirn@suse.de \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mb@lightnvm.io \
    --cc=nborisov@suse.com \
    --cc=wqu@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-BTRFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-btrfs/0 linux-btrfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-btrfs linux-btrfs/ https://lore.kernel.org/linux-btrfs \
		linux-btrfs@vger.kernel.org linux-btrfs@archiver.kernel.org
	public-inbox-index linux-btrfs


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-btrfs


AGPL code for this site: git clone https://public-inbox.org/ public-inbox