All of lore.kernel.org
 help / color / mirror / Atom feed
From: Naohiro Aota <naohiro.aota@wdc.com>
To: linux-btrfs@vger.kernel.org, David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>, Josef Bacik <josef@toxicpanda.com>,
	Nikolay Borisov <nborisov@suse.com>,
	Damien Le Moal <damien.lemoal@wdc.com>,
	Johannes Thumshirn <jthumshirn@suse.de>,
	Hannes Reinecke <hare@suse.com>,
	Anand Jain <anand.jain@oracle.com>,
	linux-fsdevel@vger.kernel.org,
	Naohiro Aota <naohiro.aota@wdc.com>
Subject: [PATCH v5 11/15] btrfs-progs: do sequential allocation in HMZONED mode
Date: Wed,  4 Dec 2019 17:25:09 +0900	[thread overview]
Message-ID: <20191204082513.857320-12-naohiro.aota@wdc.com> (raw)
In-Reply-To: <20191204082513.857320-1-naohiro.aota@wdc.com>

On HMZONED drives, writes must always be sequential and directed at a block
group zone write pointer position. Thus, block allocation in a block group
must also be done sequentially using an allocation pointer equal to the
block group zone write pointer plus the number of blocks allocated but not
yet written.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 common/hmzoned.c | 406 +++++++++++++++++++++++++++++++++++++++++++++++
 common/hmzoned.h |   7 +
 ctree.h          |   6 +
 extent-tree.c    |  16 ++
 4 files changed, 435 insertions(+)

diff --git a/common/hmzoned.c b/common/hmzoned.c
index 2cbf2fc88cb0..f268f360d8f7 100644
--- a/common/hmzoned.c
+++ b/common/hmzoned.c
@@ -29,6 +29,11 @@
 
 #define BTRFS_REPORT_NR_ZONES	8192
 
+/* Invalid allocation pointer value for missing devices */
+#define WP_MISSING_DEV ((u64)-1)
+/* Pseudo write pointer value for conventional zone */
+#define WP_CONVENTIONAL ((u64)-2)
+
 enum btrfs_zoned_model zoned_model(const char *file)
 {
 	char model[32];
@@ -505,6 +510,407 @@ bool btrfs_check_allocatable_zones(struct btrfs_device *device, u64 pos,
 	return true;
 }
 
+static int emulate_write_pointer(struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_group_cache *cache,
+				 u64 *offset_ret)
+{
+	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key search_key;
+	struct btrfs_key found_key;
+	int slot;
+	int ret;
+	u64 length;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	search_key.objectid = cache->key.objectid + cache->key.offset;
+	search_key.type = 0;
+	search_key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	ASSERT(ret != 0);
+	slot = path->slots[0];
+	leaf = path->nodes[0];
+	ASSERT(slot != 0);
+	slot--;
+	btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+	if (found_key.objectid < cache->key.objectid) {
+		*offset_ret = 0;
+	} else if (found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+		struct btrfs_key extent_item_key;
+
+		if (found_key.objectid != cache->key.objectid) {
+			ret = -EUCLEAN;
+			goto out;
+		}
+
+		length = 0;
+
+		/* metadata may have METADATA_ITEM_KEY */
+		if (slot == 0) {
+			ret = btrfs_prev_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret == 0) {
+				slot = btrfs_header_nritems(leaf) - 1;
+				btrfs_item_key_to_cpu(leaf, &extent_item_key,
+						      slot);
+			}
+		} else {
+			btrfs_item_key_to_cpu(leaf, &extent_item_key, slot - 1);
+			ret = 0;
+		}
+
+		if (ret == 0 &&
+		    extent_item_key.objectid == cache->key.objectid) {
+			if (extent_item_key.type == BTRFS_METADATA_ITEM_KEY)
+				length = fs_info->nodesize;
+			else if (extent_item_key.type == BTRFS_EXTENT_ITEM_KEY)
+				length = extent_item_key.offset;
+			else {
+				ret = -EUCLEAN;
+				goto out;
+			}
+		}
+
+		*offset_ret = length;
+	} else if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+		   found_key.type == BTRFS_METADATA_ITEM_KEY) {
+
+		if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
+			length = found_key.offset;
+		else
+			length = fs_info->nodesize;
+
+		if (!(found_key.objectid >= cache->key.objectid &&
+		       found_key.objectid + length <=
+		       cache->key.objectid + cache->key.offset)) {
+			ret = -EUCLEAN;
+			goto out;
+		}
+		*offset_ret = found_key.objectid + length - cache->key.objectid;
+	} else {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = 0;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static u64 offset_in_dev_extent(struct map_lookup *map, u64 *alloc_offsets,
+				u64 logical, int idx)
+{
+	u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+	u64 stripe_nr = logical / map->stripe_len;
+	u64 full_stripes_cnt = stripe_nr / map->num_stripes;
+	u32 rest_stripes_cnt = stripe_nr % map->num_stripes;
+	u64 stripe_start, offset;
+	int data_stripes = map->num_stripes / map->sub_stripes;
+	int stripe_idx;
+	int i;
+
+	ASSERT(profile == BTRFS_BLOCK_GROUP_RAID0 ||
+	       profile == BTRFS_BLOCK_GROUP_RAID10);
+
+	stripe_idx = idx / map->sub_stripes;
+
+	if (stripe_idx < rest_stripes_cnt)
+		return map->stripe_len * (full_stripes_cnt + 1);
+
+	for (i = idx + map->sub_stripes; i < map->num_stripes;
+	     i += map->sub_stripes) {
+		if (alloc_offsets[i] != WP_CONVENTIONAL &&
+		    alloc_offsets[i] > map->stripe_len * full_stripes_cnt)
+			return map->stripe_len * (full_stripes_cnt + 1);
+	}
+
+	stripe_start = (full_stripes_cnt * data_stripes + stripe_idx) *
+		map->stripe_len;
+	if (stripe_start >= logical)
+		return full_stripes_cnt * map->stripe_len;
+	offset = min_t(u64, logical - stripe_start, map->stripe_len);
+
+	return full_stripes_cnt * map->stripe_len + offset;
+}
+
+int btrfs_load_block_group_zone_info(struct btrfs_fs_info *fs_info,
+				     struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_device *device;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct cache_extent *ce;
+	struct map_lookup *map;
+	u64 logical = cache->key.objectid;
+	u64 length = cache->key.offset;
+	u64 physical = 0;
+	int ret = 0;
+	int i, j;
+	u64 zone_size = fs_info->fs_devices->zone_size;
+	u64 *alloc_offsets = NULL;
+	u64 emulated_offset = 0;
+	u32 num_sequential = 0, num_conventional = 0;
+
+	if (!btrfs_fs_incompat(fs_info, HMZONED))
+		return 0;
+
+	/* Sanity check */
+	if (logical == BTRFS_BLOCK_RESERVED_1M_FOR_SUPER) {
+		if (length + SZ_1M != zone_size) {
+			error("unaligned initial system block group");
+			return -EIO;
+		}
+	} else if (!IS_ALIGNED(length, zone_size)) {
+		error("unaligned block group at %llu + %llu", logical, length);
+		return -EIO;
+	}
+
+	/* Get the chunk mapping */
+	ce = search_cache_extent(&map_tree->cache_tree, logical);
+	if (!ce) {
+		error("failed to find block group at %llu", logical);
+		return -ENOENT;
+	}
+	map = container_of(ce, struct map_lookup, ce);
+
+	/*
+	 * Get the zone type: if the group is mapped to a non-sequential zone,
+	 * there is no need for the allocation offset (fit allocation is OK).
+	 */
+	alloc_offsets = calloc(map->num_stripes, sizeof(*alloc_offsets));
+	if (!alloc_offsets) {
+		error("failed to allocate alloc_offsets");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < map->num_stripes; i++) {
+		bool is_sequential;
+		struct blk_zone zone;
+
+		device = map->stripes[i].dev;
+		physical = map->stripes[i].physical;
+
+		if (device->fd == -1) {
+			alloc_offsets[i] = WP_MISSING_DEV;
+			continue;
+		}
+
+		is_sequential = btrfs_dev_is_sequential(device, physical);
+		if (is_sequential)
+			num_sequential++;
+		else
+			num_conventional++;
+
+		if (!is_sequential) {
+			alloc_offsets[i] = WP_CONVENTIONAL;
+			continue;
+		}
+
+		/*
+		 * The group is mapped to a sequential zone. Get the zone write
+		 * pointer to determine the allocation offset within the zone.
+		 */
+		WARN_ON(!IS_ALIGNED(physical, zone_size));
+		zone = device->zone_info->zones[physical / zone_size];
+
+		switch (zone.cond) {
+		case BLK_ZONE_COND_OFFLINE:
+		case BLK_ZONE_COND_READONLY:
+			error("Offline/readonly zone %llu",
+			      physical / fs_info->fs_devices->zone_size);
+			ret = -EIO;
+			goto out;
+		case BLK_ZONE_COND_EMPTY:
+			alloc_offsets[i] = 0;
+			break;
+		case BLK_ZONE_COND_FULL:
+			alloc_offsets[i] = zone_size;
+			break;
+		default:
+			/* Partially used zone */
+			alloc_offsets[i] = ((zone.wp - zone.start) << 9);
+			break;
+		}
+	}
+
+	if (num_conventional > 0) {
+		ret = emulate_write_pointer(fs_info, cache, &emulated_offset);
+		if (ret || map->num_stripes == num_conventional) {
+			if (!ret)
+				cache->alloc_offset = emulated_offset;
+			goto out;
+		}
+	}
+
+	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+	case 0: /* single */
+	case BTRFS_BLOCK_GROUP_DUP:
+	case BTRFS_BLOCK_GROUP_RAID1:
+		cache->alloc_offset = WP_MISSING_DEV;
+		for (i = 0; i < map->num_stripes; i++) {
+			if (alloc_offsets[i] == WP_MISSING_DEV ||
+			    alloc_offsets[i] == WP_CONVENTIONAL)
+				continue;
+			if (cache->alloc_offset == WP_MISSING_DEV)
+				cache->alloc_offset = alloc_offsets[i];
+			if (alloc_offsets[i] == cache->alloc_offset)
+				continue;
+
+			error("write pointer mismatch: block group %llu",
+			      logical);
+			ret = -EIO;
+			goto out;
+		}
+		if (num_conventional && emulated_offset > cache->alloc_offset)
+			ret = -EIO;
+		break;
+	case BTRFS_BLOCK_GROUP_RAID0:
+		cache->alloc_offset = 0;
+		for (i = 0; i < map->num_stripes; i++) {
+			if (alloc_offsets[i] == WP_MISSING_DEV) {
+				error(
+			"cannot recover write pointer: block group %llu",
+				      logical);
+				ret = -EIO;
+				goto out;
+			}
+
+			if (alloc_offsets[i] == WP_CONVENTIONAL)
+				alloc_offsets[i] =
+					offset_in_dev_extent(map, alloc_offsets,
+							     emulated_offset,
+							     i);
+
+			/* sanity check */
+			if (i > 0) {
+				if ((alloc_offsets[i] % BTRFS_STRIPE_LEN != 0 &&
+				     alloc_offsets[i - 1] %
+					     BTRFS_STRIPE_LEN != 0) ||
+				    (alloc_offsets[i - 1] < alloc_offsets[i]) ||
+				    (alloc_offsets[i - 1] - alloc_offsets[i] >
+						BTRFS_STRIPE_LEN)) {
+					error(
+				"write pointer mismatch at %d: block group %llu",
+					      i, logical);
+					ret = -EIO;
+					goto out;
+				}
+			}
+
+			cache->alloc_offset += alloc_offsets[i];
+		}
+		break;
+	case BTRFS_BLOCK_GROUP_RAID10:
+		/*
+		 * Pass1: check write pointer of RAID1 level: each pointer
+		 * should be equal.
+		 */
+		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
+			int base = i * map->sub_stripes;
+			u64 offset = WP_MISSING_DEV;
+			int fill = 0, num_conventional = 0;
+
+			for (j = 0; j < map->sub_stripes; j++) {
+				if (alloc_offsets[base+j] == WP_MISSING_DEV) {
+					fill++;
+					continue;
+				}
+				if (alloc_offsets[base+j] == WP_CONVENTIONAL) {
+					fill++;
+					num_conventional++;
+					continue;
+				}
+				if (offset == WP_MISSING_DEV)
+					offset = alloc_offsets[base + j];
+				if (alloc_offsets[base + j] == offset)
+					continue;
+
+				error(
+				"write pointer mismatch: block group %llu",
+				      logical);
+				ret = -EIO;
+				goto out;
+			}
+			if (!fill)
+				continue;
+			/* this RAID0 stripe is free on conventional zones */
+			if (num_conventional == map->sub_stripes)
+				offset = WP_CONVENTIONAL;
+			/* fill WP_MISSING_DEV or WP_CONVENTIONAL */
+			for (j = 0; j < map->sub_stripes; j++)
+				alloc_offsets[base + j] = offset;
+		}
+
+		/* Pass2: check write pointer of RAID0 level */
+		cache->alloc_offset = 0;
+		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
+			int base = i * map->sub_stripes;
+
+			if (alloc_offsets[base] == WP_MISSING_DEV) {
+				error(
+			"cannot recover write pointer: block group %llu",
+				      logical);
+				ret = -EIO;
+				goto out;
+			}
+
+			if (alloc_offsets[base] == WP_CONVENTIONAL)
+				alloc_offsets[base] =
+					offset_in_dev_extent(map, alloc_offsets,
+							     emulated_offset,
+							     base);
+
+			/* sanity check */
+			if (i > 0) {
+				int prev = base - map->sub_stripes;
+
+				if ((alloc_offsets[base] %
+					     BTRFS_STRIPE_LEN != 0 &&
+				     alloc_offsets[prev] %
+					     BTRFS_STRIPE_LEN != 0) ||
+				    (alloc_offsets[prev] <
+					     alloc_offsets[base]) ||
+				    (alloc_offsets[prev] - alloc_offsets[base] >
+						BTRFS_STRIPE_LEN)) {
+					error(
+				"write pointer mismatch: block group %llu",
+					      logical);
+					ret = -EIO;
+					goto out;
+				}
+			}
+
+			cache->alloc_offset += alloc_offsets[base];
+		}
+		break;
+	case BTRFS_BLOCK_GROUP_RAID5:
+	case BTRFS_BLOCK_GROUP_RAID6:
+		/* RAID5/6 is not supported yet */
+	default:
+		error("Unsupported profile %llu",
+		      map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+		ret = -EINVAL;
+		goto out;
+	}
+
+out:
+	/* an extent is allocated after the write pointer */
+	if (num_conventional && emulated_offset > cache->alloc_offset)
+		ret = -EIO;
+
+	free(alloc_offsets);
+	return ret;
+}
+
 #endif
 
 int btrfs_get_zone_info(int fd, const char *file, bool hmzoned,
diff --git a/common/hmzoned.h b/common/hmzoned.h
index 3444e2c1b0f5..a6b16d0ed35a 100644
--- a/common/hmzoned.h
+++ b/common/hmzoned.h
@@ -70,6 +70,8 @@ static inline size_t sbwrite(int fd, void *buf, off_t offset)
 int btrfs_wipe_sb_zones(int fd, struct btrfs_zoned_device_info *zinfo);
 bool btrfs_check_allocatable_zones(struct btrfs_device *device, u64 pos,
 				   u64 num_bytes);
+int btrfs_load_block_group_zone_info(struct btrfs_fs_info *fs_info,
+				     struct btrfs_block_group_cache *cache);
 #else
 static inline bool zone_is_sequential(struct btrfs_zoned_device_info *zinfo,
 				      u64 bytenr)
@@ -105,6 +107,11 @@ static inline bool btrfs_check_allocatable_zones(struct btrfs_device *device,
 {
 	return true;
 }
+static inline int btrfs_load_block_group_zone_info(
+	struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache)
+{
+	return 0;
+}
 
 #endif /* BTRFS_ZONED */
 
diff --git a/ctree.h b/ctree.h
index 34fd7d00cabf..fe72bd8921b0 100644
--- a/ctree.h
+++ b/ctree.h
@@ -1119,6 +1119,12 @@ struct btrfs_block_group_cache {
          */
         u32 bitmap_low_thresh;
 
+	/*
+	 * Allocation offset for the block group to implement
+	 * sequential allocation. This is used only with HMZONED mode
+	 * enabled.
+	 */
+	u64 alloc_offset;
 };
 
 struct btrfs_device;
diff --git a/extent-tree.c b/extent-tree.c
index 53be4f4c7369..89a8b935b602 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -30,6 +30,7 @@
 #include "volumes.h"
 #include "free-space-cache.h"
 #include "free-space-tree.h"
+#include "common/hmzoned.h"
 #include "common/utils.h"
 
 #define PENDING_EXTENT_INSERT 0
@@ -258,6 +259,14 @@ again:
 	if (cache->ro || !block_group_bits(cache, data))
 		goto new_group;
 
+	if (root->fs_info->fs_devices->hmzoned) {
+		if (cache->key.offset - cache->alloc_offset < num)
+			goto new_group;
+		*start_ret = cache->key.objectid + cache->alloc_offset;
+		cache->alloc_offset += num;
+		return 0;
+	}
+
 	while(1) {
 		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
 					    last, &start, &end, EXTENT_DIRTY);
@@ -2720,6 +2729,10 @@ static int read_one_block_group(struct btrfs_fs_info *fs_info,
 	}
 	cache->space_info = space_info;
 
+	ret = btrfs_load_block_group_zone_info(fs_info, cache);
+	if (ret)
+		return ret;
+
 	set_extent_bits(block_group_cache, cache->key.objectid,
 			cache->key.objectid + cache->key.offset - 1,
 			bit | EXTENT_LOCKED);
@@ -2785,6 +2798,9 @@ btrfs_add_block_group(struct btrfs_fs_info *fs_info, u64 bytes_used, u64 type,
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 
+	ret = btrfs_load_block_group_zone_info(fs_info, cache);
+	BUG_ON(ret);
+
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	cache->used = bytes_used;
 	cache->flags = type;
-- 
2.24.0


  parent reply	other threads:[~2019-12-04  8:27 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-12-04  8:24 [PATCH v5 00/15] btrfs-progs: zoned block device support Naohiro Aota
2019-12-04  8:24 ` [PATCH v5 01/15] btrfs-progs: utils: Introduce queue_param helper function Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 02/15] btrfs-progs: introduce raid parameters variables Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 03/15] btrfs-progs: build: Check zoned block device support Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 04/15] btrfs-progs: add new HMZONED feature flag Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 05/15] btrfs-progs: Introduce zone block device helper functions Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 06/15] btrfs-progs: load and check zone information Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 07/15] btrfs-progs: support discarding zoned device Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 08/15] btrfs-progs: support zero out on zoned block device Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 09/15] btrfs-progs: implement log-structured superblock for HMZONED mode Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 10/15] btrfs-progs: align device extent allocation to zone boundary Naohiro Aota
2019-12-04  8:25 ` Naohiro Aota [this message]
2019-12-04  8:25 ` [PATCH v5 12/15] btrfs-progs: redirty clean extent buffers in seq Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 13/15] btrfs-progs: mkfs: Zoned block device support Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 14/15] btrfs-progs: device-add: support HMZONED device Naohiro Aota
2019-12-04  8:25 ` [PATCH v5 15/15] btrfs-progs: introduce support for device replace " Naohiro Aota
2019-12-04  8:30 ` [PATCH] libblkid: implement zone-aware probing for HMZONED btrfs Naohiro Aota
2019-12-04 12:15   ` Vyacheslav Dubeyko
2019-12-06  7:03     ` Naohiro Aota
2019-12-06 15:22       ` David Sterba
2019-12-05 14:51   ` Karel Zak
2019-12-06  7:06     ` Naohiro Aota

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191204082513.857320-12-naohiro.aota@wdc.com \
    --to=naohiro.aota@wdc.com \
    --cc=anand.jain@oracle.com \
    --cc=clm@fb.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=josef@toxicpanda.com \
    --cc=jthumshirn@suse.de \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=nborisov@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.