Linux-BTRFS Archive on lore.kernel.org
 help / color / Atom feed
From: Naohiro Aota <naohiro.aota@wdc.com>
To: linux-btrfs@vger.kernel.org, David Sterba <dsterba@suse.com>
Cc: "Chris Mason" <clm@fb.com>, "Josef Bacik" <josef@toxicpanda.com>,
	"Qu Wenruo" <wqu@suse.com>, "Nikolay Borisov" <nborisov@suse.com>,
	linux-kernel@vger.kernel.org, "Hannes Reinecke" <hare@suse.com>,
	linux-fsdevel@vger.kernel.org,
	"Damien Le Moal" <damien.lemoal@wdc.com>,
	"Matias Bjørling" <mb@lightnvm.io>,
	"Johannes Thumshirn" <jthumshirn@suse.de>,
	"Bart Van Assche" <bvanassche@acm.org>,
	"Naohiro Aota" <naohiro.aota@wdc.com>
Subject: [PATCH 09/12] btrfs-progs: do sequential allocation
Date: Fri,  7 Jun 2019 22:17:48 +0900
Message-ID: <20190607131751.5359-9-naohiro.aota@wdc.com> (raw)
In-Reply-To: <20190607131751.5359-1-naohiro.aota@wdc.com>

Ensures that block allocation in sequential write required zones is always
done sequentially using an allocation pointer which is the zone write
pointer plus the number of blocks already allocated but not yet written.
For conventional zones, the legacy behavior is used.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 ctree.h       |  17 +++++
 extent-tree.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++
 transaction.c |  16 +++++
 3 files changed, 219 insertions(+)

diff --git a/ctree.h b/ctree.h
index 9f79686690e0..2e828bf1250e 100644
--- a/ctree.h
+++ b/ctree.h
@@ -1068,15 +1068,32 @@ struct btrfs_space_info {
 	struct list_head list;
 };
 
+/* Block group allocation types */
+enum btrfs_alloc_type {
+
+	/* Regular first fit allocation */
+	BTRFS_ALLOC_FIT		= 0,
+
+	/*
+	 * Sequential allocation: this is for HMZONED mode and
+	 * will result in ignoring free space before a block
+	 * group allocation offset.
+	 */
+	BTRFS_ALLOC_SEQ		= 1,
+};
+
 struct btrfs_block_group_cache {
 	struct cache_extent cache;
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
 	struct btrfs_space_info *space_info;
 	struct btrfs_free_space_ctl *free_space_ctl;
+	enum btrfs_alloc_type alloc_type;
 	u64 bytes_super;
 	u64 pinned;
 	u64 flags;
+	u64 alloc_offset;
+	u64 write_offset;
 	int cached;
 	int ro;
 	/*
diff --git a/extent-tree.c b/extent-tree.c
index e62ee8c2ba13..528c6875c8fb 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -251,6 +251,14 @@ again:
 	if (cache->ro || !block_group_bits(cache, data))
 		goto new_group;
 
+	if (cache->alloc_type == BTRFS_ALLOC_SEQ) {
+		if (cache->key.offset - cache->alloc_offset < num)
+			goto new_group;
+		*start_ret = cache->key.objectid + cache->alloc_offset;
+		cache->alloc_offset += num;
+		return 0;
+	}
+
 	while(1) {
 		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
 					    last, &start, &end, EXTENT_DIRTY);
@@ -277,6 +285,7 @@ out:
 			(unsigned long long)search_start);
 		return -ENOENT;
 	}
+	printf("nospace\n");
 	return -ENOSPC;
 
 new_group:
@@ -3039,6 +3048,176 @@ error:
 	return ret;
 }
 
+#ifdef BTRFS_ZONED
+static int
+btrfs_get_block_group_alloc_offset(struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_device *device;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct cache_extent *ce;
+	struct map_lookup *map;
+	u64 logical = cache->key.objectid;
+	u64 length = cache->key.offset;
+	u64 physical = 0;
+	int ret = 0;
+	int i;
+	u64 zone_size = fs_info->fs_devices->zone_size;
+	u64 *alloc_offsets = NULL;
+
+	if (!btrfs_fs_incompat(fs_info, HMZONED))
+		return 0;
+
+	/* Sanity check */
+	if (!IS_ALIGNED(length, zone_size)) {
+		fprintf(stderr, "unaligned block group at %llu", logical);
+		return -EIO;
+	}
+
+	/* Get the chunk mapping */
+	ce = search_cache_extent(&map_tree->cache_tree, logical);
+	if (!ce) {
+		fprintf(stderr, "failed to find block group at %llu", logical);
+		return -ENOENT;
+	}
+	map = container_of(ce, struct map_lookup, ce);
+
+	/*
+	 * Get the zone type: if the group is mapped to a non-sequential zone,
+	 * there is no need for the allocation offset (fit allocation is OK).
+	 */
+	device = map->stripes[0].dev;
+	physical = map->stripes[0].physical;
+	if (!zone_is_random_write(&device->zinfo, physical))
+		cache->alloc_type = BTRFS_ALLOC_SEQ;
+
+	/* check block group mapping */
+	alloc_offsets = calloc(map->num_stripes, sizeof(*alloc_offsets));
+	for (i = 0; i < map->num_stripes; i++) {
+		int is_sequential;
+		struct blk_zone zone;
+
+		device = map->stripes[i].dev;
+		physical = map->stripes[i].physical;
+
+		is_sequential = !zone_is_random_write(&device->zinfo, physical);
+		if ((is_sequential && cache->alloc_type != BTRFS_ALLOC_SEQ) ||
+		    (!is_sequential && cache->alloc_type == BTRFS_ALLOC_SEQ)) {
+			fprintf(stderr,
+				"found block group of mixed zone types");
+			ret = -EIO;
+			goto out;
+		}
+
+		if (!is_sequential)
+			continue;
+
+		WARN_ON(!IS_ALIGNED(physical, zone_size));
+		zone = device->zinfo.zones[physical / zone_size];
+
+		/*
+		 * The group is mapped to a sequential zone. Get the zone write
+		 * pointer to determine the allocation offset within the zone.
+		 */
+		switch (zone.cond) {
+		case BLK_ZONE_COND_OFFLINE:
+		case BLK_ZONE_COND_READONLY:
+			fprintf(stderr, "Offline/readonly zone %llu",
+				physical / fs_info->fs_devices->zone_size);
+			ret = -EIO;
+			goto out;
+		case BLK_ZONE_COND_EMPTY:
+			alloc_offsets[i] = 0;
+			break;
+		case BLK_ZONE_COND_FULL:
+			alloc_offsets[i] = zone_size;
+			break;
+		default:
+			/* Partially used zone */
+			alloc_offsets[i] = ((zone.wp - zone.start) << 9);
+			break;
+		}
+	}
+
+	if (cache->alloc_type != BTRFS_ALLOC_SEQ)
+		goto out;
+
+	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+	case 0: /* single */
+	case BTRFS_BLOCK_GROUP_DUP:
+	case BTRFS_BLOCK_GROUP_RAID1:
+		for (i = 1; i < map->num_stripes; i++) {
+			if (alloc_offsets[i] != alloc_offsets[0]) {
+				fprintf(stderr,
+					"zones' write pointers mismatch\n");
+				ret = -EIO;
+				goto out;
+			}
+		}
+		cache->alloc_offset = alloc_offsets[0];
+		break;
+	case BTRFS_BLOCK_GROUP_RAID0:
+		cache->alloc_offset = alloc_offsets[0];
+		for (i = 1; i < map->num_stripes; i++) {
+			cache->alloc_offset += alloc_offsets[i];
+			if (alloc_offsets[0] < alloc_offsets[i]) {
+				fprintf(stderr,
+					"zones' write pointers mismatch\n");
+				ret = -EIO;
+				goto out;
+			}
+		}
+		break;
+	case BTRFS_BLOCK_GROUP_RAID10:
+		cache->alloc_offset = 0;
+		for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
+			int j;
+			int base;
+
+			base = i*map->sub_stripes;
+			for (j = 1; j < map->sub_stripes; j++) {
+				if (alloc_offsets[base] !=
+					alloc_offsets[base+j]) {
+					fprintf(stderr,
+						"zones' write pointer mismatch\n");
+					ret = -EIO;
+					goto out;
+				}
+			}
+
+			if (alloc_offsets[0] < alloc_offsets[base]) {
+				fprintf(stderr,
+					"zones' write pointer mismatch\n");
+				ret = -EIO;
+				goto out;
+			}
+			cache->alloc_offset += alloc_offsets[base];
+		}
+		break;
+	case BTRFS_BLOCK_GROUP_RAID5:
+	case BTRFS_BLOCK_GROUP_RAID6:
+		/* RAID5/6 is not supported yet */
+	default:
+		fprintf(stderr, "Unsupported profile %llu\n",
+			map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+		ret = -EINVAL;
+		goto out;
+	}
+
+out:
+	cache->write_offset = cache->alloc_offset;
+	free(alloc_offsets);
+	return ret;
+}
+#else
+static int
+btrfs_get_block_group_alloc_offset(struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_group_cache *cache)
+{
+	return 0;
+}
+#endif
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -3122,6 +3301,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		BUG_ON(ret);
 		cache->space_info = space_info;
 
+		ret = btrfs_get_block_group_alloc_offset(info, cache);
+		if (ret)
+			goto error;
+
 		/* use EXTENT_LOCKED to prevent merging */
 		set_extent_bits(block_group_cache, found_key.objectid,
 				found_key.objectid + found_key.offset - 1,
@@ -3151,6 +3334,9 @@ btrfs_add_block_group(struct btrfs_fs_info *fs_info, u64 bytes_used, u64 type,
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 
+	ret = btrfs_get_block_group_alloc_offset(fs_info, cache);
+	BUG_ON(ret);
+
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item,
diff --git a/transaction.c b/transaction.c
index 138e10f0d6cc..39a52732bc71 100644
--- a/transaction.c
+++ b/transaction.c
@@ -129,16 +129,32 @@ int __commit_transaction(struct btrfs_trans_handle *trans,
 {
 	u64 start;
 	u64 end;
+	u64 next = 0;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *eb;
 	struct extent_io_tree *tree = &fs_info->extent_cache;
+	struct btrfs_block_group_cache *bg = NULL;
 	int ret;
 
 	while(1) {
+again:
 		ret = find_first_extent_bit(tree, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
+		bg = btrfs_lookup_first_block_group(fs_info, start);
+		BUG_ON(!bg);
+		if (bg->alloc_type == BTRFS_ALLOC_SEQ &&
+		    bg->key.objectid + bg->write_offset < start) {
+			next = bg->key.objectid + bg->write_offset;
+			BUG_ON(next + fs_info->nodesize > start);
+			eb = btrfs_find_create_tree_block(fs_info, next);
+			btrfs_mark_buffer_dirty(eb);
+			free_extent_buffer(eb);
+			goto again;
+		}
+		if (bg->alloc_type == BTRFS_ALLOC_SEQ)
+			bg->write_offset += (end + 1 - start);
 		while(start <= end) {
 			eb = find_first_extent_buffer(tree, start);
 			BUG_ON(!eb || eb->start != start);
-- 
2.21.0


  parent reply index

Thread overview: 79+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-07 13:10 [PATCH v2 00/19] btrfs zoned block device support Naohiro Aota
2019-06-07 13:10 ` [PATCH 01/19] btrfs: introduce HMZONED feature flag Naohiro Aota
2019-06-07 13:10 ` [PATCH 02/19] btrfs: Get zone information of zoned block devices Naohiro Aota
2019-06-13 13:58   ` Josef Bacik
2019-06-18  6:04     ` Naohiro Aota
2019-06-13 13:58   ` Josef Bacik
2019-06-17 18:57   ` David Sterba
2019-06-18  6:42     ` Naohiro Aota
2019-06-27 15:11       ` David Sterba
2019-06-07 13:10 ` [PATCH 03/19] btrfs: Check and enable HMZONED mode Naohiro Aota
2019-06-13 13:57   ` Josef Bacik
2019-06-18  6:43     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 04/19] btrfs: disable fallocate in " Naohiro Aota
2019-06-07 13:10 ` [PATCH 05/19] btrfs: disable direct IO " Naohiro Aota
2019-06-13 14:00   ` Josef Bacik
2019-06-18  8:17     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 06/19] btrfs: align dev extent allocation to zone boundary Naohiro Aota
2019-06-07 13:10 ` [PATCH 07/19] btrfs: do sequential extent allocation in HMZONED mode Naohiro Aota
2019-06-13 14:07   ` Josef Bacik
2019-06-18  8:28     ` Naohiro Aota
2019-06-18 13:37       ` Josef Bacik
2019-06-17 22:30   ` David Sterba
2019-06-18  8:49     ` Naohiro Aota
2019-06-27 15:28       ` David Sterba
2019-06-07 13:10 ` [PATCH 08/19] btrfs: make unmirroed BGs readonly only if we have at least one writable BG Naohiro Aota
2019-06-13 14:09   ` Josef Bacik
2019-06-18  7:42     ` Naohiro Aota
2019-06-18 13:35       ` Josef Bacik
2019-06-07 13:10 ` [PATCH 09/19] btrfs: limit super block locations in HMZONED mode Naohiro Aota
2019-06-13 14:12   ` Josef Bacik
2019-06-18  8:51     ` Naohiro Aota
2019-06-17 22:53   ` David Sterba
2019-06-18  9:01     ` Naohiro Aota
2019-06-27 15:35       ` David Sterba
2019-06-28  3:55   ` Anand Jain
2019-06-28  6:39     ` Naohiro Aota
2019-06-28  6:52       ` Anand Jain
2019-06-07 13:10 ` [PATCH 10/19] btrfs: rename btrfs_map_bio() Naohiro Aota
2019-06-07 13:10 ` [PATCH 11/19] btrfs: introduce submit buffer Naohiro Aota
2019-06-13 14:14   ` Josef Bacik
2019-06-17  3:16     ` Damien Le Moal
2019-06-18  0:00       ` David Sterba
2019-06-18  4:04         ` Damien Le Moal
2019-06-18 13:33       ` Josef Bacik
2019-06-19 10:32         ` Damien Le Moal
2019-06-07 13:10 ` [PATCH 12/19] btrfs: expire submit buffer on timeout Naohiro Aota
2019-06-13 14:15   ` Josef Bacik
2019-06-17  3:19     ` Damien Le Moal
2019-06-07 13:10 ` [PATCH 13/19] btrfs: avoid sync IO prioritization on checksum in HMZONED mode Naohiro Aota
2019-06-13 14:17   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 14/19] btrfs: redirty released extent buffers in sequential BGs Naohiro Aota
2019-06-13 14:24   ` Josef Bacik
2019-06-18  9:09     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 15/19] btrfs: reset zones of unused block groups Naohiro Aota
2019-06-07 13:10 ` [PATCH 16/19] btrfs: wait existing extents before truncating Naohiro Aota
2019-06-13 14:25   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 17/19] btrfs: shrink delayed allocation size in HMZONED mode Naohiro Aota
2019-06-13 14:27   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 18/19] btrfs: support dev-replace " Naohiro Aota
2019-06-13 14:33   ` Josef Bacik
2019-06-18  9:14     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 19/19] btrfs: enable to mount HMZONED incompat flag Naohiro Aota
2019-06-07 13:17 ` [PATCH 01/12] btrfs-progs: build: Check zoned block device support Naohiro Aota
2019-06-07 13:17   ` [PATCH 02/12] btrfs-progs: utils: Introduce queue_param Naohiro Aota
2019-06-07 13:17   ` [PATCH 03/12] btrfs-progs: add new HMZONED feature flag Naohiro Aota
2019-06-07 13:17   ` [PATCH 04/12] btrfs-progs: Introduce zone block device helper functions Naohiro Aota
2019-06-07 13:17   ` [PATCH 05/12] btrfs-progs: load and check zone information Naohiro Aota
2019-06-07 13:17   ` [PATCH 06/12] btrfs-progs: avoid writing super block to sequential zones Naohiro Aota
2019-06-07 13:17   ` [PATCH 07/12] btrfs-progs: support discarding zoned device Naohiro Aota
2019-06-07 13:17   ` [PATCH 08/12] btrfs-progs: volume: align chunk allocation to zones Naohiro Aota
2019-06-07 13:17   ` Naohiro Aota [this message]
2019-06-07 13:17   ` [PATCH 10/12] btrfs-progs: mkfs: Zoned block device support Naohiro Aota
2019-06-07 13:17   ` [PATCH 11/12] btrfs-progs: device-add: support HMZONED device Naohiro Aota
2019-06-07 13:17   ` [PATCH 12/12] btrfs-progs: introduce support for dev-place " Naohiro Aota
2019-06-12 17:51 ` [PATCH v2 00/19] btrfs zoned block device support David Sterba
2019-06-13  4:59   ` Naohiro Aota
2019-06-13 13:46     ` David Sterba
2019-06-14  2:07       ` Naohiro Aota
2019-06-17  2:44       ` Damien Le Moal

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190607131751.5359-9-naohiro.aota@wdc.com \
    --to=naohiro.aota@wdc.com \
    --cc=bvanassche@acm.org \
    --cc=clm@fb.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=josef@toxicpanda.com \
    --cc=jthumshirn@suse.de \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mb@lightnvm.io \
    --cc=nborisov@suse.com \
    --cc=wqu@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-BTRFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-btrfs/0 linux-btrfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-btrfs linux-btrfs/ https://lore.kernel.org/linux-btrfs \
		linux-btrfs@vger.kernel.org linux-btrfs@archiver.kernel.org
	public-inbox-index linux-btrfs


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-btrfs


AGPL code for this site: git clone https://public-inbox.org/ public-inbox