linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Naohiro Aota <naota@elisp.net>
To: David Sterba <dsterba@suse.com>, linux-btrfs@vger.kernel.org
Cc: Chris Mason <clm@fb.com>, Josef Bacik <jbacik@fb.com>,
	linux-kernel@vger.kernel.org, Hannes Reinecke <hare@suse.com>,
	Damien Le Moal <damien.lemoal@wdc.com>,
	Bart Van Assche <bart.vanassche@wdc.com>,
	Matias Bjorling <mb@lightnvm.io>, Naohiro Aota <naota@elisp.net>
Subject: [RFC PATCH 11/17] btrfs: introduce submit buffer
Date: Fri, 10 Aug 2018 03:04:44 +0900	[thread overview]
Message-ID: <20180809180450.5091-12-naota@elisp.net> (raw)
In-Reply-To: <20180809180450.5091-1-naota@elisp.net>

Sequential allocation is not enough to maintain sequential delivery of
write IOs to the device. Various features (async compress, async checksum,
...) of btrfs affect ordering of the IOs. This patch introduce submit
buffer to sort WRITE bios belonging to a block group and sort them out
sequentially in increasing block address to achieve sequential write
sequences with submit_stripe_bio().

Signed-off-by: Naohiro Aota <naota@elisp.net>
---
 fs/btrfs/ctree.h       |   3 +
 fs/btrfs/extent-tree.c |   5 ++
 fs/btrfs/volumes.c     | 121 +++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h     |   3 +
 4 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5060bcdcb72b..ebbbf46aa540 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -696,6 +696,9 @@ struct btrfs_block_group_cache {
 	 */
 	enum btrfs_alloc_type alloc_type;
 	u64 alloc_offset;
+	spinlock_t submit_lock;
+	u64 submit_offset;
+	struct list_head submit_buffer;
 };
 
 /* delayed seq elem */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d4355b9b494e..6b7b632b0791 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -105,6 +105,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
+		WARN_ON(!list_empty(&cache->submit_buffer));
 
 		/*
 		 * If not empty, someone is still holding mutex of
@@ -10059,6 +10060,8 @@ btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
 		goto out;
 	}
 
+	cache->submit_offset = logical + cache->alloc_offset;
+
 out:
 	cache->alloc_type = alloc_type;
 	kfree(alloc_offsets);
@@ -10095,6 +10098,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
+	spin_lock_init(&cache->submit_lock);
 	init_rwsem(&cache->data_rwsem);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
@@ -10102,6 +10106,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 	INIT_LIST_HEAD(&cache->ro_list);
 	INIT_LIST_HEAD(&cache->dirty_list);
 	INIT_LIST_HEAD(&cache->io_list);
+	INIT_LIST_HEAD(&cache->submit_buffer);
 	btrfs_init_free_space_ctl(cache);
 	atomic_set(&cache->trimming, 0);
 	mutex_init(&cache->free_space_lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 08d13da2553f..ca03b7136892 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -513,6 +513,8 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 	spin_unlock(&device->io_lock);
 
 	while (pending) {
+		struct btrfs_bio *bbio;
+		struct completion *sent = NULL;
 
 		rmb();
 		/* we want to work on both lists, but do more bios on the
@@ -550,7 +552,12 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 			sync_pending = 0;
 		}
 
+		bbio = cur->bi_private;
+		if (bbio)
+			sent = bbio->sent;
 		btrfsic_submit_bio(cur);
+		if (sent)
+			complete(sent);
 		num_run++;
 		batch_run++;
 
@@ -5542,6 +5549,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
 
 	atomic_set(&bbio->error, 0);
 	refcount_set(&bbio->refs, 1);
+	INIT_LIST_HEAD(&bbio->list);
 
 	return bbio;
 }
@@ -6351,7 +6359,7 @@ static void btrfs_end_bio(struct bio *bio)
  * the work struct is scheduled.
  */
 static noinline void btrfs_schedule_bio(struct btrfs_device *device,
-					struct bio *bio)
+					struct bio *bio, int need_seqwrite)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	int should_queue = 1;
@@ -6365,7 +6373,12 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
 
 	/* don't bother with additional async steps for reads, right now */
 	if (bio_op(bio) == REQ_OP_READ) {
+		struct btrfs_bio *bbio = bio->bi_private;
+		struct completion *sent = bbio->sent;
+
 		btrfsic_submit_bio(bio);
+		if (sent)
+			complete(sent);
 		return;
 	}
 
@@ -6373,7 +6386,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
 	bio->bi_next = NULL;
 
 	spin_lock(&device->io_lock);
-	if (op_is_sync(bio->bi_opf))
+	if (op_is_sync(bio->bi_opf) && need_seqwrite == 0)
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
@@ -6412,8 +6425,21 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
 
 	btrfs_bio_counter_inc_noblocked(fs_info);
 
+	/* queue all bios into scheduler if sequential write is required */
+	if (bbio->need_seqwrite) {
+		if (!async) {
+			DECLARE_COMPLETION_ONSTACK(sent);
+
+			bbio->sent = &sent;
+			btrfs_schedule_bio(dev, bio, bbio->need_seqwrite);
+			wait_for_completion_io(&sent);
+		} else {
+			btrfs_schedule_bio(dev, bio, bbio->need_seqwrite);
+		}
+		return;
+	}
 	if (async)
-		btrfs_schedule_bio(dev, bio);
+		btrfs_schedule_bio(dev, bio, bbio->need_seqwrite);
 	else
 		btrfsic_submit_bio(bio);
 }
@@ -6465,6 +6491,90 @@ static void __btrfs_map_bio(struct btrfs_fs_info *fs_info, u64 logical,
 	btrfs_bio_counter_dec(fs_info);
 }
 
+static void __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info, u64 logical,
+		struct btrfs_bio *bbio, int async_submit)
+{
+	u64 length = bbio->orig_bio->bi_iter.bi_size;
+	struct btrfs_block_group_cache *cache = NULL;
+	int sent;
+	LIST_HEAD(submit_list);
+
+	WARN_ON(bio_op(bbio->orig_bio) != REQ_OP_WRITE);
+
+	cache = btrfs_lookup_block_group(fs_info, logical);
+	if (!cache || cache->alloc_type != BTRFS_ALLOC_SEQ) {
+		if (cache)
+			btrfs_put_block_group(cache);
+		__btrfs_map_bio(fs_info, logical, bbio, async_submit);
+		return;
+	}
+
+	bbio->need_seqwrite = 1;
+
+	spin_lock(&cache->submit_lock);
+	if (cache->submit_offset == logical)
+		goto send_bios;
+
+	if (cache->submit_offset > logical) {
+		btrfs_info(fs_info, "sending unaligned bio... %llu+%llu %llu\n",
+			logical, length, cache->submit_offset);
+		spin_unlock(&cache->submit_lock);
+		WARN_ON(1);
+		btrfs_put_block_group(cache);
+		__btrfs_map_bio(fs_info, logical, bbio, async_submit);
+		return;
+	}
+
+	/* buffer the unaligned bio */
+	list_add_tail(&bbio->list, &cache->submit_buffer);
+	spin_unlock(&cache->submit_lock);
+	btrfs_put_block_group(cache);
+
+	return;
+
+send_bios:
+	spin_unlock(&cache->submit_lock);
+	/* send this bio */
+	__btrfs_map_bio(fs_info, logical, bbio, 1);
+
+loop:
+	/* and send previously buffered following bios */
+	spin_lock(&cache->submit_lock);
+	cache->submit_offset += length;
+	length = 0;
+	INIT_LIST_HEAD(&submit_list);
+
+	/* collect sequential bios into submit_list */
+	do {
+		struct btrfs_bio *next;
+
+		sent = 0;
+		list_for_each_entry_safe(bbio, next,
+					 &cache->submit_buffer, list) {
+			struct bio *orig_bio = bbio->orig_bio;
+			u64 logical = (u64)orig_bio->bi_iter.bi_sector << 9;
+
+			if (logical == cache->submit_offset + length) {
+				sent = 1;
+				length += orig_bio->bi_iter.bi_size;
+				list_move_tail(&bbio->list, &submit_list);
+			}
+		}
+	} while (sent);
+	spin_unlock(&cache->submit_lock);
+
+	/* send the collected bios */
+	list_for_each_entry(bbio, &submit_list, list) {
+		__btrfs_map_bio(bbio->fs_info,
+				(u64)bbio->orig_bio->bi_iter.bi_sector << 9,
+				bbio, 1);
+	}
+
+	if (length)
+		goto loop;
+	btrfs_put_block_group(cache);
+}
+
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num, int async_submit)
 {
@@ -6515,7 +6625,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 		BUG();
 	}
 
-	__btrfs_map_bio(fs_info, logical, bbio, async_submit);
+	if (btrfs_fs_incompat(fs_info, HMZONED) && bio_op(bio) == REQ_OP_WRITE)
+		__btrfs_map_bio_zoned(fs_info, logical, bbio, async_submit);
+	else
+		__btrfs_map_bio(fs_info, logical, bbio, async_submit);
 	return BLK_STS_OK;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 58053d2e24aa..3db90f5395cd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -317,6 +317,9 @@ struct btrfs_bio {
 	int mirror_num;
 	int num_tgtdevs;
 	int *tgtdev_map;
+	int need_seqwrite;
+	struct list_head list;
+	struct completion *sent;
 	/*
 	 * logical block numbers for the start of each stripe
 	 * The last one or two are p/q.  These are sorted,
-- 
2.18.0


  parent reply	other threads:[~2018-08-09 20:32 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-09 18:04 [RFC PATCH 00/17] btrfs zoned block device support Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 01/17] btrfs: introduce HMZONED feature flag Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 02/17] btrfs: Get zone information of zoned block devices Naohiro Aota
2018-08-10  7:41   ` Nikolay Borisov
2018-08-09 18:04 ` [RFC PATCH 03/17] btrfs: Check and enable HMZONED mode Naohiro Aota
2018-08-10 12:25   ` Hannes Reinecke
2018-08-10 13:15     ` Naohiro Aota
2018-08-10 13:41       ` Hannes Reinecke
2018-08-09 18:04 ` [RFC PATCH 04/17] btrfs: limit super block locations in " Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 05/17] btrfs: disable fallocate " Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 06/17] btrfs: disable direct IO " Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 07/17] btrfs: disable device replace " Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 08/17] btrfs: align extent allocation to zone boundary Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 09/17] btrfs: do sequential allocation on HMZONED drives Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 10/17] btrfs: split btrfs_map_bio() Naohiro Aota
2018-08-09 18:04 ` Naohiro Aota [this message]
2018-08-09 18:04 ` [RFC PATCH 12/17] btrfs: expire submit buffer on timeout Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 13/17] btrfs: avoid sync IO prioritization on checksum in HMZONED mode Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 14/17] btrfs: redirty released extent buffers in sequential BGs Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 15/17] btrfs: reset zones of unused block groups Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 16/17] btrfs: wait existing extents before truncating Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 17/17] btrfs: enable to mount HMZONED incompat flag Naohiro Aota
2018-08-09 18:10 ` [RFC PATCH 01/12] btrfs-progs: build: Check zoned block device support Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 02/12] btrfs-progs: utils: Introduce queue_param Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 03/12] btrfs-progs: add new HMZONED feature flag Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 04/12] btrfs-progs: Introduce zone block device helper functions Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 05/12] btrfs-progs: load and check zone information Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 06/12] btrfs-progs: avoid writing super block to sequential zones Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 07/12] btrfs-progs: support discarding zoned device Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 08/12] btrfs-progs: volume: align chunk allocation to zones Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 09/12] btrfs-progs: mkfs: Zoned block device support Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 10/12] btrfs-progs: device-add: support HMZONED device Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 11/12] btrfs-progs: replace: disable in " Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 12/12] btrfs-progs: do sequential allocation Naohiro Aota
2018-08-10  7:04 ` [RFC PATCH 00/17] btrfs zoned block device support Hannes Reinecke
2018-08-10 14:24   ` Naohiro Aota
2018-08-10  7:26 ` Hannes Reinecke
2018-08-10  7:28 ` Qu Wenruo
2018-08-10 13:32   ` Hans van Kranenburg
2018-08-10 14:04     ` Qu Wenruo
2018-08-16  9:05   ` Naohiro Aota
2018-08-10  7:53 ` Nikolay Borisov
2018-08-10  7:55   ` Nikolay Borisov
2018-08-13 18:42 ` David Sterba
2018-08-13 19:20   ` Hannes Reinecke
2018-08-13 19:29     ` Austin S. Hemmelgarn
2018-08-14  7:41       ` Hannes Reinecke
2018-08-15 11:25         ` Austin S. Hemmelgarn
2018-08-28 10:33   ` Naohiro Aota

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180809180450.5091-12-naota@elisp.net \
    --to=naota@elisp.net \
    --cc=bart.vanassche@wdc.com \
    --cc=clm@fb.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=jbacik@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mb@lightnvm.io \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).