All of lore.kernel.org
 help / color / mirror / Atom feed
From: Naohiro Aota <naohiro.aota@wdc.com>
To: linux-btrfs@vger.kernel.org, David Sterba <dsterba@suse.com>
Cc: "Chris Mason" <clm@fb.com>, "Josef Bacik" <josef@toxicpanda.com>,
	"Qu Wenruo" <wqu@suse.com>, "Nikolay Borisov" <nborisov@suse.com>,
	linux-kernel@vger.kernel.org, "Hannes Reinecke" <hare@suse.com>,
	linux-fsdevel@vger.kernel.org,
	"Damien Le Moal" <damien.lemoal@wdc.com>,
	"Matias Bjørling" <mb@lightnvm.io>,
	"Johannes Thumshirn" <jthumshirn@suse.de>,
	"Bart Van Assche" <bvanassche@acm.org>,
	"Naohiro Aota" <naohiro.aota@wdc.com>
Subject: [PATCH 12/19] btrfs: expire submit buffer on timeout
Date: Fri,  7 Jun 2019 22:10:18 +0900	[thread overview]
Message-ID: <20190607131025.31996-13-naohiro.aota@wdc.com> (raw)
In-Reply-To: <20190607131025.31996-1-naohiro.aota@wdc.com>

It is possible to have bios stalled in the submit buffer due to some bug or
device problem. In such situation, btrfs stops working waiting for buffered
bios completions. To avoid such hang, add a worker that will cancel the
stalled bios after a timeout.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/ctree.h             |  13 ++++
 fs/btrfs/disk-io.c           |   2 +
 fs/btrfs/extent-tree.c       |  16 +++-
 fs/btrfs/super.c             |  18 +++++
 fs/btrfs/volumes.c           | 146 ++++++++++++++++++++++++++++++++++-
 include/trace/events/btrfs.h |   2 +
 6 files changed, 193 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ade6d8243962..dad8ea5c3b99 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -596,6 +596,8 @@ enum btrfs_alloc_type {
 	BTRFS_ALLOC_SEQ		= 1,
 };
 
+struct expire_work;
+
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
@@ -721,6 +723,14 @@ struct btrfs_block_group_cache {
 	struct mutex submit_lock;
 	u64 submit_offset;
 	struct bio_list submit_buffer;
+	struct expire_work *expire_work;
+	int expired:1;
+};
+
+struct expire_work {
+	struct list_head list;
+	struct delayed_work work;
+	struct btrfs_block_group_cache *block_group;
 };
 
 /* delayed seq elem */
@@ -1194,6 +1204,9 @@ struct btrfs_fs_info {
 	spinlock_t ref_verify_lock;
 	struct rb_root block_tree;
 #endif
+
+	struct list_head expire_work_list;
+	struct mutex expire_work_lock;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ddbb02906042..56a416902ce7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2717,6 +2717,8 @@ int open_ctree(struct super_block *sb,
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 	spin_lock_init(&fs_info->reada_lock);
 	btrfs_init_ref_verify(fs_info);
+	INIT_LIST_HEAD(&fs_info->expire_work_list);
+	mutex_init(&fs_info->expire_work_lock);
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ebdc7a6dbe01..cb29a96c226b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -125,6 +125,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
 		WARN_ON(!bio_list_empty(&cache->submit_buffer));
+		WARN_ON(cache->expire_work);
 
 		/*
 		 * If not empty, someone is still holding mutex of
@@ -10180,6 +10181,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		    block_group->cached == BTRFS_CACHE_ERROR)
 			free_excluded_extents(block_group);
 
+		if (block_group->alloc_type == BTRFS_ALLOC_SEQ) {
+			mutex_lock(&block_group->submit_lock);
+			WARN_ON(!bio_list_empty(&block_group->submit_buffer));
+			WARN_ON(block_group->expire_work != NULL);
+			mutex_unlock(&block_group->submit_lock);
+		}
+
 		btrfs_remove_free_space_cache(block_group);
 		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
 		ASSERT(list_empty(&block_group->dirty_list));
@@ -10513,6 +10521,7 @@ btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
 	}
 
 	cache->submit_offset = logical + cache->alloc_offset;
+	cache->expired = 0;
 
 out:
 	cache->alloc_type = alloc_type;
@@ -10565,6 +10574,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
 	cache->alloc_type = BTRFS_ALLOC_FIT;
 	cache->alloc_offset = 0;
+	cache->expire_work = NULL;
 
 	if (btrfs_fs_incompat(fs_info, HMZONED)) {
 		ret = btrfs_get_block_group_alloc_offset(cache);
@@ -11329,11 +11339,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 
 		/* Don't want to race with allocators so take the groups_sem */
 		down_write(&space_info->groups_sem);
+		mutex_lock(&block_group->submit_lock);
 		spin_lock(&block_group->lock);
 		if (block_group->reserved || block_group->pinned ||
 		    btrfs_block_group_used(&block_group->item) ||
 		    block_group->ro ||
-		    list_is_singular(&block_group->list)) {
+		    list_is_singular(&block_group->list) ||
+		    !bio_list_empty(&block_group->submit_buffer)) {
 			/*
 			 * We want to bail if we made new allocations or have
 			 * outstanding allocations in this block group.  We do
@@ -11342,10 +11354,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			 */
 			trace_btrfs_skip_unused_block_group(block_group);
 			spin_unlock(&block_group->lock);
+			mutex_unlock(&block_group->submit_lock);
 			up_write(&space_info->groups_sem);
 			goto next;
 		}
 		spin_unlock(&block_group->lock);
+		mutex_unlock(&block_group->submit_lock);
 
 		/* We don't want to force the issue, only flip if it's ok. */
 		ret = inc_block_group_ro(block_group, 0);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 740a701f16c5..343c26537999 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -154,6 +154,24 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	 * completes. The next time when the filesystem is mounted writable
 	 * again, the device replace operation continues.
 	 */
+
+	/* expire pending bios in submit buffer */
+	if (btrfs_fs_incompat(fs_info, HMZONED)) {
+		struct expire_work *work;
+		struct btrfs_block_group_cache *block_group;
+
+		mutex_lock(&fs_info->expire_work_lock);
+		list_for_each_entry(work, &fs_info->expire_work_list, list) {
+			block_group = work->block_group;
+			mutex_lock(&block_group->submit_lock);
+			if (block_group->expire_work)
+				mod_delayed_work(
+					system_unbound_wq,
+					&block_group->expire_work->work, 0);
+			mutex_unlock(&block_group->submit_lock);
+		};
+		mutex_unlock(&fs_info->expire_work_lock);
+	}
 }
 
 #ifdef CONFIG_PRINTK
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 26a64a53032f..a04379e440fb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6840,6 +6840,124 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 	}
 }
 
+static void expire_bios_fn(struct work_struct *work)
+{
+	struct expire_work *ework;
+	struct btrfs_block_group_cache *cache;
+	struct bio *bio, *next;
+
+	ework = container_of(work, struct expire_work, work.work);
+	cache = ework->block_group;
+
+	mutex_lock(&cache->fs_info->expire_work_lock);
+	mutex_lock(&cache->submit_lock);
+	list_del(&cache->expire_work->list);
+
+	if (btrfs_fs_closing(cache->fs_info)) {
+		WARN_ON(!bio_list_empty(&cache->submit_buffer));
+		goto end;
+	}
+
+	if (bio_list_empty(&cache->submit_buffer))
+		goto end;
+
+	bio = bio_list_get(&cache->submit_buffer);
+	cache->expired = 1;
+	mutex_unlock(&cache->submit_lock);
+
+	btrfs_handle_fs_error(cache->fs_info, -EIO,
+			      "bio submit buffer expired");
+	btrfs_err(cache->fs_info, "block group %llu submit pos %llu",
+		  cache->key.objectid, cache->submit_offset);
+
+	while (bio) {
+		struct map_bio_data *map_private =
+			(struct map_bio_data *)bio->bi_private;
+
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+		bio->bi_private = map_private->orig_bi_private;
+		kfree(map_private);
+
+		trace_btrfs_expire_bio(cache, bio);
+		bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+
+		bio = next;
+	}
+
+end:
+	kfree(cache->expire_work);
+	cache->expire_work = NULL;
+	mutex_unlock(&cache->submit_lock);
+	mutex_unlock(&cache->fs_info->expire_work_lock);
+	btrfs_put_block_group(cache);
+}
+
+static int schedule_expire_work(struct btrfs_block_group_cache *cache)
+{
+	const unsigned long delay = 90 * HZ;
+	struct btrfs_fs_info *fs_info = cache->fs_info;
+	struct expire_work *work;
+	int ret = 0;
+
+	mutex_lock(&fs_info->expire_work_lock);
+	mutex_lock(&cache->submit_lock);
+	if (cache->expire_work) {
+		mod_delayed_work(system_unbound_wq, &cache->expire_work->work,
+				 delay);
+		goto end;
+	}
+
+	work = kmalloc(sizeof(*work), GFP_NOFS);
+	if (!work) {
+		ret = -ENOMEM;
+		goto end;
+	}
+	work->block_group = cache;
+	INIT_LIST_HEAD(&work->list);
+	INIT_DELAYED_WORK(&work->work, expire_bios_fn);
+	cache->expire_work = work;
+
+	list_add(&work->list, &fs_info->expire_work_list);
+	btrfs_get_block_group(cache);
+	mod_delayed_work(system_unbound_wq, &cache->expire_work->work, delay);
+
+end:
+	mutex_unlock(&cache->submit_lock);
+	mutex_unlock(&cache->fs_info->expire_work_lock);
+	return ret;
+}
+
+static bool cancel_expire_work(struct btrfs_block_group_cache *cache)
+{
+	struct expire_work *work;
+	bool ret = true;
+
+	mutex_lock(&cache->fs_info->expire_work_lock);
+	mutex_lock(&cache->submit_lock);
+	work = cache->expire_work;
+	if (!work)
+		goto end;
+	cache->expire_work = NULL;
+
+	ret = cancel_delayed_work(&work->work);
+	/*
+	 * if cancel failed, expire_work is freed by the
+	 * expire worker thread
+	 */
+	if (!ret)
+		goto end;
+
+	list_del(&work->list);
+	kfree(work);
+	btrfs_put_block_group(cache);
+
+end:
+	mutex_unlock(&cache->submit_lock);
+	mutex_unlock(&cache->fs_info->expire_work_lock);
+	return ret;
+}
 
 static blk_status_t __btrfs_map_bio(struct btrfs_fs_info *fs_info,
 				    struct bio *bio, int mirror_num,
@@ -6931,7 +7049,9 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
 	struct btrfs_block_group_cache *cache = NULL;
 	struct map_bio_data *map_private;
 	int sent;
+	bool should_queue;
 	blk_status_t ret;
+	int ret2;
 
 	WARN_ON(bio_op(cur_bio) != REQ_OP_WRITE);
 
@@ -6944,8 +7064,20 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
 	}
 
 	mutex_lock(&cache->submit_lock);
-	if (cache->submit_offset == logical)
+
+	if (cache->expired) {
+		trace_btrfs_bio_in_expired_block_group(cache, cur_bio);
+		mutex_unlock(&cache->submit_lock);
+		btrfs_put_block_group(cache);
+		WARN_ON_ONCE(1);
+		return BLK_STS_IOERR;
+	}
+
+	if (cache->submit_offset == logical) {
+		mutex_unlock(&cache->submit_lock);
+		cancel_expire_work(cache);
 		goto send_bios;
+	}
 
 	if (cache->submit_offset > logical) {
 		trace_btrfs_bio_before_write_pointer(cache, cur_bio);
@@ -6968,13 +7100,18 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
 
 	bio_list_add(&cache->submit_buffer, cur_bio);
 	mutex_unlock(&cache->submit_lock);
+
+	ret2 = schedule_expire_work(cache);
+	if (ret2) {
+		btrfs_put_block_group(cache);
+		return errno_to_blk_status(ret2);
+	}
 	btrfs_put_block_group(cache);
 
 	/* mimic a good result ... */
 	return BLK_STS_OK;
 
 send_bios:
-	mutex_unlock(&cache->submit_lock);
 	/* send this bio */
 	ret = __btrfs_map_bio(fs_info, cur_bio, mirror_num, 1, 1);
 	if (ret != BLK_STS_OK) {
@@ -7013,6 +7150,7 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
 			bio = next;
 		}
 	} while (sent);
+	should_queue = !bio_list_empty(&cache->submit_buffer);
 	mutex_unlock(&cache->submit_lock);
 
 	/* send the collected bios */
@@ -7031,8 +7169,10 @@ static blk_status_t __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info,
 
 	if (length)
 		goto loop;
-	btrfs_put_block_group(cache);
 
+	if (should_queue)
+		WARN_ON(schedule_expire_work(cache));
+	btrfs_put_block_group(cache);
 	return BLK_STS_OK;
 }
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 2b4cd791bf24..0ffb0b330b6c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2131,6 +2131,8 @@ DEFINE_EVENT(btrfs_hmzoned_bio_buffer_events, name,			\
 )
 
 DEFINE_BTRFS_HMZONED_BIO_BUF_EVENT(btrfs_bio_before_write_pointer);
+DEFINE_BTRFS_HMZONED_BIO_BUF_EVENT(btrfs_expire_bio);
+DEFINE_BTRFS_HMZONED_BIO_BUF_EVENT(btrfs_bio_in_expired_block_group);
 
 #endif /* _TRACE_BTRFS_H */
 
-- 
2.21.0


  parent reply	other threads:[~2019-06-07 13:11 UTC|newest]

Thread overview: 79+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-07 13:10 [PATCH v2 00/19] btrfs zoned block device support Naohiro Aota
2019-06-07 13:10 ` [PATCH 01/19] btrfs: introduce HMZONED feature flag Naohiro Aota
2019-06-07 13:10 ` [PATCH 02/19] btrfs: Get zone information of zoned block devices Naohiro Aota
2019-06-13 13:58   ` Josef Bacik
2019-06-18  6:04     ` Naohiro Aota
2019-06-13 13:58   ` Josef Bacik
2019-06-17 18:57   ` David Sterba
2019-06-18  6:42     ` Naohiro Aota
2019-06-27 15:11       ` David Sterba
2019-06-07 13:10 ` [PATCH 03/19] btrfs: Check and enable HMZONED mode Naohiro Aota
2019-06-13 13:57   ` Josef Bacik
2019-06-18  6:43     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 04/19] btrfs: disable fallocate in " Naohiro Aota
2019-06-07 13:10 ` [PATCH 05/19] btrfs: disable direct IO " Naohiro Aota
2019-06-13 14:00   ` Josef Bacik
2019-06-18  8:17     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 06/19] btrfs: align dev extent allocation to zone boundary Naohiro Aota
2019-06-07 13:10 ` [PATCH 07/19] btrfs: do sequential extent allocation in HMZONED mode Naohiro Aota
2019-06-13 14:07   ` Josef Bacik
2019-06-18  8:28     ` Naohiro Aota
2019-06-18 13:37       ` Josef Bacik
2019-06-17 22:30   ` David Sterba
2019-06-18  8:49     ` Naohiro Aota
2019-06-27 15:28       ` David Sterba
2019-06-07 13:10 ` [PATCH 08/19] btrfs: make unmirroed BGs readonly only if we have at least one writable BG Naohiro Aota
2019-06-13 14:09   ` Josef Bacik
2019-06-18  7:42     ` Naohiro Aota
2019-06-18 13:35       ` Josef Bacik
2019-06-07 13:10 ` [PATCH 09/19] btrfs: limit super block locations in HMZONED mode Naohiro Aota
2019-06-13 14:12   ` Josef Bacik
2019-06-18  8:51     ` Naohiro Aota
2019-06-17 22:53   ` David Sterba
2019-06-18  9:01     ` Naohiro Aota
2019-06-27 15:35       ` David Sterba
2019-06-28  3:55   ` Anand Jain
2019-06-28  6:39     ` Naohiro Aota
2019-06-28  6:52       ` Anand Jain
2019-06-07 13:10 ` [PATCH 10/19] btrfs: rename btrfs_map_bio() Naohiro Aota
2019-06-07 13:10 ` [PATCH 11/19] btrfs: introduce submit buffer Naohiro Aota
2019-06-13 14:14   ` Josef Bacik
2019-06-17  3:16     ` Damien Le Moal
2019-06-18  0:00       ` David Sterba
2019-06-18  4:04         ` Damien Le Moal
2019-06-18 13:33       ` Josef Bacik
2019-06-19 10:32         ` Damien Le Moal
2019-06-07 13:10 ` Naohiro Aota [this message]
2019-06-13 14:15   ` [PATCH 12/19] btrfs: expire submit buffer on timeout Josef Bacik
2019-06-17  3:19     ` Damien Le Moal
2019-06-07 13:10 ` [PATCH 13/19] btrfs: avoid sync IO prioritization on checksum in HMZONED mode Naohiro Aota
2019-06-13 14:17   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 14/19] btrfs: redirty released extent buffers in sequential BGs Naohiro Aota
2019-06-13 14:24   ` Josef Bacik
2019-06-18  9:09     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 15/19] btrfs: reset zones of unused block groups Naohiro Aota
2019-06-07 13:10 ` [PATCH 16/19] btrfs: wait existing extents before truncating Naohiro Aota
2019-06-13 14:25   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 17/19] btrfs: shrink delayed allocation size in HMZONED mode Naohiro Aota
2019-06-13 14:27   ` Josef Bacik
2019-06-07 13:10 ` [PATCH 18/19] btrfs: support dev-replace " Naohiro Aota
2019-06-13 14:33   ` Josef Bacik
2019-06-18  9:14     ` Naohiro Aota
2019-06-07 13:10 ` [PATCH 19/19] btrfs: enable to mount HMZONED incompat flag Naohiro Aota
2019-06-07 13:17 ` [PATCH 01/12] btrfs-progs: build: Check zoned block device support Naohiro Aota
2019-06-07 13:17   ` [PATCH 02/12] btrfs-progs: utils: Introduce queue_param Naohiro Aota
2019-06-07 13:17   ` [PATCH 03/12] btrfs-progs: add new HMZONED feature flag Naohiro Aota
2019-06-07 13:17   ` [PATCH 04/12] btrfs-progs: Introduce zone block device helper functions Naohiro Aota
2019-06-07 13:17   ` [PATCH 05/12] btrfs-progs: load and check zone information Naohiro Aota
2019-06-07 13:17   ` [PATCH 06/12] btrfs-progs: avoid writing super block to sequential zones Naohiro Aota
2019-06-07 13:17   ` [PATCH 07/12] btrfs-progs: support discarding zoned device Naohiro Aota
2019-06-07 13:17   ` [PATCH 08/12] btrfs-progs: volume: align chunk allocation to zones Naohiro Aota
2019-06-07 13:17   ` [PATCH 09/12] btrfs-progs: do sequential allocation Naohiro Aota
2019-06-07 13:17   ` [PATCH 10/12] btrfs-progs: mkfs: Zoned block device support Naohiro Aota
2019-06-07 13:17   ` [PATCH 11/12] btrfs-progs: device-add: support HMZONED device Naohiro Aota
2019-06-07 13:17   ` [PATCH 12/12] btrfs-progs: introduce support for dev-place " Naohiro Aota
2019-06-12 17:51 ` [PATCH v2 00/19] btrfs zoned block device support David Sterba
2019-06-13  4:59   ` Naohiro Aota
2019-06-13 13:46     ` David Sterba
2019-06-14  2:07       ` Naohiro Aota
2019-06-17  2:44       ` Damien Le Moal

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190607131025.31996-13-naohiro.aota@wdc.com \
    --to=naohiro.aota@wdc.com \
    --cc=bvanassche@acm.org \
    --cc=clm@fb.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=josef@toxicpanda.com \
    --cc=jthumshirn@suse.de \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mb@lightnvm.io \
    --cc=nborisov@suse.com \
    --cc=wqu@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.