linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Naohiro Aota <naota@elisp.net>
To: David Sterba <dsterba@suse.com>, linux-btrfs@vger.kernel.org
Cc: Chris Mason <clm@fb.com>, Josef Bacik <jbacik@fb.com>,
	linux-kernel@vger.kernel.org, Hannes Reinecke <hare@suse.com>,
	Damien Le Moal <damien.lemoal@wdc.com>,
	Bart Van Assche <bart.vanassche@wdc.com>,
	Matias Bjorling <mb@lightnvm.io>, Naohiro Aota <naota@elisp.net>
Subject: [RFC PATCH 12/17] btrfs: expire submit buffer on timeout
Date: Fri, 10 Aug 2018 03:04:45 +0900	[thread overview]
Message-ID: <20180809180450.5091-13-naota@elisp.net> (raw)
In-Reply-To: <20180809180450.5091-1-naota@elisp.net>

It is possible to have bios stalled in the submit buffer due to some bug or
device problem. In such such situation, btrfs stops working waiting for
buffered bios completions. To avoid such hang, add a worker that will
cancel the stalled bios after an expiration time out.

Signed-off-by: Naohiro Aota <naota@elisp.net>
---
 fs/btrfs/async-thread.c |  1 +
 fs/btrfs/async-thread.h |  1 +
 fs/btrfs/ctree.h        |  5 +++
 fs/btrfs/disk-io.c      |  7 +++-
 fs/btrfs/extent-tree.c  | 20 ++++++++++
 fs/btrfs/super.c        | 20 ++++++++++
 fs/btrfs/volumes.c      | 83 ++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.h      |  1 +
 8 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d522494698fa..86735dfbabcc 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -109,6 +109,7 @@ BTRFS_WORK_HELPER(scrub_helper);
 BTRFS_WORK_HELPER(scrubwrc_helper);
 BTRFS_WORK_HELPER(scrubnc_helper);
 BTRFS_WORK_HELPER(scrubparity_helper);
+BTRFS_WORK_HELPER(bio_expire_helper);
 
 static struct __btrfs_workqueue *
 __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 7861c9feba5f..2c041f0668d4 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -54,6 +54,7 @@ BTRFS_WORK_HELPER_PROTO(scrub_helper);
 BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
 BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
 BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
+BTRFS_WORK_HELPER_PROTO(bio_expire_helper);
 
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ebbbf46aa540..8f85c96cd262 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -699,6 +699,10 @@ struct btrfs_block_group_cache {
 	spinlock_t submit_lock;
 	u64 submit_offset;
 	struct list_head submit_buffer;
+	struct btrfs_work work;
+	unsigned long last_submit;
+	int expired:1;
+	struct task_struct *expire_thread;
 };
 
 /* delayed seq elem */
@@ -974,6 +978,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *submit_workers;
 	struct btrfs_workqueue *caching_workers;
 	struct btrfs_workqueue *readahead_workers;
+	struct btrfs_workqueue *bio_expire_workers;
 
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6a014632ca1e..00fa6aca9bb5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2040,6 +2040,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	 */
 	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
 	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+	btrfs_destroy_workqueue(fs_info->bio_expire_workers);
 }
 
 static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2245,6 +2246,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
 		btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
 				      min_t(u64, fs_devices->num_devices,
 					    max_active), 8);
+	fs_info->bio_expire_workers =
+		btrfs_alloc_workqueue(fs_info, "bio-expire", flags,
+				      max_active, 0);
 
 	if (!(fs_info->workers && fs_info->delalloc_workers &&
 	      fs_info->submit_workers && fs_info->flush_workers &&
@@ -2256,7 +2260,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
 	      fs_info->caching_workers && fs_info->readahead_workers &&
 	      fs_info->fixup_workers && fs_info->delayed_workers &&
 	      fs_info->extent_workers &&
-	      fs_info->qgroup_rescan_workers)) {
+	      fs_info->qgroup_rescan_workers &&
+	      fs_info->bio_expire_workers)) {
 		return -ENOMEM;
 	}
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b7b632b0791..a5f5935315c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9745,6 +9745,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		    block_group->cached == BTRFS_CACHE_ERROR)
 			free_excluded_extents(block_group);
 
+		if (block_group->alloc_type == BTRFS_ALLOC_SEQ) {
+			spin_lock(&block_group->submit_lock);
+			if (block_group->expire_thread)
+				wake_up_process(block_group->expire_thread);
+			spin_unlock(&block_group->submit_lock);
+			flush_work(&block_group->work.normal_work);
+		}
+
 		btrfs_remove_free_space_cache(block_group);
 		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
 		ASSERT(list_empty(&block_group->dirty_list));
@@ -10061,6 +10069,10 @@ btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
 	}
 
 	cache->submit_offset = logical + cache->alloc_offset;
+	btrfs_init_work(&cache->work, btrfs_bio_expire_helper,
+			expire_bios_fn, NULL, NULL);
+	cache->last_submit = 0;
+	cache->expired = 0;
 
 out:
 	cache->alloc_type = alloc_type;
@@ -10847,6 +10859,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		}
 		spin_unlock(&fs_info->unused_bgs_lock);
 
+		if (block_group->alloc_type == BTRFS_ALLOC_SEQ) {
+			spin_lock(&block_group->submit_lock);
+			if (block_group->expire_thread)
+				wake_up_process(block_group->expire_thread);
+			spin_unlock(&block_group->submit_lock);
+			flush_work(&block_group->work.normal_work);
+		}
+
 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
 
 		/* Don't want to race with allocators so take the groups_sem */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cc812e459197..4d1d6cc7cd59 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -154,6 +154,25 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	 * completes. The next time when the filesystem is mounted writeable
 	 * again, the device replace operation continues.
 	 */
+
+	/* expire pending bios in submit buffer */
+	if (btrfs_fs_incompat(fs_info, HMZONED)) {
+		struct btrfs_block_group_cache *block_group;
+		struct rb_node *node;
+
+		spin_lock(&fs_info->block_group_cache_lock);
+		for (node = rb_first(&fs_info->block_group_cache_tree); node;
+		     node = rb_next(node)) {
+			block_group = rb_entry(node,
+					       struct btrfs_block_group_cache,
+					       cache_node);
+			spin_lock(&block_group->submit_lock);
+			if (block_group->expire_thread)
+				wake_up_process(block_group->expire_thread);
+			spin_unlock(&block_group->submit_lock);
+		}
+		spin_unlock(&fs_info->block_group_cache_lock);
+	}
 }
 
 #ifdef CONFIG_PRINTK
@@ -1730,6 +1749,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
 				new_pool_size);
+	btrfs_workqueue_set_max(fs_info->bio_expire_workers, new_pool_size);
 }
 
 static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ca03b7136892..0e68003a429d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6498,6 +6498,7 @@ static void __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info, u64 logical,
 	struct btrfs_block_group_cache *cache = NULL;
 	int sent;
 	LIST_HEAD(submit_list);
+	int should_queue = 1;
 
 	WARN_ON(bio_op(bbio->orig_bio) != REQ_OP_WRITE);
 
@@ -6512,7 +6513,21 @@ static void __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info, u64 logical,
 	bbio->need_seqwrite = 1;
 
 	spin_lock(&cache->submit_lock);
-	if (cache->submit_offset == logical)
+
+	if (cache->expired) {
+		int i, total_devs = bbio->num_stripes;
+
+		spin_unlock(&cache->submit_lock);
+		btrfs_err(cache->fs_info,
+			  "IO in expired block group %llu+%llu",
+			  logical, length);
+		for (i = 0; i < total_devs; i++)
+			bbio_error(bbio, bbio->orig_bio, logical);
+		btrfs_put_block_group(cache);
+		return;
+	}
+
+	if (cache->submit_offset == logical || cache->expired)
 		goto send_bios;
 
 	if (cache->submit_offset > logical) {
@@ -6527,7 +6542,11 @@ static void __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info, u64 logical,
 
 	/* buffer the unaligned bio */
 	list_add_tail(&bbio->list, &cache->submit_buffer);
+	should_queue = !cache->last_submit;
+	cache->last_submit = jiffies;
 	spin_unlock(&cache->submit_lock);
+	if (should_queue)
+		btrfs_queue_work(fs_info->bio_expire_workers, &cache->work);
 	btrfs_put_block_group(cache);
 
 	return;
@@ -6561,6 +6580,14 @@ static void __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info, u64 logical,
 			}
 		}
 	} while (sent);
+
+	if (list_empty(&cache->submit_buffer)) {
+		should_queue = 0;
+		cache->last_submit = 0;
+	} else {
+		should_queue = !cache->last_submit;
+		cache->last_submit = jiffies;
+	}
 	spin_unlock(&cache->submit_lock);
 
 	/* send the collected bios */
@@ -6572,6 +6599,8 @@ static void __btrfs_map_bio_zoned(struct btrfs_fs_info *fs_info, u64 logical,
 
 	if (length)
 		goto loop;
+	if (should_queue)
+		btrfs_queue_work(fs_info->bio_expire_workers, &cache->work);
 	btrfs_put_block_group(cache);
 }
 
@@ -6632,6 +6661,58 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 	return BLK_STS_OK;
 }
 
+void expire_bios_fn(struct btrfs_work *work)
+{
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_bio *bbio, *next;
+	unsigned long expire_time, cur;
+	unsigned long expire = 90 * HZ;
+	LIST_HEAD(submit_list);
+
+	cache = container_of(work, struct btrfs_block_group_cache, work);
+	btrfs_get_block_group(cache);
+loop:
+	spin_lock(&cache->submit_lock);
+	cache->expire_thread = current;
+	if (list_empty(&cache->submit_buffer)) {
+		cache->last_submit = 0;
+		cache->expire_thread = NULL;
+		spin_unlock(&cache->submit_lock);
+		btrfs_put_block_group(cache);
+		return;
+	}
+	cur = jiffies;
+	expire_time = cache->last_submit + expire;
+	if (time_before(cur, expire_time) && !sb_rdonly(cache->fs_info->sb)) {
+		spin_unlock(&cache->submit_lock);
+		schedule_timeout_interruptible(expire_time - cur);
+		goto loop;
+	}
+
+	list_splice_init(&cache->submit_buffer, &submit_list);
+	cache->expired = 1;
+	cache->expire_thread = NULL;
+	spin_unlock(&cache->submit_lock);
+
+	btrfs_handle_fs_error(cache->fs_info, -EIO,
+			      "bio submit buffer expired");
+	btrfs_err(cache->fs_info, "block group %llu submit pos %llu",
+		  cache->key.objectid, cache->submit_offset);
+
+	list_for_each_entry_safe(bbio, next, &submit_list, list) {
+		u64 logical = (u64)bbio->orig_bio->bi_iter.bi_sector << 9;
+		int i, total_devs = bbio->num_stripes;
+
+		btrfs_err(cache->fs_info, "expiring %llu", logical);
+		list_del_init(&bbio->list);
+		for (i = 0; i < total_devs; i++)
+			bbio_error(bbio, bbio->orig_bio, logical);
+	}
+
+	cache->last_submit = 0;
+	btrfs_put_block_group(cache);
+}
+
 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid)
 {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3db90f5395cd..2a3c046fa31b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -415,6 +415,7 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num, int async_submit);
+void expire_bios_fn(struct btrfs_work *work);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       fmode_t flags, void *holder);
 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
-- 
2.18.0


  parent reply	other threads:[~2018-08-09 20:32 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-09 18:04 [RFC PATCH 00/17] btrfs zoned block device support Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 01/17] btrfs: introduce HMZONED feature flag Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 02/17] btrfs: Get zone information of zoned block devices Naohiro Aota
2018-08-10  7:41   ` Nikolay Borisov
2018-08-09 18:04 ` [RFC PATCH 03/17] btrfs: Check and enable HMZONED mode Naohiro Aota
2018-08-10 12:25   ` Hannes Reinecke
2018-08-10 13:15     ` Naohiro Aota
2018-08-10 13:41       ` Hannes Reinecke
2018-08-09 18:04 ` [RFC PATCH 04/17] btrfs: limit super block locations in " Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 05/17] btrfs: disable fallocate " Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 06/17] btrfs: disable direct IO " Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 07/17] btrfs: disable device replace " Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 08/17] btrfs: align extent allocation to zone boundary Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 09/17] btrfs: do sequential allocation on HMZONED drives Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 10/17] btrfs: split btrfs_map_bio() Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 11/17] btrfs: introduce submit buffer Naohiro Aota
2018-08-09 18:04 ` Naohiro Aota [this message]
2018-08-09 18:04 ` [RFC PATCH 13/17] btrfs: avoid sync IO prioritization on checksum in HMZONED mode Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 14/17] btrfs: redirty released extent buffers in sequential BGs Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 15/17] btrfs: reset zones of unused block groups Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 16/17] btrfs: wait existing extents before truncating Naohiro Aota
2018-08-09 18:04 ` [RFC PATCH 17/17] btrfs: enable to mount HMZONED incompat flag Naohiro Aota
2018-08-09 18:10 ` [RFC PATCH 01/12] btrfs-progs: build: Check zoned block device support Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 02/12] btrfs-progs: utils: Introduce queue_param Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 03/12] btrfs-progs: add new HMZONED feature flag Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 04/12] btrfs-progs: Introduce zone block device helper functions Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 05/12] btrfs-progs: load and check zone information Naohiro Aota
2018-08-09 18:10   ` [RFC PATCH 06/12] btrfs-progs: avoid writing super block to sequential zones Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 07/12] btrfs-progs: support discarding zoned device Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 08/12] btrfs-progs: volume: align chunk allocation to zones Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 09/12] btrfs-progs: mkfs: Zoned block device support Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 10/12] btrfs-progs: device-add: support HMZONED device Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 11/12] btrfs-progs: replace: disable in " Naohiro Aota
2018-08-09 18:11   ` [RFC PATCH 12/12] btrfs-progs: do sequential allocation Naohiro Aota
2018-08-10  7:04 ` [RFC PATCH 00/17] btrfs zoned block device support Hannes Reinecke
2018-08-10 14:24   ` Naohiro Aota
2018-08-10  7:26 ` Hannes Reinecke
2018-08-10  7:28 ` Qu Wenruo
2018-08-10 13:32   ` Hans van Kranenburg
2018-08-10 14:04     ` Qu Wenruo
2018-08-16  9:05   ` Naohiro Aota
2018-08-10  7:53 ` Nikolay Borisov
2018-08-10  7:55   ` Nikolay Borisov
2018-08-13 18:42 ` David Sterba
2018-08-13 19:20   ` Hannes Reinecke
2018-08-13 19:29     ` Austin S. Hemmelgarn
2018-08-14  7:41       ` Hannes Reinecke
2018-08-15 11:25         ` Austin S. Hemmelgarn
2018-08-28 10:33   ` Naohiro Aota

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180809180450.5091-13-naota@elisp.net \
    --to=naota@elisp.net \
    --cc=bart.vanassche@wdc.com \
    --cc=clm@fb.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=jbacik@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mb@lightnvm.io \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).