All of lore.kernel.org
 help / color / mirror / Atom feed
* cleanup btrfs bio handling, part 2 v2
@ 2022-04-29 14:30 Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
                   ` (9 more replies)
  0 siblings, 10 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Hi all,

this series removes the need to allocate a separate object for I/O
completions for all read and some write I/Os, and reduced the memory
usage of the low-level bios cloned by btrfs_map_bio by using plain bios
instead of the much larger btrfs_bio.

Changes since v1:
 - rebased to the latests misc-next tree
 - don't call destroy_workqueue on NULL workqueues
 - drop a marginal and slightly controversial cleanup to btrfs_map_bio

Diffstat:
 compression.c |   41 +++++-------
 compression.h |    7 +-
 ctree.h       |   14 ++--
 disk-io.c     |  148 +++++-----------------------------------------
 disk-io.h     |   11 ---
 extent_io.c   |   33 +++-------
 extent_io.h   |    1 
 inode.c       |  162 +++++++++++++++++++--------------------------------
 raid56.c      |  111 +++++++++++++---------------------
 super.c       |   13 ----
 volumes.c     |  184 +++++++++++++++++++++++++++-------------------------------
 volumes.h     |    8 ++
 12 files changed, 262 insertions(+), 471 deletions(-)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 01/10] btrfs: move more work into btrfs_end_bioc
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs, Johannes Thumshirn

Assign ->mirror_num and ->bi_status in btrfs_end_bioc instead of
duplicating the logic in the callers.  Also remove the bio argument as
it always must be bioc->orig_bio and the now pointless bioc_error that
did nothing but assign bi_sector to the same value just sampled in the
caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 72 ++++++++++++++--------------------------------
 1 file changed, 22 insertions(+), 50 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 748614b00ffa2..aeacb87457687 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6621,19 +6621,29 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
 }
 
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
 {
-	bio->bi_private = bioc->private;
-	bio->bi_end_io = bioc->end_io;
-	bio_endio(bio);
+	struct bio *orig_bio = bioc->orig_bio;
 
+	btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num;
+	orig_bio->bi_private = bioc->private;
+	orig_bio->bi_end_io = bioc->end_io;
+
+	/*
+	 * Only send an error to the higher layers if it is beyond the tolerance
+	 * threshold.
+	 */
+	if (atomic_read(&bioc->error) > bioc->max_errors)
+		orig_bio->bi_status = BLK_STS_IOERR;
+	else
+		orig_bio->bi_status = BLK_STS_OK;
+	bio_endio(orig_bio);
 	btrfs_put_bioc(bioc);
 }
 
 static void btrfs_end_bio(struct bio *bio)
 {
 	struct btrfs_io_context *bioc = bio->bi_private;
-	int is_orig_bio = 0;
 
 	if (bio->bi_status) {
 		atomic_inc(&bioc->error);
@@ -6654,35 +6664,12 @@ static void btrfs_end_bio(struct bio *bio)
 		}
 	}
 
-	if (bio == bioc->orig_bio)
-		is_orig_bio = 1;
+	if (bio != bioc->orig_bio)
+		bio_put(bio);
 
 	btrfs_bio_counter_dec(bioc->fs_info);
-
-	if (atomic_dec_and_test(&bioc->stripes_pending)) {
-		if (!is_orig_bio) {
-			bio_put(bio);
-			bio = bioc->orig_bio;
-		}
-
-		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
-		/* only send an error to the higher layers if it is
-		 * beyond the tolerance of the btrfs bio
-		 */
-		if (atomic_read(&bioc->error) > bioc->max_errors) {
-			bio->bi_status = BLK_STS_IOERR;
-		} else {
-			/*
-			 * this bio is actually up to date, we didn't
-			 * go over the max number of errors
-			 */
-			bio->bi_status = BLK_STS_OK;
-		}
-
-		btrfs_end_bioc(bioc, bio);
-	} else if (!is_orig_bio) {
-		bio_put(bio);
-	}
+	if (atomic_dec_and_test(&bioc->stripes_pending))
+		btrfs_end_bioc(bioc);
 }
 
 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
@@ -6720,23 +6707,6 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
 	submit_bio(bio);
 }
 
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
-{
-	atomic_inc(&bioc->error);
-	if (atomic_dec_and_test(&bioc->stripes_pending)) {
-		/* Should be the original bio. */
-		WARN_ON(bio != bioc->orig_bio);
-
-		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
-		bio->bi_iter.bi_sector = logical >> 9;
-		if (atomic_read(&bioc->error) > bioc->max_errors)
-			bio->bi_status = BLK_STS_IOERR;
-		else
-			bio->bi_status = BLK_STS_OK;
-		btrfs_end_bioc(bioc, bio);
-	}
-}
-
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num)
 {
@@ -6795,7 +6765,9 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 						   &dev->dev_state) ||
 		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-			bioc_error(bioc, first_bio, logical);
+			atomic_inc(&bioc->error);
+			if (atomic_dec_and_test(&bioc->stripes_pending))
+				btrfs_end_bioc(bioc);
 			continue;
 		}
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 03/10] btrfs: split btrfs_submit_data_bio Christoph Hellwig
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs, Johannes Thumshirn

Remove the pointless goto just to return err and clean up the code flow
to be a little more straight forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/inode.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b7df1c0ee5ee..3be52ed6ed196 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7899,31 +7899,28 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
 	blk_status_t ret;
 
-	/* Check btrfs_submit_bio_hook() for rules about async submit. */
-	if (async_submit)
-		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
 	if (!write) {
 		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		goto map;
 
-	if (write && async_submit) {
-		ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
-					  btrfs_submit_bio_start_direct_io);
-		goto err;
-	} else if (write) {
+	if (write) {
+		/* check btrfs_submit_data_bio() for async submit rules */
+		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
+			return btrfs_wq_submit_bio(inode, bio, 0, 0,
+					file_offset,
+					btrfs_submit_bio_start_direct_io);
 		/*
 		 * If we aren't doing async submit, calculate the csum of the
 		 * bio now.
 		 */
 		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
 		if (ret)
-			goto err;
+			return ret;
 	} else {
 		u64 csum_offset;
 
@@ -7933,9 +7930,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 		btrfs_bio(bio)->csum = dip->csums + csum_offset;
 	}
 map:
-	ret = btrfs_map_bio(fs_info, bio, 0);
-err:
-	return ret;
+	return btrfs_map_bio(fs_info, bio, 0);
 }
 
 /*
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 03/10] btrfs: split btrfs_submit_data_bio
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads Christoph Hellwig
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Split btrfs_submit_data_bio into one helper for reads and one for writes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.h     |   6 +-
 fs/btrfs/extent_io.c |  12 ++--
 fs/btrfs/inode.c     | 130 ++++++++++++++++++++-----------------------
 3 files changed, 73 insertions(+), 75 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 371f3f6c4ea47..40a6f61559348 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3252,8 +3252,10 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
 u64 btrfs_file_extent_end(const struct btrfs_path *path);
 
 /* inode.c */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
-			   int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio,
+		int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+		int mirror_num, unsigned long bio_flags);
 unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 				    u32 bio_offset, struct page *page,
 				    u64 start, u64 end);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f9d6dd310c42b..80b4482c477c6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -186,11 +186,15 @@ static void submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_fl
 	/* Caller should ensure the bio has at least some range added */
 	ASSERT(bio->bi_iter.bi_size);
 
-	if (is_data_inode(tree->private_data))
-		btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+	if (!is_data_inode(tree->private_data))
+		btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
+	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+		btrfs_submit_data_write_bio(tree->private_data, bio, mirror_num,
 					    bio_flags);
 	else
-		btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
+		btrfs_submit_data_read_bio(tree->private_data, bio, mirror_num,
+					    bio_flags);
+
 	/*
 	 * Above submission hooks will handle the error by ending the bio,
 	 * which will do the cleanup properly.  So here we should not return
@@ -2773,7 +2777,7 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
 		ret = btrfs_repair_one_sector(inode, failed_bio,
 				bio_offset + offset,
 				page, pgoff + offset, start + offset,
-				failed_mirror, btrfs_submit_data_bio);
+				failed_mirror, btrfs_submit_data_read_bio);
 		if (!ret) {
 			/*
 			 * We have submitted the read repair, the page release
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3be52ed6ed196..6476a5d9c09e9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2552,90 +2552,82 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
 	return errno_to_blk_status(ret);
 }
 
-/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read:				sync submit
- *
- * b) write without checksum:		sync submit
- *
- * c) write with checksum:
- *    c-1) if bio is issued by fsync:	sync submit
- *         (sync_writers != 0)
- *
- *    c-2) if root is reloc root:	sync submit
- *         (only in case of buffered IO)
- *
- *    c-3) otherwise:			async submit
- */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio,
 			   int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
-	blk_status_t ret = 0;
-	int skip_sum;
-	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
-	skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
-		test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-
-	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
-		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+	struct btrfs_inode *bi = BTRFS_I(inode);
+	blk_status_t ret;
 
 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
-		struct page *page = bio_first_bvec_all(bio)->bv_page;
-		loff_t file_offset = page_offset(page);
-
-		ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+		ret = extract_ordered_extent(bi, bio,
+				page_offset(bio_first_bvec_all(bio)->bv_page));
 		if (ret)
 			goto out;
 	}
 
-	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
-		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
-		if (ret)
-			goto out;
-
-		if (bio_flags & EXTENT_BIO_COMPRESSED) {
-			/*
-			 * btrfs_submit_compressed_read will handle completing
-			 * the bio if there were any errors, so just return
-			 * here.
-			 */
-			btrfs_submit_compressed_read(inode, bio, mirror_num);
-			return;
-		} else {
-			/*
-			 * Lookup bio sums does extra checks around whether we
-			 * need to csum or not, which is why we ignore skip_sum
-			 * here.
-			 */
-			ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+	/*
+	 * Rules for async/sync submit:
+	 *   a) write without checksum:			sync submit
+	 *   b) write with checksum:
+	 *      b-1) if bio is issued by fsync:		sync submit
+	 *           (sync_writers != 0)
+	 *      b-2) if root is reloc root:		sync submit
+	 *           (only in case of buffered IO)
+	 *      b-3) otherwise:				async submit
+	 */
+	if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+	    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
+		if (atomic_read(&bi->sync_writers)) {
+			ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
 			if (ret)
 				goto out;
+		} else if (btrfs_is_data_reloc_root(bi->root)) {
+			; /* csum items have already been cloned */
+		} else {
+			ret = btrfs_wq_submit_bio(inode, bio,
+					mirror_num, bio_flags, 0,
+					btrfs_submit_bio_start);
+			goto out;
 		}
-		goto mapit;
-	} else if (async && !skip_sum) {
-		/* csum items have already been cloned */
-		if (btrfs_is_data_reloc_root(root))
-			goto mapit;
-		/* we're doing a write, do the async checksumming */
-		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
-					  0, btrfs_submit_bio_start);
+	}
+	ret = btrfs_map_bio(fs_info, bio, mirror_num);
+out:
+	if (ret) {
+		bio->bi_status = ret;
+		bio_endio(bio);
+	}
+}
+
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+			   int mirror_num, unsigned long bio_flags)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	blk_status_t ret;
+
+	ret = btrfs_bio_wq_end_io(fs_info, bio,
+			btrfs_is_free_space_inode(BTRFS_I(inode)) ?
+			BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
+	if (ret)
 		goto out;
-	} else if (!skip_sum) {
-		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
-		if (ret)
-			goto out;
+
+	if (bio_flags & EXTENT_BIO_COMPRESSED) {
+		/*
+		 * btrfs_submit_compressed_read will handle completing the bio
+		 * if there were any errors, so just return here.
+		 */
+		btrfs_submit_compressed_read(inode, bio, mirror_num);
+		return;
 	}
 
-mapit:
+	/*
+	 * Lookup bio sums does extra checks around whether we need to csum or
+	 * not, which is why we ignore skip_sum here.
+	 */
+	ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+	if (ret)
+		goto out;
 	ret = btrfs_map_bio(fs_info, bio, mirror_num);
-
 out:
 	if (ret) {
 		bio->bi_status = ret;
@@ -7909,7 +7901,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 		goto map;
 
 	if (write) {
-		/* check btrfs_submit_data_bio() for async submit rules */
+		/* check btrfs_submit_data_write_bio() for async submit rules */
 		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
 			return btrfs_wq_submit_bio(inode, bio, 0, 0,
 					file_offset,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 03/10] btrfs: split btrfs_submit_data_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

The bio completion handler of the bio used for the compressed data is
already run in a workqueue using btrfs_bio_wq_end_io, so don't schedule
the completion of the original bio to the same workqueue again but just
execute it directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6476a5d9c09e9..02c4cbd557dcd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2605,12 +2605,6 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	blk_status_t ret;
 
-	ret = btrfs_bio_wq_end_io(fs_info, bio,
-			btrfs_is_free_space_inode(BTRFS_I(inode)) ?
-			BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
-	if (ret)
-		goto out;
-
 	if (bio_flags & EXTENT_BIO_COMPRESSED) {
 		/*
 		 * btrfs_submit_compressed_read will handle completing the bio
@@ -2620,6 +2614,12 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 		return;
 	}
 
+	ret = btrfs_bio_wq_end_io(fs_info, bio,
+			btrfs_is_free_space_inode(BTRFS_I(inode)) ?
+			BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
+	if (ret)
+		goto out;
+
 	/*
 	 * Lookup bio sums does extra checks around whether we need to csum or
 	 * not, which is why we ignore skip_sum here.
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-05-01  4:40   ` Qu Wenruo
  2022-04-29 14:30 ` [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Christoph Hellwig
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Instead of attaching a an extra allocation an indirect call to each
low-level bio issued by the RAID code, add a work_struct to struct
btrfs_raid_bio and only defer the per-rbio completion action.  The
per-bio action for all the I/Os are trivial and can be safely done
from interrupt context.

As a nice side effect this also allows sharing the boilerplate code
for the per-bio completions

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.h   |   2 +-
 fs/btrfs/disk-io.c |  12 ++---
 fs/btrfs/disk-io.h |   1 -
 fs/btrfs/raid56.c  | 111 ++++++++++++++++++---------------------------
 4 files changed, 49 insertions(+), 77 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 40a6f61559348..4dd0d4a2e7757 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -853,7 +853,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *flush_workers;
 	struct btrfs_workqueue *endio_workers;
 	struct btrfs_workqueue *endio_meta_workers;
-	struct btrfs_workqueue *endio_raid56_workers;
+	struct workqueue_struct *endio_raid56_workers;
 	struct workqueue_struct *rmw_workers;
 	struct btrfs_workqueue *endio_meta_write_workers;
 	struct btrfs_workqueue *endio_write_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 73e12ecc81be1..3c6137734d28c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
 			wq = fs_info->endio_meta_write_workers;
 		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
 			wq = fs_info->endio_freespace_worker;
-		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-			wq = fs_info->endio_raid56_workers;
 		else
 			wq = fs_info->endio_write_workers;
 	} else {
-		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-			wq = fs_info->endio_raid56_workers;
-		else if (end_io_wq->metadata)
+		if (end_io_wq->metadata)
 			wq = fs_info->endio_meta_workers;
 		else
 			wq = fs_info->endio_workers;
@@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->hipri_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
 	btrfs_destroy_workqueue(fs_info->endio_workers);
-	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+	if (fs_info->endio_raid56_workers)
+		destroy_workqueue(fs_info->endio_raid56_workers);
 	if (fs_info->rmw_workers)
 		destroy_workqueue(fs_info->rmw_workers);
 	btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
 				      max_active, 2);
 	fs_info->endio_raid56_workers =
-		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
-				      max_active, 4);
+		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
 	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
 	fs_info->endio_write_workers =
 		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9340e3266e0ac..97255e3d7e524 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
 	BTRFS_WQ_ENDIO_DATA,
 	BTRFS_WQ_ENDIO_METADATA,
 	BTRFS_WQ_ENDIO_FREE_SPACE,
-	BTRFS_WQ_ENDIO_RAID56,
 };
 
 static inline u64 btrfs_sb_offset(int mirror)
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a5b623ee6facd..1a3c1a9b10d0b 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -164,6 +164,9 @@ struct btrfs_raid_bio {
 	atomic_t stripes_pending;
 
 	atomic_t error;
+
+	struct work_struct end_io_work;
+
 	/*
 	 * these are two arrays of pointers.  We allocate the
 	 * rbio big enough to hold them both and setup their
@@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
 	}
 }
 
-/*
- * end io for the read phase of the rmw cycle.  All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid_rmw_end_io(struct bio *bio)
+static void raid56_bio_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
@@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
 
 	bio_put(bio);
 
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
+	if (atomic_dec_and_test(&rbio->stripes_pending))
+		queue_work(rbio->bioc->fs_info->endio_raid56_workers,
+			   &rbio->end_io_work);
+}
 
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
-		goto cleanup;
+/*
+ * End io handler for the read phase of the rmw cycle.  All the bios here are
+ * physical stripe bios we've read from the disk so we can recalculate the
+ * parity of the stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_rmw_end_io_work(struct work_struct *work)
+{
+	struct btrfs_raid_bio *rbio =
+		container_of(work, struct btrfs_raid_bio, end_io_work);
+
+	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+		rbio_orig_end_io(rbio, BLK_STS_IOERR);
+		return;
+	}
 
 	/*
-	 * this will normally call finish_rmw to start our write
-	 * but if there are any failed stripes we'll reconstruct
-	 * from parity first
+	 * This will normally call finish_rmw to start our write but if there
+	 * are any failed stripes we'll reconstruct from parity first.
 	 */
 	validate_rbio_for_rmw(rbio);
-	return;
-
-cleanup:
-
-	rbio_orig_end_io(rbio, BLK_STS_IOERR);
 }
 
 /*
@@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 	 * touch it after that.
 	 */
 	atomic_set(&rbio->stripes_pending, bios_to_read);
+	INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
 	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid_rmw_end_io;
-
-		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+		bio->bi_end_io = raid56_bio_end_io;
 		submit_bio(bio);
 	}
 	/* the actual write will happen once the reads are done */
@@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 }
 
 /*
- * This is called only for stripes we've read from disk to
- * reconstruct the parity.
+ * This is called only for stripes we've read from disk to reconstruct the
+ * parity.
  */
-static void raid_recover_end_io(struct bio *bio)
+static void raid_recover_end_io_work(struct work_struct *work)
 {
-	struct btrfs_raid_bio *rbio = bio->bi_private;
-
-	/*
-	 * we only read stripe pages off the disk, set them
-	 * up to date if there were no errors
-	 */
-	if (bio->bi_status)
-		fail_bio_stripe(rbio, bio);
-	else
-		set_bio_pages_uptodate(rbio, bio);
-	bio_put(bio);
-
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
+	struct btrfs_raid_bio *rbio =
+		container_of(work, struct btrfs_raid_bio, end_io_work);
 
 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
@@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 	 * touch it after that.
 	 */
 	atomic_set(&rbio->stripes_pending, bios_to_read);
+	INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
 	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid_recover_end_io;
-
-		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+		bio->bi_end_io = raid56_bio_end_io;
 		submit_bio(bio);
 	}
 
@@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 	atomic_set(&rbio->stripes_pending, nr_data);
 
 	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid_write_end_io;
-
+		bio->bi_end_io = raid56_bio_end_io;
 		submit_bio(bio);
 	}
 	return;
@@ -2671,24 +2660,14 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
  * This will usually kick off finish_rmw once all the bios are read in, but it
  * may trigger parity reconstruction if we had any errors along the way
  */
-static void raid56_parity_scrub_end_io(struct bio *bio)
+static void raid56_parity_scrub_end_io_work(struct work_struct *work)
 {
-	struct btrfs_raid_bio *rbio = bio->bi_private;
-
-	if (bio->bi_status)
-		fail_bio_stripe(rbio, bio);
-	else
-		set_bio_pages_uptodate(rbio, bio);
-
-	bio_put(bio);
-
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
+	struct btrfs_raid_bio *rbio =
+		container_of(work, struct btrfs_raid_bio, end_io_work);
 
 	/*
-	 * this will normally call finish_rmw to start our write
-	 * but if there are any failed stripes we'll reconstruct
-	 * from parity first
+	 * This will normally call finish_rmw to start our write, but if there
+	 * are any failed stripes we'll reconstruct from parity first
 	 */
 	validate_rbio_for_parity_scrub(rbio);
 }
@@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 	 * touch it after that.
 	 */
 	atomic_set(&rbio->stripes_pending, bios_to_read);
+	INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
 	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid56_parity_scrub_end_io;
-
-		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+		bio->bi_end_io = raid56_bio_end_io;
 		submit_bio(bio);
 	}
 	/* the actual write will happen once the reads are done */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 07/10] btrfs: centralize setting REQ_META Christoph Hellwig
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Compressed write bio completion is the only user of btrfs_bio_wq_end_io
for writes, and the use of btrfs_bio_wq_end_io is a little suboptimal
here as we only real need user context for the final completion of a
compressed_bio structure, and not every single bio completion.

Add a work_struct to struct compressed_bio instead and use that to call
finish_compressed_bio_write.  This allows to remove all handling of
write bios in the btrfs_bio_wq_end_io infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/compression.c | 45 +++++++++++++++++++++---------------------
 fs/btrfs/compression.h |  7 +++++--
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/disk-io.c     | 30 +++++++++++-----------------
 fs/btrfs/super.c       |  2 --
 5 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f4564f32f6d93..9d5986a30a4a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -403,6 +403,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
 	kfree(cb);
 }
 
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+	struct compressed_bio *cb =
+		container_of(work, struct compressed_bio, write_end_work);
+
+	finish_compressed_bio_write(cb);
+}
+
 /*
  * Do the cleanup once all the compressed pages hit the disk.  This will clear
  * writeback on the file pages and free the compressed pages.
@@ -414,29 +422,16 @@ static void end_compressed_bio_write(struct bio *bio)
 {
 	struct compressed_bio *cb = bio->bi_private;
 
-	if (!dec_and_test_compressed_bio(cb, bio))
-		goto out;
-
-	btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+	if (dec_and_test_compressed_bio(cb, bio)) {
+		struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
 
-	finish_compressed_bio_write(cb);
-out:
+		btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+		queue_work(fs_info->compressed_write_workers,
+			   &cb->write_end_work);
+	}
 	bio_put(bio);
 }
 
-static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
-					  struct bio *bio, int mirror_num)
-{
-	blk_status_t ret;
-
-	ASSERT(bio->bi_iter.bi_size);
-	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-	if (ret)
-		return ret;
-	ret = btrfs_map_bio(fs_info, bio, mirror_num);
-	return ret;
-}
-
 /*
  * Allocate a compressed_bio, which will be used to read/write on-disk
  * (aka, compressed) * data.
@@ -533,7 +528,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 	cb->compressed_pages = compressed_pages;
 	cb->compressed_len = compressed_len;
 	cb->writeback = writeback;
-	cb->orig_bio = NULL;
+	INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
 	cb->nr_pages = nr_pages;
 
 	if (blkcg_css)
@@ -603,7 +598,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 					goto finish_cb;
 			}
 
-			ret = submit_compressed_bio(fs_info, bio, 0);
+			ASSERT(bio->bi_iter.bi_size);
+			ret = btrfs_map_bio(fs_info, bio, 0);
 			if (ret)
 				goto finish_cb;
 			bio = NULL;
@@ -941,7 +937,12 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 						  fs_info->sectorsize);
 			sums += fs_info->csum_size * nr_sectors;
 
-			ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
+			ASSERT(comp_bio->bi_iter.bi_size);
+			ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
+						  BTRFS_WQ_ENDIO_DATA);
+			if (ret)
+				goto finish_cb;
+			ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
 			if (ret)
 				goto finish_cb;
 			comp_bio = NULL;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 2707404389a5d..4a40725cbf1db 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -61,8 +61,11 @@ struct compressed_bio {
 	blk_status_t status;
 	int mirror_num;
 
-	/* for reads, this is the bio we are copying the data into */
-	struct bio *orig_bio;
+	union {
+		/* for reads, this is the bio we are copying the data into */
+		struct bio *orig_bio;
+		struct work_struct write_end_work;
+	};
 
 	/*
 	 * the start of a variable length array of checksums only
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4dd0d4a2e7757..5ae482632327f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -855,7 +855,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *endio_meta_workers;
 	struct workqueue_struct *endio_raid56_workers;
 	struct workqueue_struct *rmw_workers;
-	struct btrfs_workqueue *endio_meta_write_workers;
+	struct workqueue_struct *compressed_write_workers;
 	struct btrfs_workqueue *endio_write_workers;
 	struct btrfs_workqueue *endio_freespace_worker;
 	struct btrfs_workqueue *caching_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3c6137734d28c..744a74203776e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -748,19 +748,10 @@ static void end_workqueue_bio(struct bio *bio)
 	fs_info = end_io_wq->info;
 	end_io_wq->status = bio->bi_status;
 
-	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
-		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
-			wq = fs_info->endio_meta_write_workers;
-		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
-			wq = fs_info->endio_freespace_worker;
-		else
-			wq = fs_info->endio_write_workers;
-	} else {
-		if (end_io_wq->metadata)
-			wq = fs_info->endio_meta_workers;
-		else
-			wq = fs_info->endio_workers;
-	}
+	if (end_io_wq->metadata)
+		wq = fs_info->endio_meta_workers;
+	else
+		wq = fs_info->endio_workers;
 
 	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
 	btrfs_queue_work(wq, &end_io_wq->work);
@@ -771,6 +762,9 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 {
 	struct btrfs_end_io_wq *end_io_wq;
 
+	if (WARN_ON_ONCE(btrfs_op(bio) != BTRFS_MAP_WRITE))
+		return BLK_STS_IOERR;
+
 	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
 	if (!end_io_wq)
 		return BLK_STS_RESOURCE;
@@ -2274,6 +2268,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 		destroy_workqueue(fs_info->endio_raid56_workers);
 	if (fs_info->rmw_workers)
 		destroy_workqueue(fs_info->rmw_workers);
+	if (fs_info->compressed_write_workers)
+		destroy_workqueue(fs_info->compressed_write_workers);
 	btrfs_destroy_workqueue(fs_info->endio_write_workers);
 	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
 	btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2288,7 +2284,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	 * queues can do metadata I/O operations.
 	 */
 	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
-	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
 }
 
 static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2470,15 +2465,14 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	fs_info->endio_meta_workers =
 		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
 				      max_active, 4);
-	fs_info->endio_meta_write_workers =
-		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
-				      max_active, 2);
 	fs_info->endio_raid56_workers =
 		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
 	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
 	fs_info->endio_write_workers =
 		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
 				      max_active, 2);
+	fs_info->compressed_write_workers =
+		alloc_workqueue("btrfs-compressed-write", flags, max_active);
 	fs_info->endio_freespace_worker =
 		btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
 				      max_active, 0);
@@ -2493,7 +2487,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	if (!(fs_info->workers && fs_info->hipri_workers &&
 	      fs_info->delalloc_workers && fs_info->flush_workers &&
 	      fs_info->endio_workers && fs_info->endio_meta_workers &&
-	      fs_info->endio_meta_write_workers &&
+	      fs_info->compressed_write_workers &&
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
 	      fs_info->caching_workers && fs_info->fixup_workers &&
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b1fdc6a26c76e..9c683c466d585 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1908,8 +1908,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
-	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
-				new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 07/10] btrfs: centralize setting REQ_META
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 08/10] btrfs: remove btrfs_end_io_wq Christoph Hellwig
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Set REQ_META in btrfs_submit_metadata_bio instead of the various callers.
We'll start relying on this flag inside of btrfs in a bit, and this
ensures it is always set correctly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/disk-io.c   | 2 ++
 fs/btrfs/extent_io.c | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 744a74203776e..b8837f66e0a43 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -914,6 +914,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	blk_status_t ret;
 
+	bio->bi_opf |= REQ_META;
+
 	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
 		/*
 		 * called for a read, do the setup so that checksum validation
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 80b4482c477c6..a14ed9b9dc2d0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4589,7 +4589,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct page *page = eb->pages[0];
-	unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+	unsigned int write_flags = wbc_to_write_flags(wbc);
 	bool no_dirty_ebs = false;
 	int ret;
 
@@ -4634,7 +4634,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 {
 	u64 disk_bytenr = eb->start;
 	int i, num_pages;
-	unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+	unsigned int write_flags = wbc_to_write_flags(wbc);
 	int ret = 0;
 
 	prepare_eb_write(eb);
@@ -6645,7 +6645,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
 
 	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
-	ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
+	ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
 				 page, eb->start, eb->len,
 				 eb->start - page_offset(page),
 				 end_bio_extent_readpage, mirror_num, 0,
@@ -6752,7 +6752,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
 			}
 
 			ClearPageError(page);
-			err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+			err = submit_extent_page(REQ_OP_READ, NULL,
 					 &bio_ctrl, page, page_offset(page),
 					 PAGE_SIZE, 0, end_bio_extent_readpage,
 					 mirror_num, 0, false);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 08/10] btrfs: remove btrfs_end_io_wq
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 07/10] btrfs: centralize setting REQ_META Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 09/10] btrfs: refactor btrfs_map_bio Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios Christoph Hellwig
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

All reads bio that go through btrfs_map_bio need to be completed in
user context.  And read I/Os are the most common and timing critical
in almost any file system workloads.

Embedd a work_struct into struct btrfs_bio and use it to complete all
read bios submitted through btrfs_map, using the REQ_META flag to decide
which workqueue they are placed on.

This removes the need for a separate 128 byte allocation (typically
rounded up to 192 bytes by slab) for all reads with a size increase
of 24 bytes for struct btrfs_bio.  Future patches will reorgnize
struct btrfs_bio to make use of this extra space for writes as well.

(all sizes are based a on typical 64-bit non-debug build)

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/compression.c |   4 --
 fs/btrfs/ctree.h       |   4 +-
 fs/btrfs/disk-io.c     | 120 +++--------------------------------------
 fs/btrfs/disk-io.h     |  10 ----
 fs/btrfs/inode.c       |  24 +--------
 fs/btrfs/super.c       |  11 +---
 fs/btrfs/volumes.c     |  33 ++++++++++--
 fs/btrfs/volumes.h     |   3 ++
 8 files changed, 42 insertions(+), 167 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d5986a30a4a2..c73ee8ae070d7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -938,10 +938,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			sums += fs_info->csum_size * nr_sectors;
 
 			ASSERT(comp_bio->bi_iter.bi_size);
-			ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
-						  BTRFS_WQ_ENDIO_DATA);
-			if (ret)
-				goto finish_cb;
 			ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
 			if (ret)
 				goto finish_cb;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5ae482632327f..c2c8027067c48 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -851,8 +851,8 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *hipri_workers;
 	struct btrfs_workqueue *delalloc_workers;
 	struct btrfs_workqueue *flush_workers;
-	struct btrfs_workqueue *endio_workers;
-	struct btrfs_workqueue *endio_meta_workers;
+	struct workqueue_struct *endio_workers;
+	struct workqueue_struct *endio_meta_workers;
 	struct workqueue_struct *endio_raid56_workers;
 	struct workqueue_struct *rmw_workers;
 	struct workqueue_struct *compressed_write_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8837f66e0a43..b55761e21bde3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
 				 BTRFS_SUPER_FLAG_METADUMP |\
 				 BTRFS_SUPER_FLAG_METADUMP_V2)
 
-static void end_workqueue_fn(struct btrfs_work *work);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				      struct btrfs_fs_info *fs_info);
@@ -63,40 +62,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
 
-/*
- * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * is complete.  This is used during reads to verify checksums, and it is used
- * by writes to insert metadata for new file extents after IO is complete.
- */
-struct btrfs_end_io_wq {
-	struct bio *bio;
-	bio_end_io_t *end_io;
-	void *private;
-	struct btrfs_fs_info *info;
-	blk_status_t status;
-	enum btrfs_wq_endio_type metadata;
-	struct btrfs_work work;
-};
-
-static struct kmem_cache *btrfs_end_io_wq_cache;
-
-int __init btrfs_end_io_wq_init(void)
-{
-	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
-					sizeof(struct btrfs_end_io_wq),
-					0,
-					SLAB_MEM_SPREAD,
-					NULL);
-	if (!btrfs_end_io_wq_cache)
-		return -ENOMEM;
-	return 0;
-}
-
-void __cold btrfs_end_io_wq_exit(void)
-{
-	kmem_cache_destroy(btrfs_end_io_wq_cache);
-}
-
 static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
 {
 	if (fs_info->csum_shash)
@@ -739,48 +704,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 	return ret;
 }
 
-static void end_workqueue_bio(struct bio *bio)
-{
-	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
-	struct btrfs_fs_info *fs_info;
-	struct btrfs_workqueue *wq;
-
-	fs_info = end_io_wq->info;
-	end_io_wq->status = bio->bi_status;
-
-	if (end_io_wq->metadata)
-		wq = fs_info->endio_meta_workers;
-	else
-		wq = fs_info->endio_workers;
-
-	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
-	btrfs_queue_work(wq, &end_io_wq->work);
-}
-
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
-			enum btrfs_wq_endio_type metadata)
-{
-	struct btrfs_end_io_wq *end_io_wq;
-
-	if (WARN_ON_ONCE(btrfs_op(bio) != BTRFS_MAP_WRITE))
-		return BLK_STS_IOERR;
-
-	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
-	if (!end_io_wq)
-		return BLK_STS_RESOURCE;
-
-	end_io_wq->private = bio->bi_private;
-	end_io_wq->end_io = bio->bi_end_io;
-	end_io_wq->info = info;
-	end_io_wq->status = 0;
-	end_io_wq->bio = bio;
-	end_io_wq->metadata = metadata;
-
-	bio->bi_private = end_io_wq;
-	bio->bi_end_io = end_workqueue_bio;
-	return 0;
-}
-
 static void run_one_async_start(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
@@ -917,14 +840,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
 	bio->bi_opf |= REQ_META;
 
 	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
-		/*
-		 * called for a read, do the setup so that checksum validation
-		 * can happen in the async kernel threads
-		 */
-		ret = btrfs_bio_wq_end_io(fs_info, bio,
-					  BTRFS_WQ_ENDIO_METADATA);
-		if (!ret)
-			ret = btrfs_map_bio(fs_info, bio, mirror_num);
+		ret = btrfs_map_bio(fs_info, bio, mirror_num);
 	} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
 		ret = btree_csum_one_bio(bio);
 		if (!ret)
@@ -1940,25 +1856,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
 	return root;
 }
 
-/*
- * called by the kthread helper functions to finally call the bio end_io
- * functions.  This is where read checksum verification actually happens
- */
-static void end_workqueue_fn(struct btrfs_work *work)
-{
-	struct bio *bio;
-	struct btrfs_end_io_wq *end_io_wq;
-
-	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
-	bio = end_io_wq->bio;
-
-	bio->bi_status = end_io_wq->status;
-	bio->bi_private = end_io_wq->private;
-	bio->bi_end_io = end_io_wq->end_io;
-	bio_endio(bio);
-	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-}
-
 static int cleaner_kthread(void *arg)
 {
 	struct btrfs_fs_info *fs_info = arg;
@@ -2265,7 +2162,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
 	btrfs_destroy_workqueue(fs_info->hipri_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
-	btrfs_destroy_workqueue(fs_info->endio_workers);
+	if (fs_info->endio_workers)
+		destroy_workqueue(fs_info->endio_workers);
 	if (fs_info->endio_raid56_workers)
 		destroy_workqueue(fs_info->endio_raid56_workers);
 	if (fs_info->rmw_workers)
@@ -2285,7 +2183,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	 * the queues used for metadata I/O, since tasks from those other work
 	 * queues can do metadata I/O operations.
 	 */
-	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+	if (fs_info->endio_meta_workers)
+		destroy_workqueue(fs_info->endio_meta_workers);
 }
 
 static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2458,15 +2357,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	fs_info->fixup_workers =
 		btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
 
-	/*
-	 * endios are largely parallel and should have a very
-	 * low idle thresh
-	 */
 	fs_info->endio_workers =
-		btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
+		alloc_workqueue("btrfs-endio", flags, max_active);
 	fs_info->endio_meta_workers =
-		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
-				      max_active, 4);
+		alloc_workqueue("btrfs-endio-meta", flags, max_active);
 	fs_info->endio_raid56_workers =
 		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
 	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 97255e3d7e524..424dbfc5fd784 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -17,12 +17,6 @@
  */
 #define BTRFS_BDEV_BLOCKSIZE	(4096)
 
-enum btrfs_wq_endio_type {
-	BTRFS_WQ_ENDIO_DATA,
-	BTRFS_WQ_ENDIO_METADATA,
-	BTRFS_WQ_ENDIO_FREE_SPACE,
-};
-
 static inline u64 btrfs_sb_offset(int mirror)
 {
 	u64 start = SZ_16K;
@@ -120,8 +114,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
 int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
 			     int level, struct btrfs_key *first_key);
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
-			enum btrfs_wq_endio_type metadata);
 blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 dio_file_offset,
@@ -145,8 +137,6 @@ int btree_lock_page_hook(struct page *page, void *data,
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
 int btrfs_init_root_free_objectid(struct btrfs_root *root);
-int __init btrfs_end_io_wq_init(void);
-void __cold btrfs_end_io_wq_exit(void);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02c4cbd557dcd..7155f3c7d9f42 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2614,12 +2614,6 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 		return;
 	}
 
-	ret = btrfs_bio_wq_end_io(fs_info, bio,
-			btrfs_is_free_space_inode(BTRFS_I(inode)) ?
-			BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
-	if (ret)
-		goto out;
-
 	/*
 	 * Lookup bio sums does extra checks around whether we need to csum or
 	 * not, which is why we ignore skip_sum here.
@@ -7785,9 +7779,6 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
 
 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
 
-	if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
-		return;
-
 	refcount_inc(&dip->refs);
 	if (btrfs_map_bio(fs_info, bio, mirror_num))
 		refcount_dec(&dip->refs);
@@ -7888,19 +7879,12 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_dio_private *dip = bio->bi_private;
-	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
 	blk_status_t ret;
 
-	if (!write) {
-		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-		if (ret)
-			return ret;
-	}
-
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		goto map;
 
-	if (write) {
+	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 		/* check btrfs_submit_data_write_bio() for async submit rules */
 		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
 			return btrfs_wq_submit_bio(inode, bio, 0, 0,
@@ -10250,12 +10234,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
 			return ret;
 	}
 
-	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-	if (ret) {
-		btrfs_bio_free_csum(bbio);
-		return ret;
-	}
-
 	atomic_inc(&priv->pending);
 	ret = btrfs_map_bio(fs_info, bio, mirror_num);
 	if (ret) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9c683c466d585..64eb8aeed156f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1906,8 +1906,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
-	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
-	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
@@ -2668,13 +2666,9 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_delayed_ref;
 
-	err = btrfs_end_io_wq_init();
-	if (err)
-		goto free_prelim_ref;
-
 	err = btrfs_interface_init();
 	if (err)
-		goto free_end_io_wq;
+		goto free_prelim_ref;
 
 	btrfs_print_mod_info();
 
@@ -2690,8 +2684,6 @@ static int __init init_btrfs_fs(void)
 
 unregister_ioctl:
 	btrfs_interface_exit();
-free_end_io_wq:
-	btrfs_end_io_wq_exit();
 free_prelim_ref:
 	btrfs_prelim_ref_exit();
 free_delayed_ref:
@@ -2729,7 +2721,6 @@ static void __exit exit_btrfs_fs(void)
 	extent_state_cache_exit();
 	extent_io_exit();
 	btrfs_interface_exit();
-	btrfs_end_io_wq_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index aeacb87457687..5f18e9105fe08 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6621,11 +6621,27 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
 }
 
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
+{
+	if (bioc->orig_bio->bi_opf & REQ_META)
+		return bioc->fs_info->endio_meta_workers;
+	return bioc->fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+	struct btrfs_bio *bbio =
+		container_of(work, struct btrfs_bio, end_io_work);
+
+	bio_endio(&bbio->bio);
+}
+
+static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
 {
 	struct bio *orig_bio = bioc->orig_bio;
+	struct btrfs_bio *bbio = btrfs_bio(orig_bio);
 
-	btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num;
+	bbio->mirror_num = bioc->mirror_num;
 	orig_bio->bi_private = bioc->private;
 	orig_bio->bi_end_io = bioc->end_io;
 
@@ -6637,7 +6653,14 @@ static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
 		orig_bio->bi_status = BLK_STS_IOERR;
 	else
 		orig_bio->bi_status = BLK_STS_OK;
-	bio_endio(orig_bio);
+
+	if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
+		INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+		queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
+	} else {
+		bio_endio(orig_bio);
+	}
+
 	btrfs_put_bioc(bioc);
 }
 
@@ -6669,7 +6692,7 @@ static void btrfs_end_bio(struct bio *bio)
 
 	btrfs_bio_counter_dec(bioc->fs_info);
 	if (atomic_dec_and_test(&bioc->stripes_pending))
-		btrfs_end_bioc(bioc);
+		btrfs_end_bioc(bioc, true);
 }
 
 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
@@ -6767,7 +6790,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
 			atomic_inc(&bioc->error);
 			if (atomic_dec_and_test(&bioc->stripes_pending))
-				btrfs_end_bioc(bioc);
+				btrfs_end_bioc(bioc, false);
 			continue;
 		}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 12b2af9260e92..28e28b7c48649 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -371,6 +371,9 @@ struct btrfs_bio {
 	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
 	struct bvec_iter iter;
 
+	/* for read end I/O handling */
+	struct work_struct end_io_work;
+
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
 	 * bytes for entire btrfs_bio but relies on bio being last.
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 09/10] btrfs: refactor btrfs_map_bio
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 08/10] btrfs: remove btrfs_end_io_wq Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios Christoph Hellwig
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Move all per-stripe handling into submit_stripe_bio and use a label to
cleanup instead of duplicating the logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/volumes.c | 77 +++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 42 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5f18e9105fe08..28ac2bfb5d296 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6695,10 +6695,30 @@ static void btrfs_end_bio(struct bio *bio)
 		btrfs_end_bioc(bioc, true);
 }
 
-static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
-			      u64 physical, struct btrfs_device *dev)
+static void submit_stripe_bio(struct btrfs_io_context *bioc,
+		struct bio *orig_bio, int dev_nr, bool clone)
 {
 	struct btrfs_fs_info *fs_info = bioc->fs_info;
+	struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
+	u64 physical = bioc->stripes[dev_nr].physical;
+	struct bio *bio;
+
+	if (!dev || !dev->bdev ||
+	    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+	    (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
+	     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+		atomic_inc(&bioc->error);
+		if (atomic_dec_and_test(&bioc->stripes_pending))
+			btrfs_end_bioc(bioc, false);
+		return;
+	}
+
+	if (clone) {
+		bio = btrfs_bio_clone(dev->bdev, orig_bio);
+	} else {
+		bio = orig_bio;
+		bio_set_dev(bio, dev->bdev);
+	}
 
 	bio->bi_private = bioc;
 	btrfs_bio(bio)->device = dev;
@@ -6733,32 +6753,25 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num)
 {
-	struct btrfs_device *dev;
-	struct bio *first_bio = bio;
 	u64 logical = bio->bi_iter.bi_sector << 9;
-	u64 length = 0;
-	u64 map_length;
+	u64 length = bio->bi_iter.bi_size;
+	u64 map_length = length;
 	int ret;
 	int dev_nr;
 	int total_devs;
 	struct btrfs_io_context *bioc = NULL;
 
-	length = bio->bi_iter.bi_size;
-	map_length = length;
-
 	btrfs_bio_counter_inc_blocked(fs_info);
 	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
 				&map_length, &bioc, mirror_num, 1);
-	if (ret) {
-		btrfs_bio_counter_dec(fs_info);
-		return errno_to_blk_status(ret);
-	}
+	if (ret)
+		goto out_dec;
 
 	total_devs = bioc->num_stripes;
-	bioc->orig_bio = first_bio;
-	bioc->private = first_bio->bi_private;
-	bioc->end_io = first_bio->bi_end_io;
-	atomic_set(&bioc->stripes_pending, bioc->num_stripes);
+	bioc->orig_bio = bio;
+	bioc->private = bio->bi_private;
+	bioc->end_io = bio->bi_end_io;
+	atomic_set(&bioc->stripes_pending, total_devs);
 
 	if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
 	    ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
@@ -6770,9 +6783,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			ret = raid56_parity_recover(bio, bioc, map_length,
 						    mirror_num, 1);
 		}
-
-		btrfs_bio_counter_dec(fs_info);
-		return errno_to_blk_status(ret);
+		goto out_dec;
 	}
 
 	if (map_length < length) {
@@ -6782,29 +6793,11 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 		BUG();
 	}
 
-	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
-		dev = bioc->stripes[dev_nr].dev;
-		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
-						   &dev->dev_state) ||
-		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
-		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-			atomic_inc(&bioc->error);
-			if (atomic_dec_and_test(&bioc->stripes_pending))
-				btrfs_end_bioc(bioc, false);
-			continue;
-		}
-
-		if (dev_nr < total_devs - 1) {
-			bio = btrfs_bio_clone(dev->bdev, first_bio);
-		} else {
-			bio = first_bio;
-			bio_set_dev(bio, dev->bdev);
-		}
-
-		submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
-	}
+	for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+		submit_stripe_bio(bioc, bio, dev_nr, dev_nr < total_devs - 1);
+out_dec:
 	btrfs_bio_counter_dec(fs_info);
-	return BLK_STS_OK;
+	return errno_to_blk_status(ret);
 }
 
 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (8 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 09/10] btrfs: refactor btrfs_map_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  9 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

The bios submitted from btrfs_map_bio don't really interact with the
rest of btrfs and the only btrfs_bio member actually used in the
low-level bios is the pointer to the btrfs_io_contex used for endio
handler.

Use a union in struct btrfs_io_stripe that allows the endio handler to
find the btrfs_io_context and remove the spurious ->device assignment
so that a plain fs_bio_set bio can be used for the low-level bios
allocated inside btrfs_map_bio.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/extent_io.c | 13 -------------
 fs/btrfs/extent_io.h |  1 -
 fs/btrfs/volumes.c   | 20 ++++++++++----------
 fs/btrfs/volumes.h   |  5 ++++-
 4 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a14ed9b9dc2d0..37f4eee418219 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3209,19 +3209,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
 	return bio;
 }
 
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio)
-{
-	struct btrfs_bio *bbio;
-	struct bio *new;
-
-	/* Bio allocation backed by a bioset does not fail */
-	new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset);
-	bbio = btrfs_bio(new);
-	btrfs_bio_init(bbio);
-	bbio->iter = bio->bi_iter;
-	return new;
-}
-
 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
 {
 	struct bio *bio;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b390ec79f9a86..3078e90be3a99 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -265,7 +265,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
 
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 28ac2bfb5d296..8577893ccb4b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6666,23 +6666,21 @@ static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
 
 static void btrfs_end_bio(struct bio *bio)
 {
-	struct btrfs_io_context *bioc = bio->bi_private;
+	struct btrfs_io_stripe *stripe = bio->bi_private;
+	struct btrfs_io_context *bioc = stripe->bioc;
 
 	if (bio->bi_status) {
 		atomic_inc(&bioc->error);
 		if (bio->bi_status == BLK_STS_IOERR ||
 		    bio->bi_status == BLK_STS_TARGET) {
-			struct btrfs_device *dev = btrfs_bio(bio)->device;
-
-			ASSERT(dev->bdev);
 			if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-				btrfs_dev_stat_inc_and_print(dev,
+				btrfs_dev_stat_inc_and_print(stripe->dev,
 						BTRFS_DEV_STAT_WRITE_ERRS);
 			else if (!(bio->bi_opf & REQ_RAHEAD))
-				btrfs_dev_stat_inc_and_print(dev,
+				btrfs_dev_stat_inc_and_print(stripe->dev,
 						BTRFS_DEV_STAT_READ_ERRS);
 			if (bio->bi_opf & REQ_PREFLUSH)
-				btrfs_dev_stat_inc_and_print(dev,
+				btrfs_dev_stat_inc_and_print(stripe->dev,
 						BTRFS_DEV_STAT_FLUSH_ERRS);
 		}
 	}
@@ -6714,14 +6712,16 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc,
 	}
 
 	if (clone) {
-		bio = btrfs_bio_clone(dev->bdev, orig_bio);
+		bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS,
+				      &fs_bio_set);
 	} else {
 		bio = orig_bio;
 		bio_set_dev(bio, dev->bdev);
+		btrfs_bio(bio)->device = dev;
 	}
 
-	bio->bi_private = bioc;
-	btrfs_bio(bio)->device = dev;
+	bioc->stripes[dev_nr].bioc = bioc;
+	bio->bi_private = &bioc->stripes[dev_nr];
 	bio->bi_end_io = btrfs_end_bio;
 	bio->bi_iter.bi_sector = physical >> 9;
 	/*
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 28e28b7c48649..825e44c82f2b0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -396,7 +396,10 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
 
 struct btrfs_io_stripe {
 	struct btrfs_device *dev;
-	u64 physical;
+	union {
+		u64 physical;			/* block mapping */
+		struct btrfs_io_context *bioc;	/* for the endio handler */
+	};
 	u64 length; /* only used for discard mappings */
 };
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
@ 2022-05-01  4:40   ` Qu Wenruo
  2022-05-01  4:53     ` Qu Wenruo
  0 siblings, 1 reply; 19+ messages in thread
From: Qu Wenruo @ 2022-05-01  4:40 UTC (permalink / raw)
  To: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs



On 2022/4/29 22:30, Christoph Hellwig wrote:
> Instead of attaching a an extra allocation an indirect call to each
> low-level bio issued by the RAID code, add a work_struct to struct
> btrfs_raid_bio and only defer the per-rbio completion action.  The
> per-bio action for all the I/Os are trivial and can be safely done
> from interrupt context.
>
> As a nice side effect this also allows sharing the boilerplate code
> for the per-bio completions
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

It looks like this patch is causing test failure in btrfs/027, at least
for subapge (64K page size, 4K sectorsize) cases.

Reproducibility is 100% (4/4 tried).

The hanging sub-test case is the repalcing of a missing device in raid5.

The involved dmesg (including the hanging thread dump) is:

[  276.672541] BTRFS warning (device dm-1): read-write for sector size
4096 with page size 65536 is experimental
[  276.744316] BTRFS info (device dm-1): checking UUID tree
[  277.387701] BTRFS info (device dm-1): allowing degraded mounts
[  277.390314] BTRFS info (device dm-1): using free space tree
[  277.392108] BTRFS info (device dm-1): has skinny extents
[  277.393890] BTRFS warning (device dm-1): read-write for sector size
4096 with page size 65536 is experimental
[  277.420922] BTRFS warning (device dm-1): devid 2 uuid
4b67464d-e851-4a88-8765-67b043d4680f is missing
[  277.432694] BTRFS warning (device dm-1): devid 2 uuid
4b67464d-e851-4a88-8765-67b043d4680f is missing
[  277.648326] BTRFS info (device dm-1): dev_replace from <missing disk>
(devid 2) to /dev/mapper/test-scratch5 started
[  297.264371] task:btrfs           state:D stack:    0 pid: 7158 ppid:
  6493 flags:0x0000000c
[  297.280744] Call trace:
[  297.282351]  __switch_to+0xfc/0x160
[  297.284525]  __schedule+0x260/0x61c
[  297.286959]  schedule+0x54/0xc4
[  297.288980]  scrub_enumerate_chunks+0x610/0x760 [btrfs]
[  297.292504]  btrfs_scrub_dev+0x1a0/0x530 [btrfs]
[  297.306738]  btrfs_dev_replace_start+0x2a4/0x2d0 [btrfs]
[  297.310418]  btrfs_dev_replace_by_ioctl+0x48/0x84 [btrfs]
[  297.314026]  btrfs_ioctl_dev_replace+0x1b8/0x210 [btrfs]
[  297.328014]  btrfs_ioctl+0xa48/0x1a70 [btrfs]
[  297.330705]  __arm64_sys_ioctl+0xb4/0x100
[  297.333037]  invoke_syscall+0x50/0x120
[  297.343237]  el0_svc_common.constprop.0+0x4c/0x100
[  297.345716]  do_el0_svc+0x34/0xa0
[  297.347242]  el0_svc+0x34/0xb0
[  297.348763]  el0t_64_sync_handler+0xa8/0x130
[  297.350870]  el0t_64_sync+0x18c/0x190

Mind to take a look on that hang?

Thanks,
Qu

> ---
>   fs/btrfs/ctree.h   |   2 +-
>   fs/btrfs/disk-io.c |  12 ++---
>   fs/btrfs/disk-io.h |   1 -
>   fs/btrfs/raid56.c  | 111 ++++++++++++++++++---------------------------
>   4 files changed, 49 insertions(+), 77 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 40a6f61559348..4dd0d4a2e7757 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -853,7 +853,7 @@ struct btrfs_fs_info {
>   	struct btrfs_workqueue *flush_workers;
>   	struct btrfs_workqueue *endio_workers;
>   	struct btrfs_workqueue *endio_meta_workers;
> -	struct btrfs_workqueue *endio_raid56_workers;
> +	struct workqueue_struct *endio_raid56_workers;
>   	struct workqueue_struct *rmw_workers;
>   	struct btrfs_workqueue *endio_meta_write_workers;
>   	struct btrfs_workqueue *endio_write_workers;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 73e12ecc81be1..3c6137734d28c 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
>   			wq = fs_info->endio_meta_write_workers;
>   		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
>   			wq = fs_info->endio_freespace_worker;
> -		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
> -			wq = fs_info->endio_raid56_workers;
>   		else
>   			wq = fs_info->endio_write_workers;
>   	} else {
> -		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
> -			wq = fs_info->endio_raid56_workers;
> -		else if (end_io_wq->metadata)
> +		if (end_io_wq->metadata)
>   			wq = fs_info->endio_meta_workers;
>   		else
>   			wq = fs_info->endio_workers;
> @@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
>   	btrfs_destroy_workqueue(fs_info->hipri_workers);
>   	btrfs_destroy_workqueue(fs_info->workers);
>   	btrfs_destroy_workqueue(fs_info->endio_workers);
> -	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
> +	if (fs_info->endio_raid56_workers)
> +		destroy_workqueue(fs_info->endio_raid56_workers);
>   	if (fs_info->rmw_workers)
>   		destroy_workqueue(fs_info->rmw_workers);
>   	btrfs_destroy_workqueue(fs_info->endio_write_workers);
> @@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
>   		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
>   				      max_active, 2);
>   	fs_info->endio_raid56_workers =
> -		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
> -				      max_active, 4);
> +		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
>   	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
>   	fs_info->endio_write_workers =
>   		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index 9340e3266e0ac..97255e3d7e524 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
>   	BTRFS_WQ_ENDIO_DATA,
>   	BTRFS_WQ_ENDIO_METADATA,
>   	BTRFS_WQ_ENDIO_FREE_SPACE,
> -	BTRFS_WQ_ENDIO_RAID56,
>   };
>
>   static inline u64 btrfs_sb_offset(int mirror)
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index a5b623ee6facd..1a3c1a9b10d0b 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -164,6 +164,9 @@ struct btrfs_raid_bio {
>   	atomic_t stripes_pending;
>
>   	atomic_t error;
> +
> +	struct work_struct end_io_work;
> +
>   	/*
>   	 * these are two arrays of pointers.  We allocate the
>   	 * rbio big enough to hold them both and setup their
> @@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
>   	}
>   }
>
> -/*
> - * end io for the read phase of the rmw cycle.  All the bios here are physical
> - * stripe bios we've read from the disk so we can recalculate the parity of the
> - * stripe.
> - *
> - * This will usually kick off finish_rmw once all the bios are read in, but it
> - * may trigger parity reconstruction if we had any errors along the way
> - */
> -static void raid_rmw_end_io(struct bio *bio)
> +static void raid56_bio_end_io(struct bio *bio)
>   {
>   	struct btrfs_raid_bio *rbio = bio->bi_private;
>
> @@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
>
>   	bio_put(bio);
>
> -	if (!atomic_dec_and_test(&rbio->stripes_pending))
> -		return;
> +	if (atomic_dec_and_test(&rbio->stripes_pending))
> +		queue_work(rbio->bioc->fs_info->endio_raid56_workers,
> +			   &rbio->end_io_work);
> +}
>
> -	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
> -		goto cleanup;
> +/*
> + * End io handler for the read phase of the rmw cycle.  All the bios here are
> + * physical stripe bios we've read from the disk so we can recalculate the
> + * parity of the stripe.
> + *
> + * This will usually kick off finish_rmw once all the bios are read in, but it
> + * may trigger parity reconstruction if we had any errors along the way
> + */
> +static void raid56_rmw_end_io_work(struct work_struct *work)
> +{
> +	struct btrfs_raid_bio *rbio =
> +		container_of(work, struct btrfs_raid_bio, end_io_work);
> +
> +	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
> +		rbio_orig_end_io(rbio, BLK_STS_IOERR);
> +		return;
> +	}
>
>   	/*
> -	 * this will normally call finish_rmw to start our write
> -	 * but if there are any failed stripes we'll reconstruct
> -	 * from parity first
> +	 * This will normally call finish_rmw to start our write but if there
> +	 * are any failed stripes we'll reconstruct from parity first.
>   	 */
>   	validate_rbio_for_rmw(rbio);
> -	return;
> -
> -cleanup:
> -
> -	rbio_orig_end_io(rbio, BLK_STS_IOERR);
>   }
>
>   /*
> @@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
>   	 * touch it after that.
>   	 */
>   	atomic_set(&rbio->stripes_pending, bios_to_read);
> +	INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
>   	while ((bio = bio_list_pop(&bio_list))) {
> -		bio->bi_end_io = raid_rmw_end_io;
> -
> -		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> +		bio->bi_end_io = raid56_bio_end_io;
>   		submit_bio(bio);
>   	}
>   	/* the actual write will happen once the reads are done */
> @@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
>   }
>
>   /*
> - * This is called only for stripes we've read from disk to
> - * reconstruct the parity.
> + * This is called only for stripes we've read from disk to reconstruct the
> + * parity.
>    */
> -static void raid_recover_end_io(struct bio *bio)
> +static void raid_recover_end_io_work(struct work_struct *work)
>   {
> -	struct btrfs_raid_bio *rbio = bio->bi_private;
> -
> -	/*
> -	 * we only read stripe pages off the disk, set them
> -	 * up to date if there were no errors
> -	 */
> -	if (bio->bi_status)
> -		fail_bio_stripe(rbio, bio);
> -	else
> -		set_bio_pages_uptodate(rbio, bio);
> -	bio_put(bio);
> -
> -	if (!atomic_dec_and_test(&rbio->stripes_pending))
> -		return;
> +	struct btrfs_raid_bio *rbio =
> +		container_of(work, struct btrfs_raid_bio, end_io_work);
>
>   	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
>   		rbio_orig_end_io(rbio, BLK_STS_IOERR);
> @@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
>   	 * touch it after that.
>   	 */
>   	atomic_set(&rbio->stripes_pending, bios_to_read);
> +	INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
>   	while ((bio = bio_list_pop(&bio_list))) {
> -		bio->bi_end_io = raid_recover_end_io;
> -
> -		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> +		bio->bi_end_io = raid56_bio_end_io;
>   		submit_bio(bio);
>   	}
>
> @@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>   	atomic_set(&rbio->stripes_pending, nr_data);
>
>   	while ((bio = bio_list_pop(&bio_list))) {
> -		bio->bi_end_io = raid_write_end_io;
> -
> +		bio->bi_end_io = raid56_bio_end_io;
>   		submit_bio(bio);
>   	}
>   	return;
> @@ -2671,24 +2660,14 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
>    * This will usually kick off finish_rmw once all the bios are read in, but it
>    * may trigger parity reconstruction if we had any errors along the way
>    */
> -static void raid56_parity_scrub_end_io(struct bio *bio)
> +static void raid56_parity_scrub_end_io_work(struct work_struct *work)
>   {
> -	struct btrfs_raid_bio *rbio = bio->bi_private;
> -
> -	if (bio->bi_status)
> -		fail_bio_stripe(rbio, bio);
> -	else
> -		set_bio_pages_uptodate(rbio, bio);
> -
> -	bio_put(bio);
> -
> -	if (!atomic_dec_and_test(&rbio->stripes_pending))
> -		return;
> +	struct btrfs_raid_bio *rbio =
> +		container_of(work, struct btrfs_raid_bio, end_io_work);
>
>   	/*
> -	 * this will normally call finish_rmw to start our write
> -	 * but if there are any failed stripes we'll reconstruct
> -	 * from parity first
> +	 * This will normally call finish_rmw to start our write, but if there
> +	 * are any failed stripes we'll reconstruct from parity first
>   	 */
>   	validate_rbio_for_parity_scrub(rbio);
>   }
> @@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
>   	 * touch it after that.
>   	 */
>   	atomic_set(&rbio->stripes_pending, bios_to_read);
> +	INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
>   	while ((bio = bio_list_pop(&bio_list))) {
> -		bio->bi_end_io = raid56_parity_scrub_end_io;
> -
> -		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> +		bio->bi_end_io = raid56_bio_end_io;
>   		submit_bio(bio);
>   	}
>   	/* the actual write will happen once the reads are done */

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-05-01  4:40   ` Qu Wenruo
@ 2022-05-01  4:53     ` Qu Wenruo
  2022-05-02 16:44       ` Christoph Hellwig
  2022-06-03 16:44       ` David Sterba
  0 siblings, 2 replies; 19+ messages in thread
From: Qu Wenruo @ 2022-05-01  4:53 UTC (permalink / raw)
  To: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs



On 2022/5/1 12:40, Qu Wenruo wrote:
>
>
> On 2022/4/29 22:30, Christoph Hellwig wrote:
>> Instead of attaching a an extra allocation an indirect call to each
>> low-level bio issued by the RAID code, add a work_struct to struct
>> btrfs_raid_bio and only defer the per-rbio completion action.  The
>> per-bio action for all the I/Os are trivial and can be safely done
>> from interrupt context.
>>
>> As a nice side effect this also allows sharing the boilerplate code
>> for the per-bio completions
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> It looks like this patch is causing test failure in btrfs/027, at least
> for subapge (64K page size, 4K sectorsize) cases.

Also confirmed the same hang on the same commit on x86_64 (4K page size
4K sectorsize).

Also 100% reproducible.

Thanks,
Qu

>
> Reproducibility is 100% (4/4 tried).
>
> The hanging sub-test case is the repalcing of a missing device in raid5.
>
> The involved dmesg (including the hanging thread dump) is:
>
> [  276.672541] BTRFS warning (device dm-1): read-write for sector size
> 4096 with page size 65536 is experimental
> [  276.744316] BTRFS info (device dm-1): checking UUID tree
> [  277.387701] BTRFS info (device dm-1): allowing degraded mounts
> [  277.390314] BTRFS info (device dm-1): using free space tree
> [  277.392108] BTRFS info (device dm-1): has skinny extents
> [  277.393890] BTRFS warning (device dm-1): read-write for sector size
> 4096 with page size 65536 is experimental
> [  277.420922] BTRFS warning (device dm-1): devid 2 uuid
> 4b67464d-e851-4a88-8765-67b043d4680f is missing
> [  277.432694] BTRFS warning (device dm-1): devid 2 uuid
> 4b67464d-e851-4a88-8765-67b043d4680f is missing
> [  277.648326] BTRFS info (device dm-1): dev_replace from <missing disk>
> (devid 2) to /dev/mapper/test-scratch5 started
> [  297.264371] task:btrfs           state:D stack:    0 pid: 7158 ppid:
>   6493 flags:0x0000000c
> [  297.280744] Call trace:
> [  297.282351]  __switch_to+0xfc/0x160
> [  297.284525]  __schedule+0x260/0x61c
> [  297.286959]  schedule+0x54/0xc4
> [  297.288980]  scrub_enumerate_chunks+0x610/0x760 [btrfs]
> [  297.292504]  btrfs_scrub_dev+0x1a0/0x530 [btrfs]
> [  297.306738]  btrfs_dev_replace_start+0x2a4/0x2d0 [btrfs]
> [  297.310418]  btrfs_dev_replace_by_ioctl+0x48/0x84 [btrfs]
> [  297.314026]  btrfs_ioctl_dev_replace+0x1b8/0x210 [btrfs]
> [  297.328014]  btrfs_ioctl+0xa48/0x1a70 [btrfs]
> [  297.330705]  __arm64_sys_ioctl+0xb4/0x100
> [  297.333037]  invoke_syscall+0x50/0x120
> [  297.343237]  el0_svc_common.constprop.0+0x4c/0x100
> [  297.345716]  do_el0_svc+0x34/0xa0
> [  297.347242]  el0_svc+0x34/0xb0
> [  297.348763]  el0t_64_sync_handler+0xa8/0x130
> [  297.350870]  el0t_64_sync+0x18c/0x190
>
> Mind to take a look on that hang?
>
> Thanks,
> Qu
>
>> ---
>>   fs/btrfs/ctree.h   |   2 +-
>>   fs/btrfs/disk-io.c |  12 ++---
>>   fs/btrfs/disk-io.h |   1 -
>>   fs/btrfs/raid56.c  | 111 ++++++++++++++++++---------------------------
>>   4 files changed, 49 insertions(+), 77 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 40a6f61559348..4dd0d4a2e7757 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -853,7 +853,7 @@ struct btrfs_fs_info {
>>       struct btrfs_workqueue *flush_workers;
>>       struct btrfs_workqueue *endio_workers;
>>       struct btrfs_workqueue *endio_meta_workers;
>> -    struct btrfs_workqueue *endio_raid56_workers;
>> +    struct workqueue_struct *endio_raid56_workers;
>>       struct workqueue_struct *rmw_workers;
>>       struct btrfs_workqueue *endio_meta_write_workers;
>>       struct btrfs_workqueue *endio_write_workers;
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 73e12ecc81be1..3c6137734d28c 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
>>               wq = fs_info->endio_meta_write_workers;
>>           else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
>>               wq = fs_info->endio_freespace_worker;
>> -        else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
>> -            wq = fs_info->endio_raid56_workers;
>>           else
>>               wq = fs_info->endio_write_workers;
>>       } else {
>> -        if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
>> -            wq = fs_info->endio_raid56_workers;
>> -        else if (end_io_wq->metadata)
>> +        if (end_io_wq->metadata)
>>               wq = fs_info->endio_meta_workers;
>>           else
>>               wq = fs_info->endio_workers;
>> @@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct
>> btrfs_fs_info *fs_info)
>>       btrfs_destroy_workqueue(fs_info->hipri_workers);
>>       btrfs_destroy_workqueue(fs_info->workers);
>>       btrfs_destroy_workqueue(fs_info->endio_workers);
>> -    btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
>> +    if (fs_info->endio_raid56_workers)
>> +        destroy_workqueue(fs_info->endio_raid56_workers);
>>       if (fs_info->rmw_workers)
>>           destroy_workqueue(fs_info->rmw_workers);
>>       btrfs_destroy_workqueue(fs_info->endio_write_workers);
>> @@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct
>> btrfs_fs_info *fs_info)
>>           btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
>>                         max_active, 2);
>>       fs_info->endio_raid56_workers =
>> -        btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
>> -                      max_active, 4);
>> +        alloc_workqueue("btrfs-endio-raid56", flags, max_active);
>>       fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags,
>> max_active);
>>       fs_info->endio_write_workers =
>>           btrfs_alloc_workqueue(fs_info, "endio-write", flags,
>> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>> index 9340e3266e0ac..97255e3d7e524 100644
>> --- a/fs/btrfs/disk-io.h
>> +++ b/fs/btrfs/disk-io.h
>> @@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
>>       BTRFS_WQ_ENDIO_DATA,
>>       BTRFS_WQ_ENDIO_METADATA,
>>       BTRFS_WQ_ENDIO_FREE_SPACE,
>> -    BTRFS_WQ_ENDIO_RAID56,
>>   };
>>
>>   static inline u64 btrfs_sb_offset(int mirror)
>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>> index a5b623ee6facd..1a3c1a9b10d0b 100644
>> --- a/fs/btrfs/raid56.c
>> +++ b/fs/btrfs/raid56.c
>> @@ -164,6 +164,9 @@ struct btrfs_raid_bio {
>>       atomic_t stripes_pending;
>>
>>       atomic_t error;
>> +
>> +    struct work_struct end_io_work;
>> +
>>       /*
>>        * these are two arrays of pointers.  We allocate the
>>        * rbio big enough to hold them both and setup their
>> @@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct
>> btrfs_raid_bio *rbio, struct bio *bio)
>>       }
>>   }
>>
>> -/*
>> - * end io for the read phase of the rmw cycle.  All the bios here are
>> physical
>> - * stripe bios we've read from the disk so we can recalculate the
>> parity of the
>> - * stripe.
>> - *
>> - * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>> - * may trigger parity reconstruction if we had any errors along the way
>> - */
>> -static void raid_rmw_end_io(struct bio *bio)
>> +static void raid56_bio_end_io(struct bio *bio)
>>   {
>>       struct btrfs_raid_bio *rbio = bio->bi_private;
>>
>> @@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
>>
>>       bio_put(bio);
>>
>> -    if (!atomic_dec_and_test(&rbio->stripes_pending))
>> -        return;
>> +    if (atomic_dec_and_test(&rbio->stripes_pending))
>> +        queue_work(rbio->bioc->fs_info->endio_raid56_workers,
>> +               &rbio->end_io_work);
>> +}
>>
>> -    if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
>> -        goto cleanup;
>> +/*
>> + * End io handler for the read phase of the rmw cycle.  All the bios
>> here are
>> + * physical stripe bios we've read from the disk so we can
>> recalculate the
>> + * parity of the stripe.
>> + *
>> + * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>> + * may trigger parity reconstruction if we had any errors along the way
>> + */
>> +static void raid56_rmw_end_io_work(struct work_struct *work)
>> +{
>> +    struct btrfs_raid_bio *rbio =
>> +        container_of(work, struct btrfs_raid_bio, end_io_work);
>> +
>> +    if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
>> +        rbio_orig_end_io(rbio, BLK_STS_IOERR);
>> +        return;
>> +    }
>>
>>       /*
>> -     * this will normally call finish_rmw to start our write
>> -     * but if there are any failed stripes we'll reconstruct
>> -     * from parity first
>> +     * This will normally call finish_rmw to start our write but if
>> there
>> +     * are any failed stripes we'll reconstruct from parity first.
>>        */
>>       validate_rbio_for_rmw(rbio);
>> -    return;
>> -
>> -cleanup:
>> -
>> -    rbio_orig_end_io(rbio, BLK_STS_IOERR);
>>   }
>>
>>   /*
>> @@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct
>> btrfs_raid_bio *rbio)
>>        * touch it after that.
>>        */
>>       atomic_set(&rbio->stripes_pending, bios_to_read);
>> +    INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
>>       while ((bio = bio_list_pop(&bio_list))) {
>> -        bio->bi_end_io = raid_rmw_end_io;
>> -
>> -        btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> +        bio->bi_end_io = raid56_bio_end_io;
>>           submit_bio(bio);
>>       }
>>       /* the actual write will happen once the reads are done */
>> @@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct
>> btrfs_raid_bio *rbio)
>>   }
>>
>>   /*
>> - * This is called only for stripes we've read from disk to
>> - * reconstruct the parity.
>> + * This is called only for stripes we've read from disk to
>> reconstruct the
>> + * parity.
>>    */
>> -static void raid_recover_end_io(struct bio *bio)
>> +static void raid_recover_end_io_work(struct work_struct *work)
>>   {
>> -    struct btrfs_raid_bio *rbio = bio->bi_private;
>> -
>> -    /*
>> -     * we only read stripe pages off the disk, set them
>> -     * up to date if there were no errors
>> -     */
>> -    if (bio->bi_status)
>> -        fail_bio_stripe(rbio, bio);
>> -    else
>> -        set_bio_pages_uptodate(rbio, bio);
>> -    bio_put(bio);
>> -
>> -    if (!atomic_dec_and_test(&rbio->stripes_pending))
>> -        return;
>> +    struct btrfs_raid_bio *rbio =
>> +        container_of(work, struct btrfs_raid_bio, end_io_work);
>>
>>       if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
>>           rbio_orig_end_io(rbio, BLK_STS_IOERR);
>> @@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct
>> btrfs_raid_bio *rbio)
>>        * touch it after that.
>>        */
>>       atomic_set(&rbio->stripes_pending, bios_to_read);
>> +    INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
>>       while ((bio = bio_list_pop(&bio_list))) {
>> -        bio->bi_end_io = raid_recover_end_io;
>> -
>> -        btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> +        bio->bi_end_io = raid56_bio_end_io;
>>           submit_bio(bio);
>>       }
>>
>> @@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct
>> btrfs_raid_bio *rbio,
>>       atomic_set(&rbio->stripes_pending, nr_data);
>>
>>       while ((bio = bio_list_pop(&bio_list))) {
>> -        bio->bi_end_io = raid_write_end_io;
>> -
>> +        bio->bi_end_io = raid56_bio_end_io;
>>           submit_bio(bio);
>>       }
>>       return;
>> @@ -2671,24 +2660,14 @@ static void
>> validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
>>    * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>>    * may trigger parity reconstruction if we had any errors along the way
>>    */
>> -static void raid56_parity_scrub_end_io(struct bio *bio)
>> +static void raid56_parity_scrub_end_io_work(struct work_struct *work)
>>   {
>> -    struct btrfs_raid_bio *rbio = bio->bi_private;
>> -
>> -    if (bio->bi_status)
>> -        fail_bio_stripe(rbio, bio);
>> -    else
>> -        set_bio_pages_uptodate(rbio, bio);
>> -
>> -    bio_put(bio);
>> -
>> -    if (!atomic_dec_and_test(&rbio->stripes_pending))
>> -        return;
>> +    struct btrfs_raid_bio *rbio =
>> +        container_of(work, struct btrfs_raid_bio, end_io_work);
>>
>>       /*
>> -     * this will normally call finish_rmw to start our write
>> -     * but if there are any failed stripes we'll reconstruct
>> -     * from parity first
>> +     * This will normally call finish_rmw to start our write, but if
>> there
>> +     * are any failed stripes we'll reconstruct from parity first
>>        */
>>       validate_rbio_for_parity_scrub(rbio);
>>   }
>> @@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct
>> btrfs_raid_bio *rbio)
>>        * touch it after that.
>>        */
>>       atomic_set(&rbio->stripes_pending, bios_to_read);
>> +    INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
>>       while ((bio = bio_list_pop(&bio_list))) {
>> -        bio->bi_end_io = raid56_parity_scrub_end_io;
>> -
>> -        btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> +        bio->bi_end_io = raid56_bio_end_io;
>>           submit_bio(bio);
>>       }
>>       /* the actual write will happen once the reads are done */

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-05-01  4:53     ` Qu Wenruo
@ 2022-05-02 16:44       ` Christoph Hellwig
  2022-06-03 16:44       ` David Sterba
  1 sibling, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-05-02 16:44 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo, linux-btrfs

On Sun, May 01, 2022 at 12:53:00PM +0800, Qu Wenruo wrote:
> > It looks like this patch is causing test failure in btrfs/027, at least
> > for subapge (64K page size, 4K sectorsize) cases.
> 
> Also confirmed the same hang on the same commit on x86_64 (4K page size
> 4K sectorsize).
> 
> Also 100% reproducible.

I'll take a look.  It turns out I never actually run that test as it
doesn't run without at least 5 devices in the scratch pool, and my btrfs
config uses 4.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-05-01  4:53     ` Qu Wenruo
  2022-05-02 16:44       ` Christoph Hellwig
@ 2022-06-03 16:44       ` David Sterba
  2022-06-03 16:45         ` David Sterba
  1 sibling, 1 reply; 19+ messages in thread
From: David Sterba @ 2022-06-03 16:44 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo, linux-btrfs

On Sun, May 01, 2022 at 12:53:00PM +0800, Qu Wenruo wrote:
> On 2022/5/1 12:40, Qu Wenruo wrote:
> > On 2022/4/29 22:30, Christoph Hellwig wrote:
> >> Instead of attaching a an extra allocation an indirect call to each
> >> low-level bio issued by the RAID code, add a work_struct to struct
> >> btrfs_raid_bio and only defer the per-rbio completion action.  The
> >> per-bio action for all the I/Os are trivial and can be safely done
> >> from interrupt context.
> >>
> >> As a nice side effect this also allows sharing the boilerplate code
> >> for the per-bio completions
> >>
> >> Signed-off-by: Christoph Hellwig <hch@lst.de>
> >
> > It looks like this patch is causing test failure in btrfs/027, at least
> > for subapge (64K page size, 4K sectorsize) cases.
> 
> Also confirmed the same hang on the same commit on x86_64 (4K page size
> 4K sectorsize).
> 
> Also 100% reproducible.

The test passes on my VM setup, the whole suite was run on the patchset
at least twice.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-06-03 16:44       ` David Sterba
@ 2022-06-03 16:45         ` David Sterba
  0 siblings, 0 replies; 19+ messages in thread
From: David Sterba @ 2022-06-03 16:45 UTC (permalink / raw)
  To: David Sterba
  Cc: Qu Wenruo, Christoph Hellwig, David Sterba, Josef Bacik,
	Qu Wenruo, linux-btrfs

On Fri, Jun 03, 2022 at 06:44:56PM +0200, David Sterba wrote:
> The test passes on my VM setup, the whole suite was run on the patchset
> at least twice.

Oh never mind, this is some old version.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 01/10] btrfs: move more work into btrfs_end_bioc
  2022-05-26  7:36 cleanup btrfs bio handling, part 2 v4 Christoph Hellwig
@ 2022-05-26  7:36 ` Christoph Hellwig
  0 siblings, 0 replies; 19+ messages in thread
From: Christoph Hellwig @ 2022-05-26  7:36 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs, Johannes Thumshirn

Assign ->mirror_num and ->bi_status in btrfs_end_bioc instead of
duplicating the logic in the callers.  Also remove the bio argument as
it always must be bioc->orig_bio and the now pointless bioc_error that
did nothing but assign bi_sector to the same value just sampled in the
caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 72 ++++++++++++++--------------------------------
 1 file changed, 22 insertions(+), 50 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a2bb0928dc066..03d941c19f4cb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6626,19 +6626,29 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
 }
 
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
 {
-	bio->bi_private = bioc->private;
-	bio->bi_end_io = bioc->end_io;
-	bio_endio(bio);
+	struct bio *orig_bio = bioc->orig_bio;
 
+	btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num;
+	orig_bio->bi_private = bioc->private;
+	orig_bio->bi_end_io = bioc->end_io;
+
+	/*
+	 * Only send an error to the higher layers if it is beyond the tolerance
+	 * threshold.
+	 */
+	if (atomic_read(&bioc->error) > bioc->max_errors)
+		orig_bio->bi_status = BLK_STS_IOERR;
+	else
+		orig_bio->bi_status = BLK_STS_OK;
+	bio_endio(orig_bio);
 	btrfs_put_bioc(bioc);
 }
 
 static void btrfs_end_bio(struct bio *bio)
 {
 	struct btrfs_io_context *bioc = bio->bi_private;
-	int is_orig_bio = 0;
 
 	if (bio->bi_status) {
 		atomic_inc(&bioc->error);
@@ -6659,35 +6669,12 @@ static void btrfs_end_bio(struct bio *bio)
 		}
 	}
 
-	if (bio == bioc->orig_bio)
-		is_orig_bio = 1;
+	if (bio != bioc->orig_bio)
+		bio_put(bio);
 
 	btrfs_bio_counter_dec(bioc->fs_info);
-
-	if (atomic_dec_and_test(&bioc->stripes_pending)) {
-		if (!is_orig_bio) {
-			bio_put(bio);
-			bio = bioc->orig_bio;
-		}
-
-		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
-		/* only send an error to the higher layers if it is
-		 * beyond the tolerance of the btrfs bio
-		 */
-		if (atomic_read(&bioc->error) > bioc->max_errors) {
-			bio->bi_status = BLK_STS_IOERR;
-		} else {
-			/*
-			 * this bio is actually up to date, we didn't
-			 * go over the max number of errors
-			 */
-			bio->bi_status = BLK_STS_OK;
-		}
-
-		btrfs_end_bioc(bioc, bio);
-	} else if (!is_orig_bio) {
-		bio_put(bio);
-	}
+	if (atomic_dec_and_test(&bioc->stripes_pending))
+		btrfs_end_bioc(bioc);
 }
 
 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
@@ -6725,23 +6712,6 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
 	submit_bio(bio);
 }
 
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
-{
-	atomic_inc(&bioc->error);
-	if (atomic_dec_and_test(&bioc->stripes_pending)) {
-		/* Should be the original bio. */
-		WARN_ON(bio != bioc->orig_bio);
-
-		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
-		bio->bi_iter.bi_sector = logical >> 9;
-		if (atomic_read(&bioc->error) > bioc->max_errors)
-			bio->bi_status = BLK_STS_IOERR;
-		else
-			bio->bi_status = BLK_STS_OK;
-		btrfs_end_bioc(bioc, bio);
-	}
-}
-
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num)
 {
@@ -6800,7 +6770,9 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 						   &dev->dev_state) ||
 		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-			bioc_error(bioc, first_bio, logical);
+			atomic_inc(&bioc->error);
+			if (atomic_dec_and_test(&bioc->stripes_pending))
+				btrfs_end_bioc(bioc);
 			continue;
 		}
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 01/10] btrfs: move more work into btrfs_end_bioc
  2022-04-25  7:54 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
@ 2022-04-26  7:19   ` Johannes Thumshirn
  0 siblings, 0 replies; 19+ messages in thread
From: Johannes Thumshirn @ 2022-04-26  7:19 UTC (permalink / raw)
  To: Christoph Hellwig, Josef Bacik, David Sterba, Qu Wenruo
  Cc: Naohiro Aota, linux-btrfs

Meh I guess I need to rework some of my RAID stuff now.
Anyways, the end result looks way better with this.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 01/10] btrfs: move more work into btrfs_end_bioc
  2022-04-25  7:54 cleanup btrfs bio handling, part 2 Christoph Hellwig
@ 2022-04-25  7:54 ` Christoph Hellwig
  2022-04-26  7:19   ` Johannes Thumshirn
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2022-04-25  7:54 UTC (permalink / raw)
  To: Josef Bacik, David Sterba, Qu Wenruo; +Cc: Naohiro Aota, linux-btrfs

Assign ->mirror_num and ->bi_status in btrfs_end_bioc instead of
duplicating the logic in the callers.  Also remove the bio argument as
it always must be bioc->orig_bio and the now pointless bioc_error that
did nothing but assign bi_sector to the same value just sampled in the
caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 72 ++++++++++++++--------------------------------
 1 file changed, 22 insertions(+), 50 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 748614b00ffa2..aeacb87457687 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6621,19 +6621,29 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
 }
 
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
 {
-	bio->bi_private = bioc->private;
-	bio->bi_end_io = bioc->end_io;
-	bio_endio(bio);
+	struct bio *orig_bio = bioc->orig_bio;
 
+	btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num;
+	orig_bio->bi_private = bioc->private;
+	orig_bio->bi_end_io = bioc->end_io;
+
+	/*
+	 * Only send an error to the higher layers if it is beyond the tolerance
+	 * threshold.
+	 */
+	if (atomic_read(&bioc->error) > bioc->max_errors)
+		orig_bio->bi_status = BLK_STS_IOERR;
+	else
+		orig_bio->bi_status = BLK_STS_OK;
+	bio_endio(orig_bio);
 	btrfs_put_bioc(bioc);
 }
 
 static void btrfs_end_bio(struct bio *bio)
 {
 	struct btrfs_io_context *bioc = bio->bi_private;
-	int is_orig_bio = 0;
 
 	if (bio->bi_status) {
 		atomic_inc(&bioc->error);
@@ -6654,35 +6664,12 @@ static void btrfs_end_bio(struct bio *bio)
 		}
 	}
 
-	if (bio == bioc->orig_bio)
-		is_orig_bio = 1;
+	if (bio != bioc->orig_bio)
+		bio_put(bio);
 
 	btrfs_bio_counter_dec(bioc->fs_info);
-
-	if (atomic_dec_and_test(&bioc->stripes_pending)) {
-		if (!is_orig_bio) {
-			bio_put(bio);
-			bio = bioc->orig_bio;
-		}
-
-		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
-		/* only send an error to the higher layers if it is
-		 * beyond the tolerance of the btrfs bio
-		 */
-		if (atomic_read(&bioc->error) > bioc->max_errors) {
-			bio->bi_status = BLK_STS_IOERR;
-		} else {
-			/*
-			 * this bio is actually up to date, we didn't
-			 * go over the max number of errors
-			 */
-			bio->bi_status = BLK_STS_OK;
-		}
-
-		btrfs_end_bioc(bioc, bio);
-	} else if (!is_orig_bio) {
-		bio_put(bio);
-	}
+	if (atomic_dec_and_test(&bioc->stripes_pending))
+		btrfs_end_bioc(bioc);
 }
 
 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
@@ -6720,23 +6707,6 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
 	submit_bio(bio);
 }
 
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
-{
-	atomic_inc(&bioc->error);
-	if (atomic_dec_and_test(&bioc->stripes_pending)) {
-		/* Should be the original bio. */
-		WARN_ON(bio != bioc->orig_bio);
-
-		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
-		bio->bi_iter.bi_sector = logical >> 9;
-		if (atomic_read(&bioc->error) > bioc->max_errors)
-			bio->bi_status = BLK_STS_IOERR;
-		else
-			bio->bi_status = BLK_STS_OK;
-		btrfs_end_bioc(bioc, bio);
-	}
-}
-
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num)
 {
@@ -6795,7 +6765,9 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 						   &dev->dev_state) ||
 		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-			bioc_error(bioc, first_bio, logical);
+			atomic_inc(&bioc->error);
+			if (atomic_dec_and_test(&bioc->stripes_pending))
+				btrfs_end_bioc(bioc);
 			continue;
 		}
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2022-06-03 16:50 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
2022-04-29 14:30 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
2022-04-29 14:30 ` [PATCH 03/10] btrfs: split btrfs_submit_data_bio Christoph Hellwig
2022-04-29 14:30 ` [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads Christoph Hellwig
2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
2022-05-01  4:40   ` Qu Wenruo
2022-05-01  4:53     ` Qu Wenruo
2022-05-02 16:44       ` Christoph Hellwig
2022-06-03 16:44       ` David Sterba
2022-06-03 16:45         ` David Sterba
2022-04-29 14:30 ` [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Christoph Hellwig
2022-04-29 14:30 ` [PATCH 07/10] btrfs: centralize setting REQ_META Christoph Hellwig
2022-04-29 14:30 ` [PATCH 08/10] btrfs: remove btrfs_end_io_wq Christoph Hellwig
2022-04-29 14:30 ` [PATCH 09/10] btrfs: refactor btrfs_map_bio Christoph Hellwig
2022-04-29 14:30 ` [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2022-05-26  7:36 cleanup btrfs bio handling, part 2 v4 Christoph Hellwig
2022-05-26  7:36 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
2022-04-25  7:54 cleanup btrfs bio handling, part 2 Christoph Hellwig
2022-04-25  7:54 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
2022-04-26  7:19   ` Johannes Thumshirn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.