All of lore.kernel.org
 help / color / mirror / Atom feed
* cleanup btrfs bio handling, part 2 v2
@ 2022-04-29 14:30 Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
                   ` (9 more replies)
  0 siblings, 10 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Hi all,

this series removes the need to allocate a separate object for I/O
completions for all read and some write I/Os, and reduced the memory
usage of the low-level bios cloned by btrfs_map_bio by using plain bios
instead of the much larger btrfs_bio.

Changes since v1:
 - rebased to the latests misc-next tree
 - don't call destroy_workqueue on NULL workqueues
 - drop a marginal and slightly controversial cleanup to btrfs_map_bio

Diffstat:
 compression.c |   41 +++++-------
 compression.h |    7 +-
 ctree.h       |   14 ++--
 disk-io.c     |  148 +++++-----------------------------------------
 disk-io.h     |   11 ---
 extent_io.c   |   33 +++-------
 extent_io.h   |    1 
 inode.c       |  162 +++++++++++++++++++--------------------------------
 raid56.c      |  111 +++++++++++++---------------------
 super.c       |   13 ----
 volumes.c     |  184 +++++++++++++++++++++++++++-------------------------------
 volumes.h     |    8 ++
 12 files changed, 262 insertions(+), 471 deletions(-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 01/10] btrfs: move more work into btrfs_end_bioc
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs, Johannes Thumshirn

Assign ->mirror_num and ->bi_status in btrfs_end_bioc instead of
duplicating the logic in the callers.  Also remove the bio argument as
it always must be bioc->orig_bio and the now pointless bioc_error that
did nothing but assign bi_sector to the same value just sampled in the
caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 72 ++++++++++++++--------------------------------
 1 file changed, 22 insertions(+), 50 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 748614b00ffa2..aeacb87457687 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6621,19 +6621,29 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
 }
 
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
 {
-	bio->bi_private = bioc->private;
-	bio->bi_end_io = bioc->end_io;
-	bio_endio(bio);
+	struct bio *orig_bio = bioc->orig_bio;
 
+	btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num;
+	orig_bio->bi_private = bioc->private;
+	orig_bio->bi_end_io = bioc->end_io;
+
+	/*
+	 * Only send an error to the higher layers if it is beyond the tolerance
+	 * threshold.
+	 */
+	if (atomic_read(&bioc->error) > bioc->max_errors)
+		orig_bio->bi_status = BLK_STS_IOERR;
+	else
+		orig_bio->bi_status = BLK_STS_OK;
+	bio_endio(orig_bio);
 	btrfs_put_bioc(bioc);
 }
 
 static void btrfs_end_bio(struct bio *bio)
 {
 	struct btrfs_io_context *bioc = bio->bi_private;
-	int is_orig_bio = 0;
 
 	if (bio->bi_status) {
 		atomic_inc(&bioc->error);
@@ -6654,35 +6664,12 @@ static void btrfs_end_bio(struct bio *bio)
 		}
 	}
 
-	if (bio == bioc->orig_bio)
-		is_orig_bio = 1;
+	if (bio != bioc->orig_bio)
+		bio_put(bio);
 
 	btrfs_bio_counter_dec(bioc->fs_info);
-
-	if (atomic_dec_and_test(&bioc->stripes_pending)) {
-		if (!is_orig_bio) {
-			bio_put(bio);
-			bio = bioc->orig_bio;
-		}
-
-		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
-		/* only send an error to the higher layers if it is
-		 * beyond the tolerance of the btrfs bio
-		 */
-		if (atomic_read(&bioc->error) > bioc->max_errors) {
-			bio->bi_status = BLK_STS_IOERR;
-		} else {
-			/*
-			 * this bio is actually up to date, we didn't
-			 * go over the max number of errors
-			 */
-			bio->bi_status = BLK_STS_OK;
-		}
-
-		btrfs_end_bioc(bioc, bio);
-	} else if (!is_orig_bio) {
-		bio_put(bio);
-	}
+	if (atomic_dec_and_test(&bioc->stripes_pending))
+		btrfs_end_bioc(bioc);
 }
 
 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
@@ -6720,23 +6707,6 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
 	submit_bio(bio);
 }
 
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
-{
-	atomic_inc(&bioc->error);
-	if (atomic_dec_and_test(&bioc->stripes_pending)) {
-		/* Should be the original bio. */
-		WARN_ON(bio != bioc->orig_bio);
-
-		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
-		bio->bi_iter.bi_sector = logical >> 9;
-		if (atomic_read(&bioc->error) > bioc->max_errors)
-			bio->bi_status = BLK_STS_IOERR;
-		else
-			bio->bi_status = BLK_STS_OK;
-		btrfs_end_bioc(bioc, bio);
-	}
-}
-
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num)
 {
@@ -6795,7 +6765,9 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 						   &dev->dev_state) ||
 		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-			bioc_error(bioc, first_bio, logical);
+			atomic_inc(&bioc->error);
+			if (atomic_dec_and_test(&bioc->stripes_pending))
+				btrfs_end_bioc(bioc);
 			continue;
 		}
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 03/10] btrfs: split btrfs_submit_data_bio Christoph Hellwig
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs, Johannes Thumshirn

Remove the pointless goto just to return err and clean up the code flow
to be a little more straight forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/inode.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b7df1c0ee5ee..3be52ed6ed196 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7899,31 +7899,28 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
 	blk_status_t ret;
 
-	/* Check btrfs_submit_bio_hook() for rules about async submit. */
-	if (async_submit)
-		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
 	if (!write) {
 		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		goto map;
 
-	if (write && async_submit) {
-		ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
-					  btrfs_submit_bio_start_direct_io);
-		goto err;
-	} else if (write) {
+	if (write) {
+		/* check btrfs_submit_data_bio() for async submit rules */
+		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
+			return btrfs_wq_submit_bio(inode, bio, 0, 0,
+					file_offset,
+					btrfs_submit_bio_start_direct_io);
 		/*
 		 * If we aren't doing async submit, calculate the csum of the
 		 * bio now.
 		 */
 		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
 		if (ret)
-			goto err;
+			return ret;
 	} else {
 		u64 csum_offset;
 
@@ -7933,9 +7930,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 		btrfs_bio(bio)->csum = dip->csums + csum_offset;
 	}
 map:
-	ret = btrfs_map_bio(fs_info, bio, 0);
-err:
-	return ret;
+	return btrfs_map_bio(fs_info, bio, 0);
 }
 
 /*
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 03/10] btrfs: split btrfs_submit_data_bio
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads Christoph Hellwig
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Split btrfs_submit_data_bio into one helper for reads and one for writes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.h     |   6 +-
 fs/btrfs/extent_io.c |  12 ++--
 fs/btrfs/inode.c     | 130 ++++++++++++++++++++-----------------------
 3 files changed, 73 insertions(+), 75 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 371f3f6c4ea47..40a6f61559348 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3252,8 +3252,10 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
 u64 btrfs_file_extent_end(const struct btrfs_path *path);
 
 /* inode.c */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
-			   int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio,
+		int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+		int mirror_num, unsigned long bio_flags);
 unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 				    u32 bio_offset, struct page *page,
 				    u64 start, u64 end);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f9d6dd310c42b..80b4482c477c6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -186,11 +186,15 @@ static void submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_fl
 	/* Caller should ensure the bio has at least some range added */
 	ASSERT(bio->bi_iter.bi_size);
 
-	if (is_data_inode(tree->private_data))
-		btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+	if (!is_data_inode(tree->private_data))
+		btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
+	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+		btrfs_submit_data_write_bio(tree->private_data, bio, mirror_num,
 					    bio_flags);
 	else
-		btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
+		btrfs_submit_data_read_bio(tree->private_data, bio, mirror_num,
+					    bio_flags);
+
 	/*
 	 * Above submission hooks will handle the error by ending the bio,
 	 * which will do the cleanup properly.  So here we should not return
@@ -2773,7 +2777,7 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
 		ret = btrfs_repair_one_sector(inode, failed_bio,
 				bio_offset + offset,
 				page, pgoff + offset, start + offset,
-				failed_mirror, btrfs_submit_data_bio);
+				failed_mirror, btrfs_submit_data_read_bio);
 		if (!ret) {
 			/*
 			 * We have submitted the read repair, the page release
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3be52ed6ed196..6476a5d9c09e9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2552,90 +2552,82 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
 	return errno_to_blk_status(ret);
 }
 
-/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read:				sync submit
- *
- * b) write without checksum:		sync submit
- *
- * c) write with checksum:
- *    c-1) if bio is issued by fsync:	sync submit
- *         (sync_writers != 0)
- *
- *    c-2) if root is reloc root:	sync submit
- *         (only in case of buffered IO)
- *
- *    c-3) otherwise:			async submit
- */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio,
 			   int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
-	blk_status_t ret = 0;
-	int skip_sum;
-	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
-	skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
-		test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-
-	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
-		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+	struct btrfs_inode *bi = BTRFS_I(inode);
+	blk_status_t ret;
 
 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
-		struct page *page = bio_first_bvec_all(bio)->bv_page;
-		loff_t file_offset = page_offset(page);
-
-		ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+		ret = extract_ordered_extent(bi, bio,
+				page_offset(bio_first_bvec_all(bio)->bv_page));
 		if (ret)
 			goto out;
 	}
 
-	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
-		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
-		if (ret)
-			goto out;
-
-		if (bio_flags & EXTENT_BIO_COMPRESSED) {
-			/*
-			 * btrfs_submit_compressed_read will handle completing
-			 * the bio if there were any errors, so just return
-			 * here.
-			 */
-			btrfs_submit_compressed_read(inode, bio, mirror_num);
-			return;
-		} else {
-			/*
-			 * Lookup bio sums does extra checks around whether we
-			 * need to csum or not, which is why we ignore skip_sum
-			 * here.
-			 */
-			ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+	/*
+	 * Rules for async/sync submit:
+	 *   a) write without checksum:			sync submit
+	 *   b) write with checksum:
+	 *      b-1) if bio is issued by fsync:		sync submit
+	 *           (sync_writers != 0)
+	 *      b-2) if root is reloc root:		sync submit
+	 *           (only in case of buffered IO)
+	 *      b-3) otherwise:				async submit
+	 */
+	if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+	    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
+		if (atomic_read(&bi->sync_writers)) {
+			ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
 			if (ret)
 				goto out;
+		} else if (btrfs_is_data_reloc_root(bi->root)) {
+			; /* csum items have already been cloned */
+		} else {
+			ret = btrfs_wq_submit_bio(inode, bio,
+					mirror_num, bio_flags, 0,
+					btrfs_submit_bio_start);
+			goto out;
 		}
-		goto mapit;
-	} else if (async && !skip_sum) {
-		/* csum items have already been cloned */
-		if (btrfs_is_data_reloc_root(root))
-			goto mapit;
-		/* we're doing a write, do the async checksumming */
-		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
-					  0, btrfs_submit_bio_start);
+	}
+	ret = btrfs_map_bio(fs_info, bio, mirror_num);
+out:
+	if (ret) {
+		bio->bi_status = ret;
+		bio_endio(bio);
+	}
+}
+
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+			   int mirror_num, unsigned long bio_flags)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	blk_status_t ret;
+
+	ret = btrfs_bio_wq_end_io(fs_info, bio,
+			btrfs_is_free_space_inode(BTRFS_I(inode)) ?
+			BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
+	if (ret)
 		goto out;
-	} else if (!skip_sum) {
-		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
-		if (ret)
-			goto out;
+
+	if (bio_flags & EXTENT_BIO_COMPRESSED) {
+		/*
+		 * btrfs_submit_compressed_read will handle completing the bio
+		 * if there were any errors, so just return here.
+		 */
+		btrfs_submit_compressed_read(inode, bio, mirror_num);
+		return;
 	}
 
-mapit:
+	/*
+	 * Lookup bio sums does extra checks around whether we need to csum or
+	 * not, which is why we ignore skip_sum here.
+	 */
+	ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+	if (ret)
+		goto out;
 	ret = btrfs_map_bio(fs_info, bio, mirror_num);
-
 out:
 	if (ret) {
 		bio->bi_status = ret;
@@ -7909,7 +7901,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 		goto map;
 
 	if (write) {
-		/* check btrfs_submit_data_bio() for async submit rules */
+		/* check btrfs_submit_data_write_bio() for async submit rules */
 		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
 			return btrfs_wq_submit_bio(inode, bio, 0, 0,
 					file_offset,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 03/10] btrfs: split btrfs_submit_data_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

The bio completion handler of the bio used for the compressed data is
already run in a workqueue using btrfs_bio_wq_end_io, so don't schedule
the completion of the original bio to the same workqueue again but just
execute it directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6476a5d9c09e9..02c4cbd557dcd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2605,12 +2605,6 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	blk_status_t ret;
 
-	ret = btrfs_bio_wq_end_io(fs_info, bio,
-			btrfs_is_free_space_inode(BTRFS_I(inode)) ?
-			BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
-	if (ret)
-		goto out;
-
 	if (bio_flags & EXTENT_BIO_COMPRESSED) {
 		/*
 		 * btrfs_submit_compressed_read will handle completing the bio
@@ -2620,6 +2614,12 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 		return;
 	}
 
+	ret = btrfs_bio_wq_end_io(fs_info, bio,
+			btrfs_is_free_space_inode(BTRFS_I(inode)) ?
+			BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
+	if (ret)
+		goto out;
+
 	/*
 	 * Lookup bio sums does extra checks around whether we need to csum or
 	 * not, which is why we ignore skip_sum here.
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-05-01  4:40   ` Qu Wenruo
  2022-04-29 14:30 ` [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Christoph Hellwig
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Instead of attaching a an extra allocation an indirect call to each
low-level bio issued by the RAID code, add a work_struct to struct
btrfs_raid_bio and only defer the per-rbio completion action.  The
per-bio action for all the I/Os are trivial and can be safely done
from interrupt context.

As a nice side effect this also allows sharing the boilerplate code
for the per-bio completions

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.h   |   2 +-
 fs/btrfs/disk-io.c |  12 ++---
 fs/btrfs/disk-io.h |   1 -
 fs/btrfs/raid56.c  | 111 ++++++++++++++++++---------------------------
 4 files changed, 49 insertions(+), 77 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 40a6f61559348..4dd0d4a2e7757 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -853,7 +853,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *flush_workers;
 	struct btrfs_workqueue *endio_workers;
 	struct btrfs_workqueue *endio_meta_workers;
-	struct btrfs_workqueue *endio_raid56_workers;
+	struct workqueue_struct *endio_raid56_workers;
 	struct workqueue_struct *rmw_workers;
 	struct btrfs_workqueue *endio_meta_write_workers;
 	struct btrfs_workqueue *endio_write_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 73e12ecc81be1..3c6137734d28c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
 			wq = fs_info->endio_meta_write_workers;
 		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
 			wq = fs_info->endio_freespace_worker;
-		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-			wq = fs_info->endio_raid56_workers;
 		else
 			wq = fs_info->endio_write_workers;
 	} else {
-		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-			wq = fs_info->endio_raid56_workers;
-		else if (end_io_wq->metadata)
+		if (end_io_wq->metadata)
 			wq = fs_info->endio_meta_workers;
 		else
 			wq = fs_info->endio_workers;
@@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->hipri_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
 	btrfs_destroy_workqueue(fs_info->endio_workers);
-	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+	if (fs_info->endio_raid56_workers)
+		destroy_workqueue(fs_info->endio_raid56_workers);
 	if (fs_info->rmw_workers)
 		destroy_workqueue(fs_info->rmw_workers);
 	btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
 				      max_active, 2);
 	fs_info->endio_raid56_workers =
-		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
-				      max_active, 4);
+		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
 	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
 	fs_info->endio_write_workers =
 		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9340e3266e0ac..97255e3d7e524 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
 	BTRFS_WQ_ENDIO_DATA,
 	BTRFS_WQ_ENDIO_METADATA,
 	BTRFS_WQ_ENDIO_FREE_SPACE,
-	BTRFS_WQ_ENDIO_RAID56,
 };
 
 static inline u64 btrfs_sb_offset(int mirror)
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a5b623ee6facd..1a3c1a9b10d0b 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -164,6 +164,9 @@ struct btrfs_raid_bio {
 	atomic_t stripes_pending;
 
 	atomic_t error;
+
+	struct work_struct end_io_work;
+
 	/*
 	 * these are two arrays of pointers.  We allocate the
 	 * rbio big enough to hold them both and setup their
@@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
 	}
 }
 
-/*
- * end io for the read phase of the rmw cycle.  All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid_rmw_end_io(struct bio *bio)
+static void raid56_bio_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
@@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
 
 	bio_put(bio);
 
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
+	if (atomic_dec_and_test(&rbio->stripes_pending))
+		queue_work(rbio->bioc->fs_info->endio_raid56_workers,
+			   &rbio->end_io_work);
+}
 
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
-		goto cleanup;
+/*
+ * End io handler for the read phase of the rmw cycle.  All the bios here are
+ * physical stripe bios we've read from the disk so we can recalculate the
+ * parity of the stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_rmw_end_io_work(struct work_struct *work)
+{
+	struct btrfs_raid_bio *rbio =
+		container_of(work, struct btrfs_raid_bio, end_io_work);
+
+	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+		rbio_orig_end_io(rbio, BLK_STS_IOERR);
+		return;
+	}
 
 	/*
-	 * this will normally call finish_rmw to start our write
-	 * but if there are any failed stripes we'll reconstruct
-	 * from parity first
+	 * This will normally call finish_rmw to start our write but if there
+	 * are any failed stripes we'll reconstruct from parity first.
 	 */
 	validate_rbio_for_rmw(rbio);
-	return;
-
-cleanup:
-
-	rbio_orig_end_io(rbio, BLK_STS_IOERR);
 }
 
 /*
@@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 	 * touch it after that.
 	 */
 	atomic_set(&rbio->stripes_pending, bios_to_read);
+	INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
 	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid_rmw_end_io;
-
-		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+		bio->bi_end_io = raid56_bio_end_io;
 		submit_bio(bio);
 	}
 	/* the actual write will happen once the reads are done */
@@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 }
 
 /*
- * This is called only for stripes we've read from disk to
- * reconstruct the parity.
+ * This is called only for stripes we've read from disk to reconstruct the
+ * parity.
  */
-static void raid_recover_end_io(struct bio *bio)
+static void raid_recover_end_io_work(struct work_struct *work)
 {
-	struct btrfs_raid_bio *rbio = bio->bi_private;
-
-	/*
-	 * we only read stripe pages off the disk, set them
-	 * up to date if there were no errors
-	 */
-	if (bio->bi_status)
-		fail_bio_stripe(rbio, bio);
-	else
-		set_bio_pages_uptodate(rbio, bio);
-	bio_put(bio);
-
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
+	struct btrfs_raid_bio *rbio =
+		container_of(work, struct btrfs_raid_bio, end_io_work);
 
 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
@@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 	 * touch it after that.
 	 */
 	atomic_set(&rbio->stripes_pending, bios_to_read);
+	INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
 	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid_recover_end_io;
-
-		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+		bio->bi_end_io = raid56_bio_end_io;
 		submit_bio(bio);
 	}
 
@@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 	atomic_set(&rbio->stripes_pending, nr_data);
 
 	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid_write_end_io;
-
+		bio->bi_end_io = raid56_bio_end_io;
 		submit_bio(bio);
 	}
 	return;
@@ -2671,24 +2660,14 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
  * This will usually kick off finish_rmw once all the bios are read in, but it
  * may trigger parity reconstruction if we had any errors along the way
  */
-static void raid56_parity_scrub_end_io(struct bio *bio)
+static void raid56_parity_scrub_end_io_work(struct work_struct *work)
 {
-	struct btrfs_raid_bio *rbio = bio->bi_private;
-
-	if (bio->bi_status)
-		fail_bio_stripe(rbio, bio);
-	else
-		set_bio_pages_uptodate(rbio, bio);
-
-	bio_put(bio);
-
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
+	struct btrfs_raid_bio *rbio =
+		container_of(work, struct btrfs_raid_bio, end_io_work);
 
 	/*
-	 * this will normally call finish_rmw to start our write
-	 * but if there are any failed stripes we'll reconstruct
-	 * from parity first
+	 * This will normally call finish_rmw to start our write, but if there
+	 * are any failed stripes we'll reconstruct from parity first
 	 */
 	validate_rbio_for_parity_scrub(rbio);
 }
@@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 	 * touch it after that.
 	 */
 	atomic_set(&rbio->stripes_pending, bios_to_read);
+	INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
 	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid56_parity_scrub_end_io;
-
-		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+		bio->bi_end_io = raid56_bio_end_io;
 		submit_bio(bio);
 	}
 	/* the actual write will happen once the reads are done */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 07/10] btrfs: centralize setting REQ_META Christoph Hellwig
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Compressed write bio completion is the only user of btrfs_bio_wq_end_io
for writes, and the use of btrfs_bio_wq_end_io is a little suboptimal
here as we only real need user context for the final completion of a
compressed_bio structure, and not every single bio completion.

Add a work_struct to struct compressed_bio instead and use that to call
finish_compressed_bio_write.  This allows to remove all handling of
write bios in the btrfs_bio_wq_end_io infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/compression.c | 45 +++++++++++++++++++++---------------------
 fs/btrfs/compression.h |  7 +++++--
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/disk-io.c     | 30 +++++++++++-----------------
 fs/btrfs/super.c       |  2 --
 5 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f4564f32f6d93..9d5986a30a4a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -403,6 +403,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
 	kfree(cb);
 }
 
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+	struct compressed_bio *cb =
+		container_of(work, struct compressed_bio, write_end_work);
+
+	finish_compressed_bio_write(cb);
+}
+
 /*
  * Do the cleanup once all the compressed pages hit the disk.  This will clear
  * writeback on the file pages and free the compressed pages.
@@ -414,29 +422,16 @@ static void end_compressed_bio_write(struct bio *bio)
 {
 	struct compressed_bio *cb = bio->bi_private;
 
-	if (!dec_and_test_compressed_bio(cb, bio))
-		goto out;
-
-	btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+	if (dec_and_test_compressed_bio(cb, bio)) {
+		struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
 
-	finish_compressed_bio_write(cb);
-out:
+		btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+		queue_work(fs_info->compressed_write_workers,
+			   &cb->write_end_work);
+	}
 	bio_put(bio);
 }
 
-static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
-					  struct bio *bio, int mirror_num)
-{
-	blk_status_t ret;
-
-	ASSERT(bio->bi_iter.bi_size);
-	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-	if (ret)
-		return ret;
-	ret = btrfs_map_bio(fs_info, bio, mirror_num);
-	return ret;
-}
-
 /*
  * Allocate a compressed_bio, which will be used to read/write on-disk
  * (aka, compressed) * data.
@@ -533,7 +528,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 	cb->compressed_pages = compressed_pages;
 	cb->compressed_len = compressed_len;
 	cb->writeback = writeback;
-	cb->orig_bio = NULL;
+	INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
 	cb->nr_pages = nr_pages;
 
 	if (blkcg_css)
@@ -603,7 +598,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 					goto finish_cb;
 			}
 
-			ret = submit_compressed_bio(fs_info, bio, 0);
+			ASSERT(bio->bi_iter.bi_size);
+			ret = btrfs_map_bio(fs_info, bio, 0);
 			if (ret)
 				goto finish_cb;
 			bio = NULL;
@@ -941,7 +937,12 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 						  fs_info->sectorsize);
 			sums += fs_info->csum_size * nr_sectors;
 
-			ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
+			ASSERT(comp_bio->bi_iter.bi_size);
+			ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
+						  BTRFS_WQ_ENDIO_DATA);
+			if (ret)
+				goto finish_cb;
+			ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
 			if (ret)
 				goto finish_cb;
 			comp_bio = NULL;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 2707404389a5d..4a40725cbf1db 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -61,8 +61,11 @@ struct compressed_bio {
 	blk_status_t status;
 	int mirror_num;
 
-	/* for reads, this is the bio we are copying the data into */
-	struct bio *orig_bio;
+	union {
+		/* for reads, this is the bio we are copying the data into */
+		struct bio *orig_bio;
+		struct work_struct write_end_work;
+	};
 
 	/*
 	 * the start of a variable length array of checksums only
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4dd0d4a2e7757..5ae482632327f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -855,7 +855,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *endio_meta_workers;
 	struct workqueue_struct *endio_raid56_workers;
 	struct workqueue_struct *rmw_workers;
-	struct btrfs_workqueue *endio_meta_write_workers;
+	struct workqueue_struct *compressed_write_workers;
 	struct btrfs_workqueue *endio_write_workers;
 	struct btrfs_workqueue *endio_freespace_worker;
 	struct btrfs_workqueue *caching_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3c6137734d28c..744a74203776e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -748,19 +748,10 @@ static void end_workqueue_bio(struct bio *bio)
 	fs_info = end_io_wq->info;
 	end_io_wq->status = bio->bi_status;
 
-	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
-		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
-			wq = fs_info->endio_meta_write_workers;
-		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
-			wq = fs_info->endio_freespace_worker;
-		else
-			wq = fs_info->endio_write_workers;
-	} else {
-		if (end_io_wq->metadata)
-			wq = fs_info->endio_meta_workers;
-		else
-			wq = fs_info->endio_workers;
-	}
+	if (end_io_wq->metadata)
+		wq = fs_info->endio_meta_workers;
+	else
+		wq = fs_info->endio_workers;
 
 	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
 	btrfs_queue_work(wq, &end_io_wq->work);
@@ -771,6 +762,9 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 {
 	struct btrfs_end_io_wq *end_io_wq;
 
+	if (WARN_ON_ONCE(btrfs_op(bio) != BTRFS_MAP_WRITE))
+		return BLK_STS_IOERR;
+
 	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
 	if (!end_io_wq)
 		return BLK_STS_RESOURCE;
@@ -2274,6 +2268,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 		destroy_workqueue(fs_info->endio_raid56_workers);
 	if (fs_info->rmw_workers)
 		destroy_workqueue(fs_info->rmw_workers);
+	if (fs_info->compressed_write_workers)
+		destroy_workqueue(fs_info->compressed_write_workers);
 	btrfs_destroy_workqueue(fs_info->endio_write_workers);
 	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
 	btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2288,7 +2284,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	 * queues can do metadata I/O operations.
 	 */
 	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
-	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
 }
 
 static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2470,15 +2465,14 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	fs_info->endio_meta_workers =
 		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
 				      max_active, 4);
-	fs_info->endio_meta_write_workers =
-		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
-				      max_active, 2);
 	fs_info->endio_raid56_workers =
 		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
 	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
 	fs_info->endio_write_workers =
 		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
 				      max_active, 2);
+	fs_info->compressed_write_workers =
+		alloc_workqueue("btrfs-compressed-write", flags, max_active);
 	fs_info->endio_freespace_worker =
 		btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
 				      max_active, 0);
@@ -2493,7 +2487,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	if (!(fs_info->workers && fs_info->hipri_workers &&
 	      fs_info->delalloc_workers && fs_info->flush_workers &&
 	      fs_info->endio_workers && fs_info->endio_meta_workers &&
-	      fs_info->endio_meta_write_workers &&
+	      fs_info->compressed_write_workers &&
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
 	      fs_info->caching_workers && fs_info->fixup_workers &&
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b1fdc6a26c76e..9c683c466d585 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1908,8 +1908,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
-	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
-				new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 07/10] btrfs: centralize setting REQ_META
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 08/10] btrfs: remove btrfs_end_io_wq Christoph Hellwig
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Set REQ_META in btrfs_submit_metadata_bio instead of the various callers.
We'll start relying on this flag inside of btrfs in a bit, and this
ensures it is always set correctly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/disk-io.c   | 2 ++
 fs/btrfs/extent_io.c | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 744a74203776e..b8837f66e0a43 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -914,6 +914,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	blk_status_t ret;
 
+	bio->bi_opf |= REQ_META;
+
 	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
 		/*
 		 * called for a read, do the setup so that checksum validation
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 80b4482c477c6..a14ed9b9dc2d0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4589,7 +4589,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct page *page = eb->pages[0];
-	unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+	unsigned int write_flags = wbc_to_write_flags(wbc);
 	bool no_dirty_ebs = false;
 	int ret;
 
@@ -4634,7 +4634,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 {
 	u64 disk_bytenr = eb->start;
 	int i, num_pages;
-	unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+	unsigned int write_flags = wbc_to_write_flags(wbc);
 	int ret = 0;
 
 	prepare_eb_write(eb);
@@ -6645,7 +6645,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
 
 	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
-	ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
+	ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
 				 page, eb->start, eb->len,
 				 eb->start - page_offset(page),
 				 end_bio_extent_readpage, mirror_num, 0,
@@ -6752,7 +6752,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
 			}
 
 			ClearPageError(page);
-			err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+			err = submit_extent_page(REQ_OP_READ, NULL,
 					 &bio_ctrl, page, page_offset(page),
 					 PAGE_SIZE, 0, end_bio_extent_readpage,
 					 mirror_num, 0, false);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 08/10] btrfs: remove btrfs_end_io_wq
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 07/10] btrfs: centralize setting REQ_META Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 09/10] btrfs: refactor btrfs_map_bio Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios Christoph Hellwig
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

All reads bio that go through btrfs_map_bio need to be completed in
user context.  And read I/Os are the most common and timing critical
in almost any file system workloads.

Embedd a work_struct into struct btrfs_bio and use it to complete all
read bios submitted through btrfs_map, using the REQ_META flag to decide
which workqueue they are placed on.

This removes the need for a separate 128 byte allocation (typically
rounded up to 192 bytes by slab) for all reads with a size increase
of 24 bytes for struct btrfs_bio.  Future patches will reorgnize
struct btrfs_bio to make use of this extra space for writes as well.

(all sizes are based a on typical 64-bit non-debug build)

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/compression.c |   4 --
 fs/btrfs/ctree.h       |   4 +-
 fs/btrfs/disk-io.c     | 120 +++--------------------------------------
 fs/btrfs/disk-io.h     |  10 ----
 fs/btrfs/inode.c       |  24 +--------
 fs/btrfs/super.c       |  11 +---
 fs/btrfs/volumes.c     |  33 ++++++++++--
 fs/btrfs/volumes.h     |   3 ++
 8 files changed, 42 insertions(+), 167 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d5986a30a4a2..c73ee8ae070d7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -938,10 +938,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			sums += fs_info->csum_size * nr_sectors;
 
 			ASSERT(comp_bio->bi_iter.bi_size);
-			ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
-						  BTRFS_WQ_ENDIO_DATA);
-			if (ret)
-				goto finish_cb;
 			ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
 			if (ret)
 				goto finish_cb;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5ae482632327f..c2c8027067c48 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -851,8 +851,8 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *hipri_workers;
 	struct btrfs_workqueue *delalloc_workers;
 	struct btrfs_workqueue *flush_workers;
-	struct btrfs_workqueue *endio_workers;
-	struct btrfs_workqueue *endio_meta_workers;
+	struct workqueue_struct *endio_workers;
+	struct workqueue_struct *endio_meta_workers;
 	struct workqueue_struct *endio_raid56_workers;
 	struct workqueue_struct *rmw_workers;
 	struct workqueue_struct *compressed_write_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8837f66e0a43..b55761e21bde3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
 				 BTRFS_SUPER_FLAG_METADUMP |\
 				 BTRFS_SUPER_FLAG_METADUMP_V2)
 
-static void end_workqueue_fn(struct btrfs_work *work);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				      struct btrfs_fs_info *fs_info);
@@ -63,40 +62,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
 
-/*
- * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * is complete.  This is used during reads to verify checksums, and it is used
- * by writes to insert metadata for new file extents after IO is complete.
- */
-struct btrfs_end_io_wq {
-	struct bio *bio;
-	bio_end_io_t *end_io;
-	void *private;
-	struct btrfs_fs_info *info;
-	blk_status_t status;
-	enum btrfs_wq_endio_type metadata;
-	struct btrfs_work work;
-};
-
-static struct kmem_cache *btrfs_end_io_wq_cache;
-
-int __init btrfs_end_io_wq_init(void)
-{
-	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
-					sizeof(struct btrfs_end_io_wq),
-					0,
-					SLAB_MEM_SPREAD,
-					NULL);
-	if (!btrfs_end_io_wq_cache)
-		return -ENOMEM;
-	return 0;
-}
-
-void __cold btrfs_end_io_wq_exit(void)
-{
-	kmem_cache_destroy(btrfs_end_io_wq_cache);
-}
-
 static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
 {
 	if (fs_info->csum_shash)
@@ -739,48 +704,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 	return ret;
 }
 
-static void end_workqueue_bio(struct bio *bio)
-{
-	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
-	struct btrfs_fs_info *fs_info;
-	struct btrfs_workqueue *wq;
-
-	fs_info = end_io_wq->info;
-	end_io_wq->status = bio->bi_status;
-
-	if (end_io_wq->metadata)
-		wq = fs_info->endio_meta_workers;
-	else
-		wq = fs_info->endio_workers;
-
-	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
-	btrfs_queue_work(wq, &end_io_wq->work);
-}
-
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
-			enum btrfs_wq_endio_type metadata)
-{
-	struct btrfs_end_io_wq *end_io_wq;
-
-	if (WARN_ON_ONCE(btrfs_op(bio) != BTRFS_MAP_WRITE))
-		return BLK_STS_IOERR;
-
-	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
-	if (!end_io_wq)
-		return BLK_STS_RESOURCE;
-
-	end_io_wq->private = bio->bi_private;
-	end_io_wq->end_io = bio->bi_end_io;
-	end_io_wq->info = info;
-	end_io_wq->status = 0;
-	end_io_wq->bio = bio;
-	end_io_wq->metadata = metadata;
-
-	bio->bi_private = end_io_wq;
-	bio->bi_end_io = end_workqueue_bio;
-	return 0;
-}
-
 static void run_one_async_start(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
@@ -917,14 +840,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
 	bio->bi_opf |= REQ_META;
 
 	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
-		/*
-		 * called for a read, do the setup so that checksum validation
-		 * can happen in the async kernel threads
-		 */
-		ret = btrfs_bio_wq_end_io(fs_info, bio,
-					  BTRFS_WQ_ENDIO_METADATA);
-		if (!ret)
-			ret = btrfs_map_bio(fs_info, bio, mirror_num);
+		ret = btrfs_map_bio(fs_info, bio, mirror_num);
 	} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
 		ret = btree_csum_one_bio(bio);
 		if (!ret)
@@ -1940,25 +1856,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
 	return root;
 }
 
-/*
- * called by the kthread helper functions to finally call the bio end_io
- * functions.  This is where read checksum verification actually happens
- */
-static void end_workqueue_fn(struct btrfs_work *work)
-{
-	struct bio *bio;
-	struct btrfs_end_io_wq *end_io_wq;
-
-	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
-	bio = end_io_wq->bio;
-
-	bio->bi_status = end_io_wq->status;
-	bio->bi_private = end_io_wq->private;
-	bio->bi_end_io = end_io_wq->end_io;
-	bio_endio(bio);
-	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-}
-
 static int cleaner_kthread(void *arg)
 {
 	struct btrfs_fs_info *fs_info = arg;
@@ -2265,7 +2162,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
 	btrfs_destroy_workqueue(fs_info->hipri_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
-	btrfs_destroy_workqueue(fs_info->endio_workers);
+	if (fs_info->endio_workers)
+		destroy_workqueue(fs_info->endio_workers);
 	if (fs_info->endio_raid56_workers)
 		destroy_workqueue(fs_info->endio_raid56_workers);
 	if (fs_info->rmw_workers)
@@ -2285,7 +2183,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	 * the queues used for metadata I/O, since tasks from those other work
 	 * queues can do metadata I/O operations.
 	 */
-	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+	if (fs_info->endio_meta_workers)
+		destroy_workqueue(fs_info->endio_meta_workers);
 }
 
 static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2458,15 +2357,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	fs_info->fixup_workers =
 		btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
 
-	/*
-	 * endios are largely parallel and should have a very
-	 * low idle thresh
-	 */
 	fs_info->endio_workers =
-		btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
+		alloc_workqueue("btrfs-endio", flags, max_active);
 	fs_info->endio_meta_workers =
-		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
-				      max_active, 4);
+		alloc_workqueue("btrfs-endio-meta", flags, max_active);
 	fs_info->endio_raid56_workers =
 		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
 	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 97255e3d7e524..424dbfc5fd784 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -17,12 +17,6 @@
  */
 #define BTRFS_BDEV_BLOCKSIZE	(4096)
 
-enum btrfs_wq_endio_type {
-	BTRFS_WQ_ENDIO_DATA,
-	BTRFS_WQ_ENDIO_METADATA,
-	BTRFS_WQ_ENDIO_FREE_SPACE,
-};
-
 static inline u64 btrfs_sb_offset(int mirror)
 {
 	u64 start = SZ_16K;
@@ -120,8 +114,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
 int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
 			     int level, struct btrfs_key *first_key);
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
-			enum btrfs_wq_endio_type metadata);
 blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 dio_file_offset,
@@ -145,8 +137,6 @@ int btree_lock_page_hook(struct page *page, void *data,
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
 int btrfs_init_root_free_objectid(struct btrfs_root *root);
-int __init btrfs_end_io_wq_init(void);
-void __cold btrfs_end_io_wq_exit(void);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02c4cbd557dcd..7155f3c7d9f42 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2614,12 +2614,6 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 		return;
 	}
 
-	ret = btrfs_bio_wq_end_io(fs_info, bio,
-			btrfs_is_free_space_inode(BTRFS_I(inode)) ?
-			BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
-	if (ret)
-		goto out;
-
 	/*
 	 * Lookup bio sums does extra checks around whether we need to csum or
 	 * not, which is why we ignore skip_sum here.
@@ -7785,9 +7779,6 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
 
 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
 
-	if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
-		return;
-
 	refcount_inc(&dip->refs);
 	if (btrfs_map_bio(fs_info, bio, mirror_num))
 		refcount_dec(&dip->refs);
@@ -7888,19 +7879,12 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_dio_private *dip = bio->bi_private;
-	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
 	blk_status_t ret;
 
-	if (!write) {
-		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-		if (ret)
-			return ret;
-	}
-
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		goto map;
 
-	if (write) {
+	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 		/* check btrfs_submit_data_write_bio() for async submit rules */
 		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
 			return btrfs_wq_submit_bio(inode, bio, 0, 0,
@@ -10250,12 +10234,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
 			return ret;
 	}
 
-	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-	if (ret) {
-		btrfs_bio_free_csum(bbio);
-		return ret;
-	}
-
 	atomic_inc(&priv->pending);
 	ret = btrfs_map_bio(fs_info, bio, mirror_num);
 	if (ret) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9c683c466d585..64eb8aeed156f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1906,8 +1906,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
-	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
-	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
@@ -2668,13 +2666,9 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_delayed_ref;
 
-	err = btrfs_end_io_wq_init();
-	if (err)
-		goto free_prelim_ref;
-
 	err = btrfs_interface_init();
 	if (err)
-		goto free_end_io_wq;
+		goto free_prelim_ref;
 
 	btrfs_print_mod_info();
 
@@ -2690,8 +2684,6 @@ static int __init init_btrfs_fs(void)
 
 unregister_ioctl:
 	btrfs_interface_exit();
-free_end_io_wq:
-	btrfs_end_io_wq_exit();
 free_prelim_ref:
 	btrfs_prelim_ref_exit();
 free_delayed_ref:
@@ -2729,7 +2721,6 @@ static void __exit exit_btrfs_fs(void)
 	extent_state_cache_exit();
 	extent_io_exit();
 	btrfs_interface_exit();
-	btrfs_end_io_wq_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index aeacb87457687..5f18e9105fe08 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6621,11 +6621,27 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
 }
 
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
+{
+	if (bioc->orig_bio->bi_opf & REQ_META)
+		return bioc->fs_info->endio_meta_workers;
+	return bioc->fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+	struct btrfs_bio *bbio =
+		container_of(work, struct btrfs_bio, end_io_work);
+
+	bio_endio(&bbio->bio);
+}
+
+static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
 {
 	struct bio *orig_bio = bioc->orig_bio;
+	struct btrfs_bio *bbio = btrfs_bio(orig_bio);
 
-	btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num;
+	bbio->mirror_num = bioc->mirror_num;
 	orig_bio->bi_private = bioc->private;
 	orig_bio->bi_end_io = bioc->end_io;
 
@@ -6637,7 +6653,14 @@ static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
 		orig_bio->bi_status = BLK_STS_IOERR;
 	else
 		orig_bio->bi_status = BLK_STS_OK;
-	bio_endio(orig_bio);
+
+	if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
+		INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+		queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
+	} else {
+		bio_endio(orig_bio);
+	}
+
 	btrfs_put_bioc(bioc);
 }
 
@@ -6669,7 +6692,7 @@ static void btrfs_end_bio(struct bio *bio)
 
 	btrfs_bio_counter_dec(bioc->fs_info);
 	if (atomic_dec_and_test(&bioc->stripes_pending))
-		btrfs_end_bioc(bioc);
+		btrfs_end_bioc(bioc, true);
 }
 
 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
@@ -6767,7 +6790,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
 			atomic_inc(&bioc->error);
 			if (atomic_dec_and_test(&bioc->stripes_pending))
-				btrfs_end_bioc(bioc);
+				btrfs_end_bioc(bioc, false);
 			continue;
 		}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 12b2af9260e92..28e28b7c48649 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -371,6 +371,9 @@ struct btrfs_bio {
 	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
 	struct bvec_iter iter;
 
+	/* for read end I/O handling */
+	struct work_struct end_io_work;
+
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
 	 * bytes for entire btrfs_bio but relies on bio being last.
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 09/10] btrfs: refactor btrfs_map_bio
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 08/10] btrfs: remove btrfs_end_io_wq Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  2022-04-29 14:30 ` [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios Christoph Hellwig
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

Move all per-stripe handling into submit_stripe_bio and use a label to
cleanup instead of duplicating the logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/volumes.c | 77 +++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 42 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5f18e9105fe08..28ac2bfb5d296 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6695,10 +6695,30 @@ static void btrfs_end_bio(struct bio *bio)
 		btrfs_end_bioc(bioc, true);
 }
 
-static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
-			      u64 physical, struct btrfs_device *dev)
+static void submit_stripe_bio(struct btrfs_io_context *bioc,
+		struct bio *orig_bio, int dev_nr, bool clone)
 {
 	struct btrfs_fs_info *fs_info = bioc->fs_info;
+	struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
+	u64 physical = bioc->stripes[dev_nr].physical;
+	struct bio *bio;
+
+	if (!dev || !dev->bdev ||
+	    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+	    (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
+	     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+		atomic_inc(&bioc->error);
+		if (atomic_dec_and_test(&bioc->stripes_pending))
+			btrfs_end_bioc(bioc, false);
+		return;
+	}
+
+	if (clone) {
+		bio = btrfs_bio_clone(dev->bdev, orig_bio);
+	} else {
+		bio = orig_bio;
+		bio_set_dev(bio, dev->bdev);
+	}
 
 	bio->bi_private = bioc;
 	btrfs_bio(bio)->device = dev;
@@ -6733,32 +6753,25 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num)
 {
-	struct btrfs_device *dev;
-	struct bio *first_bio = bio;
 	u64 logical = bio->bi_iter.bi_sector << 9;
-	u64 length = 0;
-	u64 map_length;
+	u64 length = bio->bi_iter.bi_size;
+	u64 map_length = length;
 	int ret;
 	int dev_nr;
 	int total_devs;
 	struct btrfs_io_context *bioc = NULL;
 
-	length = bio->bi_iter.bi_size;
-	map_length = length;
-
 	btrfs_bio_counter_inc_blocked(fs_info);
 	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
 				&map_length, &bioc, mirror_num, 1);
-	if (ret) {
-		btrfs_bio_counter_dec(fs_info);
-		return errno_to_blk_status(ret);
-	}
+	if (ret)
+		goto out_dec;
 
 	total_devs = bioc->num_stripes;
-	bioc->orig_bio = first_bio;
-	bioc->private = first_bio->bi_private;
-	bioc->end_io = first_bio->bi_end_io;
-	atomic_set(&bioc->stripes_pending, bioc->num_stripes);
+	bioc->orig_bio = bio;
+	bioc->private = bio->bi_private;
+	bioc->end_io = bio->bi_end_io;
+	atomic_set(&bioc->stripes_pending, total_devs);
 
 	if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
 	    ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
@@ -6770,9 +6783,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			ret = raid56_parity_recover(bio, bioc, map_length,
 						    mirror_num, 1);
 		}
-
-		btrfs_bio_counter_dec(fs_info);
-		return errno_to_blk_status(ret);
+		goto out_dec;
 	}
 
 	if (map_length < length) {
@@ -6782,29 +6793,11 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 		BUG();
 	}
 
-	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
-		dev = bioc->stripes[dev_nr].dev;
-		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
-						   &dev->dev_state) ||
-		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
-		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-			atomic_inc(&bioc->error);
-			if (atomic_dec_and_test(&bioc->stripes_pending))
-				btrfs_end_bioc(bioc, false);
-			continue;
-		}
-
-		if (dev_nr < total_devs - 1) {
-			bio = btrfs_bio_clone(dev->bdev, first_bio);
-		} else {
-			bio = first_bio;
-			bio_set_dev(bio, dev->bdev);
-		}
-
-		submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
-	}
+	for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+		submit_stripe_bio(bioc, bio, dev_nr, dev_nr < total_devs - 1);
+out_dec:
 	btrfs_bio_counter_dec(fs_info);
-	return BLK_STS_OK;
+	return errno_to_blk_status(ret);
 }
 
 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios
  2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
                   ` (8 preceding siblings ...)
  2022-04-29 14:30 ` [PATCH 09/10] btrfs: refactor btrfs_map_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
  9 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs

The bios submitted from btrfs_map_bio don't really interact with the
rest of btrfs and the only btrfs_bio member actually used in the
low-level bios is the pointer to the btrfs_io_contex used for endio
handler.

Use a union in struct btrfs_io_stripe that allows the endio handler to
find the btrfs_io_context and remove the spurious ->device assignment
so that a plain fs_bio_set bio can be used for the low-level bios
allocated inside btrfs_map_bio.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/extent_io.c | 13 -------------
 fs/btrfs/extent_io.h |  1 -
 fs/btrfs/volumes.c   | 20 ++++++++++----------
 fs/btrfs/volumes.h   |  5 ++++-
 4 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a14ed9b9dc2d0..37f4eee418219 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3209,19 +3209,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
 	return bio;
 }
 
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio)
-{
-	struct btrfs_bio *bbio;
-	struct bio *new;
-
-	/* Bio allocation backed by a bioset does not fail */
-	new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset);
-	bbio = btrfs_bio(new);
-	btrfs_bio_init(bbio);
-	bbio->iter = bio->bi_iter;
-	return new;
-}
-
 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
 {
 	struct bio *bio;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b390ec79f9a86..3078e90be3a99 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -265,7 +265,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
 
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 28ac2bfb5d296..8577893ccb4b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6666,23 +6666,21 @@ static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
 
 static void btrfs_end_bio(struct bio *bio)
 {
-	struct btrfs_io_context *bioc = bio->bi_private;
+	struct btrfs_io_stripe *stripe = bio->bi_private;
+	struct btrfs_io_context *bioc = stripe->bioc;
 
 	if (bio->bi_status) {
 		atomic_inc(&bioc->error);
 		if (bio->bi_status == BLK_STS_IOERR ||
 		    bio->bi_status == BLK_STS_TARGET) {
-			struct btrfs_device *dev = btrfs_bio(bio)->device;
-
-			ASSERT(dev->bdev);
 			if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-				btrfs_dev_stat_inc_and_print(dev,
+				btrfs_dev_stat_inc_and_print(stripe->dev,
 						BTRFS_DEV_STAT_WRITE_ERRS);
 			else if (!(bio->bi_opf & REQ_RAHEAD))
-				btrfs_dev_stat_inc_and_print(dev,
+				btrfs_dev_stat_inc_and_print(stripe->dev,
 						BTRFS_DEV_STAT_READ_ERRS);
 			if (bio->bi_opf & REQ_PREFLUSH)
-				btrfs_dev_stat_inc_and_print(dev,
+				btrfs_dev_stat_inc_and_print(stripe->dev,
 						BTRFS_DEV_STAT_FLUSH_ERRS);
 		}
 	}
@@ -6714,14 +6712,16 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc,
 	}
 
 	if (clone) {
-		bio = btrfs_bio_clone(dev->bdev, orig_bio);
+		bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS,
+				      &fs_bio_set);
 	} else {
 		bio = orig_bio;
 		bio_set_dev(bio, dev->bdev);
+		btrfs_bio(bio)->device = dev;
 	}
 
-	bio->bi_private = bioc;
-	btrfs_bio(bio)->device = dev;
+	bioc->stripes[dev_nr].bioc = bioc;
+	bio->bi_private = &bioc->stripes[dev_nr];
 	bio->bi_end_io = btrfs_end_bio;
 	bio->bi_iter.bi_sector = physical >> 9;
 	/*
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 28e28b7c48649..825e44c82f2b0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -396,7 +396,10 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
 
 struct btrfs_io_stripe {
 	struct btrfs_device *dev;
-	u64 physical;
+	union {
+		u64 physical;			/* block mapping */
+		struct btrfs_io_context *bioc;	/* for the endio handler */
+	};
 	u64 length; /* only used for discard mappings */
 };
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
@ 2022-05-01  4:40   ` Qu Wenruo
  2022-05-01  4:53     ` Qu Wenruo
  0 siblings, 1 reply; 21+ messages in thread
From: Qu Wenruo @ 2022-05-01  4:40 UTC (permalink / raw)
  To: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs



On 2022/4/29 22:30, Christoph Hellwig wrote:
> Instead of attaching a an extra allocation an indirect call to each
> low-level bio issued by the RAID code, add a work_struct to struct
> btrfs_raid_bio and only defer the per-rbio completion action.  The
> per-bio action for all the I/Os are trivial and can be safely done
> from interrupt context.
>
> As a nice side effect this also allows sharing the boilerplate code
> for the per-bio completions
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

It looks like this patch is causing test failure in btrfs/027, at least
for subapge (64K page size, 4K sectorsize) cases.

Reproducibility is 100% (4/4 tried).

The hanging sub-test case is the repalcing of a missing device in raid5.

The involved dmesg (including the hanging thread dump) is:

[  276.672541] BTRFS warning (device dm-1): read-write for sector size
4096 with page size 65536 is experimental
[  276.744316] BTRFS info (device dm-1): checking UUID tree
[  277.387701] BTRFS info (device dm-1): allowing degraded mounts
[  277.390314] BTRFS info (device dm-1): using free space tree
[  277.392108] BTRFS info (device dm-1): has skinny extents
[  277.393890] BTRFS warning (device dm-1): read-write for sector size
4096 with page size 65536 is experimental
[  277.420922] BTRFS warning (device dm-1): devid 2 uuid
4b67464d-e851-4a88-8765-67b043d4680f is missing
[  277.432694] BTRFS warning (device dm-1): devid 2 uuid
4b67464d-e851-4a88-8765-67b043d4680f is missing
[  277.648326] BTRFS info (device dm-1): dev_replace from <missing disk>
(devid 2) to /dev/mapper/test-scratch5 started
[  297.264371] task:btrfs           state:D stack:    0 pid: 7158 ppid:
  6493 flags:0x0000000c
[  297.280744] Call trace:
[  297.282351]  __switch_to+0xfc/0x160
[  297.284525]  __schedule+0x260/0x61c
[  297.286959]  schedule+0x54/0xc4
[  297.288980]  scrub_enumerate_chunks+0x610/0x760 [btrfs]
[  297.292504]  btrfs_scrub_dev+0x1a0/0x530 [btrfs]
[  297.306738]  btrfs_dev_replace_start+0x2a4/0x2d0 [btrfs]
[  297.310418]  btrfs_dev_replace_by_ioctl+0x48/0x84 [btrfs]
[  297.314026]  btrfs_ioctl_dev_replace+0x1b8/0x210 [btrfs]
[  297.328014]  btrfs_ioctl+0xa48/0x1a70 [btrfs]
[  297.330705]  __arm64_sys_ioctl+0xb4/0x100
[  297.333037]  invoke_syscall+0x50/0x120
[  297.343237]  el0_svc_common.constprop.0+0x4c/0x100
[  297.345716]  do_el0_svc+0x34/0xa0
[  297.347242]  el0_svc+0x34/0xb0
[  297.348763]  el0t_64_sync_handler+0xa8/0x130
[  297.350870]  el0t_64_sync+0x18c/0x190

Mind to take a look on that hang?

Thanks,
Qu

> ---
>   fs/btrfs/ctree.h   |   2 +-
>   fs/btrfs/disk-io.c |  12 ++---
>   fs/btrfs/disk-io.h |   1 -
>   fs/btrfs/raid56.c  | 111 ++++++++++++++++++---------------------------
>   4 files changed, 49 insertions(+), 77 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 40a6f61559348..4dd0d4a2e7757 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -853,7 +853,7 @@ struct btrfs_fs_info {
>   	struct btrfs_workqueue *flush_workers;
>   	struct btrfs_workqueue *endio_workers;
>   	struct btrfs_workqueue *endio_meta_workers;
> -	struct btrfs_workqueue *endio_raid56_workers;
> +	struct workqueue_struct *endio_raid56_workers;
>   	struct workqueue_struct *rmw_workers;
>   	struct btrfs_workqueue *endio_meta_write_workers;
>   	struct btrfs_workqueue *endio_write_workers;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 73e12ecc81be1..3c6137734d28c 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
>   			wq = fs_info->endio_meta_write_workers;
>   		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
>   			wq = fs_info->endio_freespace_worker;
> -		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
> -			wq = fs_info->endio_raid56_workers;
>   		else
>   			wq = fs_info->endio_write_workers;
>   	} else {
> -		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
> -			wq = fs_info->endio_raid56_workers;
> -		else if (end_io_wq->metadata)
> +		if (end_io_wq->metadata)
>   			wq = fs_info->endio_meta_workers;
>   		else
>   			wq = fs_info->endio_workers;
> @@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
>   	btrfs_destroy_workqueue(fs_info->hipri_workers);
>   	btrfs_destroy_workqueue(fs_info->workers);
>   	btrfs_destroy_workqueue(fs_info->endio_workers);
> -	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
> +	if (fs_info->endio_raid56_workers)
> +		destroy_workqueue(fs_info->endio_raid56_workers);
>   	if (fs_info->rmw_workers)
>   		destroy_workqueue(fs_info->rmw_workers);
>   	btrfs_destroy_workqueue(fs_info->endio_write_workers);
> @@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
>   		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
>   				      max_active, 2);
>   	fs_info->endio_raid56_workers =
> -		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
> -				      max_active, 4);
> +		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
>   	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
>   	fs_info->endio_write_workers =
>   		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index 9340e3266e0ac..97255e3d7e524 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
>   	BTRFS_WQ_ENDIO_DATA,
>   	BTRFS_WQ_ENDIO_METADATA,
>   	BTRFS_WQ_ENDIO_FREE_SPACE,
> -	BTRFS_WQ_ENDIO_RAID56,
>   };
>
>   static inline u64 btrfs_sb_offset(int mirror)
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index a5b623ee6facd..1a3c1a9b10d0b 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -164,6 +164,9 @@ struct btrfs_raid_bio {
>   	atomic_t stripes_pending;
>
>   	atomic_t error;
> +
> +	struct work_struct end_io_work;
> +
>   	/*
>   	 * these are two arrays of pointers.  We allocate the
>   	 * rbio big enough to hold them both and setup their
> @@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
>   	}
>   }
>
> -/*
> - * end io for the read phase of the rmw cycle.  All the bios here are physical
> - * stripe bios we've read from the disk so we can recalculate the parity of the
> - * stripe.
> - *
> - * This will usually kick off finish_rmw once all the bios are read in, but it
> - * may trigger parity reconstruction if we had any errors along the way
> - */
> -static void raid_rmw_end_io(struct bio *bio)
> +static void raid56_bio_end_io(struct bio *bio)
>   {
>   	struct btrfs_raid_bio *rbio = bio->bi_private;
>
> @@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
>
>   	bio_put(bio);
>
> -	if (!atomic_dec_and_test(&rbio->stripes_pending))
> -		return;
> +	if (atomic_dec_and_test(&rbio->stripes_pending))
> +		queue_work(rbio->bioc->fs_info->endio_raid56_workers,
> +			   &rbio->end_io_work);
> +}
>
> -	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
> -		goto cleanup;
> +/*
> + * End io handler for the read phase of the rmw cycle.  All the bios here are
> + * physical stripe bios we've read from the disk so we can recalculate the
> + * parity of the stripe.
> + *
> + * This will usually kick off finish_rmw once all the bios are read in, but it
> + * may trigger parity reconstruction if we had any errors along the way
> + */
> +static void raid56_rmw_end_io_work(struct work_struct *work)
> +{
> +	struct btrfs_raid_bio *rbio =
> +		container_of(work, struct btrfs_raid_bio, end_io_work);
> +
> +	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
> +		rbio_orig_end_io(rbio, BLK_STS_IOERR);
> +		return;
> +	}
>
>   	/*
> -	 * this will normally call finish_rmw to start our write
> -	 * but if there are any failed stripes we'll reconstruct
> -	 * from parity first
> +	 * This will normally call finish_rmw to start our write but if there
> +	 * are any failed stripes we'll reconstruct from parity first.
>   	 */
>   	validate_rbio_for_rmw(rbio);
> -	return;
> -
> -cleanup:
> -
> -	rbio_orig_end_io(rbio, BLK_STS_IOERR);
>   }
>
>   /*
> @@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
>   	 * touch it after that.
>   	 */
>   	atomic_set(&rbio->stripes_pending, bios_to_read);
> +	INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
>   	while ((bio = bio_list_pop(&bio_list))) {
> -		bio->bi_end_io = raid_rmw_end_io;
> -
> -		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> +		bio->bi_end_io = raid56_bio_end_io;
>   		submit_bio(bio);
>   	}
>   	/* the actual write will happen once the reads are done */
> @@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
>   }
>
>   /*
> - * This is called only for stripes we've read from disk to
> - * reconstruct the parity.
> + * This is called only for stripes we've read from disk to reconstruct the
> + * parity.
>    */
> -static void raid_recover_end_io(struct bio *bio)
> +static void raid_recover_end_io_work(struct work_struct *work)
>   {
> -	struct btrfs_raid_bio *rbio = bio->bi_private;
> -
> -	/*
> -	 * we only read stripe pages off the disk, set them
> -	 * up to date if there were no errors
> -	 */
> -	if (bio->bi_status)
> -		fail_bio_stripe(rbio, bio);
> -	else
> -		set_bio_pages_uptodate(rbio, bio);
> -	bio_put(bio);
> -
> -	if (!atomic_dec_and_test(&rbio->stripes_pending))
> -		return;
> +	struct btrfs_raid_bio *rbio =
> +		container_of(work, struct btrfs_raid_bio, end_io_work);
>
>   	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
>   		rbio_orig_end_io(rbio, BLK_STS_IOERR);
> @@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
>   	 * touch it after that.
>   	 */
>   	atomic_set(&rbio->stripes_pending, bios_to_read);
> +	INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
>   	while ((bio = bio_list_pop(&bio_list))) {
> -		bio->bi_end_io = raid_recover_end_io;
> -
> -		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> +		bio->bi_end_io = raid56_bio_end_io;
>   		submit_bio(bio);
>   	}
>
> @@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>   	atomic_set(&rbio->stripes_pending, nr_data);
>
>   	while ((bio = bio_list_pop(&bio_list))) {
> -		bio->bi_end_io = raid_write_end_io;
> -
> +		bio->bi_end_io = raid56_bio_end_io;
>   		submit_bio(bio);
>   	}
>   	return;
> @@ -2671,24 +2660,14 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
>    * This will usually kick off finish_rmw once all the bios are read in, but it
>    * may trigger parity reconstruction if we had any errors along the way
>    */
> -static void raid56_parity_scrub_end_io(struct bio *bio)
> +static void raid56_parity_scrub_end_io_work(struct work_struct *work)
>   {
> -	struct btrfs_raid_bio *rbio = bio->bi_private;
> -
> -	if (bio->bi_status)
> -		fail_bio_stripe(rbio, bio);
> -	else
> -		set_bio_pages_uptodate(rbio, bio);
> -
> -	bio_put(bio);
> -
> -	if (!atomic_dec_and_test(&rbio->stripes_pending))
> -		return;
> +	struct btrfs_raid_bio *rbio =
> +		container_of(work, struct btrfs_raid_bio, end_io_work);
>
>   	/*
> -	 * this will normally call finish_rmw to start our write
> -	 * but if there are any failed stripes we'll reconstruct
> -	 * from parity first
> +	 * This will normally call finish_rmw to start our write, but if there
> +	 * are any failed stripes we'll reconstruct from parity first
>   	 */
>   	validate_rbio_for_parity_scrub(rbio);
>   }
> @@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
>   	 * touch it after that.
>   	 */
>   	atomic_set(&rbio->stripes_pending, bios_to_read);
> +	INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
>   	while ((bio = bio_list_pop(&bio_list))) {
> -		bio->bi_end_io = raid56_parity_scrub_end_io;
> -
> -		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> +		bio->bi_end_io = raid56_bio_end_io;
>   		submit_bio(bio);
>   	}
>   	/* the actual write will happen once the reads are done */

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-05-01  4:40   ` Qu Wenruo
@ 2022-05-01  4:53     ` Qu Wenruo
  2022-05-02 16:44       ` Christoph Hellwig
  2022-06-03 16:44       ` David Sterba
  0 siblings, 2 replies; 21+ messages in thread
From: Qu Wenruo @ 2022-05-01  4:53 UTC (permalink / raw)
  To: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs



On 2022/5/1 12:40, Qu Wenruo wrote:
>
>
> On 2022/4/29 22:30, Christoph Hellwig wrote:
>> Instead of attaching a an extra allocation an indirect call to each
>> low-level bio issued by the RAID code, add a work_struct to struct
>> btrfs_raid_bio and only defer the per-rbio completion action.  The
>> per-bio action for all the I/Os are trivial and can be safely done
>> from interrupt context.
>>
>> As a nice side effect this also allows sharing the boilerplate code
>> for the per-bio completions
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> It looks like this patch is causing test failure in btrfs/027, at least
> for subapge (64K page size, 4K sectorsize) cases.

Also confirmed the same hang on the same commit on x86_64 (4K page size
4K sectorsize).

Also 100% reproducible.

Thanks,
Qu

>
> Reproducibility is 100% (4/4 tried).
>
> The hanging sub-test case is the repalcing of a missing device in raid5.
>
> The involved dmesg (including the hanging thread dump) is:
>
> [  276.672541] BTRFS warning (device dm-1): read-write for sector size
> 4096 with page size 65536 is experimental
> [  276.744316] BTRFS info (device dm-1): checking UUID tree
> [  277.387701] BTRFS info (device dm-1): allowing degraded mounts
> [  277.390314] BTRFS info (device dm-1): using free space tree
> [  277.392108] BTRFS info (device dm-1): has skinny extents
> [  277.393890] BTRFS warning (device dm-1): read-write for sector size
> 4096 with page size 65536 is experimental
> [  277.420922] BTRFS warning (device dm-1): devid 2 uuid
> 4b67464d-e851-4a88-8765-67b043d4680f is missing
> [  277.432694] BTRFS warning (device dm-1): devid 2 uuid
> 4b67464d-e851-4a88-8765-67b043d4680f is missing
> [  277.648326] BTRFS info (device dm-1): dev_replace from <missing disk>
> (devid 2) to /dev/mapper/test-scratch5 started
> [  297.264371] task:btrfs           state:D stack:    0 pid: 7158 ppid:
>   6493 flags:0x0000000c
> [  297.280744] Call trace:
> [  297.282351]  __switch_to+0xfc/0x160
> [  297.284525]  __schedule+0x260/0x61c
> [  297.286959]  schedule+0x54/0xc4
> [  297.288980]  scrub_enumerate_chunks+0x610/0x760 [btrfs]
> [  297.292504]  btrfs_scrub_dev+0x1a0/0x530 [btrfs]
> [  297.306738]  btrfs_dev_replace_start+0x2a4/0x2d0 [btrfs]
> [  297.310418]  btrfs_dev_replace_by_ioctl+0x48/0x84 [btrfs]
> [  297.314026]  btrfs_ioctl_dev_replace+0x1b8/0x210 [btrfs]
> [  297.328014]  btrfs_ioctl+0xa48/0x1a70 [btrfs]
> [  297.330705]  __arm64_sys_ioctl+0xb4/0x100
> [  297.333037]  invoke_syscall+0x50/0x120
> [  297.343237]  el0_svc_common.constprop.0+0x4c/0x100
> [  297.345716]  do_el0_svc+0x34/0xa0
> [  297.347242]  el0_svc+0x34/0xb0
> [  297.348763]  el0t_64_sync_handler+0xa8/0x130
> [  297.350870]  el0t_64_sync+0x18c/0x190
>
> Mind to take a look on that hang?
>
> Thanks,
> Qu
>
>> ---
>>   fs/btrfs/ctree.h   |   2 +-
>>   fs/btrfs/disk-io.c |  12 ++---
>>   fs/btrfs/disk-io.h |   1 -
>>   fs/btrfs/raid56.c  | 111 ++++++++++++++++++---------------------------
>>   4 files changed, 49 insertions(+), 77 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 40a6f61559348..4dd0d4a2e7757 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -853,7 +853,7 @@ struct btrfs_fs_info {
>>       struct btrfs_workqueue *flush_workers;
>>       struct btrfs_workqueue *endio_workers;
>>       struct btrfs_workqueue *endio_meta_workers;
>> -    struct btrfs_workqueue *endio_raid56_workers;
>> +    struct workqueue_struct *endio_raid56_workers;
>>       struct workqueue_struct *rmw_workers;
>>       struct btrfs_workqueue *endio_meta_write_workers;
>>       struct btrfs_workqueue *endio_write_workers;
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 73e12ecc81be1..3c6137734d28c 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
>>               wq = fs_info->endio_meta_write_workers;
>>           else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
>>               wq = fs_info->endio_freespace_worker;
>> -        else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
>> -            wq = fs_info->endio_raid56_workers;
>>           else
>>               wq = fs_info->endio_write_workers;
>>       } else {
>> -        if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
>> -            wq = fs_info->endio_raid56_workers;
>> -        else if (end_io_wq->metadata)
>> +        if (end_io_wq->metadata)
>>               wq = fs_info->endio_meta_workers;
>>           else
>>               wq = fs_info->endio_workers;
>> @@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct
>> btrfs_fs_info *fs_info)
>>       btrfs_destroy_workqueue(fs_info->hipri_workers);
>>       btrfs_destroy_workqueue(fs_info->workers);
>>       btrfs_destroy_workqueue(fs_info->endio_workers);
>> -    btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
>> +    if (fs_info->endio_raid56_workers)
>> +        destroy_workqueue(fs_info->endio_raid56_workers);
>>       if (fs_info->rmw_workers)
>>           destroy_workqueue(fs_info->rmw_workers);
>>       btrfs_destroy_workqueue(fs_info->endio_write_workers);
>> @@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct
>> btrfs_fs_info *fs_info)
>>           btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
>>                         max_active, 2);
>>       fs_info->endio_raid56_workers =
>> -        btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
>> -                      max_active, 4);
>> +        alloc_workqueue("btrfs-endio-raid56", flags, max_active);
>>       fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags,
>> max_active);
>>       fs_info->endio_write_workers =
>>           btrfs_alloc_workqueue(fs_info, "endio-write", flags,
>> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>> index 9340e3266e0ac..97255e3d7e524 100644
>> --- a/fs/btrfs/disk-io.h
>> +++ b/fs/btrfs/disk-io.h
>> @@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
>>       BTRFS_WQ_ENDIO_DATA,
>>       BTRFS_WQ_ENDIO_METADATA,
>>       BTRFS_WQ_ENDIO_FREE_SPACE,
>> -    BTRFS_WQ_ENDIO_RAID56,
>>   };
>>
>>   static inline u64 btrfs_sb_offset(int mirror)
>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>> index a5b623ee6facd..1a3c1a9b10d0b 100644
>> --- a/fs/btrfs/raid56.c
>> +++ b/fs/btrfs/raid56.c
>> @@ -164,6 +164,9 @@ struct btrfs_raid_bio {
>>       atomic_t stripes_pending;
>>
>>       atomic_t error;
>> +
>> +    struct work_struct end_io_work;
>> +
>>       /*
>>        * these are two arrays of pointers.  We allocate the
>>        * rbio big enough to hold them both and setup their
>> @@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct
>> btrfs_raid_bio *rbio, struct bio *bio)
>>       }
>>   }
>>
>> -/*
>> - * end io for the read phase of the rmw cycle.  All the bios here are
>> physical
>> - * stripe bios we've read from the disk so we can recalculate the
>> parity of the
>> - * stripe.
>> - *
>> - * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>> - * may trigger parity reconstruction if we had any errors along the way
>> - */
>> -static void raid_rmw_end_io(struct bio *bio)
>> +static void raid56_bio_end_io(struct bio *bio)
>>   {
>>       struct btrfs_raid_bio *rbio = bio->bi_private;
>>
>> @@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
>>
>>       bio_put(bio);
>>
>> -    if (!atomic_dec_and_test(&rbio->stripes_pending))
>> -        return;
>> +    if (atomic_dec_and_test(&rbio->stripes_pending))
>> +        queue_work(rbio->bioc->fs_info->endio_raid56_workers,
>> +               &rbio->end_io_work);
>> +}
>>
>> -    if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
>> -        goto cleanup;
>> +/*
>> + * End io handler for the read phase of the rmw cycle.  All the bios
>> here are
>> + * physical stripe bios we've read from the disk so we can
>> recalculate the
>> + * parity of the stripe.
>> + *
>> + * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>> + * may trigger parity reconstruction if we had any errors along the way
>> + */
>> +static void raid56_rmw_end_io_work(struct work_struct *work)
>> +{
>> +    struct btrfs_raid_bio *rbio =
>> +        container_of(work, struct btrfs_raid_bio, end_io_work);
>> +
>> +    if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
>> +        rbio_orig_end_io(rbio, BLK_STS_IOERR);
>> +        return;
>> +    }
>>
>>       /*
>> -     * this will normally call finish_rmw to start our write
>> -     * but if there are any failed stripes we'll reconstruct
>> -     * from parity first
>> +     * This will normally call finish_rmw to start our write but if
>> there
>> +     * are any failed stripes we'll reconstruct from parity first.
>>        */
>>       validate_rbio_for_rmw(rbio);
>> -    return;
>> -
>> -cleanup:
>> -
>> -    rbio_orig_end_io(rbio, BLK_STS_IOERR);
>>   }
>>
>>   /*
>> @@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct
>> btrfs_raid_bio *rbio)
>>        * touch it after that.
>>        */
>>       atomic_set(&rbio->stripes_pending, bios_to_read);
>> +    INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
>>       while ((bio = bio_list_pop(&bio_list))) {
>> -        bio->bi_end_io = raid_rmw_end_io;
>> -
>> -        btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> +        bio->bi_end_io = raid56_bio_end_io;
>>           submit_bio(bio);
>>       }
>>       /* the actual write will happen once the reads are done */
>> @@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct
>> btrfs_raid_bio *rbio)
>>   }
>>
>>   /*
>> - * This is called only for stripes we've read from disk to
>> - * reconstruct the parity.
>> + * This is called only for stripes we've read from disk to
>> reconstruct the
>> + * parity.
>>    */
>> -static void raid_recover_end_io(struct bio *bio)
>> +static void raid_recover_end_io_work(struct work_struct *work)
>>   {
>> -    struct btrfs_raid_bio *rbio = bio->bi_private;
>> -
>> -    /*
>> -     * we only read stripe pages off the disk, set them
>> -     * up to date if there were no errors
>> -     */
>> -    if (bio->bi_status)
>> -        fail_bio_stripe(rbio, bio);
>> -    else
>> -        set_bio_pages_uptodate(rbio, bio);
>> -    bio_put(bio);
>> -
>> -    if (!atomic_dec_and_test(&rbio->stripes_pending))
>> -        return;
>> +    struct btrfs_raid_bio *rbio =
>> +        container_of(work, struct btrfs_raid_bio, end_io_work);
>>
>>       if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
>>           rbio_orig_end_io(rbio, BLK_STS_IOERR);
>> @@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct
>> btrfs_raid_bio *rbio)
>>        * touch it after that.
>>        */
>>       atomic_set(&rbio->stripes_pending, bios_to_read);
>> +    INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
>>       while ((bio = bio_list_pop(&bio_list))) {
>> -        bio->bi_end_io = raid_recover_end_io;
>> -
>> -        btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> +        bio->bi_end_io = raid56_bio_end_io;
>>           submit_bio(bio);
>>       }
>>
>> @@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct
>> btrfs_raid_bio *rbio,
>>       atomic_set(&rbio->stripes_pending, nr_data);
>>
>>       while ((bio = bio_list_pop(&bio_list))) {
>> -        bio->bi_end_io = raid_write_end_io;
>> -
>> +        bio->bi_end_io = raid56_bio_end_io;
>>           submit_bio(bio);
>>       }
>>       return;
>> @@ -2671,24 +2660,14 @@ static void
>> validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
>>    * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>>    * may trigger parity reconstruction if we had any errors along the way
>>    */
>> -static void raid56_parity_scrub_end_io(struct bio *bio)
>> +static void raid56_parity_scrub_end_io_work(struct work_struct *work)
>>   {
>> -    struct btrfs_raid_bio *rbio = bio->bi_private;
>> -
>> -    if (bio->bi_status)
>> -        fail_bio_stripe(rbio, bio);
>> -    else
>> -        set_bio_pages_uptodate(rbio, bio);
>> -
>> -    bio_put(bio);
>> -
>> -    if (!atomic_dec_and_test(&rbio->stripes_pending))
>> -        return;
>> +    struct btrfs_raid_bio *rbio =
>> +        container_of(work, struct btrfs_raid_bio, end_io_work);
>>
>>       /*
>> -     * this will normally call finish_rmw to start our write
>> -     * but if there are any failed stripes we'll reconstruct
>> -     * from parity first
>> +     * This will normally call finish_rmw to start our write, but if
>> there
>> +     * are any failed stripes we'll reconstruct from parity first
>>        */
>>       validate_rbio_for_parity_scrub(rbio);
>>   }
>> @@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct
>> btrfs_raid_bio *rbio)
>>        * touch it after that.
>>        */
>>       atomic_set(&rbio->stripes_pending, bios_to_read);
>> +    INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
>>       while ((bio = bio_list_pop(&bio_list))) {
>> -        bio->bi_end_io = raid56_parity_scrub_end_io;
>> -
>> -        btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> +        bio->bi_end_io = raid56_bio_end_io;
>>           submit_bio(bio);
>>       }
>>       /* the actual write will happen once the reads are done */

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-05-01  4:53     ` Qu Wenruo
@ 2022-05-02 16:44       ` Christoph Hellwig
  2022-06-03 16:44       ` David Sterba
  1 sibling, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-05-02 16:44 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo, linux-btrfs

On Sun, May 01, 2022 at 12:53:00PM +0800, Qu Wenruo wrote:
> > It looks like this patch is causing test failure in btrfs/027, at least
> > for subapge (64K page size, 4K sectorsize) cases.
> 
> Also confirmed the same hang on the same commit on x86_64 (4K page size
> 4K sectorsize).
> 
> Also 100% reproducible.

I'll take a look.  It turns out I never actually run that test as it
doesn't run without at least 5 devices in the scratch pool, and my btrfs
config uses 4.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-05-01  4:53     ` Qu Wenruo
  2022-05-02 16:44       ` Christoph Hellwig
@ 2022-06-03 16:44       ` David Sterba
  2022-06-03 16:45         ` David Sterba
  1 sibling, 1 reply; 21+ messages in thread
From: David Sterba @ 2022-06-03 16:44 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo, linux-btrfs

On Sun, May 01, 2022 at 12:53:00PM +0800, Qu Wenruo wrote:
> On 2022/5/1 12:40, Qu Wenruo wrote:
> > On 2022/4/29 22:30, Christoph Hellwig wrote:
> >> Instead of attaching a an extra allocation an indirect call to each
> >> low-level bio issued by the RAID code, add a work_struct to struct
> >> btrfs_raid_bio and only defer the per-rbio completion action.  The
> >> per-bio action for all the I/Os are trivial and can be safely done
> >> from interrupt context.
> >>
> >> As a nice side effect this also allows sharing the boilerplate code
> >> for the per-bio completions
> >>
> >> Signed-off-by: Christoph Hellwig <hch@lst.de>
> >
> > It looks like this patch is causing test failure in btrfs/027, at least
> > for subapge (64K page size, 4K sectorsize) cases.
> 
> Also confirmed the same hang on the same commit on x86_64 (4K page size
> 4K sectorsize).
> 
> Also 100% reproducible.

The test passes on my VM setup, the whole suite was run on the patchset
at least twice.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
  2022-06-03 16:44       ` David Sterba
@ 2022-06-03 16:45         ` David Sterba
  0 siblings, 0 replies; 21+ messages in thread
From: David Sterba @ 2022-06-03 16:45 UTC (permalink / raw)
  To: David Sterba
  Cc: Qu Wenruo, Christoph Hellwig, David Sterba, Josef Bacik,
	Qu Wenruo, linux-btrfs

On Fri, Jun 03, 2022 at 06:44:56PM +0200, David Sterba wrote:
> The test passes on my VM setup, the whole suite was run on the patchset
> at least twice.

Oh never mind, this is some old version.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio
  2022-05-26  7:36 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
@ 2022-06-01 19:32   ` David Sterba
  0 siblings, 0 replies; 21+ messages in thread
From: David Sterba @ 2022-06-01 19:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: David Sterba, Josef Bacik, Qu Wenruo, linux-btrfs, Johannes Thumshirn

On Thu, May 26, 2022 at 09:36:34AM +0200, Christoph Hellwig wrote:
> Remove the pointless goto just to return err and clean up the code flow
> to be a little more straight forward.

Please use more descriptive subjects, "cleaup function" is too generic
and makes navigation in patches harder. The changelog text usually gives
hint what exactly is being cleaned up, in this case it's code flow so
I've used subject "simplify code flow in btrfs_submit_dio_bio".

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio
  2022-05-26  7:36 cleanup btrfs bio handling, part 2 v4 Christoph Hellwig
@ 2022-05-26  7:36 ` Christoph Hellwig
  2022-06-01 19:32   ` David Sterba
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2022-05-26  7:36 UTC (permalink / raw)
  To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs, Johannes Thumshirn

Remove the pointless goto just to return err and clean up the code flow
to be a little more straight forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/inode.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67bb7876fdfa8..03335ce83a949 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7966,39 +7966,33 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
 	blk_status_t ret;
 
-	/* Check btrfs_submit_bio_hook() for rules about async submit. */
-	if (async_submit)
-		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
 	if (!write) {
 		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		goto map;
 
-	if (write && async_submit) {
-		ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset,
-					  btrfs_submit_bio_start_direct_io);
-		goto err;
-	} else if (write) {
+	if (write) {
+		/* check btrfs_submit_data_bio() for async submit rules */
+		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
+			return btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+					btrfs_submit_bio_start_direct_io);
 		/*
 		 * If we aren't doing async submit, calculate the csum of the
 		 * bio now.
 		 */
 		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
 		if (ret)
-			goto err;
+			return ret;
 	} else {
 		btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
 						      file_offset - dip->file_offset);
 	}
 map:
-	ret = btrfs_map_bio(fs_info, bio, 0);
-err:
-	return ret;
+	return btrfs_map_bio(fs_info, bio, 0);
 }
 
 static void btrfs_submit_direct(const struct iomap_iter *iter,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio
  2022-04-25  7:54 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
  2022-04-25  8:45   ` Qu Wenruo
@ 2022-04-26  7:21   ` Johannes Thumshirn
  1 sibling, 0 replies; 21+ messages in thread
From: Johannes Thumshirn @ 2022-04-26  7:21 UTC (permalink / raw)
  To: Christoph Hellwig, Josef Bacik, David Sterba, Qu Wenruo
  Cc: Naohiro Aota, linux-btrfs

Looks good,
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio
  2022-04-25  7:54 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
@ 2022-04-25  8:45   ` Qu Wenruo
  2022-04-26  7:21   ` Johannes Thumshirn
  1 sibling, 0 replies; 21+ messages in thread
From: Qu Wenruo @ 2022-04-25  8:45 UTC (permalink / raw)
  To: Christoph Hellwig, Josef Bacik, David Sterba; +Cc: Naohiro Aota, linux-btrfs



On 2022/4/25 15:54, Christoph Hellwig wrote:
> Remove the pointless goto just to return err and clean up the code flow
> to be a little more straight forward.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu

> ---
>   fs/btrfs/inode.c | 23 +++++++++--------------
>   1 file changed, 9 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index ef3bee1cbc6db..b188f724eff2d 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -7899,31 +7899,28 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
>   	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
>   	blk_status_t ret;
>   
> -	/* Check btrfs_submit_bio_hook() for rules about async submit. */
> -	if (async_submit)
> -		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
> -
>   	if (!write) {
>   		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
>   		if (ret)
> -			goto err;
> +			return ret;
>   	}
>   
>   	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
>   		goto map;
>   
> -	if (write && async_submit) {
> -		ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
> -					  btrfs_submit_bio_start_direct_io);
> -		goto err;
> -	} else if (write) {
> +	if (write) {
> +		/* check btrfs_submit_data_bio() for async submit rules */
> +		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
> +			return btrfs_wq_submit_bio(inode, bio, 0, 0,
> +					file_offset,
> +					btrfs_submit_bio_start_direct_io);
>   		/*
>   		 * If we aren't doing async submit, calculate the csum of the
>   		 * bio now.
>   		 */
>   		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
>   		if (ret)
> -			goto err;
> +			return ret;
>   	} else {
>   		u64 csum_offset;
>   
> @@ -7933,9 +7930,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
>   		btrfs_bio(bio)->csum = dip->csums + csum_offset;
>   	}
>   map:
> -	ret = btrfs_map_bio(fs_info, bio, 0);
> -err:
> -	return ret;
> +	return btrfs_map_bio(fs_info, bio, 0);
>   }
>   
>   /*


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio
  2022-04-25  7:54 cleanup btrfs bio handling, part 2 Christoph Hellwig
@ 2022-04-25  7:54 ` Christoph Hellwig
  2022-04-25  8:45   ` Qu Wenruo
  2022-04-26  7:21   ` Johannes Thumshirn
  0 siblings, 2 replies; 21+ messages in thread
From: Christoph Hellwig @ 2022-04-25  7:54 UTC (permalink / raw)
  To: Josef Bacik, David Sterba, Qu Wenruo; +Cc: Naohiro Aota, linux-btrfs

Remove the pointless goto just to return err and clean up the code flow
to be a little more straight forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/inode.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ef3bee1cbc6db..b188f724eff2d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7899,31 +7899,28 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
 	blk_status_t ret;
 
-	/* Check btrfs_submit_bio_hook() for rules about async submit. */
-	if (async_submit)
-		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
 	if (!write) {
 		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		goto map;
 
-	if (write && async_submit) {
-		ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
-					  btrfs_submit_bio_start_direct_io);
-		goto err;
-	} else if (write) {
+	if (write) {
+		/* check btrfs_submit_data_bio() for async submit rules */
+		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
+			return btrfs_wq_submit_bio(inode, bio, 0, 0,
+					file_offset,
+					btrfs_submit_bio_start_direct_io);
 		/*
 		 * If we aren't doing async submit, calculate the csum of the
 		 * bio now.
 		 */
 		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
 		if (ret)
-			goto err;
+			return ret;
 	} else {
 		u64 csum_offset;
 
@@ -7933,9 +7930,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
 		btrfs_bio(bio)->csum = dip->csums + csum_offset;
 	}
 map:
-	ret = btrfs_map_bio(fs_info, bio, 0);
-err:
-	return ret;
+	return btrfs_map_bio(fs_info, bio, 0);
 }
 
 /*
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2022-06-03 16:50 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
2022-04-29 14:30 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
2022-04-29 14:30 ` [PATCH 03/10] btrfs: split btrfs_submit_data_bio Christoph Hellwig
2022-04-29 14:30 ` [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads Christoph Hellwig
2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
2022-05-01  4:40   ` Qu Wenruo
2022-05-01  4:53     ` Qu Wenruo
2022-05-02 16:44       ` Christoph Hellwig
2022-06-03 16:44       ` David Sterba
2022-06-03 16:45         ` David Sterba
2022-04-29 14:30 ` [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Christoph Hellwig
2022-04-29 14:30 ` [PATCH 07/10] btrfs: centralize setting REQ_META Christoph Hellwig
2022-04-29 14:30 ` [PATCH 08/10] btrfs: remove btrfs_end_io_wq Christoph Hellwig
2022-04-29 14:30 ` [PATCH 09/10] btrfs: refactor btrfs_map_bio Christoph Hellwig
2022-04-29 14:30 ` [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2022-05-26  7:36 cleanup btrfs bio handling, part 2 v4 Christoph Hellwig
2022-05-26  7:36 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
2022-06-01 19:32   ` David Sterba
2022-04-25  7:54 cleanup btrfs bio handling, part 2 Christoph Hellwig
2022-04-25  7:54 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
2022-04-25  8:45   ` Qu Wenruo
2022-04-26  7:21   ` Johannes Thumshirn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.