* [PATCH 01/10] btrfs: move more work into btrfs_end_bioc
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-04-29 14:30 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
` (8 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs, Johannes Thumshirn
Assign ->mirror_num and ->bi_status in btrfs_end_bioc instead of
duplicating the logic in the callers. Also remove the bio argument as
it always must be bioc->orig_bio and the now pointless bioc_error that
did nothing but assign bi_sector to the same value just sampled in the
caller.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/volumes.c | 72 ++++++++++++++--------------------------------
1 file changed, 22 insertions(+), 50 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 748614b00ffa2..aeacb87457687 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6621,19 +6621,29 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
}
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
{
- bio->bi_private = bioc->private;
- bio->bi_end_io = bioc->end_io;
- bio_endio(bio);
+ struct bio *orig_bio = bioc->orig_bio;
+ btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num;
+ orig_bio->bi_private = bioc->private;
+ orig_bio->bi_end_io = bioc->end_io;
+
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ orig_bio->bi_status = BLK_STS_IOERR;
+ else
+ orig_bio->bi_status = BLK_STS_OK;
+ bio_endio(orig_bio);
btrfs_put_bioc(bioc);
}
static void btrfs_end_bio(struct bio *bio)
{
struct btrfs_io_context *bioc = bio->bi_private;
- int is_orig_bio = 0;
if (bio->bi_status) {
atomic_inc(&bioc->error);
@@ -6654,35 +6664,12 @@ static void btrfs_end_bio(struct bio *bio)
}
}
- if (bio == bioc->orig_bio)
- is_orig_bio = 1;
+ if (bio != bioc->orig_bio)
+ bio_put(bio);
btrfs_bio_counter_dec(bioc->fs_info);
-
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- if (!is_orig_bio) {
- bio_put(bio);
- bio = bioc->orig_bio;
- }
-
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- /* only send an error to the higher layers if it is
- * beyond the tolerance of the btrfs bio
- */
- if (atomic_read(&bioc->error) > bioc->max_errors) {
- bio->bi_status = BLK_STS_IOERR;
- } else {
- /*
- * this bio is actually up to date, we didn't
- * go over the max number of errors
- */
- bio->bi_status = BLK_STS_OK;
- }
-
- btrfs_end_bioc(bioc, bio);
- } else if (!is_orig_bio) {
- bio_put(bio);
- }
+ if (atomic_dec_and_test(&bioc->stripes_pending))
+ btrfs_end_bioc(bioc);
}
static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
@@ -6720,23 +6707,6 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
submit_bio(bio);
}
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
-{
- atomic_inc(&bioc->error);
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- /* Should be the original bio. */
- WARN_ON(bio != bioc->orig_bio);
-
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bioc->error) > bioc->max_errors)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_status = BLK_STS_OK;
- btrfs_end_bioc(bioc, bio);
- }
-}
-
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num)
{
@@ -6795,7 +6765,9 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
&dev->dev_state) ||
(btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bioc_error(bioc, first_bio, logical);
+ atomic_inc(&bioc->error);
+ if (atomic_dec_and_test(&bioc->stripes_pending))
+ btrfs_end_bioc(bioc);
continue;
}
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-04-29 14:30 ` [PATCH 03/10] btrfs: split btrfs_submit_data_bio Christoph Hellwig
` (7 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs, Johannes Thumshirn
Remove the pointless goto just to return err and clean up the code flow
to be a little more straight forward.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/inode.c | 23 +++++++++--------------
1 file changed, 9 insertions(+), 14 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b7df1c0ee5ee..3be52ed6ed196 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7899,31 +7899,28 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
blk_status_t ret;
- /* Check btrfs_submit_bio_hook() for rules about async submit. */
- if (async_submit)
- async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
if (!write) {
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
if (ret)
- goto err;
+ return ret;
}
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
goto map;
- if (write && async_submit) {
- ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
- btrfs_submit_bio_start_direct_io);
- goto err;
- } else if (write) {
+ if (write) {
+ /* check btrfs_submit_data_bio() for async submit rules */
+ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
+ return btrfs_wq_submit_bio(inode, bio, 0, 0,
+ file_offset,
+ btrfs_submit_bio_start_direct_io);
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
if (ret)
- goto err;
+ return ret;
} else {
u64 csum_offset;
@@ -7933,9 +7930,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
btrfs_bio(bio)->csum = dip->csums + csum_offset;
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0);
-err:
- return ret;
+ return btrfs_map_bio(fs_info, bio, 0);
}
/*
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 03/10] btrfs: split btrfs_submit_data_bio
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
2022-04-29 14:30 ` [PATCH 01/10] btrfs: move more work into btrfs_end_bioc Christoph Hellwig
2022-04-29 14:30 ` [PATCH 02/10] btrfs: cleanup btrfs_submit_dio_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-04-29 14:30 ` [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads Christoph Hellwig
` (6 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
Split btrfs_submit_data_bio into one helper for reads and one for writes.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/btrfs/ctree.h | 6 +-
fs/btrfs/extent_io.c | 12 ++--
fs/btrfs/inode.c | 130 ++++++++++++++++++++-----------------------
3 files changed, 73 insertions(+), 75 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 371f3f6c4ea47..40a6f61559348 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3252,8 +3252,10 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f9d6dd310c42b..80b4482c477c6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -186,11 +186,15 @@ static void submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_fl
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
- if (is_data_inode(tree->private_data))
- btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ if (!is_data_inode(tree->private_data))
+ btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
+ else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_submit_data_write_bio(tree->private_data, bio, mirror_num,
bio_flags);
else
- btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
+ btrfs_submit_data_read_bio(tree->private_data, bio, mirror_num,
+ bio_flags);
+
/*
* Above submission hooks will handle the error by ending the bio,
* which will do the cleanup properly. So here we should not return
@@ -2773,7 +2777,7 @@ static blk_status_t submit_data_read_repair(struct inode *inode,
ret = btrfs_repair_one_sector(inode, failed_bio,
bio_offset + offset,
page, pgoff + offset, start + offset,
- failed_mirror, btrfs_submit_data_bio);
+ failed_mirror, btrfs_submit_data_read_bio);
if (!ret) {
/*
* We have submitted the read repair, the page release
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3be52ed6ed196..6476a5d9c09e9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2552,90 +2552,82 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
return errno_to_blk_status(ret);
}
-/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read: sync submit
- *
- * b) write without checksum: sync submit
- *
- * c) write with checksum:
- * c-1) if bio is issued by fsync: sync submit
- * (sync_writers != 0)
- *
- * c-2) if root is reloc root: sync submit
- * (only in case of buffered IO)
- *
- * c-3) otherwise: async submit
- */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
- blk_status_t ret = 0;
- int skip_sum;
- int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
- metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ blk_status_t ret;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct page *page = bio_first_bvec_all(bio)->bv_page;
- loff_t file_offset = page_offset(page);
-
- ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+ ret = extract_ordered_extent(bi, bio,
+ page_offset(bio_first_bvec_all(bio)->bv_page));
if (ret)
goto out;
}
- if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
- if (ret)
- goto out;
-
- if (bio_flags & EXTENT_BIO_COMPRESSED) {
- /*
- * btrfs_submit_compressed_read will handle completing
- * the bio if there were any errors, so just return
- * here.
- */
- btrfs_submit_compressed_read(inode, bio, mirror_num);
- return;
- } else {
- /*
- * Lookup bio sums does extra checks around whether we
- * need to csum or not, which is why we ignore skip_sum
- * here.
- */
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+ /*
+ * Rules for async/sync submit:
+ * a) write without checksum: sync submit
+ * b) write with checksum:
+ * b-1) if bio is issued by fsync: sync submit
+ * (sync_writers != 0)
+ * b-2) if root is reloc root: sync submit
+ * (only in case of buffered IO)
+ * b-3) otherwise: async submit
+ */
+ if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
+ if (atomic_read(&bi->sync_writers)) {
+ ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
if (ret)
goto out;
+ } else if (btrfs_is_data_reloc_root(bi->root)) {
+ ; /* csum items have already been cloned */
+ } else {
+ ret = btrfs_wq_submit_bio(inode, bio,
+ mirror_num, bio_flags, 0,
+ btrfs_submit_bio_start);
+ goto out;
}
- goto mapit;
- } else if (async && !skip_sum) {
- /* csum items have already been cloned */
- if (btrfs_is_data_reloc_root(root))
- goto mapit;
- /* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
- 0, btrfs_submit_bio_start);
+ }
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+out:
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ }
+}
+
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ blk_status_t ret;
+
+ ret = btrfs_bio_wq_end_io(fs_info, bio,
+ btrfs_is_free_space_inode(BTRFS_I(inode)) ?
+ BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
+ if (ret)
goto out;
- } else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
- if (ret)
- goto out;
+
+ if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ /*
+ * btrfs_submit_compressed_read will handle completing the bio
+ * if there were any errors, so just return here.
+ */
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
}
-mapit:
+ /*
+ * Lookup bio sums does extra checks around whether we need to csum or
+ * not, which is why we ignore skip_sum here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+ if (ret)
+ goto out;
ret = btrfs_map_bio(fs_info, bio, mirror_num);
-
out:
if (ret) {
bio->bi_status = ret;
@@ -7909,7 +7901,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
goto map;
if (write) {
- /* check btrfs_submit_data_bio() for async submit rules */
+ /* check btrfs_submit_data_write_bio() for async submit rules */
if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
return btrfs_wq_submit_bio(inode, bio, 0, 0,
file_offset,
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
` (2 preceding siblings ...)
2022-04-29 14:30 ` [PATCH 03/10] btrfs: split btrfs_submit_data_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
` (5 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
The bio completion handler of the bio used for the compressed data is
already run in a workqueue using btrfs_bio_wq_end_io, so don't schedule
the completion of the original bio to the same workqueue again but just
execute it directly.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/btrfs/inode.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6476a5d9c09e9..02c4cbd557dcd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2605,12 +2605,6 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- btrfs_is_free_space_inode(BTRFS_I(inode)) ?
- BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
- if (ret)
- goto out;
-
if (bio_flags & EXTENT_BIO_COMPRESSED) {
/*
* btrfs_submit_compressed_read will handle completing the bio
@@ -2620,6 +2614,12 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
return;
}
+ ret = btrfs_bio_wq_end_io(fs_info, bio,
+ btrfs_is_free_space_inode(BTRFS_I(inode)) ?
+ BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
+ if (ret)
+ goto out;
+
/*
* Lookup bio sums does extra checks around whether we need to csum or
* not, which is why we ignore skip_sum here.
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
` (3 preceding siblings ...)
2022-04-29 14:30 ` [PATCH 04/10] btrfs: don't double-defer bio completions for compressed reads Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-05-01 4:40 ` Qu Wenruo
2022-04-29 14:30 ` [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Christoph Hellwig
` (4 subsequent siblings)
9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
Instead of attaching a an extra allocation an indirect call to each
low-level bio issued by the RAID code, add a work_struct to struct
btrfs_raid_bio and only defer the per-rbio completion action. The
per-bio action for all the I/Os are trivial and can be safely done
from interrupt context.
As a nice side effect this also allows sharing the boilerplate code
for the per-bio completions
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/btrfs/ctree.h | 2 +-
fs/btrfs/disk-io.c | 12 ++---
fs/btrfs/disk-io.h | 1 -
fs/btrfs/raid56.c | 111 ++++++++++++++++++---------------------------
4 files changed, 49 insertions(+), 77 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 40a6f61559348..4dd0d4a2e7757 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -853,7 +853,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *flush_workers;
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
- struct btrfs_workqueue *endio_raid56_workers;
+ struct workqueue_struct *endio_raid56_workers;
struct workqueue_struct *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 73e12ecc81be1..3c6137734d28c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
wq = fs_info->endio_meta_write_workers;
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
wq = fs_info->endio_freespace_worker;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
else
wq = fs_info->endio_write_workers;
} else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else if (end_io_wq->metadata)
+ if (end_io_wq->metadata)
wq = fs_info->endio_meta_workers;
else
wq = fs_info->endio_workers;
@@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
btrfs_destroy_workqueue(fs_info->endio_workers);
- btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ if (fs_info->endio_raid56_workers)
+ destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
max_active, 2);
fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
- max_active, 4);
+ alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9340e3266e0ac..97255e3d7e524 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
BTRFS_WQ_ENDIO_DATA,
BTRFS_WQ_ENDIO_METADATA,
BTRFS_WQ_ENDIO_FREE_SPACE,
- BTRFS_WQ_ENDIO_RAID56,
};
static inline u64 btrfs_sb_offset(int mirror)
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a5b623ee6facd..1a3c1a9b10d0b 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -164,6 +164,9 @@ struct btrfs_raid_bio {
atomic_t stripes_pending;
atomic_t error;
+
+ struct work_struct end_io_work;
+
/*
* these are two arrays of pointers. We allocate the
* rbio big enough to hold them both and setup their
@@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
}
}
-/*
- * end io for the read phase of the rmw cycle. All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid_rmw_end_io(struct bio *bio)
+static void raid56_bio_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
bio_put(bio);
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ queue_work(rbio->bioc->fs_info->endio_raid56_workers,
+ &rbio->end_io_work);
+}
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- goto cleanup;
+/*
+ * End io handler for the read phase of the rmw cycle. All the bios here are
+ * physical stripe bios we've read from the disk so we can recalculate the
+ * parity of the stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_rmw_end_io_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
+
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ return;
+ }
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write but if there
+ * are any failed stripes we'll reconstruct from parity first.
*/
validate_rbio_for_rmw(rbio);
- return;
-
-cleanup:
-
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
@@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_rmw_end_io;
-
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+ bio->bi_end_io = raid56_bio_end_io;
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/*
- * This is called only for stripes we've read from disk to
- * reconstruct the parity.
+ * This is called only for stripes we've read from disk to reconstruct the
+ * parity.
*/
-static void raid_recover_end_io(struct bio *bio)
+static void raid_recover_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- /*
- * we only read stripe pages off the disk, set them
- * up to date if there were no errors
- */
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
@@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_recover_end_io;
-
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+ bio->bi_end_io = raid56_bio_end_io;
submit_bio(bio);
}
@@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
atomic_set(&rbio->stripes_pending, nr_data);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_write_end_io;
-
+ bio->bi_end_io = raid56_bio_end_io;
submit_bio(bio);
}
return;
@@ -2671,24 +2660,14 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid56_parity_scrub_end_io(struct bio *bio)
+static void raid56_parity_scrub_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write, but if there
+ * are any failed stripes we'll reconstruct from parity first
*/
validate_rbio_for_parity_scrub(rbio);
}
@@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_parity_scrub_end_io;
-
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
-
+ bio->bi_end_io = raid56_bio_end_io;
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
@ 2022-05-01 4:40 ` Qu Wenruo
2022-05-01 4:53 ` Qu Wenruo
0 siblings, 1 reply; 30+ messages in thread
From: Qu Wenruo @ 2022-05-01 4:40 UTC (permalink / raw)
To: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
On 2022/4/29 22:30, Christoph Hellwig wrote:
> Instead of attaching a an extra allocation an indirect call to each
> low-level bio issued by the RAID code, add a work_struct to struct
> btrfs_raid_bio and only defer the per-rbio completion action. The
> per-bio action for all the I/Os are trivial and can be safely done
> from interrupt context.
>
> As a nice side effect this also allows sharing the boilerplate code
> for the per-bio completions
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
It looks like this patch is causing test failure in btrfs/027, at least
for subapge (64K page size, 4K sectorsize) cases.
Reproducibility is 100% (4/4 tried).
The hanging sub-test case is the repalcing of a missing device in raid5.
The involved dmesg (including the hanging thread dump) is:
[ 276.672541] BTRFS warning (device dm-1): read-write for sector size
4096 with page size 65536 is experimental
[ 276.744316] BTRFS info (device dm-1): checking UUID tree
[ 277.387701] BTRFS info (device dm-1): allowing degraded mounts
[ 277.390314] BTRFS info (device dm-1): using free space tree
[ 277.392108] BTRFS info (device dm-1): has skinny extents
[ 277.393890] BTRFS warning (device dm-1): read-write for sector size
4096 with page size 65536 is experimental
[ 277.420922] BTRFS warning (device dm-1): devid 2 uuid
4b67464d-e851-4a88-8765-67b043d4680f is missing
[ 277.432694] BTRFS warning (device dm-1): devid 2 uuid
4b67464d-e851-4a88-8765-67b043d4680f is missing
[ 277.648326] BTRFS info (device dm-1): dev_replace from <missing disk>
(devid 2) to /dev/mapper/test-scratch5 started
[ 297.264371] task:btrfs state:D stack: 0 pid: 7158 ppid:
6493 flags:0x0000000c
[ 297.280744] Call trace:
[ 297.282351] __switch_to+0xfc/0x160
[ 297.284525] __schedule+0x260/0x61c
[ 297.286959] schedule+0x54/0xc4
[ 297.288980] scrub_enumerate_chunks+0x610/0x760 [btrfs]
[ 297.292504] btrfs_scrub_dev+0x1a0/0x530 [btrfs]
[ 297.306738] btrfs_dev_replace_start+0x2a4/0x2d0 [btrfs]
[ 297.310418] btrfs_dev_replace_by_ioctl+0x48/0x84 [btrfs]
[ 297.314026] btrfs_ioctl_dev_replace+0x1b8/0x210 [btrfs]
[ 297.328014] btrfs_ioctl+0xa48/0x1a70 [btrfs]
[ 297.330705] __arm64_sys_ioctl+0xb4/0x100
[ 297.333037] invoke_syscall+0x50/0x120
[ 297.343237] el0_svc_common.constprop.0+0x4c/0x100
[ 297.345716] do_el0_svc+0x34/0xa0
[ 297.347242] el0_svc+0x34/0xb0
[ 297.348763] el0t_64_sync_handler+0xa8/0x130
[ 297.350870] el0t_64_sync+0x18c/0x190
Mind to take a look on that hang?
Thanks,
Qu
> ---
> fs/btrfs/ctree.h | 2 +-
> fs/btrfs/disk-io.c | 12 ++---
> fs/btrfs/disk-io.h | 1 -
> fs/btrfs/raid56.c | 111 ++++++++++++++++++---------------------------
> 4 files changed, 49 insertions(+), 77 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 40a6f61559348..4dd0d4a2e7757 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -853,7 +853,7 @@ struct btrfs_fs_info {
> struct btrfs_workqueue *flush_workers;
> struct btrfs_workqueue *endio_workers;
> struct btrfs_workqueue *endio_meta_workers;
> - struct btrfs_workqueue *endio_raid56_workers;
> + struct workqueue_struct *endio_raid56_workers;
> struct workqueue_struct *rmw_workers;
> struct btrfs_workqueue *endio_meta_write_workers;
> struct btrfs_workqueue *endio_write_workers;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 73e12ecc81be1..3c6137734d28c 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
> wq = fs_info->endio_meta_write_workers;
> else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
> wq = fs_info->endio_freespace_worker;
> - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
> - wq = fs_info->endio_raid56_workers;
> else
> wq = fs_info->endio_write_workers;
> } else {
> - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
> - wq = fs_info->endio_raid56_workers;
> - else if (end_io_wq->metadata)
> + if (end_io_wq->metadata)
> wq = fs_info->endio_meta_workers;
> else
> wq = fs_info->endio_workers;
> @@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
> btrfs_destroy_workqueue(fs_info->hipri_workers);
> btrfs_destroy_workqueue(fs_info->workers);
> btrfs_destroy_workqueue(fs_info->endio_workers);
> - btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
> + if (fs_info->endio_raid56_workers)
> + destroy_workqueue(fs_info->endio_raid56_workers);
> if (fs_info->rmw_workers)
> destroy_workqueue(fs_info->rmw_workers);
> btrfs_destroy_workqueue(fs_info->endio_write_workers);
> @@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
> btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
> max_active, 2);
> fs_info->endio_raid56_workers =
> - btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
> - max_active, 4);
> + alloc_workqueue("btrfs-endio-raid56", flags, max_active);
> fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
> fs_info->endio_write_workers =
> btrfs_alloc_workqueue(fs_info, "endio-write", flags,
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index 9340e3266e0ac..97255e3d7e524 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
> BTRFS_WQ_ENDIO_DATA,
> BTRFS_WQ_ENDIO_METADATA,
> BTRFS_WQ_ENDIO_FREE_SPACE,
> - BTRFS_WQ_ENDIO_RAID56,
> };
>
> static inline u64 btrfs_sb_offset(int mirror)
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index a5b623ee6facd..1a3c1a9b10d0b 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -164,6 +164,9 @@ struct btrfs_raid_bio {
> atomic_t stripes_pending;
>
> atomic_t error;
> +
> + struct work_struct end_io_work;
> +
> /*
> * these are two arrays of pointers. We allocate the
> * rbio big enough to hold them both and setup their
> @@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
> }
> }
>
> -/*
> - * end io for the read phase of the rmw cycle. All the bios here are physical
> - * stripe bios we've read from the disk so we can recalculate the parity of the
> - * stripe.
> - *
> - * This will usually kick off finish_rmw once all the bios are read in, but it
> - * may trigger parity reconstruction if we had any errors along the way
> - */
> -static void raid_rmw_end_io(struct bio *bio)
> +static void raid56_bio_end_io(struct bio *bio)
> {
> struct btrfs_raid_bio *rbio = bio->bi_private;
>
> @@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
>
> bio_put(bio);
>
> - if (!atomic_dec_and_test(&rbio->stripes_pending))
> - return;
> + if (atomic_dec_and_test(&rbio->stripes_pending))
> + queue_work(rbio->bioc->fs_info->endio_raid56_workers,
> + &rbio->end_io_work);
> +}
>
> - if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
> - goto cleanup;
> +/*
> + * End io handler for the read phase of the rmw cycle. All the bios here are
> + * physical stripe bios we've read from the disk so we can recalculate the
> + * parity of the stripe.
> + *
> + * This will usually kick off finish_rmw once all the bios are read in, but it
> + * may trigger parity reconstruction if we had any errors along the way
> + */
> +static void raid56_rmw_end_io_work(struct work_struct *work)
> +{
> + struct btrfs_raid_bio *rbio =
> + container_of(work, struct btrfs_raid_bio, end_io_work);
> +
> + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
> + rbio_orig_end_io(rbio, BLK_STS_IOERR);
> + return;
> + }
>
> /*
> - * this will normally call finish_rmw to start our write
> - * but if there are any failed stripes we'll reconstruct
> - * from parity first
> + * This will normally call finish_rmw to start our write but if there
> + * are any failed stripes we'll reconstruct from parity first.
> */
> validate_rbio_for_rmw(rbio);
> - return;
> -
> -cleanup:
> -
> - rbio_orig_end_io(rbio, BLK_STS_IOERR);
> }
>
> /*
> @@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
> * touch it after that.
> */
> atomic_set(&rbio->stripes_pending, bios_to_read);
> + INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
> while ((bio = bio_list_pop(&bio_list))) {
> - bio->bi_end_io = raid_rmw_end_io;
> -
> - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> + bio->bi_end_io = raid56_bio_end_io;
> submit_bio(bio);
> }
> /* the actual write will happen once the reads are done */
> @@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
> }
>
> /*
> - * This is called only for stripes we've read from disk to
> - * reconstruct the parity.
> + * This is called only for stripes we've read from disk to reconstruct the
> + * parity.
> */
> -static void raid_recover_end_io(struct bio *bio)
> +static void raid_recover_end_io_work(struct work_struct *work)
> {
> - struct btrfs_raid_bio *rbio = bio->bi_private;
> -
> - /*
> - * we only read stripe pages off the disk, set them
> - * up to date if there were no errors
> - */
> - if (bio->bi_status)
> - fail_bio_stripe(rbio, bio);
> - else
> - set_bio_pages_uptodate(rbio, bio);
> - bio_put(bio);
> -
> - if (!atomic_dec_and_test(&rbio->stripes_pending))
> - return;
> + struct btrfs_raid_bio *rbio =
> + container_of(work, struct btrfs_raid_bio, end_io_work);
>
> if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
> rbio_orig_end_io(rbio, BLK_STS_IOERR);
> @@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
> * touch it after that.
> */
> atomic_set(&rbio->stripes_pending, bios_to_read);
> + INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
> while ((bio = bio_list_pop(&bio_list))) {
> - bio->bi_end_io = raid_recover_end_io;
> -
> - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> + bio->bi_end_io = raid56_bio_end_io;
> submit_bio(bio);
> }
>
> @@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
> atomic_set(&rbio->stripes_pending, nr_data);
>
> while ((bio = bio_list_pop(&bio_list))) {
> - bio->bi_end_io = raid_write_end_io;
> -
> + bio->bi_end_io = raid56_bio_end_io;
> submit_bio(bio);
> }
> return;
> @@ -2671,24 +2660,14 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
> * This will usually kick off finish_rmw once all the bios are read in, but it
> * may trigger parity reconstruction if we had any errors along the way
> */
> -static void raid56_parity_scrub_end_io(struct bio *bio)
> +static void raid56_parity_scrub_end_io_work(struct work_struct *work)
> {
> - struct btrfs_raid_bio *rbio = bio->bi_private;
> -
> - if (bio->bi_status)
> - fail_bio_stripe(rbio, bio);
> - else
> - set_bio_pages_uptodate(rbio, bio);
> -
> - bio_put(bio);
> -
> - if (!atomic_dec_and_test(&rbio->stripes_pending))
> - return;
> + struct btrfs_raid_bio *rbio =
> + container_of(work, struct btrfs_raid_bio, end_io_work);
>
> /*
> - * this will normally call finish_rmw to start our write
> - * but if there are any failed stripes we'll reconstruct
> - * from parity first
> + * This will normally call finish_rmw to start our write, but if there
> + * are any failed stripes we'll reconstruct from parity first
> */
> validate_rbio_for_parity_scrub(rbio);
> }
> @@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
> * touch it after that.
> */
> atomic_set(&rbio->stripes_pending, bios_to_read);
> + INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
> while ((bio = bio_list_pop(&bio_list))) {
> - bio->bi_end_io = raid56_parity_scrub_end_io;
> -
> - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
> -
> + bio->bi_end_io = raid56_bio_end_io;
> submit_bio(bio);
> }
> /* the actual write will happen once the reads are done */
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
2022-05-01 4:40 ` Qu Wenruo
@ 2022-05-01 4:53 ` Qu Wenruo
2022-05-02 16:44 ` Christoph Hellwig
2022-06-03 16:44 ` David Sterba
0 siblings, 2 replies; 30+ messages in thread
From: Qu Wenruo @ 2022-05-01 4:53 UTC (permalink / raw)
To: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
On 2022/5/1 12:40, Qu Wenruo wrote:
>
>
> On 2022/4/29 22:30, Christoph Hellwig wrote:
>> Instead of attaching a an extra allocation an indirect call to each
>> low-level bio issued by the RAID code, add a work_struct to struct
>> btrfs_raid_bio and only defer the per-rbio completion action. The
>> per-bio action for all the I/Os are trivial and can be safely done
>> from interrupt context.
>>
>> As a nice side effect this also allows sharing the boilerplate code
>> for the per-bio completions
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> It looks like this patch is causing test failure in btrfs/027, at least
> for subapge (64K page size, 4K sectorsize) cases.
Also confirmed the same hang on the same commit on x86_64 (4K page size
4K sectorsize).
Also 100% reproducible.
Thanks,
Qu
>
> Reproducibility is 100% (4/4 tried).
>
> The hanging sub-test case is the repalcing of a missing device in raid5.
>
> The involved dmesg (including the hanging thread dump) is:
>
> [ 276.672541] BTRFS warning (device dm-1): read-write for sector size
> 4096 with page size 65536 is experimental
> [ 276.744316] BTRFS info (device dm-1): checking UUID tree
> [ 277.387701] BTRFS info (device dm-1): allowing degraded mounts
> [ 277.390314] BTRFS info (device dm-1): using free space tree
> [ 277.392108] BTRFS info (device dm-1): has skinny extents
> [ 277.393890] BTRFS warning (device dm-1): read-write for sector size
> 4096 with page size 65536 is experimental
> [ 277.420922] BTRFS warning (device dm-1): devid 2 uuid
> 4b67464d-e851-4a88-8765-67b043d4680f is missing
> [ 277.432694] BTRFS warning (device dm-1): devid 2 uuid
> 4b67464d-e851-4a88-8765-67b043d4680f is missing
> [ 277.648326] BTRFS info (device dm-1): dev_replace from <missing disk>
> (devid 2) to /dev/mapper/test-scratch5 started
> [ 297.264371] task:btrfs state:D stack: 0 pid: 7158 ppid:
> 6493 flags:0x0000000c
> [ 297.280744] Call trace:
> [ 297.282351] __switch_to+0xfc/0x160
> [ 297.284525] __schedule+0x260/0x61c
> [ 297.286959] schedule+0x54/0xc4
> [ 297.288980] scrub_enumerate_chunks+0x610/0x760 [btrfs]
> [ 297.292504] btrfs_scrub_dev+0x1a0/0x530 [btrfs]
> [ 297.306738] btrfs_dev_replace_start+0x2a4/0x2d0 [btrfs]
> [ 297.310418] btrfs_dev_replace_by_ioctl+0x48/0x84 [btrfs]
> [ 297.314026] btrfs_ioctl_dev_replace+0x1b8/0x210 [btrfs]
> [ 297.328014] btrfs_ioctl+0xa48/0x1a70 [btrfs]
> [ 297.330705] __arm64_sys_ioctl+0xb4/0x100
> [ 297.333037] invoke_syscall+0x50/0x120
> [ 297.343237] el0_svc_common.constprop.0+0x4c/0x100
> [ 297.345716] do_el0_svc+0x34/0xa0
> [ 297.347242] el0_svc+0x34/0xb0
> [ 297.348763] el0t_64_sync_handler+0xa8/0x130
> [ 297.350870] el0t_64_sync+0x18c/0x190
>
> Mind to take a look on that hang?
>
> Thanks,
> Qu
>
>> ---
>> fs/btrfs/ctree.h | 2 +-
>> fs/btrfs/disk-io.c | 12 ++---
>> fs/btrfs/disk-io.h | 1 -
>> fs/btrfs/raid56.c | 111 ++++++++++++++++++---------------------------
>> 4 files changed, 49 insertions(+), 77 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 40a6f61559348..4dd0d4a2e7757 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -853,7 +853,7 @@ struct btrfs_fs_info {
>> struct btrfs_workqueue *flush_workers;
>> struct btrfs_workqueue *endio_workers;
>> struct btrfs_workqueue *endio_meta_workers;
>> - struct btrfs_workqueue *endio_raid56_workers;
>> + struct workqueue_struct *endio_raid56_workers;
>> struct workqueue_struct *rmw_workers;
>> struct btrfs_workqueue *endio_meta_write_workers;
>> struct btrfs_workqueue *endio_write_workers;
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 73e12ecc81be1..3c6137734d28c 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -753,14 +753,10 @@ static void end_workqueue_bio(struct bio *bio)
>> wq = fs_info->endio_meta_write_workers;
>> else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
>> wq = fs_info->endio_freespace_worker;
>> - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
>> - wq = fs_info->endio_raid56_workers;
>> else
>> wq = fs_info->endio_write_workers;
>> } else {
>> - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
>> - wq = fs_info->endio_raid56_workers;
>> - else if (end_io_wq->metadata)
>> + if (end_io_wq->metadata)
>> wq = fs_info->endio_meta_workers;
>> else
>> wq = fs_info->endio_workers;
>> @@ -2274,7 +2270,8 @@ static void btrfs_stop_all_workers(struct
>> btrfs_fs_info *fs_info)
>> btrfs_destroy_workqueue(fs_info->hipri_workers);
>> btrfs_destroy_workqueue(fs_info->workers);
>> btrfs_destroy_workqueue(fs_info->endio_workers);
>> - btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
>> + if (fs_info->endio_raid56_workers)
>> + destroy_workqueue(fs_info->endio_raid56_workers);
>> if (fs_info->rmw_workers)
>> destroy_workqueue(fs_info->rmw_workers);
>> btrfs_destroy_workqueue(fs_info->endio_write_workers);
>> @@ -2477,8 +2474,7 @@ static int btrfs_init_workqueues(struct
>> btrfs_fs_info *fs_info)
>> btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
>> max_active, 2);
>> fs_info->endio_raid56_workers =
>> - btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
>> - max_active, 4);
>> + alloc_workqueue("btrfs-endio-raid56", flags, max_active);
>> fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags,
>> max_active);
>> fs_info->endio_write_workers =
>> btrfs_alloc_workqueue(fs_info, "endio-write", flags,
>> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>> index 9340e3266e0ac..97255e3d7e524 100644
>> --- a/fs/btrfs/disk-io.h
>> +++ b/fs/btrfs/disk-io.h
>> @@ -21,7 +21,6 @@ enum btrfs_wq_endio_type {
>> BTRFS_WQ_ENDIO_DATA,
>> BTRFS_WQ_ENDIO_METADATA,
>> BTRFS_WQ_ENDIO_FREE_SPACE,
>> - BTRFS_WQ_ENDIO_RAID56,
>> };
>>
>> static inline u64 btrfs_sb_offset(int mirror)
>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>> index a5b623ee6facd..1a3c1a9b10d0b 100644
>> --- a/fs/btrfs/raid56.c
>> +++ b/fs/btrfs/raid56.c
>> @@ -164,6 +164,9 @@ struct btrfs_raid_bio {
>> atomic_t stripes_pending;
>>
>> atomic_t error;
>> +
>> + struct work_struct end_io_work;
>> +
>> /*
>> * these are two arrays of pointers. We allocate the
>> * rbio big enough to hold them both and setup their
>> @@ -1552,15 +1555,7 @@ static void set_bio_pages_uptodate(struct
>> btrfs_raid_bio *rbio, struct bio *bio)
>> }
>> }
>>
>> -/*
>> - * end io for the read phase of the rmw cycle. All the bios here are
>> physical
>> - * stripe bios we've read from the disk so we can recalculate the
>> parity of the
>> - * stripe.
>> - *
>> - * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>> - * may trigger parity reconstruction if we had any errors along the way
>> - */
>> -static void raid_rmw_end_io(struct bio *bio)
>> +static void raid56_bio_end_io(struct bio *bio)
>> {
>> struct btrfs_raid_bio *rbio = bio->bi_private;
>>
>> @@ -1571,23 +1566,34 @@ static void raid_rmw_end_io(struct bio *bio)
>>
>> bio_put(bio);
>>
>> - if (!atomic_dec_and_test(&rbio->stripes_pending))
>> - return;
>> + if (atomic_dec_and_test(&rbio->stripes_pending))
>> + queue_work(rbio->bioc->fs_info->endio_raid56_workers,
>> + &rbio->end_io_work);
>> +}
>>
>> - if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
>> - goto cleanup;
>> +/*
>> + * End io handler for the read phase of the rmw cycle. All the bios
>> here are
>> + * physical stripe bios we've read from the disk so we can
>> recalculate the
>> + * parity of the stripe.
>> + *
>> + * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>> + * may trigger parity reconstruction if we had any errors along the way
>> + */
>> +static void raid56_rmw_end_io_work(struct work_struct *work)
>> +{
>> + struct btrfs_raid_bio *rbio =
>> + container_of(work, struct btrfs_raid_bio, end_io_work);
>> +
>> + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
>> + rbio_orig_end_io(rbio, BLK_STS_IOERR);
>> + return;
>> + }
>>
>> /*
>> - * this will normally call finish_rmw to start our write
>> - * but if there are any failed stripes we'll reconstruct
>> - * from parity first
>> + * This will normally call finish_rmw to start our write but if
>> there
>> + * are any failed stripes we'll reconstruct from parity first.
>> */
>> validate_rbio_for_rmw(rbio);
>> - return;
>> -
>> -cleanup:
>> -
>> - rbio_orig_end_io(rbio, BLK_STS_IOERR);
>> }
>>
>> /*
>> @@ -1662,11 +1668,9 @@ static int raid56_rmw_stripe(struct
>> btrfs_raid_bio *rbio)
>> * touch it after that.
>> */
>> atomic_set(&rbio->stripes_pending, bios_to_read);
>> + INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
>> while ((bio = bio_list_pop(&bio_list))) {
>> - bio->bi_end_io = raid_rmw_end_io;
>> -
>> - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> + bio->bi_end_io = raid56_bio_end_io;
>> submit_bio(bio);
>> }
>> /* the actual write will happen once the reads are done */
>> @@ -2108,25 +2112,13 @@ static void __raid_recover_end_io(struct
>> btrfs_raid_bio *rbio)
>> }
>>
>> /*
>> - * This is called only for stripes we've read from disk to
>> - * reconstruct the parity.
>> + * This is called only for stripes we've read from disk to
>> reconstruct the
>> + * parity.
>> */
>> -static void raid_recover_end_io(struct bio *bio)
>> +static void raid_recover_end_io_work(struct work_struct *work)
>> {
>> - struct btrfs_raid_bio *rbio = bio->bi_private;
>> -
>> - /*
>> - * we only read stripe pages off the disk, set them
>> - * up to date if there were no errors
>> - */
>> - if (bio->bi_status)
>> - fail_bio_stripe(rbio, bio);
>> - else
>> - set_bio_pages_uptodate(rbio, bio);
>> - bio_put(bio);
>> -
>> - if (!atomic_dec_and_test(&rbio->stripes_pending))
>> - return;
>> + struct btrfs_raid_bio *rbio =
>> + container_of(work, struct btrfs_raid_bio, end_io_work);
>>
>> if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
>> rbio_orig_end_io(rbio, BLK_STS_IOERR);
>> @@ -2209,11 +2201,9 @@ static int __raid56_parity_recover(struct
>> btrfs_raid_bio *rbio)
>> * touch it after that.
>> */
>> atomic_set(&rbio->stripes_pending, bios_to_read);
>> + INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
>> while ((bio = bio_list_pop(&bio_list))) {
>> - bio->bi_end_io = raid_recover_end_io;
>> -
>> - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> + bio->bi_end_io = raid56_bio_end_io;
>> submit_bio(bio);
>> }
>>
>> @@ -2582,8 +2572,7 @@ static noinline void finish_parity_scrub(struct
>> btrfs_raid_bio *rbio,
>> atomic_set(&rbio->stripes_pending, nr_data);
>>
>> while ((bio = bio_list_pop(&bio_list))) {
>> - bio->bi_end_io = raid_write_end_io;
>> -
>> + bio->bi_end_io = raid56_bio_end_io;
>> submit_bio(bio);
>> }
>> return;
>> @@ -2671,24 +2660,14 @@ static void
>> validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
>> * This will usually kick off finish_rmw once all the bios are read
>> in, but it
>> * may trigger parity reconstruction if we had any errors along the way
>> */
>> -static void raid56_parity_scrub_end_io(struct bio *bio)
>> +static void raid56_parity_scrub_end_io_work(struct work_struct *work)
>> {
>> - struct btrfs_raid_bio *rbio = bio->bi_private;
>> -
>> - if (bio->bi_status)
>> - fail_bio_stripe(rbio, bio);
>> - else
>> - set_bio_pages_uptodate(rbio, bio);
>> -
>> - bio_put(bio);
>> -
>> - if (!atomic_dec_and_test(&rbio->stripes_pending))
>> - return;
>> + struct btrfs_raid_bio *rbio =
>> + container_of(work, struct btrfs_raid_bio, end_io_work);
>>
>> /*
>> - * this will normally call finish_rmw to start our write
>> - * but if there are any failed stripes we'll reconstruct
>> - * from parity first
>> + * This will normally call finish_rmw to start our write, but if
>> there
>> + * are any failed stripes we'll reconstruct from parity first
>> */
>> validate_rbio_for_parity_scrub(rbio);
>> }
>> @@ -2758,11 +2737,9 @@ static void raid56_parity_scrub_stripe(struct
>> btrfs_raid_bio *rbio)
>> * touch it after that.
>> */
>> atomic_set(&rbio->stripes_pending, bios_to_read);
>> + INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
>> while ((bio = bio_list_pop(&bio_list))) {
>> - bio->bi_end_io = raid56_parity_scrub_end_io;
>> -
>> - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio,
>> BTRFS_WQ_ENDIO_RAID56);
>> -
>> + bio->bi_end_io = raid56_bio_end_io;
>> submit_bio(bio);
>> }
>> /* the actual write will happen once the reads are done */
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
2022-05-01 4:53 ` Qu Wenruo
@ 2022-05-02 16:44 ` Christoph Hellwig
2022-06-03 16:44 ` David Sterba
1 sibling, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-05-02 16:44 UTC (permalink / raw)
To: Qu Wenruo
Cc: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo, linux-btrfs
On Sun, May 01, 2022 at 12:53:00PM +0800, Qu Wenruo wrote:
> > It looks like this patch is causing test failure in btrfs/027, at least
> > for subapge (64K page size, 4K sectorsize) cases.
>
> Also confirmed the same hang on the same commit on x86_64 (4K page size
> 4K sectorsize).
>
> Also 100% reproducible.
I'll take a look. It turns out I never actually run that test as it
doesn't run without at least 5 devices in the scratch pool, and my btrfs
config uses 4.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
2022-05-01 4:53 ` Qu Wenruo
2022-05-02 16:44 ` Christoph Hellwig
@ 2022-06-03 16:44 ` David Sterba
2022-06-03 16:45 ` David Sterba
1 sibling, 1 reply; 30+ messages in thread
From: David Sterba @ 2022-06-03 16:44 UTC (permalink / raw)
To: Qu Wenruo
Cc: Christoph Hellwig, David Sterba, Josef Bacik, Qu Wenruo, linux-btrfs
On Sun, May 01, 2022 at 12:53:00PM +0800, Qu Wenruo wrote:
> On 2022/5/1 12:40, Qu Wenruo wrote:
> > On 2022/4/29 22:30, Christoph Hellwig wrote:
> >> Instead of attaching a an extra allocation an indirect call to each
> >> low-level bio issued by the RAID code, add a work_struct to struct
> >> btrfs_raid_bio and only defer the per-rbio completion action. The
> >> per-bio action for all the I/Os are trivial and can be safely done
> >> from interrupt context.
> >>
> >> As a nice side effect this also allows sharing the boilerplate code
> >> for the per-bio completions
> >>
> >> Signed-off-by: Christoph Hellwig <hch@lst.de>
> >
> > It looks like this patch is causing test failure in btrfs/027, at least
> > for subapge (64K page size, 4K sectorsize) cases.
>
> Also confirmed the same hang on the same commit on x86_64 (4K page size
> 4K sectorsize).
>
> Also 100% reproducible.
The test passes on my VM setup, the whole suite was run on the patchset
at least twice.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio
2022-06-03 16:44 ` David Sterba
@ 2022-06-03 16:45 ` David Sterba
0 siblings, 0 replies; 30+ messages in thread
From: David Sterba @ 2022-06-03 16:45 UTC (permalink / raw)
To: David Sterba
Cc: Qu Wenruo, Christoph Hellwig, David Sterba, Josef Bacik,
Qu Wenruo, linux-btrfs
On Fri, Jun 03, 2022 at 06:44:56PM +0200, David Sterba wrote:
> The test passes on my VM setup, the whole suite was run on the patchset
> at least twice.
Oh never mind, this is some old version.
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
` (4 preceding siblings ...)
2022-04-29 14:30 ` [PATCH 05/10] btrfs: defer I/O completion based on the btrfs_raid_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-04-29 14:30 ` [PATCH 07/10] btrfs: centralize setting REQ_META Christoph Hellwig
` (3 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
Compressed write bio completion is the only user of btrfs_bio_wq_end_io
for writes, and the use of btrfs_bio_wq_end_io is a little suboptimal
here as we only real need user context for the final completion of a
compressed_bio structure, and not every single bio completion.
Add a work_struct to struct compressed_bio instead and use that to call
finish_compressed_bio_write. This allows to remove all handling of
write bios in the btrfs_bio_wq_end_io infrastructure.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/btrfs/compression.c | 45 +++++++++++++++++++++---------------------
fs/btrfs/compression.h | 7 +++++--
fs/btrfs/ctree.h | 2 +-
fs/btrfs/disk-io.c | 30 +++++++++++-----------------
fs/btrfs/super.c | 2 --
5 files changed, 41 insertions(+), 45 deletions(-)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f4564f32f6d93..9d5986a30a4a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -403,6 +403,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
kfree(cb);
}
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+ struct compressed_bio *cb =
+ container_of(work, struct compressed_bio, write_end_work);
+
+ finish_compressed_bio_write(cb);
+}
+
/*
* Do the cleanup once all the compressed pages hit the disk. This will clear
* writeback on the file pages and free the compressed pages.
@@ -414,29 +422,16 @@ static void end_compressed_bio_write(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
- if (!dec_and_test_compressed_bio(cb, bio))
- goto out;
-
- btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ if (dec_and_test_compressed_bio(cb, bio)) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- finish_compressed_bio_write(cb);
-out:
+ btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ queue_work(fs_info->compressed_write_workers,
+ &cb->write_end_work);
+ }
bio_put(bio);
}
-static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
- struct bio *bio, int mirror_num)
-{
- blk_status_t ret;
-
- ASSERT(bio->bi_iter.bi_size);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- return ret;
-}
-
/*
* Allocate a compressed_bio, which will be used to read/write on-disk
* (aka, compressed) * data.
@@ -533,7 +528,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
cb->writeback = writeback;
- cb->orig_bio = NULL;
+ INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
if (blkcg_css)
@@ -603,7 +598,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
goto finish_cb;
}
- ret = submit_compressed_bio(fs_info, bio, 0);
+ ASSERT(bio->bi_iter.bi_size);
+ ret = btrfs_map_bio(fs_info, bio, 0);
if (ret)
goto finish_cb;
bio = NULL;
@@ -941,7 +937,12 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
+ ASSERT(comp_bio->bi_iter.bi_size);
+ ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
+ if (ret)
+ goto finish_cb;
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret)
goto finish_cb;
comp_bio = NULL;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 2707404389a5d..4a40725cbf1db 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -61,8 +61,11 @@ struct compressed_bio {
blk_status_t status;
int mirror_num;
- /* for reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
+ union {
+ /* for reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+ struct work_struct write_end_work;
+ };
/*
* the start of a variable length array of checksums only
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4dd0d4a2e7757..5ae482632327f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -855,7 +855,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_meta_workers;
struct workqueue_struct *endio_raid56_workers;
struct workqueue_struct *rmw_workers;
- struct btrfs_workqueue *endio_meta_write_workers;
+ struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3c6137734d28c..744a74203776e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -748,19 +748,10 @@ static void end_workqueue_bio(struct bio *bio)
fs_info = end_io_wq->info;
end_io_wq->status = bio->bi_status;
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- wq = fs_info->endio_meta_write_workers;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
- } else {
- if (end_io_wq->metadata)
- wq = fs_info->endio_meta_workers;
- else
- wq = fs_info->endio_workers;
- }
+ if (end_io_wq->metadata)
+ wq = fs_info->endio_meta_workers;
+ else
+ wq = fs_info->endio_workers;
btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
btrfs_queue_work(wq, &end_io_wq->work);
@@ -771,6 +762,9 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
{
struct btrfs_end_io_wq *end_io_wq;
+ if (WARN_ON_ONCE(btrfs_op(bio) != BTRFS_MAP_WRITE))
+ return BLK_STS_IOERR;
+
end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
if (!end_io_wq)
return BLK_STS_RESOURCE;
@@ -2274,6 +2268,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->compressed_write_workers)
+ destroy_workqueue(fs_info->compressed_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2288,7 +2284,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
* queues can do metadata I/O operations.
*/
btrfs_destroy_workqueue(fs_info->endio_meta_workers);
- btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2470,15 +2465,14 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->endio_meta_workers =
btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
max_active, 4);
- fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
- max_active, 2);
fs_info->endio_raid56_workers =
alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
+ fs_info->compressed_write_workers =
+ alloc_workqueue("btrfs-compressed-write", flags, max_active);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
@@ -2493,7 +2487,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
if (!(fs_info->workers && fs_info->hipri_workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->endio_meta_write_workers &&
+ fs_info->compressed_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b1fdc6a26c76e..9c683c466d585 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1908,8 +1908,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
- new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 07/10] btrfs: centralize setting REQ_META
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
` (5 preceding siblings ...)
2022-04-29 14:30 ` [PATCH 06/10] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-04-29 14:30 ` [PATCH 08/10] btrfs: remove btrfs_end_io_wq Christoph Hellwig
` (2 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
Set REQ_META in btrfs_submit_metadata_bio instead of the various callers.
We'll start relying on this flag inside of btrfs in a bit, and this
ensures it is always set correctly.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/disk-io.c | 2 ++
fs/btrfs/extent_io.c | 8 ++++----
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 744a74203776e..b8837f66e0a43 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -914,6 +914,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
+ bio->bi_opf |= REQ_META;
+
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
/*
* called for a read, do the setup so that checksum validation
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 80b4482c477c6..a14ed9b9dc2d0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4589,7 +4589,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
- unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ unsigned int write_flags = wbc_to_write_flags(wbc);
bool no_dirty_ebs = false;
int ret;
@@ -4634,7 +4634,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
{
u64 disk_bytenr = eb->start;
int i, num_pages;
- unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ unsigned int write_flags = wbc_to_write_flags(wbc);
int ret = 0;
prepare_eb_write(eb);
@@ -6645,7 +6645,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
- ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
+ ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
page, eb->start, eb->len,
eb->start - page_offset(page),
end_bio_extent_readpage, mirror_num, 0,
@@ -6752,7 +6752,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ err = submit_extent_page(REQ_OP_READ, NULL,
&bio_ctrl, page, page_offset(page),
PAGE_SIZE, 0, end_bio_extent_readpage,
mirror_num, 0, false);
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 08/10] btrfs: remove btrfs_end_io_wq
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
` (6 preceding siblings ...)
2022-04-29 14:30 ` [PATCH 07/10] btrfs: centralize setting REQ_META Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-04-29 14:30 ` [PATCH 09/10] btrfs: refactor btrfs_map_bio Christoph Hellwig
2022-04-29 14:30 ` [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios Christoph Hellwig
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
All reads bio that go through btrfs_map_bio need to be completed in
user context. And read I/Os are the most common and timing critical
in almost any file system workloads.
Embedd a work_struct into struct btrfs_bio and use it to complete all
read bios submitted through btrfs_map, using the REQ_META flag to decide
which workqueue they are placed on.
This removes the need for a separate 128 byte allocation (typically
rounded up to 192 bytes by slab) for all reads with a size increase
of 24 bytes for struct btrfs_bio. Future patches will reorgnize
struct btrfs_bio to make use of this extra space for writes as well.
(all sizes are based a on typical 64-bit non-debug build)
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/btrfs/compression.c | 4 --
fs/btrfs/ctree.h | 4 +-
fs/btrfs/disk-io.c | 120 +++--------------------------------------
fs/btrfs/disk-io.h | 10 ----
fs/btrfs/inode.c | 24 +--------
fs/btrfs/super.c | 11 +---
fs/btrfs/volumes.c | 33 ++++++++++--
fs/btrfs/volumes.h | 3 ++
8 files changed, 42 insertions(+), 167 deletions(-)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9d5986a30a4a2..c73ee8ae070d7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -938,10 +938,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
sums += fs_info->csum_size * nr_sectors;
ASSERT(comp_bio->bi_iter.bi_size);
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
- BTRFS_WQ_ENDIO_DATA);
- if (ret)
- goto finish_cb;
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret)
goto finish_cb;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5ae482632327f..c2c8027067c48 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -851,8 +851,8 @@ struct btrfs_fs_info {
struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
- struct btrfs_workqueue *endio_workers;
- struct btrfs_workqueue *endio_meta_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
struct workqueue_struct *endio_raid56_workers;
struct workqueue_struct *rmw_workers;
struct workqueue_struct *compressed_write_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8837f66e0a43..b55761e21bde3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
@@ -63,40 +62,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
-/*
- * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * is complete. This is used during reads to verify checksums, and it is used
- * by writes to insert metadata for new file extents after IO is complete.
- */
-struct btrfs_end_io_wq {
- struct bio *bio;
- bio_end_io_t *end_io;
- void *private;
- struct btrfs_fs_info *info;
- blk_status_t status;
- enum btrfs_wq_endio_type metadata;
- struct btrfs_work work;
-};
-
-static struct kmem_cache *btrfs_end_io_wq_cache;
-
-int __init btrfs_end_io_wq_init(void)
-{
- btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
- sizeof(struct btrfs_end_io_wq),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_end_io_wq_cache)
- return -ENOMEM;
- return 0;
-}
-
-void __cold btrfs_end_io_wq_exit(void)
-{
- kmem_cache_destroy(btrfs_end_io_wq_cache);
-}
-
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
if (fs_info->csum_shash)
@@ -739,48 +704,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
return ret;
}
-static void end_workqueue_bio(struct bio *bio)
-{
- struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
- struct btrfs_fs_info *fs_info;
- struct btrfs_workqueue *wq;
-
- fs_info = end_io_wq->info;
- end_io_wq->status = bio->bi_status;
-
- if (end_io_wq->metadata)
- wq = fs_info->endio_meta_workers;
- else
- wq = fs_info->endio_workers;
-
- btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
- btrfs_queue_work(wq, &end_io_wq->work);
-}
-
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata)
-{
- struct btrfs_end_io_wq *end_io_wq;
-
- if (WARN_ON_ONCE(btrfs_op(bio) != BTRFS_MAP_WRITE))
- return BLK_STS_IOERR;
-
- end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
- if (!end_io_wq)
- return BLK_STS_RESOURCE;
-
- end_io_wq->private = bio->bi_private;
- end_io_wq->end_io = bio->bi_end_io;
- end_io_wq->info = info;
- end_io_wq->status = 0;
- end_io_wq->bio = bio;
- end_io_wq->metadata = metadata;
-
- bio->bi_private = end_io_wq;
- bio->bi_end_io = end_workqueue_bio;
- return 0;
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -917,14 +840,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
bio->bi_opf |= REQ_META;
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- /*
- * called for a read, do the setup so that checksum validation
- * can happen in the async kernel threads
- */
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_METADATA);
- if (!ret)
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
ret = btree_csum_one_bio(bio);
if (!ret)
@@ -1940,25 +1856,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
return root;
}
-/*
- * called by the kthread helper functions to finally call the bio end_io
- * functions. This is where read checksum verification actually happens
- */
-static void end_workqueue_fn(struct btrfs_work *work)
-{
- struct bio *bio;
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
- bio = end_io_wq->bio;
-
- bio->bi_status = end_io_wq->status;
- bio->bi_private = end_io_wq->private;
- bio->bi_end_io = end_io_wq->end_io;
- bio_endio(bio);
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-}
-
static int cleaner_kthread(void *arg)
{
struct btrfs_fs_info *fs_info = arg;
@@ -2265,7 +2162,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->delalloc_workers);
btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
- btrfs_destroy_workqueue(fs_info->endio_workers);
+ if (fs_info->endio_workers)
+ destroy_workqueue(fs_info->endio_workers);
if (fs_info->endio_raid56_workers)
destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
@@ -2285,7 +2183,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
* the queues used for metadata I/O, since tasks from those other work
* queues can do metadata I/O operations.
*/
- btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+ if (fs_info->endio_meta_workers)
+ destroy_workqueue(fs_info->endio_meta_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2458,15 +2357,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
- /*
- * endios are largely parallel and should have a very
- * low idle thresh
- */
fs_info->endio_workers =
- btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
+ alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
- max_active, 4);
+ alloc_workqueue("btrfs-endio-meta", flags, max_active);
fs_info->endio_raid56_workers =
alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 97255e3d7e524..424dbfc5fd784 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -17,12 +17,6 @@
*/
#define BTRFS_BDEV_BLOCKSIZE (4096)
-enum btrfs_wq_endio_type {
- BTRFS_WQ_ENDIO_DATA,
- BTRFS_WQ_ENDIO_METADATA,
- BTRFS_WQ_ENDIO_FREE_SPACE,
-};
-
static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = SZ_16K;
@@ -120,8 +114,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
int level, struct btrfs_key *first_key);
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata);
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 dio_file_offset,
@@ -145,8 +137,6 @@ int btree_lock_page_hook(struct page *page, void *data,
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);
-int __init btrfs_end_io_wq_init(void);
-void __cold btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_set_buffer_lockdep_class(u64 objectid,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02c4cbd557dcd..7155f3c7d9f42 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2614,12 +2614,6 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
return;
}
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- btrfs_is_free_space_inode(BTRFS_I(inode)) ?
- BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA);
- if (ret)
- goto out;
-
/*
* Lookup bio sums does extra checks around whether we need to csum or
* not, which is why we ignore skip_sum here.
@@ -7785,9 +7779,6 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
- return;
-
refcount_inc(&dip->refs);
if (btrfs_map_bio(fs_info, bio, mirror_num))
refcount_dec(&dip->refs);
@@ -7888,19 +7879,12 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
- bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
blk_status_t ret;
- if (!write) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
- }
-
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
goto map;
- if (write) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
/* check btrfs_submit_data_write_bio() for async submit rules */
if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers))
return btrfs_wq_submit_bio(inode, bio, 0, 0,
@@ -10250,12 +10234,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
return ret;
}
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret) {
- btrfs_bio_free_csum(bbio);
- return ret;
- }
-
atomic_inc(&priv->pending);
ret = btrfs_map_bio(fs_info, bio, mirror_num);
if (ret) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9c683c466d585..64eb8aeed156f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1906,8 +1906,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
@@ -2668,13 +2666,9 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_delayed_ref;
- err = btrfs_end_io_wq_init();
- if (err)
- goto free_prelim_ref;
-
err = btrfs_interface_init();
if (err)
- goto free_end_io_wq;
+ goto free_prelim_ref;
btrfs_print_mod_info();
@@ -2690,8 +2684,6 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
-free_end_io_wq:
- btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
@@ -2729,7 +2721,6 @@ static void __exit exit_btrfs_fs(void)
extent_state_cache_exit();
extent_io_exit();
btrfs_interface_exit();
- btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index aeacb87457687..5f18e9105fe08 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6621,11 +6621,27 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
}
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
+{
+ if (bioc->orig_bio->bi_opf & REQ_META)
+ return bioc->fs_info->endio_meta_workers;
+ return bioc->fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio =
+ container_of(work, struct btrfs_bio, end_io_work);
+
+ bio_endio(&bbio->bio);
+}
+
+static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
{
struct bio *orig_bio = bioc->orig_bio;
+ struct btrfs_bio *bbio = btrfs_bio(orig_bio);
- btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num;
+ bbio->mirror_num = bioc->mirror_num;
orig_bio->bi_private = bioc->private;
orig_bio->bi_end_io = bioc->end_io;
@@ -6637,7 +6653,14 @@ static inline void btrfs_end_bioc(struct btrfs_io_context *bioc)
orig_bio->bi_status = BLK_STS_IOERR;
else
orig_bio->bi_status = BLK_STS_OK;
- bio_endio(orig_bio);
+
+ if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
+ INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
+ } else {
+ bio_endio(orig_bio);
+ }
+
btrfs_put_bioc(bioc);
}
@@ -6669,7 +6692,7 @@ static void btrfs_end_bio(struct bio *bio)
btrfs_bio_counter_dec(bioc->fs_info);
if (atomic_dec_and_test(&bioc->stripes_pending))
- btrfs_end_bioc(bioc);
+ btrfs_end_bioc(bioc, true);
}
static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
@@ -6767,7 +6790,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
atomic_inc(&bioc->error);
if (atomic_dec_and_test(&bioc->stripes_pending))
- btrfs_end_bioc(bioc);
+ btrfs_end_bioc(bioc, false);
continue;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 12b2af9260e92..28e28b7c48649 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -371,6 +371,9 @@ struct btrfs_bio {
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+ /* for read end I/O handling */
+ struct work_struct end_io_work;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 09/10] btrfs: refactor btrfs_map_bio
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
` (7 preceding siblings ...)
2022-04-29 14:30 ` [PATCH 08/10] btrfs: remove btrfs_end_io_wq Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
2022-04-29 14:30 ` [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios Christoph Hellwig
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
Move all per-stripe handling into submit_stripe_bio and use a label to
cleanup instead of duplicating the logic.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/btrfs/volumes.c | 77 +++++++++++++++++++++-------------------------
1 file changed, 35 insertions(+), 42 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5f18e9105fe08..28ac2bfb5d296 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6695,10 +6695,30 @@ static void btrfs_end_bio(struct bio *bio)
btrfs_end_bioc(bioc, true);
}
-static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
- u64 physical, struct btrfs_device *dev)
+static void submit_stripe_bio(struct btrfs_io_context *bioc,
+ struct bio *orig_bio, int dev_nr, bool clone)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
+ struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
+ u64 physical = bioc->stripes[dev_nr].physical;
+ struct bio *bio;
+
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+ atomic_inc(&bioc->error);
+ if (atomic_dec_and_test(&bioc->stripes_pending))
+ btrfs_end_bioc(bioc, false);
+ return;
+ }
+
+ if (clone) {
+ bio = btrfs_bio_clone(dev->bdev, orig_bio);
+ } else {
+ bio = orig_bio;
+ bio_set_dev(bio, dev->bdev);
+ }
bio->bi_private = bioc;
btrfs_bio(bio)->device = dev;
@@ -6733,32 +6753,25 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num)
{
- struct btrfs_device *dev;
- struct bio *first_bio = bio;
u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = 0;
- u64 map_length;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
int ret;
int dev_nr;
int total_devs;
struct btrfs_io_context *bioc = NULL;
- length = bio->bi_iter.bi_size;
- map_length = length;
-
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
&map_length, &bioc, mirror_num, 1);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
- }
+ if (ret)
+ goto out_dec;
total_devs = bioc->num_stripes;
- bioc->orig_bio = first_bio;
- bioc->private = first_bio->bi_private;
- bioc->end_io = first_bio->bi_end_io;
- atomic_set(&bioc->stripes_pending, bioc->num_stripes);
+ bioc->orig_bio = bio;
+ bioc->private = bio->bi_private;
+ bioc->end_io = bio->bi_end_io;
+ atomic_set(&bioc->stripes_pending, total_devs);
if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
@@ -6770,9 +6783,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
ret = raid56_parity_recover(bio, bioc, map_length,
mirror_num, 1);
}
-
- btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
+ goto out_dec;
}
if (map_length < length) {
@@ -6782,29 +6793,11 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
BUG();
}
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bioc->stripes[dev_nr].dev;
- if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
- &dev->dev_state) ||
- (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- atomic_inc(&bioc->error);
- if (atomic_dec_and_test(&bioc->stripes_pending))
- btrfs_end_bioc(bioc, false);
- continue;
- }
-
- if (dev_nr < total_devs - 1) {
- bio = btrfs_bio_clone(dev->bdev, first_bio);
- } else {
- bio = first_bio;
- bio_set_dev(bio, dev->bdev);
- }
-
- submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
- }
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ submit_stripe_bio(bioc, bio, dev_nr, dev_nr < total_devs - 1);
+out_dec:
btrfs_bio_counter_dec(fs_info);
- return BLK_STS_OK;
+ return errno_to_blk_status(ret);
}
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 10/10] btrfs: do not allocate a btrfs_bio for low-level bios
2022-04-29 14:30 cleanup btrfs bio handling, part 2 v2 Christoph Hellwig
` (8 preceding siblings ...)
2022-04-29 14:30 ` [PATCH 09/10] btrfs: refactor btrfs_map_bio Christoph Hellwig
@ 2022-04-29 14:30 ` Christoph Hellwig
9 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2022-04-29 14:30 UTC (permalink / raw)
To: David Sterba, Josef Bacik, Qu Wenruo; +Cc: linux-btrfs
The bios submitted from btrfs_map_bio don't really interact with the
rest of btrfs and the only btrfs_bio member actually used in the
low-level bios is the pointer to the btrfs_io_contex used for endio
handler.
Use a union in struct btrfs_io_stripe that allows the endio handler to
find the btrfs_io_context and remove the spurious ->device assignment
so that a plain fs_bio_set bio can be used for the low-level bios
allocated inside btrfs_map_bio.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/btrfs/extent_io.c | 13 -------------
fs/btrfs/extent_io.h | 1 -
fs/btrfs/volumes.c | 20 ++++++++++----------
fs/btrfs/volumes.h | 5 ++++-
4 files changed, 14 insertions(+), 25 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a14ed9b9dc2d0..37f4eee418219 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3209,19 +3209,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
return bio;
}
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio)
-{
- struct btrfs_bio *bbio;
- struct bio *new;
-
- /* Bio allocation backed by a bioset does not fail */
- new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset);
- bbio = btrfs_bio(new);
- btrfs_bio_init(bbio);
- bbio->iter = bio->bi_iter;
- return new;
-}
-
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b390ec79f9a86..3078e90be3a99 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -265,7 +265,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 28ac2bfb5d296..8577893ccb4b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6666,23 +6666,21 @@ static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
static void btrfs_end_bio(struct bio *bio)
{
- struct btrfs_io_context *bioc = bio->bi_private;
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
if (bio->bi_status) {
atomic_inc(&bioc->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_bio(bio)->device;
-
- ASSERT(dev->bdev);
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
+ btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
}
}
@@ -6714,14 +6712,16 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc,
}
if (clone) {
- bio = btrfs_bio_clone(dev->bdev, orig_bio);
+ bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS,
+ &fs_bio_set);
} else {
bio = orig_bio;
bio_set_dev(bio, dev->bdev);
+ btrfs_bio(bio)->device = dev;
}
- bio->bi_private = bioc;
- btrfs_bio(bio)->device = dev;
+ bioc->stripes[dev_nr].bioc = bioc;
+ bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
/*
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 28e28b7c48649..825e44c82f2b0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -396,7 +396,10 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
struct btrfs_io_stripe {
struct btrfs_device *dev;
- u64 physical;
+ union {
+ u64 physical; /* block mapping */
+ struct btrfs_io_context *bioc; /* for the endio handler */
+ };
u64 length; /* only used for discard mappings */
};
--
2.30.2
^ permalink raw reply related [flat|nested] 30+ messages in thread