All of lore.kernel.org
 help / color / mirror / Atom feed
* fix read repair on compressed extents v2
@ 2022-06-30 16:01 Christoph Hellwig
  2022-06-30 16:01 ` [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-06-30 16:01 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba; +Cc: linux-btrfs

Hi all,

while looking into the repair code I found that read repair of compressed
extents is current fundamentally broken, in that repair tries to write
the uncompressed data into a corrupted extent during a repair.  This is
demonstrated by the "btrfs: test read repair on a corrupted compressed
extent" test submitted to xfstests.

This series fixes that, but is a bit invaside as it requires both
refactoring of the compression code and changes to the repair code to
not look up the logic address on every repair attempt.  On the plus
side it removes a whole lot of code.

It is based on the for-next branch plus my "btrfs: repair all known bad
mirrors" patch.

Changes since v1:
 - describe the partial revert that happens in patch 1 better in the
   commit log
 - drop a now incorrect comment
 - do not add a prototype for a non-existent function

Diffstat:
 compression.c |  287 ++++++++++++++++------------------------------------------
 compression.h |   11 --
 ctree.h       |    2 
 extent_io.c   |   93 +++++++-----------
 extent_io.h   |    9 -
 inode.c       |   34 +++---
 volumes.h     |    2 
 7 files changed, 146 insertions(+), 292 deletions(-)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio
  2022-06-30 16:01 fix read repair on compressed extents v2 Christoph Hellwig
@ 2022-06-30 16:01 ` Christoph Hellwig
  2022-07-05 14:40   ` Nikolay Borisov
  2022-06-30 16:01 ` [PATCH 2/4] btrfs: pass a btrfs_bio to btrfs_repair_one_sector Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2022-06-30 16:01 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba; +Cc: linux-btrfs, Boris Burkov

Instead of counting the bytes just count the bios, with an extra
reference held during submission.  This significantly simplifies the
submission side error handling.

This reverts the change commit 6ec9765d746d ("btrfs: introduce
compressed_bio::pending_sectors to trace compressed bio") that moved to
counting sectors, but unlike the state before that commit the extra
reference held during the submission actually keeps the refcounting
sane.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/compression.c | 126 ++++++++++-------------------------------
 fs/btrfs/compression.h |   4 +-
 2 files changed, 33 insertions(+), 97 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 907fc8a4c092c..e756da640fd7b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -191,44 +191,6 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
 	return 0;
 }
 
-/*
- * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
- *
- * Return true if there is no pending bio nor io.
- * Return false otherwise.
- */
-static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
-	unsigned int bi_size = 0;
-	bool last_io = false;
-	struct bio_vec *bvec;
-	struct bvec_iter_all iter_all;
-
-	/*
-	 * At endio time, bi_iter.bi_size doesn't represent the real bio size.
-	 * Thus here we have to iterate through all segments to grab correct
-	 * bio size.
-	 */
-	bio_for_each_segment_all(bvec, bio, iter_all)
-		bi_size += bvec->bv_len;
-
-	if (bio->bi_status)
-		cb->status = bio->bi_status;
-
-	ASSERT(bi_size && bi_size <= cb->compressed_len);
-	last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
-					&cb->pending_sectors);
-	/*
-	 * Here we must wake up the possible error handler after all other
-	 * operations on @cb finished, or we can race with
-	 * finish_compressed_bio_*() which may free @cb.
-	 */
-	wake_up_var(cb);
-
-	return last_io;
-}
-
 static void finish_compressed_bio_read(struct compressed_bio *cb)
 {
 	unsigned int index;
@@ -288,7 +250,10 @@ static void end_compressed_bio_read(struct bio *bio)
 	unsigned int mirror = btrfs_bio(bio)->mirror_num;
 	int ret = 0;
 
-	if (!dec_and_test_compressed_bio(cb, bio))
+	if (bio->bi_status)
+		cb->status = bio->bi_status;
+
+	if (!refcount_dec_and_test(&cb->pending_ios))
 		goto out;
 
 	/*
@@ -417,7 +382,10 @@ static void end_compressed_bio_write(struct bio *bio)
 {
 	struct compressed_bio *cb = bio->bi_private;
 
-	if (dec_and_test_compressed_bio(cb, bio)) {
+	if (bio->bi_status)
+		cb->status = bio->bi_status;
+
+	if (refcount_dec_and_test(&cb->pending_ios)) {
 		struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
 
 		btrfs_record_physical_zoned(cb->inode, cb->start, bio);
@@ -476,7 +444,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
 		return ERR_PTR(ret);
 	}
 	*next_stripe_start = disk_bytenr + geom.len;
-
+	refcount_inc(&cb->pending_ios);
 	return bio;
 }
 
@@ -503,17 +471,17 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 	struct compressed_bio *cb;
 	u64 cur_disk_bytenr = disk_start;
 	u64 next_stripe_start;
-	blk_status_t ret;
 	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
 	const bool use_append = btrfs_use_zone_append(inode, disk_start);
 	const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
+	blk_status_t ret = BLK_STS_OK;
 
 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
 	       IS_ALIGNED(len, fs_info->sectorsize));
 	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 	if (!cb)
 		return BLK_STS_RESOURCE;
-	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+	refcount_set(&cb->pending_ios, 1);
 	cb->status = BLK_STS_OK;
 	cb->inode = &inode->vfs_inode;
 	cb->start = start;
@@ -543,8 +511,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 				&next_stripe_start);
 			if (IS_ERR(bio)) {
 				ret = errno_to_blk_status(PTR_ERR(bio));
-				bio = NULL;
-				goto finish_cb;
+				break;
 			}
 			if (blkcg_css)
 				bio->bi_opf |= REQ_CGROUP_PUNT;
@@ -588,8 +555,11 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 		if (submit) {
 			if (!skip_sum) {
 				ret = btrfs_csum_one_bio(inode, bio, start, true);
-				if (ret)
-					goto finish_cb;
+				if (ret) {
+					bio->bi_status = ret;
+					bio_endio(bio);
+					break;
+				}
 			}
 
 			ASSERT(bio->bi_iter.bi_size);
@@ -598,33 +568,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 		}
 		cond_resched();
 	}
-	if (blkcg_css)
-		kthread_associate_blkcg(NULL);
 
-	return 0;
-
-finish_cb:
 	if (blkcg_css)
 		kthread_associate_blkcg(NULL);
 
-	if (bio) {
-		bio->bi_status = ret;
-		bio_endio(bio);
-	}
-	/* Last byte of @cb is submitted, endio will free @cb */
-	if (cur_disk_bytenr == disk_start + compressed_len)
-		return ret;
-
-	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
-			   (disk_start + compressed_len - cur_disk_bytenr) >>
-			   fs_info->sectorsize_bits);
-	/*
-	 * Even with previous bio ended, we should still have io not yet
-	 * submitted, thus need to finish manually.
-	 */
-	ASSERT(refcount_read(&cb->pending_sectors));
-	/* Now we are the only one referring @cb, can finish it safely. */
-	finish_compressed_bio_write(cb);
+	if (refcount_dec_and_test(&cb->pending_ios))
+		finish_compressed_bio_write(cb);
 	return ret;
 }
 
@@ -830,7 +779,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		goto out;
 	}
 
-	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+	refcount_set(&cb->pending_ios, 1);
 	cb->status = BLK_STS_OK;
 	cb->inode = inode;
 	cb->mirror_num = mirror_num;
@@ -880,9 +829,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 					REQ_OP_READ, end_compressed_bio_read,
 					&next_stripe_start);
 			if (IS_ERR(comp_bio)) {
-				ret = errno_to_blk_status(PTR_ERR(comp_bio));
-				comp_bio = NULL;
-				goto finish_cb;
+				cb->status =
+					errno_to_blk_status(PTR_ERR(comp_bio));
+				break;
 			}
 		}
 		/*
@@ -921,8 +870,11 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			unsigned int nr_sectors;
 
 			ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
-			if (ret)
-				goto finish_cb;
+			if (ret) {
+				comp_bio->bi_status = ret;
+				bio_endio(comp_bio);
+				break;
+			}
 
 			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
 						  fs_info->sectorsize);
@@ -933,6 +885,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			comp_bio = NULL;
 		}
 	}
+
+	if (refcount_dec_and_test(&cb->pending_ios))
+		finish_compressed_bio_read(cb);
 	return;
 
 fail:
@@ -950,25 +905,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	bio->bi_status = ret;
 	bio_endio(bio);
 	return;
-finish_cb:
-	if (comp_bio) {
-		comp_bio->bi_status = ret;
-		bio_endio(comp_bio);
-	}
-	/* All bytes of @cb is submitted, endio will free @cb */
-	if (cur_disk_byte == disk_bytenr + compressed_len)
-		return;
-
-	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
-			   (disk_bytenr + compressed_len - cur_disk_byte) >>
-			   fs_info->sectorsize_bits);
-	/*
-	 * Even with previous bio ended, we should still have io not yet
-	 * submitted, thus need to finish @cb manually.
-	 */
-	ASSERT(refcount_read(&cb->pending_sectors));
-	/* Now we are the only one referring @cb, can finish it safely. */
-	finish_compressed_bio_read(cb);
 }
 
 /*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 5fca7603e928a..0e4cbf04fd866 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
 #define	BTRFS_ZLIB_DEFAULT_LEVEL		3
 
 struct compressed_bio {
-	/* Number of sectors with unfinished IO (unsubmitted or unfinished) */
-	refcount_t pending_sectors;
+	/* Number of outstanding bios */
+	refcount_t pending_ios;
 
 	/* Number of compressed pages in the array */
 	unsigned int nr_pages;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/4] btrfs: pass a btrfs_bio to btrfs_repair_one_sector
  2022-06-30 16:01 fix read repair on compressed extents v2 Christoph Hellwig
  2022-06-30 16:01 ` [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio Christoph Hellwig
@ 2022-06-30 16:01 ` Christoph Hellwig
  2022-06-30 16:01 ` [PATCH 3/4] btrfs: remove the start argument to check_data_csum Christoph Hellwig
  2022-06-30 16:01 ` [PATCH 4/4] btrfs: fix repair of compressed extents Christoph Hellwig
  3 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-06-30 16:01 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba; +Cc: linux-btrfs, Boris Burkov

Pass the btrfs_bio instead of the plain bio to btrfs_repair_one_sector,
an remove the start and failed_mirror arguments in favor of deriving
them from the btrfs_bio.  For this to work ensure that the file_offset
field is also initialized for buffered I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/extent_io.c | 47 ++++++++++++++++++++++++--------------------
 fs/btrfs/extent_io.h |  8 ++++----
 fs/btrfs/inode.c     |  5 ++---
 fs/btrfs/volumes.h   |  2 --
 4 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3778d58092dea..ec7bdb3fa0921 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -182,6 +182,7 @@ static int add_extent_changeset(struct extent_state *state, u32 bits,
 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct bio *bio;
+	struct bio_vec *bv;
 	struct inode *inode;
 	int mirror_num;
 
@@ -189,12 +190,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 		return;
 
 	bio = bio_ctrl->bio;
-	inode = bio_first_page_all(bio)->mapping->host;
+	bv = bio_first_bvec_all(bio);
+	inode = bv->bv_page->mapping->host;
 	mirror_num = bio_ctrl->mirror_num;
 
 	/* Caller should ensure the bio has at least some range added */
 	ASSERT(bio->bi_iter.bi_size);
 
+	btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
+
 	if (!is_data_inode(inode))
 		btrfs_submit_metadata_bio(inode, bio, mirror_num);
 	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@ -2533,10 +2537,11 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
 }
 
 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
-							     u64 start,
-							     int failed_mirror)
+							     struct btrfs_bio *bbio,
+							     unsigned int bio_offset)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	u64 start = bbio->file_offset + bio_offset;
 	struct io_failure_record *failrec;
 	struct extent_map *em;
 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -2556,7 +2561,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
 		 * (e.g. with a list for failed_mirror) to make
 		 * clean_io_failure() clean all those errors at once.
 		 */
-		ASSERT(failrec->this_mirror == failed_mirror);
+		ASSERT(failrec->this_mirror == bbio->mirror_num);
 		ASSERT(failrec->len == fs_info->sectorsize);
 		return failrec;
 	}
@@ -2567,7 +2572,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
 
 	failrec->start = start;
 	failrec->len = sectorsize;
-	failrec->failed_mirror = failrec->this_mirror = failed_mirror;
+	failrec->failed_mirror = failrec->this_mirror = bbio->mirror_num;
 	failrec->compress_type = BTRFS_COMPRESS_NONE;
 
 	read_lock(&em_tree->lock);
@@ -2632,17 +2637,17 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
 	return failrec;
 }
 
-int btrfs_repair_one_sector(struct inode *inode,
-			    struct bio *failed_bio, u32 bio_offset,
-			    struct page *page, unsigned int pgoff,
-			    u64 start, int failed_mirror,
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+			    u32 bio_offset, struct page *page,
+			    unsigned int pgoff,
 			    submit_bio_hook_t *submit_bio_hook)
 {
+	u64 start = failed_bbio->file_offset + bio_offset;
 	struct io_failure_record *failrec;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-	struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
+	struct bio *failed_bio = &failed_bbio->bio;
 	const int icsum = bio_offset >> fs_info->sectorsize_bits;
 	struct bio *repair_bio;
 	struct btrfs_bio *repair_bbio;
@@ -2652,7 +2657,7 @@ int btrfs_repair_one_sector(struct inode *inode,
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
-	failrec = btrfs_get_io_failure_record(inode, start, failed_mirror);
+	failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
 	if (IS_ERR(failrec))
 		return PTR_ERR(failrec);
 
@@ -2750,9 +2755,10 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate)
 				    offset + sectorsize - 1, &cached);
 }
 
-static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio,
+static void submit_data_read_repair(struct inode *inode,
+				    struct btrfs_bio *failed_bbio,
 				    u32 bio_offset, const struct bio_vec *bvec,
-				    int failed_mirror, unsigned int error_bitmap)
+				    unsigned int error_bitmap)
 {
 	const unsigned int pgoff = bvec->bv_offset;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2763,7 +2769,7 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio,
 	const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
 	int i;
 
-	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+	BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
 
 	/* This repair is only for data */
 	ASSERT(is_data_inode(inode));
@@ -2775,7 +2781,7 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio,
 	 * We only get called on buffered IO, thus page must be mapped and bio
 	 * must not be cloned.
 	 */
-	ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
+	ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
 
 	/* Iterate through all the sectors in the range */
 	for (i = 0; i < nr_bits; i++) {
@@ -2792,10 +2798,9 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio,
 			goto next;
 		}
 
-		ret = btrfs_repair_one_sector(inode, failed_bio,
-				bio_offset + offset,
-				page, pgoff + offset, start + offset,
-				failed_mirror, btrfs_submit_data_read_bio);
+		ret = btrfs_repair_one_sector(inode, failed_bbio,
+				bio_offset + offset, page, pgoff + offset,
+				btrfs_submit_data_read_bio);
 		if (!ret) {
 			/*
 			 * We have submitted the read repair, the page release
@@ -3127,8 +3132,8 @@ static void end_bio_extent_readpage(struct bio *bio)
 			 * submit_data_read_repair() will handle all the good
 			 * and bad sectors, we just continue to the next bvec.
 			 */
-			submit_data_read_repair(inode, bio, bio_offset, bvec,
-						mirror, error_bitmap);
+			submit_data_read_repair(inode, bbio, bio_offset, bvec,
+						error_bitmap);
 		} else {
 			/* Update page status and unlock */
 			end_page_read(page, uptodate, start, len);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 280af70c04953..a78051c7627c4 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -57,6 +57,7 @@ enum {
 #define BITMAP_LAST_BYTE_MASK(nbits) \
 	(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
 
+struct btrfs_bio;
 struct btrfs_root;
 struct btrfs_inode;
 struct btrfs_io_bio;
@@ -266,10 +267,9 @@ struct io_failure_record {
 	int num_copies;
 };
 
-int btrfs_repair_one_sector(struct inode *inode,
-			    struct bio *failed_bio, u32 bio_offset,
-			    struct page *page, unsigned int pgoff,
-			    u64 start, int failed_mirror,
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+			    u32 bio_offset, struct page *page,
+			    unsigned int pgoff,
 			    submit_bio_hook_t *submit_bio_hook);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 784c1ad4a9634..a627b2af9e243 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7953,9 +7953,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 		} else {
 			int ret;
 
-			ret = btrfs_repair_one_sector(inode, &bbio->bio, offset,
-					bv.bv_page, bv.bv_offset, start,
-					bbio->mirror_num,
+			ret = btrfs_repair_one_sector(inode, bbio, offset,
+					bv.bv_page, bv.bv_offset,
 					submit_dio_repair_bio);
 			if (ret)
 				err = errno_to_blk_status(ret);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4324c4d409096..9cce711cc938c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -368,8 +368,6 @@ struct btrfs_fs_devices {
  */
 struct btrfs_bio {
 	unsigned int mirror_num;
-
-	/* for direct I/O */
 	u64 file_offset;
 
 	/* @device is for stripe IO submission. */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/4] btrfs: remove the start argument to check_data_csum
  2022-06-30 16:01 fix read repair on compressed extents v2 Christoph Hellwig
  2022-06-30 16:01 ` [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio Christoph Hellwig
  2022-06-30 16:01 ` [PATCH 2/4] btrfs: pass a btrfs_bio to btrfs_repair_one_sector Christoph Hellwig
@ 2022-06-30 16:01 ` Christoph Hellwig
  2022-07-05 15:35   ` Nikolay Borisov
  2022-06-30 16:01 ` [PATCH 4/4] btrfs: fix repair of compressed extents Christoph Hellwig
  3 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2022-06-30 16:01 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba; +Cc: linux-btrfs, Boris Burkov

Just derive it from the btrfs_bio now that ->file_offset is always valid.
Also make the function available outside of inode.c as we'll need that
soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/inode.c | 22 +++++++++-------------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4e2569f84aabc..164f54e6aa447 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3293,6 +3293,8 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 				    u32 bio_offset, struct page *page,
 				    u64 start, u64 end);
+int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset,
+		    struct page *page, u32 pgoff);
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
 					   u64 start, u64 len);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a627b2af9e243..429428fde4a88 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3396,20 +3396,18 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 /*
  * check_data_csum - verify checksum of one sector of uncompressed data
  * @inode:	inode
- * @io_bio:	btrfs_io_bio which contains the csum
+ * @bbio:	btrfs_io_bio which contains the csum
  * @bio_offset:	offset to the beginning of the bio (in bytes)
  * @page:	page where is the data to be verified
  * @pgoff:	offset inside the page
- * @start:	logical offset in the file
  *
  * The length of such check is always one sector size.
  *
  * When csum mismatch is detected, we will also report the error and fill the
  * corrupted range with zero. (Thus it needs the extra parameters)
  */
-static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
-			   u32 bio_offset, struct page *page, u32 pgoff,
-			   u64 start)
+int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset,
+		    struct page *page, u32 pgoff)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	u32 len = fs_info->sectorsize;
@@ -3425,8 +3423,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
 	return 0;
 
 zeroit:
-	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
-				    bbio->mirror_num);
+	btrfs_print_data_csum_error(BTRFS_I(inode),
+				    bbio->file_offset + bio_offset,
+				    csum, csum_expected, bbio->mirror_num);
 	if (bbio->device)
 		btrfs_dev_stat_inc_and_print(bbio->device,
 					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -3495,8 +3494,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 					  EXTENT_NODATASUM);
 			continue;
 		}
-		ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
-				      page_offset(page) + pg_off);
+		ret = check_data_csum(inode, bbio, bio_offset, page, pg_off);
 		if (ret < 0) {
 			const int nr_bit = (pg_off - offset_in_page(start)) >>
 				     root->fs_info->sectorsize_bits;
@@ -7946,7 +7944,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 
 		if (uptodate &&
 		    (!csum || !check_data_csum(inode, bbio, offset, bv.bv_page,
-					       bv.bv_offset, start))) {
+					       bv.bv_offset))) {
 			clean_io_failure(fs_info, failure_tree, io_tree, start,
 					 bv.bv_page, btrfs_ino(BTRFS_I(inode)),
 					 bv.bv_offset);
@@ -10324,7 +10322,6 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
 	u32 sectorsize = fs_info->sectorsize;
 	struct bio_vec *bvec;
 	struct bvec_iter_all iter_all;
-	u64 start = priv->file_offset;
 	u32 bio_offset = 0;
 
 	if (priv->skip_csum || !uptodate)
@@ -10338,9 +10335,8 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
 		for (i = 0; i < nr_sectors; i++) {
 			ASSERT(pgoff < PAGE_SIZE);
 			if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
-					    bvec->bv_page, pgoff, start))
+					    bvec->bv_page, pgoff))
 				return BLK_STS_IOERR;
-			start += sectorsize;
 			bio_offset += sectorsize;
 			pgoff += sectorsize;
 		}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/4] btrfs: fix repair of compressed extents
  2022-06-30 16:01 fix read repair on compressed extents v2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2022-06-30 16:01 ` [PATCH 3/4] btrfs: remove the start argument to check_data_csum Christoph Hellwig
@ 2022-06-30 16:01 ` Christoph Hellwig
  2022-07-07 12:50   ` Nikolay Borisov
  3 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2022-06-30 16:01 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba; +Cc: linux-btrfs

Currently the checksum of compressed extents is verified based on the
compressed data and the lower btrfs_bio, but the actual repair process
is driven by end_bio_extent_readpage on the upper btrfs_bio for the
decompressed data.

This has a bunch of issues, including not being able to properly
communicate the failed mirror up in case that the I/O submission got
preempted, a general loss of if an error was an I/O error or a checksum
verification failure, but most importantly that this design causes
btrfs_clean_io_failure to eventually write back the uncompressed good
data onto the disk sectors that are supposed to contain compressed data.

Fix this by moving the repair to the lower btrfs_bio.  To do so, a fair
amount of code has to be reshuffled:

 a) the lower btrfs_bio now needs a valid csum pointer.  The easiest way
    to archive that is to pass NULL btrfs_lookup_bio_sums and just use
    the btrfs_bio management of csums.  For a compressed_bio that is
    split into multiple btrfs_bios this mean additional memory
    allocations, but the code becomes a lot more regular.
 b) checksum verifiaction now runs diretly on the lower btrfs_bio instead
    of the compressed_bio.  This actually nicely simplifies the end I/O
    processing.
 c) btrfs_repair_one_sector can't just look up the logical address for
    the file offset any more, as there is no coresponding relative
    offsets that apply to the file offset and the logic address for
    compressed extents.  Instead require that the saved bvec_iter in the
    btrfs_bio is filled out for all read bios and use that, which again
    removes a fair amount of code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/compression.c | 171 ++++++++++++++---------------------------
 fs/btrfs/compression.h |   7 --
 fs/btrfs/extent_io.c   |  46 +++--------
 fs/btrfs/extent_io.h   |   1 -
 fs/btrfs/inode.c       |   7 ++
 5 files changed, 75 insertions(+), 157 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e756da640fd7b..c8b14a5bd89be 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -136,66 +136,14 @@ static int compression_decompress(int type, struct list_head *ws,
 
 static int btrfs_decompress_bio(struct compressed_bio *cb);
 
-static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
-				      unsigned long disk_size)
-{
-	return sizeof(struct compressed_bio) +
-		(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
-}
-
-static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
-				 u64 disk_start)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	const u32 csum_size = fs_info->csum_size;
-	const u32 sectorsize = fs_info->sectorsize;
-	struct page *page;
-	unsigned int i;
-	u8 csum[BTRFS_CSUM_SIZE];
-	struct compressed_bio *cb = bio->bi_private;
-	u8 *cb_sum = cb->sums;
-
-	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
-	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
-		return 0;
-
-	for (i = 0; i < cb->nr_pages; i++) {
-		u32 pg_offset;
-		u32 bytes_left = PAGE_SIZE;
-		page = cb->compressed_pages[i];
-
-		/* Determine the remaining bytes inside the page first */
-		if (i == cb->nr_pages - 1)
-			bytes_left = cb->compressed_len - i * PAGE_SIZE;
-
-		/* Hash through the page sector by sector */
-		for (pg_offset = 0; pg_offset < bytes_left;
-		     pg_offset += sectorsize) {
-			int ret;
-
-			ret = btrfs_check_sector_csum(fs_info, page, pg_offset,
-						      csum, cb_sum);
-			if (ret) {
-				btrfs_print_data_csum_error(inode, disk_start,
-						csum, cb_sum, cb->mirror_num);
-				if (btrfs_bio(bio)->device)
-					btrfs_dev_stat_inc_and_print(
-						btrfs_bio(bio)->device,
-						BTRFS_DEV_STAT_CORRUPTION_ERRS);
-				return -EIO;
-			}
-			cb_sum += csum_size;
-			disk_start += sectorsize;
-		}
-	}
-	return 0;
-}
-
 static void finish_compressed_bio_read(struct compressed_bio *cb)
 {
 	unsigned int index;
 	struct page *page;
 
+	if (cb->status == BLK_STS_OK)
+		cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
+
 	/* Release the compressed pages */
 	for (index = 0; index < cb->nr_pages; index++) {
 		page = cb->compressed_pages[index];
@@ -233,59 +181,54 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
 	kfree(cb);
 }
 
-/* when we finish reading compressed pages from the disk, we
- * decompress them and then run the bio end_io routines on the
- * decompressed pages (in the inode address space).
- *
- * This allows the checksumming and other IO error handling routines
- * to work normally
- *
- * The compressed pages are freed here, and it must be run
- * in process context
+/*
+ * Verify the checksums and kick off repair if needed on the uncompressed data
+ * before decompressing it into the original bio and freeing the uncompressed
+ * pages.
  */
 static void end_compressed_bio_read(struct bio *bio)
 {
 	struct compressed_bio *cb = bio->bi_private;
-	struct inode *inode;
-	unsigned int mirror = btrfs_bio(bio)->mirror_num;
-	int ret = 0;
-
-	if (bio->bi_status)
-		cb->status = bio->bi_status;
-
-	if (!refcount_dec_and_test(&cb->pending_ios))
-		goto out;
-
-	/*
-	 * Record the correct mirror_num in cb->orig_bio so that
-	 * read-repair can work properly.
-	 */
-	btrfs_bio(cb->orig_bio)->mirror_num = mirror;
-	cb->mirror_num = mirror;
-
-	/*
-	 * Some IO in this cb have failed, just skip checksum as there
-	 * is no way it could be correct.
-	 */
-	if (cb->status != BLK_STS_OK)
-		goto csum_failed;
+	struct inode *inode = cb->inode;
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_inode *bi = BTRFS_I(inode);
+	bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
+		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+	blk_status_t status = bio->bi_status;
+	struct btrfs_bio *bbio = btrfs_bio(bio);
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	u32 offset;
+
+	btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+		u64 start = bbio->file_offset + offset;
+
+		if (!status &&
+		    (!csum || !check_data_csum(inode, bbio, offset, bv.bv_page,
+					       bv.bv_offset))) {
+			clean_io_failure(fs_info, &bi->io_failure_tree,
+					 &bi->io_tree, start, bv.bv_page,
+					 btrfs_ino(bi), bv.bv_offset);
+		} else {
+			int ret;
 
-	inode = cb->inode;
-	ret = check_compressed_csum(BTRFS_I(inode), bio,
-				    bio->bi_iter.bi_sector << 9);
-	if (ret)
-		goto csum_failed;
+			refcount_inc(&cb->pending_ios);
+			ret = btrfs_repair_one_sector(inode, bbio, offset,
+					bv.bv_page, bv.bv_offset,
+					btrfs_submit_data_read_bio);
+			if (ret) {
+				refcount_dec(&cb->pending_ios);
+				status = errno_to_blk_status(ret);
+			}
+		}
+	}
 
-	/* ok, we're the last bio for this extent, lets start
-	 * the decompression.
-	 */
-	ret = btrfs_decompress_bio(cb);
+	if (status)
+		cb->status = status;
 
-csum_failed:
-	if (ret)
-		cb->status = errno_to_blk_status(ret);
-	finish_compressed_bio_read(cb);
-out:
+	if (refcount_dec_and_test(&cb->pending_ios))
+		finish_compressed_bio_read(cb);
+	btrfs_bio_free_csum(bbio);
 	bio_put(bio);
 }
 
@@ -478,7 +421,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 
 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
 	       IS_ALIGNED(len, fs_info->sectorsize));
-	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
 	if (!cb)
 		return BLK_STS_RESOURCE;
 	refcount_set(&cb->pending_ios, 1);
@@ -486,7 +429,6 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 	cb->inode = &inode->vfs_inode;
 	cb->start = start;
 	cb->len = len;
-	cb->mirror_num = 0;
 	cb->compressed_pages = compressed_pages;
 	cb->compressed_len = compressed_len;
 	cb->writeback = writeback;
@@ -755,7 +697,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	blk_status_t ret;
 	int ret2;
 	int i;
-	u8 *sums;
 
 	em_tree = &BTRFS_I(inode)->extent_tree;
 
@@ -773,7 +714,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
 	compressed_len = em->block_len;
-	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
 	if (!cb) {
 		ret = BLK_STS_RESOURCE;
 		goto out;
@@ -782,8 +723,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	refcount_set(&cb->pending_ios, 1);
 	cb->status = BLK_STS_OK;
 	cb->inode = inode;
-	cb->mirror_num = mirror_num;
-	sums = cb->sums;
 
 	cb->start = em->orig_start;
 	em_len = em->len;
@@ -867,19 +806,25 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			submit = true;
 
 		if (submit) {
-			unsigned int nr_sectors;
+			/* Save the original iter for read repair */
+			if (bio_op(comp_bio) == REQ_OP_READ)
+				btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
+
+			/*
+			 * Just stash the initial offset of this chunk, as there
+			 * is no direct correlation between compressed pages and
+			 * the original file offset.  The field is only used for
+			 * priting error messages anyway.
+			 */
+			btrfs_bio(comp_bio)->file_offset = file_offset;
 
-			ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
+			ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
 			if (ret) {
 				comp_bio->bi_status = ret;
 				bio_endio(comp_bio);
 				break;
 			}
 
-			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
-						  fs_info->sectorsize);
-			sums += fs_info->csum_size * nr_sectors;
-
 			ASSERT(comp_bio->bi_iter.bi_size);
 			btrfs_submit_bio(fs_info, comp_bio, mirror_num);
 			comp_bio = NULL;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 0e4cbf04fd866..e9ef24034cad0 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -59,19 +59,12 @@ struct compressed_bio {
 
 	/* IO errors */
 	blk_status_t status;
-	int mirror_num;
 
 	union {
 		/* For reads, this is the bio we are copying the data into */
 		struct bio *orig_bio;
 		struct work_struct write_end_work;
 	};
-
-	/*
-	 * the start of a variable length array of checksums only
-	 * used by reads
-	 */
-	u8 sums[];
 };
 
 static inline unsigned int btrfs_compress_type(unsigned int type_level)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ec7bdb3fa0921..587d2ba20b53b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2543,13 +2543,10 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	u64 start = bbio->file_offset + bio_offset;
 	struct io_failure_record *failrec;
-	struct extent_map *em;
 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	const u32 sectorsize = fs_info->sectorsize;
 	int ret;
-	u64 logical;
 
 	failrec = get_state_failrec(failure_tree, start);
 	if (!IS_ERR(failrec)) {
@@ -2573,41 +2570,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
 	failrec->start = start;
 	failrec->len = sectorsize;
 	failrec->failed_mirror = failrec->this_mirror = bbio->mirror_num;
-	failrec->compress_type = BTRFS_COMPRESS_NONE;
-
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, start, failrec->len);
-	if (!em) {
-		read_unlock(&em_tree->lock);
-		kfree(failrec);
-		return ERR_PTR(-EIO);
-	}
-
-	if (em->start > start || em->start + em->len <= start) {
-		free_extent_map(em);
-		em = NULL;
-	}
-	read_unlock(&em_tree->lock);
-	if (!em) {
-		kfree(failrec);
-		return ERR_PTR(-EIO);
-	}
-
-	logical = start - em->start;
-	logical = em->block_start + logical;
-	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-		logical = em->block_start;
-		failrec->compress_type = em->compress_type;
-	}
+	failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
 
 	btrfs_debug(fs_info,
-		    "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
-		    logical, start, failrec->len);
-
-	failrec->logical = logical;
-	free_extent_map(em);
+		    "Get IO Failure Record: (new) logical=%llu, start=%llu",
+		    failrec->logical, start);
 
-	failrec->num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
+	failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical,
+					       sectorsize);
 	if (failrec->num_copies == 1) {
 		/*
 		 * we only have a single copy of the data, so don't bother with
@@ -2709,7 +2679,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 	 * will be handled by the endio on the repair_bio, so we can't return an
 	 * error here.
 	 */
-	submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type);
+	submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
 	return BLK_STS_OK;
 }
 
@@ -3115,6 +3085,10 @@ static void end_bio_extent_readpage(struct bio *bio)
 			 * Only try to repair bios that actually made it to a
 			 * device.  If the bio failed to be submitted mirror
 			 * is 0 and we need to fail it without retrying.
+			 *
+			 * This also includes the high level bios for compressed
+			 * extents - these never make it to a device and repair
+			 * is already handled on the lower compressed bio.
 			 */
 			if (mirror > 0)
 				repair = true;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a78051c7627c4..9dec34c009e91 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -261,7 +261,6 @@ struct io_failure_record {
 	u64 start;
 	u64 len;
 	u64 logical;
-	enum btrfs_compression_type compress_type;
 	int this_mirror;
 	int failed_mirror;
 	int num_copies;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 429428fde4a88..eea351216db33 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2707,6 +2707,9 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 		return;
 	}
 
+	/* Save the original iter for read repair */
+	btrfs_bio(bio)->iter = bio->bi_iter;
+
 	/*
 	 * Lookup bio sums does extra checks around whether we need to csum or
 	 * not, which is why we ignore skip_sum here.
@@ -8000,6 +8003,10 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_dio_private *dip = bio->bi_private;
 	blk_status_t ret;
+		
+	/* Save the original iter for read repair */
+	if (btrfs_op(bio) == BTRFS_MAP_READ)
+		btrfs_bio(bio)->iter = bio->bi_iter;
 
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		goto map;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio
  2022-06-30 16:01 ` [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio Christoph Hellwig
@ 2022-07-05 14:40   ` Nikolay Borisov
  2022-07-05 17:17     ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Nikolay Borisov @ 2022-07-05 14:40 UTC (permalink / raw)
  To: Christoph Hellwig, Chris Mason, Josef Bacik, David Sterba
  Cc: linux-btrfs, Boris Burkov



On 30.06.22 г. 19:01 ч., Christoph Hellwig wrote:
> Instead of counting the bytes just count the bios, with an extra
> reference held during submission.  This significantly simplifies the
> submission side error handling.
> 
> This reverts the change commit 6ec9765d746d ("btrfs: introduce
> compressed_bio::pending_sectors to trace compressed bio") that moved to
> counting sectors, but unlike the state before that commit the extra
> reference held during the submission actually keeps the refcounting
> sane.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Boris Burkov <boris@bur.io>
> ---
>   fs/btrfs/compression.c | 126 ++++++++++-------------------------------
>   fs/btrfs/compression.h |   4 +-
>   2 files changed, 33 insertions(+), 97 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 907fc8a4c092c..e756da640fd7b 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -191,44 +191,6 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
>   	return 0;
>   }
>   

<snip>


>   		btrfs_record_physical_zoned(cb->inode, cb->start, bio);
> @@ -476,7 +444,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
>   		return ERR_PTR(ret);
>   	}
>   	*next_stripe_start = disk_bytenr + geom.len;
> -
> +	refcount_inc(&cb->pending_ios);
>   	return bio;
>   }
>   
> @@ -503,17 +471,17 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>   	struct compressed_bio *cb;
>   	u64 cur_disk_bytenr = disk_start;
>   	u64 next_stripe_start;
> -	blk_status_t ret;
>   	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
>   	const bool use_append = btrfs_use_zone_append(inode, disk_start);
>   	const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
> +	blk_status_t ret = BLK_STS_OK;
>   
>   	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
>   	       IS_ALIGNED(len, fs_info->sectorsize));
>   	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
>   	if (!cb)
>   		return BLK_STS_RESOURCE;
> -	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
> +	refcount_set(&cb->pending_ios, 1);
>   	cb->status = BLK_STS_OK;
>   	cb->inode = &inode->vfs_inode;
>   	cb->start = start;
> @@ -543,8 +511,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>   				&next_stripe_start);
>   			if (IS_ERR(bio)) {
>   				ret = errno_to_blk_status(PTR_ERR(bio));
> -				bio = NULL;
> -				goto finish_cb;
> +				break;
>   			}
>   			if (blkcg_css)
>   				bio->bi_opf |= REQ_CGROUP_PUNT;
> @@ -588,8 +555,11 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>   		if (submit) {
>   			if (!skip_sum) {
>   				ret = btrfs_csum_one_bio(inode, bio, start, true);
> -				if (ret)
> -					goto finish_cb;
> +				if (ret) {
> +					bio->bi_status = ret;
> +					bio_endio(bio);
> +					break;
> +				}
>   			}
>   
>   			ASSERT(bio->bi_iter.bi_size);
> @@ -598,33 +568,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>   		}
>   		cond_resched();
>   	}
> -	if (blkcg_css)
> -		kthread_associate_blkcg(NULL);
>   
> -	return 0;
> -
> -finish_cb:
>   	if (blkcg_css)
>   		kthread_associate_blkcg(NULL);
>   
> -	if (bio) {
> -		bio->bi_status = ret;
> -		bio_endio(bio);
> -	}
> -	/* Last byte of @cb is submitted, endio will free @cb */
> -	if (cur_disk_bytenr == disk_start + compressed_len)
> -		return ret;
> -
> -	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
> -			   (disk_start + compressed_len - cur_disk_bytenr) >>
> -			   fs_info->sectorsize_bits);
> -	/*
> -	 * Even with previous bio ended, we should still have io not yet
> -	 * submitted, thus need to finish manually.
> -	 */
> -	ASSERT(refcount_read(&cb->pending_sectors));
> -	/* Now we are the only one referring @cb, can finish it safely. */
> -	finish_compressed_bio_write(cb);
> +	if (refcount_dec_and_test(&cb->pending_ios))
> +		finish_compressed_bio_write(cb);

nit: This slightly changes the semantics of the function because with 
the old code the bio could have been completed in 
submit_compressed_write iff there was an error during submission for one 
of the sub-bios. Whilst with this new code there is a chance even in the 
success case this happens (if the sub bios complete by the time we 
arrive at this code). Generally that'd be very unlikely due to io 
latency and indeed this code becomes effective iff there is an error. 
Personally I'd like such changes to be called out explicitly in the 
change log or at least with a comment. I guess this ties into the "keeps 
ref counting sane" in the changelog but what exactly do you mean by 
sane? I guess it ties into what Qu mentions in his changelog about 
ensuring the compressed bio is not freed/finished by some completing 
sub-bio _before_ the submitter had a chance to submit all sub-bios and 
that the old code was doing another subtle thing - setting the counter 
the pending bios to 1, and noot incrementing it when doing the final 
submission, thus ensuring everything works out.

I agree your code is much better however I'd like to have the above 
details put (perhaps slightly reworded) in the changelog so that those 
subtle aspects are more visible to someone reading it some months down 
the line :) .

<snip>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/4] btrfs: remove the start argument to check_data_csum
  2022-06-30 16:01 ` [PATCH 3/4] btrfs: remove the start argument to check_data_csum Christoph Hellwig
@ 2022-07-05 15:35   ` Nikolay Borisov
  2022-07-05 17:18     ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Nikolay Borisov @ 2022-07-05 15:35 UTC (permalink / raw)
  To: Christoph Hellwig, Chris Mason, Josef Bacik, David Sterba
  Cc: linux-btrfs, Boris Burkov



On 30.06.22 г. 19:01 ч., Christoph Hellwig wrote:
> Just derive it from the btrfs_bio now that ->file_offset is always valid.
> Also make the function available outside of inode.c as we'll need that
> soon.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Boris Burkov <boris@bur.io>
> ---
>   fs/btrfs/ctree.h |  2 ++
>   fs/btrfs/inode.c | 22 +++++++++-------------
>   2 files changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 4e2569f84aabc..164f54e6aa447 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3293,6 +3293,8 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
>   unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
>   				    u32 bio_offset, struct page *page,
>   				    u64 start, u64 end);
> +int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset,
> +		    struct page *page, u32 pgoff);
>   struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
>   					   u64 start, u64 len);
>   noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index a627b2af9e243..429428fde4a88 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3396,20 +3396,18 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
>   /*
>    * check_data_csum - verify checksum of one sector of uncompressed data
>    * @inode:	inode
> - * @io_bio:	btrfs_io_bio which contains the csum
> + * @bbio:	btrfs_io_bio which contains the csum
>    * @bio_offset:	offset to the beginning of the bio (in bytes)
>    * @page:	page where is the data to be verified
>    * @pgoff:	offset inside the page
> - * @start:	logical offset in the file
>    *
>    * The length of such check is always one sector size.
>    *
>    * When csum mismatch is detected, we will also report the error and fill the
>    * corrupted range with zero. (Thus it needs the extra parameters)
>    */
> -static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
> -			   u32 bio_offset, struct page *page, u32 pgoff,
> -			   u64 start)
> +int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset,
> +		    struct page *page, u32 pgoff)

nit: The removal of the static could be tucked into the next patch as 
that's where it's being used for the first time.

<snip>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio
  2022-07-05 14:40   ` Nikolay Borisov
@ 2022-07-05 17:17     ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-07-05 17:17 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Christoph Hellwig, Chris Mason, Josef Bacik, David Sterba,
	linux-btrfs, Boris Burkov

On Tue, Jul 05, 2022 at 05:40:22PM +0300, Nikolay Borisov wrote:
> nit: This slightly changes the semantics of the function because with the 
> old code the bio could have been completed in submit_compressed_write iff 
> there was an error during submission for one of the sub-bios. Whilst with 
> this new code there is a chance even in the success case this happens (if 
> the sub bios complete by the time we arrive at this code).

Yes.

> Generally that'd 
> be very unlikely due to io latency and indeed this code becomes effective 
> iff there is an error. Personally I'd like such changes to be called out 
> explicitly in the change log or at least with a comment.

Ok, I can spell this out a little more explicitly.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/4] btrfs: remove the start argument to check_data_csum
  2022-07-05 15:35   ` Nikolay Borisov
@ 2022-07-05 17:18     ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-07-05 17:18 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Christoph Hellwig, Chris Mason, Josef Bacik, David Sterba,
	linux-btrfs, Boris Burkov

On Tue, Jul 05, 2022 at 06:35:59PM +0300, Nikolay Borisov wrote:
>> +int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset,
>> +		    struct page *page, u32 pgoff)
>
> nit: The removal of the static could be tucked into the next patch as 
> that's where it's being used for the first time.

I can do that if is generally preferred, but this version causes less
churn, especially with the annoying btrfs style of the variable prototype
continuation indentation.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/4] btrfs: fix repair of compressed extents
  2022-06-30 16:01 ` [PATCH 4/4] btrfs: fix repair of compressed extents Christoph Hellwig
@ 2022-07-07 12:50   ` Nikolay Borisov
  2022-07-07 13:30     ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Nikolay Borisov @ 2022-07-07 12:50 UTC (permalink / raw)
  To: Christoph Hellwig, Chris Mason, Josef Bacik, David Sterba; +Cc: linux-btrfs



On 30.06.22 г. 19:01 ч., Christoph Hellwig wrote:
> Currently the checksum of compressed extents is verified based on the
> compressed data and the lower btrfs_bio, but the actual repair process
> is driven by end_bio_extent_readpage on the upper btrfs_bio for the
> decompressed data.
> 
> This has a bunch of issues, including not being able to properly
> communicate the failed mirror up in case that the I/O submission got
> preempted, a general loss of if an error was an I/O error or a checksum
> verification failure, but most importantly that this design causes
> btrfs_clean_io_failure to eventually write back the uncompressed good
> data onto the disk sectors that are supposed to contain compressed data.
> 
> Fix this by moving the repair to the lower btrfs_bio.  To do so, a fair
> amount of code has to be reshuffled:
> 
>   a) the lower btrfs_bio now needs a valid csum pointer.  The easiest way
>      to archive that is to pass NULL btrfs_lookup_bio_sums and just use
>      the btrfs_bio management of csums.  For a compressed_bio that is
>      split into multiple btrfs_bios this mean additional memory
>      allocations, but the code becomes a lot more regular.
>   b) checksum verifiaction now runs diretly on the lower btrfs_bio instead
>      of the compressed_bio.  This actually nicely simplifies the end I/O
>      processing.
>   c) btrfs_repair_one_sector can't just look up the logical address for
>      the file offset any more, as there is no coresponding relative
>      offsets that apply to the file offset and the logic address for
>      compressed extents.  Instead require that the saved bvec_iter in the
>      btrfs_bio is filled out for all read bios and use that, which again
>      removes a fair amount of code.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Overall it looks good but there are a couple of minor nits.

> ---
>   fs/btrfs/compression.c | 171 ++++++++++++++---------------------------
>   fs/btrfs/compression.h |   7 --
>   fs/btrfs/extent_io.c   |  46 +++--------
>   fs/btrfs/extent_io.h   |   1 -
>   fs/btrfs/inode.c       |   7 ++
>   5 files changed, 75 insertions(+), 157 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index e756da640fd7b..c8b14a5bd89be 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -136,66 +136,14 @@ static int compression_decompress(int type, struct list_head *ws,
>   
>   static int btrfs_decompress_bio(struct compressed_bio *cb);
>   
> -static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
> -				      unsigned long disk_size)
> -{
> -	return sizeof(struct compressed_bio) +
> -		(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
> -}
> -
> -static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
> -				 u64 disk_start)
> -{
> -	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> -	const u32 csum_size = fs_info->csum_size;
> -	const u32 sectorsize = fs_info->sectorsize;
> -	struct page *page;
> -	unsigned int i;
> -	u8 csum[BTRFS_CSUM_SIZE];
> -	struct compressed_bio *cb = bio->bi_private;
> -	u8 *cb_sum = cb->sums;
> -
> -	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
> -	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
> -		return 0;
> -
> -	for (i = 0; i < cb->nr_pages; i++) {
> -		u32 pg_offset;
> -		u32 bytes_left = PAGE_SIZE;
> -		page = cb->compressed_pages[i];
> -
> -		/* Determine the remaining bytes inside the page first */
> -		if (i == cb->nr_pages - 1)
> -			bytes_left = cb->compressed_len - i * PAGE_SIZE;
> -
> -		/* Hash through the page sector by sector */
> -		for (pg_offset = 0; pg_offset < bytes_left;
> -		     pg_offset += sectorsize) {
> -			int ret;
> -
> -			ret = btrfs_check_sector_csum(fs_info, page, pg_offset,
> -						      csum, cb_sum);
> -			if (ret) {
> -				btrfs_print_data_csum_error(inode, disk_start,
> -						csum, cb_sum, cb->mirror_num);
> -				if (btrfs_bio(bio)->device)
> -					btrfs_dev_stat_inc_and_print(
> -						btrfs_bio(bio)->device,
> -						BTRFS_DEV_STAT_CORRUPTION_ERRS);
> -				return -EIO;
> -			}
> -			cb_sum += csum_size;
> -			disk_start += sectorsize;
> -		}
> -	}
> -	return 0;
> -}
> -
>   static void finish_compressed_bio_read(struct compressed_bio *cb)
>   {
>   	unsigned int index;
>   	struct page *page;
>   
> +	if (cb->status == BLK_STS_OK)
> +		cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));

nit: That's a sneaky line, initially I was wondering "huh, where did the 
btrfs_decompress_bio() call go". If David thinks the same I think it's 
best if the function call is on a separate line.
> +
>   	/* Release the compressed pages */
>   	for (index = 0; index < cb->nr_pages; index++) {
>   		page = cb->compressed_pages[index];
> @@ -233,59 +181,54 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
>   	kfree(cb);
>   }
>   
> -/* when we finish reading compressed pages from the disk, we
> - * decompress them and then run the bio end_io routines on the
> - * decompressed pages (in the inode address space).
> - *
> - * This allows the checksumming and other IO error handling routines
> - * to work normally
> - *
> - * The compressed pages are freed here, and it must be run
> - * in process context
> +/*
> + * Verify the checksums and kick off repair if needed on the uncompressed data
> + * before decompressing it into the original bio and freeing the uncompressed
> + * pages.
>    */
>   static void end_compressed_bio_read(struct bio *bio)
>   {
>   	struct compressed_bio *cb = bio->bi_private;
> -	struct inode *inode;
> -	unsigned int mirror = btrfs_bio(bio)->mirror_num;
> -	int ret = 0;
> -
> -	if (bio->bi_status)
> -		cb->status = bio->bi_status;
> -
> -	if (!refcount_dec_and_test(&cb->pending_ios))
> -		goto out;
> -
> -	/*
> -	 * Record the correct mirror_num in cb->orig_bio so that
> -	 * read-repair can work properly.
> -	 */
> -	btrfs_bio(cb->orig_bio)->mirror_num = mirror;
> -	cb->mirror_num = mirror;
> -
> -	/*
> -	 * Some IO in this cb have failed, just skip checksum as there
> -	 * is no way it could be correct.
> -	 */
> -	if (cb->status != BLK_STS_OK)
> -		goto csum_failed;
> +	struct inode *inode = cb->inode;
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	struct btrfs_inode *bi = BTRFS_I(inode);
> +	bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
> +		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
> +	blk_status_t status = bio->bi_status;
> +	struct btrfs_bio *bbio = btrfs_bio(bio);
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +	u32 offset;
> +
> +	btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
> +		u64 start = bbio->file_offset + offset;
> +
> +		if (!status &&
> +		    (!csum || !check_data_csum(inode, bbio, offset, bv.bv_page,
> +					       bv.bv_offset))) {

In the !csum case you'd be executing a lot of code for no gain i.e 
clean_io_failure. Instead, factor out the !csum case as a break from the 
btrfs_bio_for_each_sector i.e no point in running clean_io_failure for 
every sector in this case.

> +			clean_io_failure(fs_info, &bi->io_failure_tree,
> +					 &bi->io_tree, start, bv.bv_page,
> +					 btrfs_ino(bi), bv.bv_offset);
> +		} else {
> +			int ret;
>   
> -	inode = cb->inode;
> -	ret = check_compressed_csum(BTRFS_I(inode), bio,
> -				    bio->bi_iter.bi_sector << 9);
> -	if (ret)
> -		goto csum_failed;
> +			refcount_inc(&cb->pending_ios);
> +			ret = btrfs_repair_one_sector(inode, bbio, offset,
> +					bv.bv_page, bv.bv_offset,
> +					btrfs_submit_data_read_bio);
> +			if (ret) {
> +				refcount_dec(&cb->pending_ios);
> +				status = errno_to_blk_status(ret);
> +			}
> +		}
> +	}
>   
> -	/* ok, we're the last bio for this extent, lets start
> -	 * the decompression.
> -	 */
> -	ret = btrfs_decompress_bio(cb);
> +	if (status)
> +		cb->status = status;
>   
> -csum_failed:
> -	if (ret)
> -		cb->status = errno_to_blk_status(ret);
> -	finish_compressed_bio_read(cb);
> -out:
> +	if (refcount_dec_and_test(&cb->pending_ios))
> +		finish_compressed_bio_read(cb);
> +	btrfs_bio_free_csum(bbio);
>   	bio_put(bio);
>   }
>   
> @@ -478,7 +421,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>   
>   	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
>   	       IS_ALIGNED(len, fs_info->sectorsize));
> -	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
> +	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);

nit: This change is irrelevant to this patch - indeed we don't need to 
allocate the flex array at the end of compressed_bio for writes, as the 
csums are being stored in the ordered extents for the bio, still this 
it's independent of this patch.

>   	if (!cb)
>   		return BLK_STS_RESOURCE;
>   	refcount_set(&cb->pending_ios, 1);

<snip>

> @@ -2573,41 +2570,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
>   	failrec->start = start;
>   	failrec->len = sectorsize;
>   	failrec->failed_mirror = failrec->this_mirror = bbio->mirror_num;
> -	failrec->compress_type = BTRFS_COMPRESS_NONE;
> -
> -	read_lock(&em_tree->lock);
> -	em = lookup_extent_mapping(em_tree, start, failrec->len);
> -	if (!em) {
> -		read_unlock(&em_tree->lock);
> -		kfree(failrec);
> -		return ERR_PTR(-EIO);
> -	}
> -
> -	if (em->start > start || em->start + em->len <= start) {
> -		free_extent_map(em);
> -		em = NULL;
> -	}
> -	read_unlock(&em_tree->lock);
> -	if (!em) {
> -		kfree(failrec);
> -		return ERR_PTR(-EIO);
> -	}
> -
> -	logical = start - em->start;
> -	logical = em->block_start + logical;
> -	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
> -		logical = em->block_start;
> -		failrec->compress_type = em->compress_type;
> -	}
> +	failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
>   
>   	btrfs_debug(fs_info,
> -		    "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
> -		    logical, start, failrec->len);
> -
> -	failrec->logical = logical;
> -	free_extent_map(em);
> +		    "Get IO Failure Record: (new) logical=%llu, start=%llu",
> +		    failrec->logical, start);

nit: While at it the '(new)' could be removed as I don't think it's 
rather informative, I guess David can also do this fix up. Also let's 
not remove the len, sure, it's always (at least for the time being) be a 
sectorsize but at least it's explicit in the error message.

>   
> -	failrec->num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
> +	failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical,
> +					       sectorsize);
>   	if (failrec->num_copies == 1) {
>   		/*
>   		 * we only have a single copy of the data, so don't bother with

<snip>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/4] btrfs: fix repair of compressed extents
  2022-07-07 12:50   ` Nikolay Borisov
@ 2022-07-07 13:30     ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-07-07 13:30 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Christoph Hellwig, Chris Mason, Josef Bacik, David Sterba, linux-btrfs

On Thu, Jul 07, 2022 at 03:50:34PM +0300, Nikolay Borisov wrote:
>> +	btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
>> +		u64 start = bbio->file_offset + offset;
>> +
>> +		if (!status &&
>> +		    (!csum || !check_data_csum(inode, bbio, offset, bv.bv_page,
>> +					       bv.bv_offset))) {
>
> In the !csum case you'd be executing a lot of code for no gain i.e 
> clean_io_failure. Instead, factor out the !csum case as a break from the 
> btrfs_bio_for_each_sector i.e no point in running clean_io_failure for 
> every sector in this case.

We still need to call clean_io_failure in that case, as repair can
also happen when I/O failed even without checksums.  Note that this
code also is just a copy and paste from the direct I/O completion
handler, and an equivalent but more obsfucated version of same
logic also exists in the buffered I/O completion path.

(and before anyone asks, I do have a WIP patchset to consolidate the
logic, and remove the calls to clean_io_failure for non-repair bio
completions, but it will take a while to get there).

>> -	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
>> +	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
>
> nit: This change is irrelevant to this patch - indeed we don't need to 
> allocate the flex array at the end of compressed_bio for writes, as the 
> csums are being stored in the ordered extents for the bio, still this it's 
> independent of this patch.

True.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio
  2022-06-29 23:42   ` Boris Burkov
@ 2022-06-30  4:22     ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-06-30  4:22 UTC (permalink / raw)
  To: Boris Burkov
  Cc: Christoph Hellwig, Chris Mason, Josef Bacik, David Sterba, linux-btrfs

On Wed, Jun 29, 2022 at 04:42:14PM -0700, Boris Burkov wrote:
> On Thu, Jun 23, 2022 at 07:53:35AM +0200, Christoph Hellwig wrote:
> > Instead of counting the bytes just count the bios, with an extra
> > reference held during submission.  This significantly simplifies the
> > submission side error handling.
> 
> Interestingly, this more or less exactly un-does the patch:
> 
> btrfs: introduce compressed_bio::pending_sectors to trace compressed bio
> 
> which introduced the sector counting, asserting that counting bios was
> awkward. FWIW, in my opinion, counting from 1 feels worth it to not have
> to add up the size, and simplifying the error handling.

Looking at the commit history: yes, it kind of does, but this new
version actually has several advantages over the version before that
commit as well, one being the extra bias on the refcount, and other
things are APIs fixed in the meantime like actually propagating the
error code.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio
  2022-06-23  5:53 ` [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio Christoph Hellwig
@ 2022-06-29 23:42   ` Boris Burkov
  2022-06-30  4:22     ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Boris Burkov @ 2022-06-29 23:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Chris Mason, Josef Bacik, David Sterba, linux-btrfs

On Thu, Jun 23, 2022 at 07:53:35AM +0200, Christoph Hellwig wrote:
> Instead of counting the bytes just count the bios, with an extra
> reference held during submission.  This significantly simplifies the
> submission side error handling.

Interestingly, this more or less exactly un-does the patch:

btrfs: introduce compressed_bio::pending_sectors to trace compressed bio

which introduced the sector counting, asserting that counting bios was
awkward. FWIW, in my opinion, counting from 1 feels worth it to not have
to add up the size, and simplifying the error handling.

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Boris Burkov <boris@bur.io>
> ---
>  fs/btrfs/compression.c | 126 ++++++++++-------------------------------
>  fs/btrfs/compression.h |   4 +-
>  2 files changed, 33 insertions(+), 97 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 907fc8a4c092c..e756da640fd7b 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -191,44 +191,6 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
>  	return 0;
>  }
>  
> -/*
> - * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
> - *
> - * Return true if there is no pending bio nor io.
> - * Return false otherwise.
> - */
> -static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
> -{
> -	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
> -	unsigned int bi_size = 0;
> -	bool last_io = false;
> -	struct bio_vec *bvec;
> -	struct bvec_iter_all iter_all;
> -
> -	/*
> -	 * At endio time, bi_iter.bi_size doesn't represent the real bio size.
> -	 * Thus here we have to iterate through all segments to grab correct
> -	 * bio size.
> -	 */
> -	bio_for_each_segment_all(bvec, bio, iter_all)
> -		bi_size += bvec->bv_len;
> -
> -	if (bio->bi_status)
> -		cb->status = bio->bi_status;
> -
> -	ASSERT(bi_size && bi_size <= cb->compressed_len);
> -	last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
> -					&cb->pending_sectors);
> -	/*
> -	 * Here we must wake up the possible error handler after all other
> -	 * operations on @cb finished, or we can race with
> -	 * finish_compressed_bio_*() which may free @cb.
> -	 */
> -	wake_up_var(cb);
> -
> -	return last_io;
> -}
> -
>  static void finish_compressed_bio_read(struct compressed_bio *cb)
>  {
>  	unsigned int index;
> @@ -288,7 +250,10 @@ static void end_compressed_bio_read(struct bio *bio)
>  	unsigned int mirror = btrfs_bio(bio)->mirror_num;
>  	int ret = 0;
>  
> -	if (!dec_and_test_compressed_bio(cb, bio))
> +	if (bio->bi_status)
> +		cb->status = bio->bi_status;
> +
> +	if (!refcount_dec_and_test(&cb->pending_ios))
>  		goto out;
>  
>  	/*
> @@ -417,7 +382,10 @@ static void end_compressed_bio_write(struct bio *bio)
>  {
>  	struct compressed_bio *cb = bio->bi_private;
>  
> -	if (dec_and_test_compressed_bio(cb, bio)) {
> +	if (bio->bi_status)
> +		cb->status = bio->bi_status;
> +
> +	if (refcount_dec_and_test(&cb->pending_ios)) {
>  		struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
>  
>  		btrfs_record_physical_zoned(cb->inode, cb->start, bio);
> @@ -476,7 +444,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
>  		return ERR_PTR(ret);
>  	}
>  	*next_stripe_start = disk_bytenr + geom.len;
> -
> +	refcount_inc(&cb->pending_ios);
>  	return bio;
>  }
>  
> @@ -503,17 +471,17 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>  	struct compressed_bio *cb;
>  	u64 cur_disk_bytenr = disk_start;
>  	u64 next_stripe_start;
> -	blk_status_t ret;
>  	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
>  	const bool use_append = btrfs_use_zone_append(inode, disk_start);
>  	const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
> +	blk_status_t ret = BLK_STS_OK;
>  
>  	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
>  	       IS_ALIGNED(len, fs_info->sectorsize));
>  	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
>  	if (!cb)
>  		return BLK_STS_RESOURCE;
> -	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
> +	refcount_set(&cb->pending_ios, 1);
>  	cb->status = BLK_STS_OK;
>  	cb->inode = &inode->vfs_inode;
>  	cb->start = start;
> @@ -543,8 +511,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>  				&next_stripe_start);
>  			if (IS_ERR(bio)) {
>  				ret = errno_to_blk_status(PTR_ERR(bio));
> -				bio = NULL;
> -				goto finish_cb;
> +				break;
>  			}
>  			if (blkcg_css)
>  				bio->bi_opf |= REQ_CGROUP_PUNT;
> @@ -588,8 +555,11 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>  		if (submit) {
>  			if (!skip_sum) {
>  				ret = btrfs_csum_one_bio(inode, bio, start, true);
> -				if (ret)
> -					goto finish_cb;
> +				if (ret) {
> +					bio->bi_status = ret;
> +					bio_endio(bio);
> +					break;
> +				}
>  			}
>  
>  			ASSERT(bio->bi_iter.bi_size);
> @@ -598,33 +568,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
>  		}
>  		cond_resched();
>  	}
> -	if (blkcg_css)
> -		kthread_associate_blkcg(NULL);
>  
> -	return 0;
> -
> -finish_cb:
>  	if (blkcg_css)
>  		kthread_associate_blkcg(NULL);
>  
> -	if (bio) {
> -		bio->bi_status = ret;
> -		bio_endio(bio);
> -	}
> -	/* Last byte of @cb is submitted, endio will free @cb */
> -	if (cur_disk_bytenr == disk_start + compressed_len)
> -		return ret;
> -
> -	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
> -			   (disk_start + compressed_len - cur_disk_bytenr) >>
> -			   fs_info->sectorsize_bits);
> -	/*
> -	 * Even with previous bio ended, we should still have io not yet
> -	 * submitted, thus need to finish manually.
> -	 */
> -	ASSERT(refcount_read(&cb->pending_sectors));
> -	/* Now we are the only one referring @cb, can finish it safely. */
> -	finish_compressed_bio_write(cb);
> +	if (refcount_dec_and_test(&cb->pending_ios))
> +		finish_compressed_bio_write(cb);
>  	return ret;
>  }
>  
> @@ -830,7 +779,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
>  		goto out;
>  	}
>  
> -	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
> +	refcount_set(&cb->pending_ios, 1);
>  	cb->status = BLK_STS_OK;
>  	cb->inode = inode;
>  	cb->mirror_num = mirror_num;
> @@ -880,9 +829,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
>  					REQ_OP_READ, end_compressed_bio_read,
>  					&next_stripe_start);
>  			if (IS_ERR(comp_bio)) {
> -				ret = errno_to_blk_status(PTR_ERR(comp_bio));
> -				comp_bio = NULL;
> -				goto finish_cb;
> +				cb->status =
> +					errno_to_blk_status(PTR_ERR(comp_bio));
> +				break;
>  			}
>  		}
>  		/*
> @@ -921,8 +870,11 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
>  			unsigned int nr_sectors;
>  
>  			ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
> -			if (ret)
> -				goto finish_cb;
> +			if (ret) {
> +				comp_bio->bi_status = ret;
> +				bio_endio(comp_bio);
> +				break;
> +			}
>  
>  			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
>  						  fs_info->sectorsize);
> @@ -933,6 +885,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
>  			comp_bio = NULL;
>  		}
>  	}
> +
> +	if (refcount_dec_and_test(&cb->pending_ios))
> +		finish_compressed_bio_read(cb);
>  	return;
>  
>  fail:
> @@ -950,25 +905,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
>  	bio->bi_status = ret;
>  	bio_endio(bio);
>  	return;
> -finish_cb:
> -	if (comp_bio) {
> -		comp_bio->bi_status = ret;
> -		bio_endio(comp_bio);
> -	}
> -	/* All bytes of @cb is submitted, endio will free @cb */
> -	if (cur_disk_byte == disk_bytenr + compressed_len)
> -		return;
> -
> -	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
> -			   (disk_bytenr + compressed_len - cur_disk_byte) >>
> -			   fs_info->sectorsize_bits);
> -	/*
> -	 * Even with previous bio ended, we should still have io not yet
> -	 * submitted, thus need to finish @cb manually.
> -	 */
> -	ASSERT(refcount_read(&cb->pending_sectors));
> -	/* Now we are the only one referring @cb, can finish it safely. */
> -	finish_compressed_bio_read(cb);
>  }
>  
>  /*
> diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
> index 5fca7603e928a..0e4cbf04fd866 100644
> --- a/fs/btrfs/compression.h
> +++ b/fs/btrfs/compression.h
> @@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
>  #define	BTRFS_ZLIB_DEFAULT_LEVEL		3
>  
>  struct compressed_bio {
> -	/* Number of sectors with unfinished IO (unsubmitted or unfinished) */
> -	refcount_t pending_sectors;
> +	/* Number of outstanding bios */
> +	refcount_t pending_ios;
>  
>  	/* Number of compressed pages in the array */
>  	unsigned int nr_pages;
> -- 
> 2.30.2
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio
  2022-06-23  5:53 fix read repair on " Christoph Hellwig
@ 2022-06-23  5:53 ` Christoph Hellwig
  2022-06-29 23:42   ` Boris Burkov
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2022-06-23  5:53 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba; +Cc: linux-btrfs

Instead of counting the bytes just count the bios, with an extra
reference held during submission.  This significantly simplifies the
submission side error handling.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/compression.c | 126 ++++++++++-------------------------------
 fs/btrfs/compression.h |   4 +-
 2 files changed, 33 insertions(+), 97 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 907fc8a4c092c..e756da640fd7b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -191,44 +191,6 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
 	return 0;
 }
 
-/*
- * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
- *
- * Return true if there is no pending bio nor io.
- * Return false otherwise.
- */
-static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
-	unsigned int bi_size = 0;
-	bool last_io = false;
-	struct bio_vec *bvec;
-	struct bvec_iter_all iter_all;
-
-	/*
-	 * At endio time, bi_iter.bi_size doesn't represent the real bio size.
-	 * Thus here we have to iterate through all segments to grab correct
-	 * bio size.
-	 */
-	bio_for_each_segment_all(bvec, bio, iter_all)
-		bi_size += bvec->bv_len;
-
-	if (bio->bi_status)
-		cb->status = bio->bi_status;
-
-	ASSERT(bi_size && bi_size <= cb->compressed_len);
-	last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
-					&cb->pending_sectors);
-	/*
-	 * Here we must wake up the possible error handler after all other
-	 * operations on @cb finished, or we can race with
-	 * finish_compressed_bio_*() which may free @cb.
-	 */
-	wake_up_var(cb);
-
-	return last_io;
-}
-
 static void finish_compressed_bio_read(struct compressed_bio *cb)
 {
 	unsigned int index;
@@ -288,7 +250,10 @@ static void end_compressed_bio_read(struct bio *bio)
 	unsigned int mirror = btrfs_bio(bio)->mirror_num;
 	int ret = 0;
 
-	if (!dec_and_test_compressed_bio(cb, bio))
+	if (bio->bi_status)
+		cb->status = bio->bi_status;
+
+	if (!refcount_dec_and_test(&cb->pending_ios))
 		goto out;
 
 	/*
@@ -417,7 +382,10 @@ static void end_compressed_bio_write(struct bio *bio)
 {
 	struct compressed_bio *cb = bio->bi_private;
 
-	if (dec_and_test_compressed_bio(cb, bio)) {
+	if (bio->bi_status)
+		cb->status = bio->bi_status;
+
+	if (refcount_dec_and_test(&cb->pending_ios)) {
 		struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
 
 		btrfs_record_physical_zoned(cb->inode, cb->start, bio);
@@ -476,7 +444,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
 		return ERR_PTR(ret);
 	}
 	*next_stripe_start = disk_bytenr + geom.len;
-
+	refcount_inc(&cb->pending_ios);
 	return bio;
 }
 
@@ -503,17 +471,17 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 	struct compressed_bio *cb;
 	u64 cur_disk_bytenr = disk_start;
 	u64 next_stripe_start;
-	blk_status_t ret;
 	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
 	const bool use_append = btrfs_use_zone_append(inode, disk_start);
 	const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
+	blk_status_t ret = BLK_STS_OK;
 
 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
 	       IS_ALIGNED(len, fs_info->sectorsize));
 	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 	if (!cb)
 		return BLK_STS_RESOURCE;
-	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+	refcount_set(&cb->pending_ios, 1);
 	cb->status = BLK_STS_OK;
 	cb->inode = &inode->vfs_inode;
 	cb->start = start;
@@ -543,8 +511,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 				&next_stripe_start);
 			if (IS_ERR(bio)) {
 				ret = errno_to_blk_status(PTR_ERR(bio));
-				bio = NULL;
-				goto finish_cb;
+				break;
 			}
 			if (blkcg_css)
 				bio->bi_opf |= REQ_CGROUP_PUNT;
@@ -588,8 +555,11 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 		if (submit) {
 			if (!skip_sum) {
 				ret = btrfs_csum_one_bio(inode, bio, start, true);
-				if (ret)
-					goto finish_cb;
+				if (ret) {
+					bio->bi_status = ret;
+					bio_endio(bio);
+					break;
+				}
 			}
 
 			ASSERT(bio->bi_iter.bi_size);
@@ -598,33 +568,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 		}
 		cond_resched();
 	}
-	if (blkcg_css)
-		kthread_associate_blkcg(NULL);
 
-	return 0;
-
-finish_cb:
 	if (blkcg_css)
 		kthread_associate_blkcg(NULL);
 
-	if (bio) {
-		bio->bi_status = ret;
-		bio_endio(bio);
-	}
-	/* Last byte of @cb is submitted, endio will free @cb */
-	if (cur_disk_bytenr == disk_start + compressed_len)
-		return ret;
-
-	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
-			   (disk_start + compressed_len - cur_disk_bytenr) >>
-			   fs_info->sectorsize_bits);
-	/*
-	 * Even with previous bio ended, we should still have io not yet
-	 * submitted, thus need to finish manually.
-	 */
-	ASSERT(refcount_read(&cb->pending_sectors));
-	/* Now we are the only one referring @cb, can finish it safely. */
-	finish_compressed_bio_write(cb);
+	if (refcount_dec_and_test(&cb->pending_ios))
+		finish_compressed_bio_write(cb);
 	return ret;
 }
 
@@ -830,7 +779,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		goto out;
 	}
 
-	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+	refcount_set(&cb->pending_ios, 1);
 	cb->status = BLK_STS_OK;
 	cb->inode = inode;
 	cb->mirror_num = mirror_num;
@@ -880,9 +829,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 					REQ_OP_READ, end_compressed_bio_read,
 					&next_stripe_start);
 			if (IS_ERR(comp_bio)) {
-				ret = errno_to_blk_status(PTR_ERR(comp_bio));
-				comp_bio = NULL;
-				goto finish_cb;
+				cb->status =
+					errno_to_blk_status(PTR_ERR(comp_bio));
+				break;
 			}
 		}
 		/*
@@ -921,8 +870,11 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			unsigned int nr_sectors;
 
 			ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
-			if (ret)
-				goto finish_cb;
+			if (ret) {
+				comp_bio->bi_status = ret;
+				bio_endio(comp_bio);
+				break;
+			}
 
 			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
 						  fs_info->sectorsize);
@@ -933,6 +885,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			comp_bio = NULL;
 		}
 	}
+
+	if (refcount_dec_and_test(&cb->pending_ios))
+		finish_compressed_bio_read(cb);
 	return;
 
 fail:
@@ -950,25 +905,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	bio->bi_status = ret;
 	bio_endio(bio);
 	return;
-finish_cb:
-	if (comp_bio) {
-		comp_bio->bi_status = ret;
-		bio_endio(comp_bio);
-	}
-	/* All bytes of @cb is submitted, endio will free @cb */
-	if (cur_disk_byte == disk_bytenr + compressed_len)
-		return;
-
-	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
-			   (disk_bytenr + compressed_len - cur_disk_byte) >>
-			   fs_info->sectorsize_bits);
-	/*
-	 * Even with previous bio ended, we should still have io not yet
-	 * submitted, thus need to finish @cb manually.
-	 */
-	ASSERT(refcount_read(&cb->pending_sectors));
-	/* Now we are the only one referring @cb, can finish it safely. */
-	finish_compressed_bio_read(cb);
 }
 
 /*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 5fca7603e928a..0e4cbf04fd866 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
 #define	BTRFS_ZLIB_DEFAULT_LEVEL		3
 
 struct compressed_bio {
-	/* Number of sectors with unfinished IO (unsubmitted or unfinished) */
-	refcount_t pending_sectors;
+	/* Number of outstanding bios */
+	refcount_t pending_ios;
 
 	/* Number of compressed pages in the array */
 	unsigned int nr_pages;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-07-07 13:30 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-30 16:01 fix read repair on compressed extents v2 Christoph Hellwig
2022-06-30 16:01 ` [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio Christoph Hellwig
2022-07-05 14:40   ` Nikolay Borisov
2022-07-05 17:17     ` Christoph Hellwig
2022-06-30 16:01 ` [PATCH 2/4] btrfs: pass a btrfs_bio to btrfs_repair_one_sector Christoph Hellwig
2022-06-30 16:01 ` [PATCH 3/4] btrfs: remove the start argument to check_data_csum Christoph Hellwig
2022-07-05 15:35   ` Nikolay Borisov
2022-07-05 17:18     ` Christoph Hellwig
2022-06-30 16:01 ` [PATCH 4/4] btrfs: fix repair of compressed extents Christoph Hellwig
2022-07-07 12:50   ` Nikolay Borisov
2022-07-07 13:30     ` Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2022-06-23  5:53 fix read repair on " Christoph Hellwig
2022-06-23  5:53 ` [PATCH 1/4] btrfs: simplify the pending I/O counting in struct compressed_bio Christoph Hellwig
2022-06-29 23:42   ` Boris Burkov
2022-06-30  4:22     ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.