Re: [PATCH v2 4/7] btrfs: introduce new read-repair infrastructure

From: Qu Wenruo <wqu@suse.com>
To: linux-btrfs@vger.kernel.org, Christoph Hellwig <hch@lst.de>
Subject: Re: [PATCH v2 4/7] btrfs: introduce new read-repair infrastructure
Date: Thu, 26 May 2022 11:06:31 +0800	[thread overview]
Message-ID: <531d3865-eb5b-d114-9ff2-c1b209902262@suse.com> (raw)
In-Reply-To: <b014412ee0713e01f52269e553c0cff3487ca495.1653476251.git.wqu@suse.com>

On 2022/5/25 18:59, Qu Wenruo wrote:
> The new read repair infrastructure is consist of the following 3 parts:
> 
[...]
> +static void io_add_or_submit(struct btrfs_read_repair_ctrl *ctrl, int mirror,
> +			   u64 logical, struct page *page, unsigned int pgoff,
> +			   int opf)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
> +	struct bio *io_bio = ctrl->io_bio;
> +
> +	/* Uninitialized. */
> +	if (io_bio->bi_iter.bi_sector == 0) {
> +		ASSERT(io_bio->bi_iter.bi_size == 0);
> +		io_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
> +		io_bio->bi_opf = opf;
> +		bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
> +		return;
> +	}
> +
> +	/* Continuous, add the page */
> +	if ((io_bio->bi_iter.bi_sector << SECTOR_SHIFT) +
> +	     io_bio->bi_iter.bi_size == logical) {
> +		bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
> +		return;
> +	}
> +
> +	/* Not continuous, submit first. */

Hi Christoph, I'm pretty sure the non-continuous bio problem is here for 
all of our attempts to rework read-repair.

I'm wondering if there is some "dummy" page provided from block layer 
that we can utilize?

E.g. We have the following checker pattern:

mirror 1	|X|X|X|X|
mirror 2	|X| |X| |
mirror 3	| |X| |X|

After reading all the 4 sectors from mirror 2, we know the 2nd and 4th 
are good and should not need to be-read.

Then reading mirror 3 needs us to submit two bios.

But if we have some "dummy" pages, and added into the bio for sector 2 
and 4, we only need one bio submission.

Is there such convenient page for us to utilize? Or we have to assign it 
globally?

Thanks,
Qu

> +	io_bio_submit(ctrl, mirror, opf);
> +	io_bio = ctrl->io_bio;
> +	io_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
> +	bio_add_page(io_bio, page, fs_info->sectorsize, pgoff);
> +}
> +
> +static void writeback_good_mirror(struct btrfs_read_repair_ctrl *ctrl,
> +				  int mirror, u64 logical,
> +				  struct page *page, unsigned int pgoff)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
> +	struct bio *io_bio = ctrl->io_bio;
> +
> +
> +	if (btrfs_repair_one_zone(fs_info, ctrl->logical))
> +		return;
> +
> +	/*
> +	 * For RAID56, we can not just write the bad data back, as
> +	 * any write will trigger RMW and read back the corrrupted
> +	 * on-disk stripe, causing further damage.
> +	 * So here we do special repair for raid56.
> +	 *
> +	 * And unfortunately, this repair is very low level and not
> +	 * compatible with the rest of the mirror based repair.
> +	 * So it's still done in synchronous mode using
> +	 * btrfs_repair_io_failure().
> +	 */
> +	if (ctrl->is_raid56) {
> +		const u64 file_offset = logical - ctrl->logical +
> +					ctrl->file_offset;
> +		btrfs_repair_io_failure(fs_info,
> +				btrfs_ino(BTRFS_I(ctrl->inode)), file_offset,
> +				fs_info->sectorsize, logical, page, pgoff,
> +				mirror);
> +		return;
> +	}
> +
> +	ASSERT(io_bio);
> +	io_add_or_submit(ctrl, mirror, logical, page, pgoff, REQ_OP_WRITE);
> +}
> +
> +static void repair_from_mirror(struct btrfs_read_repair_ctrl *ctrl, int mirror)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(ctrl->inode->i_sb);
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +	unsigned long old_bitmap = ctrl->bad_bitmap;
> +	const int prev_mirror = get_prev_mirror(mirror, ctrl->num_copies);
> +	int nr_sector;
> +	u32 offset;
> +	int ret;
> +
> +	/*
> +	 * Reset the io_bio logial bytenr so later io_add_or_submit() can do
> +	 * correct check on the logical bytenr.
> +	 */
> +	ctrl->io_bio->bi_iter.bi_sector = 0;
> +
> +	/* Add all bad sectors into io_bio. */
> +	bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter, offset) {
> +		u64 logical = ctrl->logical + offset;
> +
> +		nr_sector = offset >> fs_info->sectorsize_bits;
> +
> +		/* Good sectors, no need to handle. */
> +		if (!test_bit(nr_sector, &ctrl->bad_bitmap))
> +			continue;
> +
> +		io_add_or_submit(ctrl, mirror, logical, bv.bv_page,
> +				 bv.bv_offset, REQ_OP_READ | REQ_SYNC);
> +	}
> +	io_bio_submit(ctrl, mirror, REQ_OP_READ | REQ_SYNC);
> +
> +	/* Check the newly read data. */
> +	bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter, offset) {
> +		u8 *csum_expected;
> +		u8 csum[BTRFS_CSUM_SIZE];
> +
> +		nr_sector = offset >> fs_info->sectorsize_bits;
> +
> +		/* Originally good sector or read failed, skip. */
> +		if (!test_bit(nr_sector, &old_bitmap) ||
> +		    test_bit(nr_sector, &ctrl->bad_bitmap))
> +			continue;
> +
> +		/* No data csum, only need to repair. */
> +		if (!ctrl->csum)
> +			goto repair;
> +
> +		/*
> +		 * The remaining case is successful read with csum, need
> +		 * recheck the csum.
> +		 */
> +		csum_expected = btrfs_csum_ptr(fs_info, ctrl->csum, offset);
> +		ret = btrfs_check_sector_csum(fs_info, bv.bv_page,
> +				bv.bv_offset, csum, csum_expected);
> +		if (ret) {
> +			set_bit(nr_sector, &ctrl->bad_bitmap);
> +			continue;
> +		}
> +repair:
> +		/*
> +		 * This sector is properly fixed, write it back to previous
> +		 * bad mirror.
> +		 */
> +		writeback_good_mirror(ctrl, prev_mirror, ctrl->logical + offset,
> +				bv.bv_page, bv.bv_offset);
> +	}
> +	/* Submit the last write bio. */
> +	io_bio_submit(ctrl, mirror, REQ_OP_WRITE);
> +}
> +
> +int btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl)
> +{
> +	struct btrfs_fs_info *fs_info;
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +	u32 offset;
> +	int nr_sectors;
> +	int mirror;
> +	int ret = -EIO;
> +
> +	if (!ctrl->inode)
> +		return 0;
> +
> +	fs_info = btrfs_sb(ctrl->inode->i_sb);
> +	nr_sectors = ctrl->len >> fs_info->sectorsize_bits;
> +	ASSERT(ctrl->len);
> +	/* All sectors should be bad initially. */
> +	ASSERT(find_first_zero_bit(&ctrl->bad_bitmap, nr_sectors) == nr_sectors);
> +
> +	for (mirror = get_next_mirror(ctrl->failed_mirror, ctrl->num_copies);
> +	     mirror != ctrl->failed_mirror;
> +	     mirror = get_next_mirror(mirror, ctrl->num_copies)) {
> +		repair_from_mirror(ctrl, mirror);
> +
> +		/* All repaired*/
> +		if (find_first_bit(&ctrl->bad_bitmap, nr_sectors) == nr_sectors) {
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	/* DIO doesn't need any page status/extent update.*/
> +	if (!ctrl->is_dio) {
> +		/* Unlock all the pages and unlock the extent range. */
> +		bio_for_each_sector(fs_info, bv, ctrl->bad_sectors, iter,
> +				    offset) {
> +			bool uptodate = !test_bit(offset >>
> +						  fs_info->sectorsize_bits,
> +						  &ctrl->bad_bitmap);
> +
> +			end_sector_io(bv.bv_page, ctrl->file_offset + offset,
> +				      uptodate);
> +		}
> +	}
> +	bio_put(ctrl->bad_sectors);
> +	if (ctrl->io_bio)
> +		bio_put(ctrl->io_bio);
> +	memset(ctrl, 0, sizeof(*ctrl));
> +	return ret;
> +}
> diff --git a/fs/btrfs/read-repair.h b/fs/btrfs/read-repair.h
> new file mode 100644
> index 000000000000..87219c786109
> --- /dev/null
> +++ b/fs/btrfs/read-repair.h
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef BTRFS_READ_REPAIR_H
> +#define BTRFS_READ_REPAIR_H
> +
> +#include <linux/blk_types.h>
> +#include <linux/fs.h>
> +
> +struct btrfs_read_repair_ctrl {
> +	struct inode *inode;
> +
> +	/* The logical bytenr of the firts corrupted sector. */
> +	u64 logical;
> +
> +	/* The file offset of the first corrupted sector. */
> +	u64 file_offset;
> +
> +	/* The checksum for the corrupted sectors. */
> +	u8 *csum;
> +
> +	/* Current length of the corrupted range. */
> +	u32 len;
> +
> +	int failed_mirror;
> +	int num_copies;
> +	unsigned long bad_bitmap;
> +	bool is_raid56;
> +	bool is_dio;
> +
> +	/* This is only to hold all the initial bad continuous sectors. */
> +	struct bio *bad_sectors;
> +
> +	/*
> +	 * The bio we use to do the real IO.
> +	 * This bio has to be btrfs_bio, as btrfs_map_bio() will utilize
> +	 * btrfs_bio()->device.
> +	 */
> +	struct bio *io_bio;
> +};
> +
> +int btrfs_read_repair_add_sector(struct inode *inode,
> +				 struct btrfs_read_repair_ctrl *ctrl,
> +				 struct page *page, unsigned int pgoff,
> +				 u64 logical, u64 file_offset, u8 *csum,
> +				 int failed_mirror, bool is_dio);
> +int btrfs_read_repair_finish(struct btrfs_read_repair_ctrl *ctrl);
> +
> +#endif