All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Btrfs: scrub: batch rebuild for raid56
@ 2018-03-02 23:10 Liu Bo
  2018-03-06 10:47 ` David Sterba
  2018-03-07 19:08 ` [PATCH v2] " Liu Bo
  0 siblings, 2 replies; 8+ messages in thread
From: Liu Bo @ 2018-03-02 23:10 UTC (permalink / raw)
  To: linux-btrfs

In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
as unit, however, scrub_extent() sets blocksize as unit, so rebuild
process may be triggered on every block on a same stripe.

A typical example would be that when we're replacing a disappeared disk,
all reads on the disks get -EIO, every block (size is 4K if blocksize is
4K) would go thru these,

scrub_handle_errored_block
  scrub_recheck_block # re-read pages one by one
  scrub_recheck_block # rebuild by calling raid56_parity_recover()
                        page by page

Although with raid56 stripe cache most of reads during rebuild can be
avoided, the parity recover calculation(xor or raid6 algorithms) needs to
be done $(BTRFS_STRIPE_LEN / blocksize) times.

This makes it less stupid by doing raid56 scrub/replace on stripe length.
---
 fs/btrfs/scrub.c | 78 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9882513..e3203a1 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1718,6 +1718,44 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 	return blk_status_to_errno(bio->bi_status);
 }
 
+static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
+					  struct scrub_block *sblock)
+{
+	struct scrub_page *first_page = sblock->pagev[0];
+	struct bio *bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
+	int page_num;
+
+	/* All pages in sblock belongs to the same stripe on the same device. */
+	ASSERT(first_page->dev);
+	if (first_page->dev->bdev == NULL)
+		goto out;
+
+	bio_set_dev(bio, first_page->dev->bdev);
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		struct scrub_page *page = sblock->pagev[page_num];
+
+		WARN_ON(!page->page);
+		bio_add_page(bio, page->page, PAGE_SIZE, 0);
+	}
+
+	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+		bio_put(bio);
+		goto out;
+	}
+
+	bio_put(bio);
+
+	scrub_recheck_block_checksum(sblock);
+
+	return;
+out:
+	for (page_num = 0; page_num < sblock->page_count; page_num++)
+		sblock->pagev[page_num]->io_error = 1;
+
+	sblock->no_io_error_seen = 0;
+}
+
 /*
  * this function will check the on disk data for checksum errors, header
  * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1733,6 +1771,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 	sblock->no_io_error_seen = 1;
 
+	/* short cut for raid56 */
+	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+		return scrub_recheck_block_on_raid56(fs_info, sblock);
+
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
 		struct scrub_page *page = sblock->pagev[page_num];
@@ -1748,19 +1790,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 		bio_set_dev(bio, page->dev->bdev);
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
-		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
-			if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
-				page->io_error = 1;
-				sblock->no_io_error_seen = 0;
-			}
-		} else {
-			bio->bi_iter.bi_sector = page->physical >> 9;
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+		bio->bi_iter.bi_sector = page->physical >> 9;
+		bio_set_op_attrs(bio, REQ_OP_READ, 0);
 
-			if (btrfsic_submit_bio_wait(bio)) {
-				page->io_error = 1;
-				sblock->no_io_error_seen = 0;
-			}
+		if (btrfsic_submit_bio_wait(bio)) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
 		}
 
 		bio_put(bio);
@@ -2728,7 +2763,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
 }
 
 /* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
+			u64 logical, u64 len,
 			u64 physical, struct btrfs_device *dev, u64 flags,
 			u64 gen, int mirror_num, u64 physical_for_dev_replace)
 {
@@ -2737,13 +2773,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
 	u32 blocksize;
 
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		blocksize = sctx->fs_info->sectorsize;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			blocksize = map->stripe_len;
+		else
+			blocksize = sctx->fs_info->sectorsize;
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.data_extents_scrubbed++;
 		sctx->stat.data_bytes_scrubbed += len;
 		spin_unlock(&sctx->stat_lock);
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		blocksize = sctx->fs_info->nodesize;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			blocksize = map->stripe_len;
+		else
+			blocksize = sctx->fs_info->nodesize;
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.tree_extents_scrubbed++;
 		sctx->stat.tree_bytes_scrubbed += len;
@@ -2883,9 +2925,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
 	}
 
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		blocksize = sctx->fs_info->sectorsize;
+		blocksize = sparity->stripe_len;
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		blocksize = sctx->fs_info->nodesize;
+		blocksize = sparity->stripe_len;
 	} else {
 		blocksize = sctx->fs_info->sectorsize;
 		WARN_ON(1);
@@ -3595,7 +3637,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			if (ret)
 				goto out;
 
-			ret = scrub_extent(sctx, extent_logical, extent_len,
+			ret = scrub_extent(sctx, map, extent_logical, extent_len,
 					   extent_physical, extent_dev, flags,
 					   generation, extent_mirror_num,
 					   extent_logical - logical + physical);
-- 
2.9.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: scrub: batch rebuild for raid56
  2018-03-02 23:10 [PATCH] Btrfs: scrub: batch rebuild for raid56 Liu Bo
@ 2018-03-06 10:47 ` David Sterba
  2018-03-06 18:22   ` Liu Bo
  2018-03-07 19:08 ` [PATCH v2] " Liu Bo
  1 sibling, 1 reply; 8+ messages in thread
From: David Sterba @ 2018-03-06 10:47 UTC (permalink / raw)
  To: Liu Bo; +Cc: linux-btrfs

On Fri, Mar 02, 2018 at 04:10:37PM -0700, Liu Bo wrote:
> In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
> as unit, however, scrub_extent() sets blocksize as unit, so rebuild
> process may be triggered on every block on a same stripe.
> 
> A typical example would be that when we're replacing a disappeared disk,
> all reads on the disks get -EIO, every block (size is 4K if blocksize is
> 4K) would go thru these,
> 
> scrub_handle_errored_block
>   scrub_recheck_block # re-read pages one by one
>   scrub_recheck_block # rebuild by calling raid56_parity_recover()
>                         page by page
> 
> Although with raid56 stripe cache most of reads during rebuild can be
> avoided, the parity recover calculation(xor or raid6 algorithms) needs to
> be done $(BTRFS_STRIPE_LEN / blocksize) times.
> 
> This makes it less stupid by doing raid56 scrub/replace on stripe length.

missing s-o-b

> ---
>  fs/btrfs/scrub.c | 78 +++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 60 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 9882513..e3203a1 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -1718,6 +1718,44 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
>  	return blk_status_to_errno(bio->bi_status);
>  }
>  
> +static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
> +					  struct scrub_block *sblock)
> +{
> +	struct scrub_page *first_page = sblock->pagev[0];
> +	struct bio *bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);

nontrivial initializations (variable to variable) are better put into
the statement section.

> +	int page_num;
> +
> +	/* All pages in sblock belongs to the same stripe on the same device. */
> +	ASSERT(first_page->dev);
> +	if (first_page->dev->bdev == NULL)
> +		goto out;
> +
> +	bio_set_dev(bio, first_page->dev->bdev);
> +
> +	for (page_num = 0; page_num < sblock->page_count; page_num++) {
> +		struct scrub_page *page = sblock->pagev[page_num];
> +
> +		WARN_ON(!page->page);
> +		bio_add_page(bio, page->page, PAGE_SIZE, 0);
> +	}
> +
> +	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
> +		bio_put(bio);
> +		goto out;
> +	}
> +
> +	bio_put(bio);
> +
> +	scrub_recheck_block_checksum(sblock);
> +
> +	return;
> +out:
> +	for (page_num = 0; page_num < sblock->page_count; page_num++)
> +		sblock->pagev[page_num]->io_error = 1;
> +
> +	sblock->no_io_error_seen = 0;
> +}
> +
>  /*
>   * this function will check the on disk data for checksum errors, header
>   * errors and read I/O errors. If any I/O errors happen, the exact pages
> @@ -1733,6 +1771,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
>  
>  	sblock->no_io_error_seen = 1;
>  
> +	/* short cut for raid56 */
> +	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
> +		return scrub_recheck_block_on_raid56(fs_info, sblock);
> +
>  	for (page_num = 0; page_num < sblock->page_count; page_num++) {
>  		struct bio *bio;
>  		struct scrub_page *page = sblock->pagev[page_num];
> @@ -1748,19 +1790,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
>  		bio_set_dev(bio, page->dev->bdev);
>  
>  		bio_add_page(bio, page->page, PAGE_SIZE, 0);
> -		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
> -			if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
> -				page->io_error = 1;
> -				sblock->no_io_error_seen = 0;
> -			}
> -		} else {
> -			bio->bi_iter.bi_sector = page->physical >> 9;
> -			bio_set_op_attrs(bio, REQ_OP_READ, 0);
> +		bio->bi_iter.bi_sector = page->physical >> 9;
> +		bio_set_op_attrs(bio, REQ_OP_READ, 0);

https://elixir.bootlin.com/linux/latest/source/include/linux/blk_types.h#L270

bio_set_op_attrs should not be used

>  
> -			if (btrfsic_submit_bio_wait(bio)) {
> -				page->io_error = 1;
> -				sblock->no_io_error_seen = 0;
> -			}
> +		if (btrfsic_submit_bio_wait(bio)) {
> +			page->io_error = 1;
> +			sblock->no_io_error_seen = 0;
>  		}
>  
>  		bio_put(bio);
> @@ -2728,7 +2763,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
>  }
>  
>  /* scrub extent tries to collect up to 64 kB for each bio */
> -static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
> +static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
> +			u64 logical, u64 len,
>  			u64 physical, struct btrfs_device *dev, u64 flags,
>  			u64 gen, int mirror_num, u64 physical_for_dev_replace)
>  {
> @@ -2737,13 +2773,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
>  	u32 blocksize;
>  
>  	if (flags & BTRFS_EXTENT_FLAG_DATA) {
> -		blocksize = sctx->fs_info->sectorsize;
> +		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
> +			blocksize = map->stripe_len;
> +		else
> +			blocksize = sctx->fs_info->sectorsize;
>  		spin_lock(&sctx->stat_lock);
>  		sctx->stat.data_extents_scrubbed++;
>  		sctx->stat.data_bytes_scrubbed += len;
>  		spin_unlock(&sctx->stat_lock);
>  	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
> -		blocksize = sctx->fs_info->nodesize;
> +		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
> +			blocksize = map->stripe_len;
> +		else
> +			blocksize = sctx->fs_info->nodesize;
>  		spin_lock(&sctx->stat_lock);
>  		sctx->stat.tree_extents_scrubbed++;
>  		sctx->stat.tree_bytes_scrubbed += len;
> @@ -2883,9 +2925,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
>  	}
>  
>  	if (flags & BTRFS_EXTENT_FLAG_DATA) {
> -		blocksize = sctx->fs_info->sectorsize;
> +		blocksize = sparity->stripe_len;
>  	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
> -		blocksize = sctx->fs_info->nodesize;
> +		blocksize = sparity->stripe_len;
>  	} else {
>  		blocksize = sctx->fs_info->sectorsize;
>  		WARN_ON(1);
> @@ -3595,7 +3637,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
>  			if (ret)
>  				goto out;
>  
> -			ret = scrub_extent(sctx, extent_logical, extent_len,
> +			ret = scrub_extent(sctx, map, extent_logical, extent_len,
>  					   extent_physical, extent_dev, flags,
>  					   generation, extent_mirror_num,
>  					   extent_logical - logical + physical);
> -- 
> 2.9.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: scrub: batch rebuild for raid56
  2018-03-06 10:47 ` David Sterba
@ 2018-03-06 18:22   ` Liu Bo
  2018-03-07 14:43     ` David Sterba
  0 siblings, 1 reply; 8+ messages in thread
From: Liu Bo @ 2018-03-06 18:22 UTC (permalink / raw)
  To: dsterba, linux-btrfs

On Tue, Mar 06, 2018 at 11:47:47AM +0100, David Sterba wrote:
> On Fri, Mar 02, 2018 at 04:10:37PM -0700, Liu Bo wrote:
> > In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
> > as unit, however, scrub_extent() sets blocksize as unit, so rebuild
> > process may be triggered on every block on a same stripe.
> > 
> > A typical example would be that when we're replacing a disappeared disk,
> > all reads on the disks get -EIO, every block (size is 4K if blocksize is
> > 4K) would go thru these,
> > 
> > scrub_handle_errored_block
> >   scrub_recheck_block # re-read pages one by one
> >   scrub_recheck_block # rebuild by calling raid56_parity_recover()
> >                         page by page
> > 
> > Although with raid56 stripe cache most of reads during rebuild can be
> > avoided, the parity recover calculation(xor or raid6 algorithms) needs to
> > be done $(BTRFS_STRIPE_LEN / blocksize) times.
> > 
> > This makes it less stupid by doing raid56 scrub/replace on stripe length.
> 
> missing s-o-b
>

I'm surprised that checkpatch.pl didn't complain.

> > ---
> >  fs/btrfs/scrub.c | 78 +++++++++++++++++++++++++++++++++++++++++++-------------
> >  1 file changed, 60 insertions(+), 18 deletions(-)
> > 
> > diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> > index 9882513..e3203a1 100644
> > --- a/fs/btrfs/scrub.c
> > +++ b/fs/btrfs/scrub.c
> > @@ -1718,6 +1718,44 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
> >  	return blk_status_to_errno(bio->bi_status);
> >  }
> >  
> > +static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
> > +					  struct scrub_block *sblock)
> > +{
> > +	struct scrub_page *first_page = sblock->pagev[0];
> > +	struct bio *bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
> 
> nontrivial initializations (variable to variable) are better put into
> the statement section.
>

OK.

> > +	int page_num;
> > +
> > +	/* All pages in sblock belongs to the same stripe on the same device. */
> > +	ASSERT(first_page->dev);
> > +	if (first_page->dev->bdev == NULL)
> > +		goto out;
> > +
> > +	bio_set_dev(bio, first_page->dev->bdev);
> > +
> > +	for (page_num = 0; page_num < sblock->page_count; page_num++) {
> > +		struct scrub_page *page = sblock->pagev[page_num];
> > +
> > +		WARN_ON(!page->page);
> > +		bio_add_page(bio, page->page, PAGE_SIZE, 0);
> > +	}
> > +
> > +	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
> > +		bio_put(bio);
> > +		goto out;
> > +	}
> > +
> > +	bio_put(bio);
> > +
> > +	scrub_recheck_block_checksum(sblock);
> > +
> > +	return;
> > +out:
> > +	for (page_num = 0; page_num < sblock->page_count; page_num++)
> > +		sblock->pagev[page_num]->io_error = 1;
> > +
> > +	sblock->no_io_error_seen = 0;
> > +}
> > +
> >  /*
> >   * this function will check the on disk data for checksum errors, header
> >   * errors and read I/O errors. If any I/O errors happen, the exact pages
> > @@ -1733,6 +1771,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
> >  
> >  	sblock->no_io_error_seen = 1;
> >  
> > +	/* short cut for raid56 */
> > +	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
> > +		return scrub_recheck_block_on_raid56(fs_info, sblock);
> > +
> >  	for (page_num = 0; page_num < sblock->page_count; page_num++) {
> >  		struct bio *bio;
> >  		struct scrub_page *page = sblock->pagev[page_num];
> > @@ -1748,19 +1790,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
> >  		bio_set_dev(bio, page->dev->bdev);
> >  
> >  		bio_add_page(bio, page->page, PAGE_SIZE, 0);
> > -		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
> > -			if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
> > -				page->io_error = 1;
> > -				sblock->no_io_error_seen = 0;
> > -			}
> > -		} else {
> > -			bio->bi_iter.bi_sector = page->physical >> 9;
> > -			bio_set_op_attrs(bio, REQ_OP_READ, 0);
> > +		bio->bi_iter.bi_sector = page->physical >> 9;
> > +		bio_set_op_attrs(bio, REQ_OP_READ, 0);
> 
> https://elixir.bootlin.com/linux/latest/source/include/linux/blk_types.h#L270
> 
> bio_set_op_attrs should not be used

OK.

Thanks,

-liubo
> 
> >  
> > -			if (btrfsic_submit_bio_wait(bio)) {
> > -				page->io_error = 1;
> > -				sblock->no_io_error_seen = 0;
> > -			}
> > +		if (btrfsic_submit_bio_wait(bio)) {
> > +			page->io_error = 1;
> > +			sblock->no_io_error_seen = 0;
> >  		}
> >  
> >  		bio_put(bio);
> > @@ -2728,7 +2763,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
> >  }
> >  
> >  /* scrub extent tries to collect up to 64 kB for each bio */
> > -static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
> > +static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
> > +			u64 logical, u64 len,
> >  			u64 physical, struct btrfs_device *dev, u64 flags,
> >  			u64 gen, int mirror_num, u64 physical_for_dev_replace)
> >  {
> > @@ -2737,13 +2773,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
> >  	u32 blocksize;
> >  
> >  	if (flags & BTRFS_EXTENT_FLAG_DATA) {
> > -		blocksize = sctx->fs_info->sectorsize;
> > +		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
> > +			blocksize = map->stripe_len;
> > +		else
> > +			blocksize = sctx->fs_info->sectorsize;
> >  		spin_lock(&sctx->stat_lock);
> >  		sctx->stat.data_extents_scrubbed++;
> >  		sctx->stat.data_bytes_scrubbed += len;
> >  		spin_unlock(&sctx->stat_lock);
> >  	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
> > -		blocksize = sctx->fs_info->nodesize;
> > +		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
> > +			blocksize = map->stripe_len;
> > +		else
> > +			blocksize = sctx->fs_info->nodesize;
> >  		spin_lock(&sctx->stat_lock);
> >  		sctx->stat.tree_extents_scrubbed++;
> >  		sctx->stat.tree_bytes_scrubbed += len;
> > @@ -2883,9 +2925,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
> >  	}
> >  
> >  	if (flags & BTRFS_EXTENT_FLAG_DATA) {
> > -		blocksize = sctx->fs_info->sectorsize;
> > +		blocksize = sparity->stripe_len;
> >  	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
> > -		blocksize = sctx->fs_info->nodesize;
> > +		blocksize = sparity->stripe_len;
> >  	} else {
> >  		blocksize = sctx->fs_info->sectorsize;
> >  		WARN_ON(1);
> > @@ -3595,7 +3637,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
> >  			if (ret)
> >  				goto out;
> >  
> > -			ret = scrub_extent(sctx, extent_logical, extent_len,
> > +			ret = scrub_extent(sctx, map, extent_logical, extent_len,
> >  					   extent_physical, extent_dev, flags,
> >  					   generation, extent_mirror_num,
> >  					   extent_logical - logical + physical);
> > -- 
> > 2.9.4
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: scrub: batch rebuild for raid56
  2018-03-06 18:22   ` Liu Bo
@ 2018-03-07 14:43     ` David Sterba
  2018-03-07 15:22       ` Nikolay Borisov
  0 siblings, 1 reply; 8+ messages in thread
From: David Sterba @ 2018-03-07 14:43 UTC (permalink / raw)
  To: Liu Bo; +Cc: dsterba, linux-btrfs

On Tue, Mar 06, 2018 at 11:22:21AM -0700, Liu Bo wrote:
> On Tue, Mar 06, 2018 at 11:47:47AM +0100, David Sterba wrote:
> > On Fri, Mar 02, 2018 at 04:10:37PM -0700, Liu Bo wrote:
> > > In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
> > > as unit, however, scrub_extent() sets blocksize as unit, so rebuild
> > > process may be triggered on every block on a same stripe.
> > > 
> > > A typical example would be that when we're replacing a disappeared disk,
> > > all reads on the disks get -EIO, every block (size is 4K if blocksize is
> > > 4K) would go thru these,
> > > 
> > > scrub_handle_errored_block
> > >   scrub_recheck_block # re-read pages one by one
> > >   scrub_recheck_block # rebuild by calling raid56_parity_recover()
> > >                         page by page
> > > 
> > > Although with raid56 stripe cache most of reads during rebuild can be
> > > avoided, the parity recover calculation(xor or raid6 algorithms) needs to
> > > be done $(BTRFS_STRIPE_LEN / blocksize) times.
> > > 
> > > This makes it less stupid by doing raid56 scrub/replace on stripe length.
> > 
> > missing s-o-b
> 
> I'm surprised that checkpatch.pl didn't complain.

Never mind, I'm your true checkpatch.cz

(http://checkpatch.pl actually exists)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: scrub: batch rebuild for raid56
  2018-03-07 14:43     ` David Sterba
@ 2018-03-07 15:22       ` Nikolay Borisov
  2018-03-09 14:32         ` David Sterba
  0 siblings, 1 reply; 8+ messages in thread
From: Nikolay Borisov @ 2018-03-07 15:22 UTC (permalink / raw)
  To: dsterba, Liu Bo, linux-btrfs



On  7.03.2018 16:43, David Sterba wrote:
> On Tue, Mar 06, 2018 at 11:22:21AM -0700, Liu Bo wrote:
>> On Tue, Mar 06, 2018 at 11:47:47AM +0100, David Sterba wrote:
>>> On Fri, Mar 02, 2018 at 04:10:37PM -0700, Liu Bo wrote:
>>>> In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
>>>> as unit, however, scrub_extent() sets blocksize as unit, so rebuild
>>>> process may be triggered on every block on a same stripe.
>>>>
>>>> A typical example would be that when we're replacing a disappeared disk,
>>>> all reads on the disks get -EIO, every block (size is 4K if blocksize is
>>>> 4K) would go thru these,
>>>>
>>>> scrub_handle_errored_block
>>>>   scrub_recheck_block # re-read pages one by one
>>>>   scrub_recheck_block # rebuild by calling raid56_parity_recover()
>>>>                         page by page
>>>>
>>>> Although with raid56 stripe cache most of reads during rebuild can be
>>>> avoided, the parity recover calculation(xor or raid6 algorithms) needs to
>>>> be done $(BTRFS_STRIPE_LEN / blocksize) times.
>>>>
>>>> This makes it less stupid by doing raid56 scrub/replace on stripe length.
>>>
>>> missing s-o-b
>>
>> I'm surprised that checkpatch.pl didn't complain.

I have written a python script that can scrape the mailing list and run
checkpatch (and any other software deemed appropriate) on posted patches
and reply back with results. However, I haven't really activated it, I
guess if people think there is merit in it I could hook it up to the
mailing list :)

> 
> Never mind, I'm your true checkpatch.cz
> 
> (http://checkpatch.pl actually exists)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] Btrfs: scrub: batch rebuild for raid56
  2018-03-02 23:10 [PATCH] Btrfs: scrub: batch rebuild for raid56 Liu Bo
  2018-03-06 10:47 ` David Sterba
@ 2018-03-07 19:08 ` Liu Bo
  2018-03-09 15:37   ` David Sterba
  1 sibling, 1 reply; 8+ messages in thread
From: Liu Bo @ 2018-03-07 19:08 UTC (permalink / raw)
  To: linux-btrfs

In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
as unit, however, scrub_extent() sets blocksize as unit, so rebuild
process may be triggered on every block on a same stripe.

A typical example would be that when we're replacing a disappeared disk,
all reads on the disks get -EIO, every block (size is 4K if blocksize is
4K) would go thru these,

scrub_handle_errored_block
  scrub_recheck_block # re-read pages one by one
  scrub_recheck_block # rebuild by calling raid56_parity_recover()
                        page by page

Although with raid56 stripe cache most of reads during rebuild can be
avoided, the parity recover calculation(xor or raid6 algorithms) needs to
be done $(BTRFS_STRIPE_LEN / blocksize) times.

This makes it smarter by doing raid56 scrub/replace on stripe length.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
v2: - Place bio allocation in code statement.
    - Get rid of bio_set_op_attrs.
    - Add SOB.

 fs/btrfs/scrub.c | 79 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ec56f33..3ccabad 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1718,6 +1718,45 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 	return blk_status_to_errno(bio->bi_status);
 }
 
+static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
+					  struct scrub_block *sblock)
+{
+	struct scrub_page *first_page = sblock->pagev[0];
+	struct bio *bio;
+	int page_num;
+
+	/* All pages in sblock belong to the same stripe on the same device. */
+	ASSERT(first_page->dev);
+	if (!first_page->dev->bdev)
+		goto out;
+
+	bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
+	bio_set_dev(bio, first_page->dev->bdev);
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		struct scrub_page *page = sblock->pagev[page_num];
+
+		WARN_ON(!page->page);
+		bio_add_page(bio, page->page, PAGE_SIZE, 0);
+	}
+
+	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+		bio_put(bio);
+		goto out;
+	}
+
+	bio_put(bio);
+
+	scrub_recheck_block_checksum(sblock);
+
+	return;
+out:
+	for (page_num = 0; page_num < sblock->page_count; page_num++)
+		sblock->pagev[page_num]->io_error = 1;
+
+	sblock->no_io_error_seen = 0;
+}
+
 /*
  * this function will check the on disk data for checksum errors, header
  * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1733,6 +1772,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 	sblock->no_io_error_seen = 1;
 
+	/* short cut for raid56 */
+	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+		return scrub_recheck_block_on_raid56(fs_info, sblock);
+
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
 		struct scrub_page *page = sblock->pagev[page_num];
@@ -1748,19 +1791,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 		bio_set_dev(bio, page->dev->bdev);
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
-		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
-			if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
-				page->io_error = 1;
-				sblock->no_io_error_seen = 0;
-			}
-		} else {
-			bio->bi_iter.bi_sector = page->physical >> 9;
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+		bio->bi_iter.bi_sector = page->physical >> 9;
+		bio->bi_opf = REQ_OP_READ;
 
-			if (btrfsic_submit_bio_wait(bio)) {
-				page->io_error = 1;
-				sblock->no_io_error_seen = 0;
-			}
+		if (btrfsic_submit_bio_wait(bio)) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
 		}
 
 		bio_put(bio);
@@ -2728,7 +2764,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
 }
 
 /* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
+			u64 logical, u64 len,
 			u64 physical, struct btrfs_device *dev, u64 flags,
 			u64 gen, int mirror_num, u64 physical_for_dev_replace)
 {
@@ -2737,13 +2774,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
 	u32 blocksize;
 
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		blocksize = sctx->fs_info->sectorsize;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			blocksize = map->stripe_len;
+		else
+			blocksize = sctx->fs_info->sectorsize;
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.data_extents_scrubbed++;
 		sctx->stat.data_bytes_scrubbed += len;
 		spin_unlock(&sctx->stat_lock);
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		blocksize = sctx->fs_info->nodesize;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			blocksize = map->stripe_len;
+		else
+			blocksize = sctx->fs_info->nodesize;
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.tree_extents_scrubbed++;
 		sctx->stat.tree_bytes_scrubbed += len;
@@ -2883,9 +2926,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
 	}
 
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		blocksize = sctx->fs_info->sectorsize;
+		blocksize = sparity->stripe_len;
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		blocksize = sctx->fs_info->nodesize;
+		blocksize = sparity->stripe_len;
 	} else {
 		blocksize = sctx->fs_info->sectorsize;
 		WARN_ON(1);
@@ -3595,7 +3638,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			if (ret)
 				goto out;
 
-			ret = scrub_extent(sctx, extent_logical, extent_len,
+			ret = scrub_extent(sctx, map, extent_logical, extent_len,
 					   extent_physical, extent_dev, flags,
 					   generation, extent_mirror_num,
 					   extent_logical - logical + physical);
-- 
2.9.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: scrub: batch rebuild for raid56
  2018-03-07 15:22       ` Nikolay Borisov
@ 2018-03-09 14:32         ` David Sterba
  0 siblings, 0 replies; 8+ messages in thread
From: David Sterba @ 2018-03-09 14:32 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: dsterba, Liu Bo, linux-btrfs

On Wed, Mar 07, 2018 at 05:22:08PM +0200, Nikolay Borisov wrote:
> 
> 
> On  7.03.2018 16:43, David Sterba wrote:
> > On Tue, Mar 06, 2018 at 11:22:21AM -0700, Liu Bo wrote:
> >> On Tue, Mar 06, 2018 at 11:47:47AM +0100, David Sterba wrote:
> >>> On Fri, Mar 02, 2018 at 04:10:37PM -0700, Liu Bo wrote:
> >>>> In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
> >>>> as unit, however, scrub_extent() sets blocksize as unit, so rebuild
> >>>> process may be triggered on every block on a same stripe.
> >>>>
> >>>> A typical example would be that when we're replacing a disappeared disk,
> >>>> all reads on the disks get -EIO, every block (size is 4K if blocksize is
> >>>> 4K) would go thru these,
> >>>>
> >>>> scrub_handle_errored_block
> >>>>   scrub_recheck_block # re-read pages one by one
> >>>>   scrub_recheck_block # rebuild by calling raid56_parity_recover()
> >>>>                         page by page
> >>>>
> >>>> Although with raid56 stripe cache most of reads during rebuild can be
> >>>> avoided, the parity recover calculation(xor or raid6 algorithms) needs to
> >>>> be done $(BTRFS_STRIPE_LEN / blocksize) times.
> >>>>
> >>>> This makes it less stupid by doing raid56 scrub/replace on stripe length.
> >>>
> >>> missing s-o-b
> >>
> >> I'm surprised that checkpatch.pl didn't complain.
> 
> I have written a python script that can scrape the mailing list and run
> checkpatch (and any other software deemed appropriate) on posted patches
> and reply back with results. However, I haven't really activated it, I
> guess if people think there is merit in it I could hook it up to the
> mailing list :)

If we and checkpatch agree on the issues to report then yes, but I think
it would be confusing when we'd have to tell people to ignore some of
the warnings. I think we'd need to tune checkpatch for our needs, I
would not mind testing it locally. I'd start with catching typos in the
changelogs and comments, that's something that happens all the time and
can be automated. Next, implement AI that will try to understand the
changelog and tell people what's missing.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] Btrfs: scrub: batch rebuild for raid56
  2018-03-07 19:08 ` [PATCH v2] " Liu Bo
@ 2018-03-09 15:37   ` David Sterba
  0 siblings, 0 replies; 8+ messages in thread
From: David Sterba @ 2018-03-09 15:37 UTC (permalink / raw)
  To: Liu Bo; +Cc: linux-btrfs

On Wed, Mar 07, 2018 at 12:08:09PM -0700, Liu Bo wrote:
> In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
> as unit, however, scrub_extent() sets blocksize as unit, so rebuild
> process may be triggered on every block on a same stripe.
> 
> A typical example would be that when we're replacing a disappeared disk,
> all reads on the disks get -EIO, every block (size is 4K if blocksize is
> 4K) would go thru these,
> 
> scrub_handle_errored_block
>   scrub_recheck_block # re-read pages one by one
>   scrub_recheck_block # rebuild by calling raid56_parity_recover()
>                         page by page
> 
> Although with raid56 stripe cache most of reads during rebuild can be
> avoided, the parity recover calculation(xor or raid6 algorithms) needs to
> be done $(BTRFS_STRIPE_LEN / blocksize) times.
> 
> This makes it smarter by doing raid56 scrub/replace on stripe length.
> 
> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> ---
> v2: - Place bio allocation in code statement.
>     - Get rid of bio_set_op_attrs.
>     - Add SOB.

Added to next, thanks.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-03-09 15:39 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-02 23:10 [PATCH] Btrfs: scrub: batch rebuild for raid56 Liu Bo
2018-03-06 10:47 ` David Sterba
2018-03-06 18:22   ` Liu Bo
2018-03-07 14:43     ` David Sterba
2018-03-07 15:22       ` Nikolay Borisov
2018-03-09 14:32         ` David Sterba
2018-03-07 19:08 ` [PATCH v2] " Liu Bo
2018-03-09 15:37   ` David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.