All of lore.kernel.org
 help / color / mirror / Atom feed
* Removing PG_error use from btrfs
@ 2024-04-18 17:41 Matthew Wilcox
  2024-04-18 18:00 ` David Sterba
  0 siblings, 1 reply; 3+ messages in thread
From: Matthew Wilcox @ 2024-04-18 17:41 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba
  Cc: linux-btrfs, linux-fsdevel, Jan Kara

We're down to just JFS and btrfs using the PG_error flag.  I sent a
patch earlier to remove PG_error from JFS, so now it's your turn ...

btrfs currently uses it to indicate superblock writeback errors.
This proposal moves that information to a counter in the btrfs_device.
Maybe this isn't the best approach.  What do you think?

I'm currently running fstests against it and it hasn't blown up yet.

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3d512b041977..5f6f8472ecec 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3627,28 +3627,24 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
 static void btrfs_end_super_write(struct bio *bio)
 {
 	struct btrfs_device *device = bio->bi_private;
-	struct bio_vec *bvec;
-	struct bvec_iter_all iter_all;
-	struct page *page;
-
-	bio_for_each_segment_all(bvec, bio, iter_all) {
-		page = bvec->bv_page;
+	struct folio_iter fi;
 
+	bio_for_each_folio_all(fi, bio) {
 		if (bio->bi_status) {
 			btrfs_warn_rl_in_rcu(device->fs_info,
-				"lost page write due to IO error on %s (%d)",
+				"lost sb write due to IO error on %s (%d)",
 				btrfs_dev_name(device),
 				blk_status_to_errno(bio->bi_status));
-			ClearPageUptodate(page);
-			SetPageError(page);
 			btrfs_dev_stat_inc_and_print(device,
 						     BTRFS_DEV_STAT_WRITE_ERRS);
-		} else {
-			SetPageUptodate(page);
+			/* Ensure failure if a primary sb fails */
+			if (bio->bi_opf & REQ_FUA)
+				atomic_set(&device->sb_wb_errors, INT_MAX / 2);
+			else
+				atomic_inc(&device->sb_wb_errors);
 		}
-
-		put_page(page);
-		unlock_page(page);
+		folio_unlock(fi.folio);
+		folio_put(fi.folio);
 	}
 
 	bio_put(bio);
@@ -3750,19 +3746,21 @@ static int write_dev_supers(struct btrfs_device *device,
 	struct address_space *mapping = device->bdev->bd_mapping;
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	int i;
-	int errors = 0;
 	int ret;
 	u64 bytenr, bytenr_orig;
 
+	atomic_set(&device->sb_wb_errors, 0);
+
 	if (max_mirrors == 0)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
 	shash->tfm = fs_info->csum_shash;
 
 	for (i = 0; i < max_mirrors; i++) {
-		struct page *page;
+		struct folio *folio;
 		struct bio *bio;
 		struct btrfs_super_block *disk_super;
+		size_t offset;
 
 		bytenr_orig = btrfs_sb_offset(i);
 		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
@@ -3772,7 +3770,7 @@ static int write_dev_supers(struct btrfs_device *device,
 			btrfs_err(device->fs_info,
 				"couldn't get super block location for mirror %d",
 				i);
-			errors++;
+			atomic_inc(&device->sb_wb_errors);
 			continue;
 		}
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@@ -3785,20 +3783,18 @@ static int write_dev_supers(struct btrfs_device *device,
 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
 				    sb->csum);
 
-		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
-					   GFP_NOFS);
-		if (!page) {
+		folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
+				FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+		if (IS_ERR(folio)) {
 			btrfs_err(device->fs_info,
 			    "couldn't get super block page for bytenr %llu",
 			    bytenr);
-			errors++;
+			atomic_inc(&device->sb_wb_errors);
 			continue;
 		}
 
-		/* Bump the refcount for wait_dev_supers() */
-		get_page(page);
-
-		disk_super = page_address(page);
+		offset = offset_in_folio(folio, bytenr);
+		disk_super = folio_address(folio) + offset;
 		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
 
 		/*
@@ -3812,8 +3808,7 @@ static int write_dev_supers(struct btrfs_device *device,
 		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
 		bio->bi_private = device;
 		bio->bi_end_io = btrfs_end_super_write;
-		__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
-			       offset_in_page(bytenr));
+		bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
 
 		/*
 		 * We FUA only the first super block.  The others we allow to
@@ -3825,9 +3820,9 @@ static int write_dev_supers(struct btrfs_device *device,
 		submit_bio(bio);
 
 		if (btrfs_advance_sb_log(device, i))
-			errors++;
+			atomic_inc(&device->sb_wb_errors);
 	}
-	return errors < i ? 0 : -1;
+	return atomic_read(&device->sb_wb_errors) < i ? 0 : -1;
 }
 
 /*
@@ -3849,7 +3844,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
 	for (i = 0; i < max_mirrors; i++) {
-		struct page *page;
+		struct folio *folio;
 
 		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
 		if (ret == -ENOENT) {
@@ -3864,29 +3859,19 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		    device->commit_total_bytes)
 			break;
 
-		page = find_get_page(device->bdev->bd_mapping,
+		folio = filemap_get_folio(device->bdev->bd_mapping,
 				     bytenr >> PAGE_SHIFT);
-		if (!page) {
-			errors++;
-			if (i == 0)
-				primary_failed = true;
+		/* If the folio has been removed, then we know it completed */
+		if (IS_ERR(folio))
 			continue;
-		}
-		/* Page is submitted locked and unlocked once the IO completes */
-		wait_on_page_locked(page);
-		if (PageError(page)) {
-			errors++;
-			if (i == 0)
-				primary_failed = true;
-		}
-
-		/* Drop our reference */
-		put_page(page);
-
-		/* Drop the reference from the writing run */
-		put_page(page);
+		/* Folio is unlocked once the IO completes */
+		folio_wait_locked(folio);
+		folio_put(folio);
 	}
 
+	errors += atomic_read(&device->sb_wb_errors);
+	if (errors >= INT_MAX / 2)
+		primary_failed = true;
 	/* log error, force error return */
 	if (primary_failed) {
 		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index cf555f5b47ce..44c639720426 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -142,6 +142,8 @@ struct btrfs_device {
 	/* type and info about this device */
 	u64 type;
 
+	atomic_t sb_wb_errors;
+
 	/* minimal io size for this device */
 	u32 sector_size;
 


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: Removing PG_error use from btrfs
  2024-04-18 17:41 Removing PG_error use from btrfs Matthew Wilcox
@ 2024-04-18 18:00 ` David Sterba
  2024-04-18 19:07   ` Matthew Wilcox
  0 siblings, 1 reply; 3+ messages in thread
From: David Sterba @ 2024-04-18 18:00 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Chris Mason, Josef Bacik, David Sterba, linux-btrfs,
	linux-fsdevel, Jan Kara

On Thu, Apr 18, 2024 at 06:41:47PM +0100, Matthew Wilcox wrote:
> We're down to just JFS and btrfs using the PG_error flag.  I sent a
> patch earlier to remove PG_error from JFS, so now it's your turn ...
> 
> btrfs currently uses it to indicate superblock writeback errors.
> This proposal moves that information to a counter in the btrfs_device.
> Maybe this isn't the best approach.  What do you think?

Tracking the number of errors in the device is a good approach.  The
superblock write is asynchronous but it's not necessary to track the
error in the page, we have the device structure in the end io callback.
Also it's guaranteed that this is running only from one place so not
even the atomics are needed.

> I'm currently running fstests against it and it hasn't blown up yet.
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 3d512b041977..5f6f8472ecec 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3627,28 +3627,24 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
>  static void btrfs_end_super_write(struct bio *bio)
>  {
>  	struct btrfs_device *device = bio->bi_private;
> -	struct bio_vec *bvec;
> -	struct bvec_iter_all iter_all;
> -	struct page *page;
> -
> -	bio_for_each_segment_all(bvec, bio, iter_all) {
> -		page = bvec->bv_page;
> +	struct folio_iter fi;

I'd rather make the conversion from pages to folios a separate patch
from the error counting change. I haven't seen anything obviously wrong
but the superblock write is a critical action so it's a matter of
precaution.

> +	bio_for_each_folio_all(fi, bio) {
>  		if (bio->bi_status) {
>  			btrfs_warn_rl_in_rcu(device->fs_info,
> -				"lost page write due to IO error on %s (%d)",
> +				"lost sb write due to IO error on %s (%d)",
>  				btrfs_dev_name(device),
>  				blk_status_to_errno(bio->bi_status));
> -			ClearPageUptodate(page);
> -			SetPageError(page);
>  			btrfs_dev_stat_inc_and_print(device,
>  						     BTRFS_DEV_STAT_WRITE_ERRS);
> -		} else {
> -			SetPageUptodate(page);
> +			/* Ensure failure if a primary sb fails */
> +			if (bio->bi_opf & REQ_FUA)
> +				atomic_set(&device->sb_wb_errors, INT_MAX / 2);

This is using some magic constant so it would be better defined
separately and documented what it means.

> +			else
> +				atomic_inc(&device->sb_wb_errors);
>  		}
> -
> -		put_page(page);
> -		unlock_page(page);
> +		folio_unlock(fi.folio);
> +		folio_put(fi.folio);
>  	}
>  
>  	bio_put(bio);
> @@ -3750,19 +3746,21 @@ static int write_dev_supers(struct btrfs_device *device,
>  	struct address_space *mapping = device->bdev->bd_mapping;
>  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>  	int i;
> -	int errors = 0;
>  	int ret;
>  	u64 bytenr, bytenr_orig;
>  
> +	atomic_set(&device->sb_wb_errors, 0);
> +
>  	if (max_mirrors == 0)
>  		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
>  
>  	shash->tfm = fs_info->csum_shash;
>  
>  	for (i = 0; i < max_mirrors; i++) {
> -		struct page *page;
> +		struct folio *folio;
>  		struct bio *bio;
>  		struct btrfs_super_block *disk_super;
> +		size_t offset;
>  
>  		bytenr_orig = btrfs_sb_offset(i);
>  		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
> @@ -3772,7 +3770,7 @@ static int write_dev_supers(struct btrfs_device *device,
>  			btrfs_err(device->fs_info,
>  				"couldn't get super block location for mirror %d",
>  				i);
> -			errors++;
> +			atomic_inc(&device->sb_wb_errors);
>  			continue;
>  		}
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
> @@ -3785,20 +3783,18 @@ static int write_dev_supers(struct btrfs_device *device,
>  				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
>  				    sb->csum);
>  
> -		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
> -					   GFP_NOFS);
> -		if (!page) {
> +		folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
> +				FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
> +		if (IS_ERR(folio)) {
>  			btrfs_err(device->fs_info,
>  			    "couldn't get super block page for bytenr %llu",
>  			    bytenr);
> -			errors++;
> +			atomic_inc(&device->sb_wb_errors);
>  			continue;
>  		}
>  
> -		/* Bump the refcount for wait_dev_supers() */
> -		get_page(page);
> -
> -		disk_super = page_address(page);
> +		offset = offset_in_folio(folio, bytenr);
> +		disk_super = folio_address(folio) + offset;
>  		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
>  
>  		/*
> @@ -3812,8 +3808,7 @@ static int write_dev_supers(struct btrfs_device *device,
>  		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
>  		bio->bi_private = device;
>  		bio->bi_end_io = btrfs_end_super_write;
> -		__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
> -			       offset_in_page(bytenr));
> +		bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
>  
>  		/*
>  		 * We FUA only the first super block.  The others we allow to
> @@ -3825,9 +3820,9 @@ static int write_dev_supers(struct btrfs_device *device,
>  		submit_bio(bio);
>  
>  		if (btrfs_advance_sb_log(device, i))
> -			errors++;
> +			atomic_inc(&device->sb_wb_errors);
>  	}
> -	return errors < i ? 0 : -1;
> +	return atomic_read(&device->sb_wb_errors) < i ? 0 : -1;
>  }
>  
>  /*
> @@ -3849,7 +3844,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
>  
>  	for (i = 0; i < max_mirrors; i++) {
> -		struct page *page;
> +		struct folio *folio;
>  
>  		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
>  		if (ret == -ENOENT) {
> @@ -3864,29 +3859,19 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  		    device->commit_total_bytes)
>  			break;
>  
> -		page = find_get_page(device->bdev->bd_mapping,
> +		folio = filemap_get_folio(device->bdev->bd_mapping,
>  				     bytenr >> PAGE_SHIFT);
> -		if (!page) {
> -			errors++;
> -			if (i == 0)
> -				primary_failed = true;
> +		/* If the folio has been removed, then we know it completed */
> +		if (IS_ERR(folio))
>  			continue;
> -		}
> -		/* Page is submitted locked and unlocked once the IO completes */
> -		wait_on_page_locked(page);
> -		if (PageError(page)) {
> -			errors++;
> -			if (i == 0)
> -				primary_failed = true;
> -		}
> -
> -		/* Drop our reference */
> -		put_page(page);
> -
> -		/* Drop the reference from the writing run */
> -		put_page(page);
> +		/* Folio is unlocked once the IO completes */
> +		folio_wait_locked(folio);
> +		folio_put(folio);
>  	}
>  
> +	errors += atomic_read(&device->sb_wb_errors);
> +	if (errors >= INT_MAX / 2)
> +		primary_failed = true;

Alternatively a flag can be set in the device if the primary superblock
write fails but I think encoding that in the error count also works, as
long as it's a named constant.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Removing PG_error use from btrfs
  2024-04-18 18:00 ` David Sterba
@ 2024-04-18 19:07   ` Matthew Wilcox
  0 siblings, 0 replies; 3+ messages in thread
From: Matthew Wilcox @ 2024-04-18 19:07 UTC (permalink / raw)
  To: David Sterba
  Cc: Chris Mason, Josef Bacik, David Sterba, linux-btrfs,
	linux-fsdevel, Jan Kara

On Thu, Apr 18, 2024 at 08:00:51PM +0200, David Sterba wrote:
> On Thu, Apr 18, 2024 at 06:41:47PM +0100, Matthew Wilcox wrote:
> > btrfs currently uses it to indicate superblock writeback errors.
> > This proposal moves that information to a counter in the btrfs_device.
> > Maybe this isn't the best approach.  What do you think?
> 
> Tracking the number of errors in the device is a good approach.  The
> superblock write is asynchronous but it's not necessary to track the
> error in the page, we have the device structure in the end io callback.
> Also it's guaranteed that this is running only from one place so not
> even the atomics are needed.

Ah, but the completions might happen on different CPUs at the same time.
And the completion of the first write might happen while the submission
for a subsequent write are still running, and that might also adjust
the sb_wb_error value, so I think it is needed.

> I'd rather make the conversion from pages to folios a separate patch
> from the error counting change. I haven't seen anything obviously wrong
> but the superblock write is a critical action so it's a matter of
> precaution.

Understood.  I've split off three patches for folio conversion; one for
each function.

> > -		} else {
> > -			SetPageUptodate(page);
> > +			/* Ensure failure if a primary sb fails */
> > +			if (bio->bi_opf & REQ_FUA)
> > +				atomic_set(&device->sb_wb_errors, INT_MAX / 2);
> 
> This is using some magic constant so it would be better defined
> separately and documented what it means.

I was hoping that comment would be enough.  Name for the constant?
BTRFS_PRIMARY_SB_FAILED?

> Alternatively a flag can be set in the device if the primary superblock
> write fails but I think encoding that in the error count also works, as
> long as it's a named constant.

Yeah, I thought about that option too.  This seemed like the easiest way
to go.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-04-18 19:07 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-18 17:41 Removing PG_error use from btrfs Matthew Wilcox
2024-04-18 18:00 ` David Sterba
2024-04-18 19:07   ` Matthew Wilcox

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.