All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfs: check for sparse inode clusters that cross new EOAG when shrinking
@ 2021-07-03  3:01 Darrick J. Wong
  2021-07-03  3:04 ` [RFC PATCH] xfs: test regression in shrink when EOFS splits a sparse inode cluster Darrick J. Wong
  2021-07-05  8:11 ` [PATCH] xfs: check for sparse inode clusters that cross new EOAG when shrinking Gao Xiang
  0 siblings, 2 replies; 3+ messages in thread
From: Darrick J. Wong @ 2021-07-03  3:01 UTC (permalink / raw)
  To: xfs

From: Darrick J. Wong <djwong@kernel.org>

While running xfs/168, I noticed occasional write verifier shutdowns
involving inodes at the very end of the filesystem.  Existing inode
btree validation code checks that all inode clusters are fully contained
within the filesystem.

However, due to inadequate checking in the fs shrink code, it's possible
that there could be a sparse inode cluster at the end of the filesystem
where the upper inodes of the cluster are marked as holes and the
corresponding blocks are free.  In this case, the last blocks in the AG
are listed in the bnobt.  This enables the shrink to proceed but results
in a filesystem that trips the inode verifiers.  Fix this by disallowing
the shrink.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ag.c     |    8 +++++++
 fs/xfs/libxfs/xfs_ialloc.c |   52 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_ialloc.h |    3 +++
 3 files changed, 63 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index dcd8c8f7cd4a..376e4bb0e4ec 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -815,6 +815,14 @@ xfs_ag_shrink_space(
 
 	args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta);
 
+	/*
+	 * Make sure that the last inode cluster cannot overlap with the new
+	 * end of the AG, even if it's sparse.
+	 */
+	error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta);
+	if (error)
+		return error;
+
 	/*
 	 * Disable perag reservations so it doesn't cause the allocation request
 	 * to fail. We'll reestablish reservation before we return.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 57d9cb632983..90c4afecf7ba 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2928,3 +2928,55 @@ xfs_ialloc_calc_rootino(
 
 	return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
 }
+
+/*
+ * Ensure there are not sparse inode clusters that cross the new EOAG.
+ *
+ * This is a no-op for non-spinode filesystems since clusters are always fully
+ * allocated and checking the bnobt suffices.  However, a spinode filesystem
+ * could have a record where the upper inodes are free blocks.  If those blocks
+ * were removed from the filesystem, the inode record would extend beyond EOAG,
+ * which will be flagged as corruption.
+ */
+int
+xfs_ialloc_check_shrink(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		*agibp,
+	xfs_agblock_t		new_length)
+{
+	struct xfs_inobt_rec_incore rec;
+	struct xfs_btree_cur	*cur;
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_agino_t		agino = XFS_AGB_TO_AGINO(mp, new_length);
+	int			has;
+	int			error;
+
+	if (!xfs_sb_version_hassparseinodes(&mp->m_sb))
+		return 0;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agibp, agno, XFS_BTNUM_INO);
+
+	/* Look up the inobt record that would correspond to the new EOFS. */
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
+	if (error || !has)
+		goto out;
+
+	error = xfs_inobt_get_rec(cur, &rec, &has);
+	if (error)
+		goto out;
+
+	if (!has) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* If the record covers inodes that would be beyond EOFS, bail out. */
+	if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
+		error = -ENOSPC;
+		goto out;
+	}
+out:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 9df7c80408ff..9a2112b4ad5e 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -122,4 +122,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
 void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
 xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
 
+int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno,
+		struct xfs_buf *agibp, xfs_agblock_t new_length);
+
 #endif	/* __XFS_IALLOC_H__ */

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [RFC PATCH] xfs: test regression in shrink when EOFS splits a sparse inode cluster
  2021-07-03  3:01 [PATCH] xfs: check for sparse inode clusters that cross new EOAG when shrinking Darrick J. Wong
@ 2021-07-03  3:04 ` Darrick J. Wong
  2021-07-05  8:11 ` [PATCH] xfs: check for sparse inode clusters that cross new EOAG when shrinking Gao Xiang
  1 sibling, 0 replies; 3+ messages in thread
From: Darrick J. Wong @ 2021-07-03  3:04 UTC (permalink / raw)
  To: xfs

From: Darrick J. Wong <djwong@kernel.org>

This is a targeted regression test for the patch "xfs: check for sparse
inode clusters that cross new EOAG when shrinking", which was found by
running the random-loopy shrink stresser xfs/168.

The original shrink implementation assumed that if we could allocate the
last free extent in the filesystem, it was ok to proceed with the fs
shrink.  Unfortunately, this isn't quite the case -- if there's a sparse
inode cluster such that the blocks at the end of the cluster are free,
it is not ok to shrink the fs to the point that part of the cluster
hangs off the end of the filesystem.  Doing so results in repair and
scrub marking the filesystem corrupt, so we must not.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/778     |  190 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/xfs/778.out |    2 +
 2 files changed, 192 insertions(+)
 create mode 100755 tests/xfs/778
 create mode 100644 tests/xfs/778.out

diff --git a/tests/xfs/778 b/tests/xfs/778
new file mode 100755
index 00000000..73cebaf1
--- /dev/null
+++ b/tests/xfs/778
@@ -0,0 +1,190 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2021 Oracle.  All Rights Reserved.
+#
+# FS QA Test 778
+#
+# Ensure that online shrink does not let us shrink the fs such that the end
+# of the filesystem is now in the middle of a sparse inode cluster.
+#
+. ./common/preamble
+_begin_fstest auto quick shrink
+
+# Import common functions.
+. ./common/filter
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs generic
+_require_scratch
+_require_xfs_sparse_inodes
+_require_scratch_xfs_shrink
+_require_xfs_io_command "falloc"
+_require_xfs_io_command "fpunch"
+
+_scratch_mkfs "-d size=50m -m crc=1 -i sparse" |
+	_filter_mkfs > /dev/null 2> $tmp.mkfs
+. $tmp.mkfs	# for isize
+cat $tmp.mkfs >> $seqres.full
+
+daddr_to_fsblocks=$((dbsize / 512))
+
+convert_units() {
+	_scratch_xfs_db -f -c "$@" | sed -e 's/^.*(\([0-9]*\)).*$/\1/g'
+}
+
+# Figure out the next possible inode number after the log, since we can't
+# shrink or relocate the log
+logstart=$(_scratch_xfs_get_metadata_field 'logstart' 'sb')
+if [ $logstart -gt 0 ]; then
+	logblocks=$(_scratch_xfs_get_metadata_field 'logblocks' 'sb')
+	logend=$((logstart + logblocks))
+	logend_agno=$(convert_units "convert fsb $logend agno")
+	logend_agino=$(convert_units "convert fsb $logend agino")
+else
+	logend_agno=0
+	logend_agino=0
+fi
+
+_scratch_mount
+_xfs_force_bdev data $SCRATCH_MNT
+old_dblocks=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | grep geom.datablocks)
+
+mkdir $SCRATCH_MNT/save/
+sino=$(stat -c '%i' $SCRATCH_MNT/save)
+
+_consume_freesp()
+{
+	file=$1
+
+	# consume nearly all available space (leave ~1MB)
+	avail=`_get_available_space $SCRATCH_MNT`
+	filesizemb=$((avail / 1024 / 1024 - 1))
+	$XFS_IO_PROG -fc "falloc 0 ${filesizemb}m" $file
+}
+
+# Allocate inodes in a directory until failure.
+_alloc_inodes()
+{
+	dir=$1
+
+	i=0
+	while [ true ]; do
+		touch $dir/$i 2>> $seqres.full || break
+		i=$((i + 1))
+	done
+}
+
+# Find a sparse inode cluster after logend_agno/logend_agino.
+find_sparse_clusters()
+{
+	for ((agno = agcount - 1; agno >= logend_agno; agno--)); do
+		_scratch_xfs_db -c "agi $agno" -c "addr root" -c "btdump" | \
+			tr ':[,]' '    ' | \
+			awk -v "agno=$agno" \
+			    -v "agino=$logend_agino" \
+'{if ($2 >= agino && and(strtonum($3), 0x8000)) {printf("%s %s %s\n", agno, $2, $3);}}' | \
+			tac
+	done
+}
+
+# Calculate the fs inode chunk size based on the inode size and fixed 64-inode
+# record. This value is used as the target level of free space fragmentation
+# induced by the test (i.e., max size of free extents). We don't need to go
+# smaller than a full chunk because the XFS block allocator tacks on alignment
+# requirements to the size of the requested allocation. In other words, a chunk
+# sized free chunk is not enough to guarantee a successful chunk sized
+# allocation.
+XFS_INODES_PER_CHUNK=64
+CHUNK_SIZE=$((isize * XFS_INODES_PER_CHUNK))
+
+_consume_freesp $SCRATCH_MNT/spc
+
+# Now that the fs is nearly full, punch holes in every other $CHUNK_SIZE range
+# of the space consumer file.  The goal here is to end up with a sparse cluster
+# at the end of the fs (and past any internal log), where the chunks at the end
+# of the cluster are sparse.
+
+offset=`_get_filesize $SCRATCH_MNT/spc`
+offset=$((offset - $CHUNK_SIZE * 2))
+nr=0
+while [ $offset -ge 0 ]; do
+	$XFS_IO_PROG -c "fpunch $offset $CHUNK_SIZE" $SCRATCH_MNT/spc \
+		2>> $seqres.full || _fail "fpunch failed"
+
+	# allocate as many inodes as possible
+	mkdir -p $SCRATCH_MNT/urk/offset.$offset > /dev/null 2>&1
+	_alloc_inodes $SCRATCH_MNT/urk/offset.$offset
+
+	offset=$((offset - $CHUNK_SIZE * 2))
+
+	# Every five times through the loop, see if we got a sparse cluster
+	nr=$((nr + 1))
+	if [ $((nr % 5)) -eq 4 ]; then
+		_scratch_unmount
+		find_sparse_clusters > $tmp.clusters
+		if [ -s $tmp.clusters ]; then
+			break;
+		fi
+		_scratch_mount
+	fi
+done
+
+test -s $tmp.clusters || _notrun "Could not create a sparse inode cluster"
+
+echo clusters >> $seqres.full
+cat $tmp.clusters >> $seqres.full
+
+# Figure out which inode numbers are in that last cluster.  We need to preserve
+# that cluster but delete everything else ahead of shrinking.
+icluster_agno=$(head -n 1 $tmp.clusters | cut -d ' ' -f 1)
+icluster_agino=$(head -n 1 $tmp.clusters | cut -d ' ' -f 2)
+icluster_ino=$(convert_units "convert agno $icluster_agno agino $icluster_agino ino")
+
+# Check that the save directory isn't going to prevent us from shrinking
+test $sino -lt $icluster_ino || \
+	echo "/save inode comes after target cluster, test may fail"
+
+# Save the inodes in the last cluster and delete everything else
+_scratch_mount
+rm -r $SCRATCH_MNT/spc
+for ((ino = icluster_ino; ino < icluster_ino + XFS_INODES_PER_CHUNK; ino++)); do
+	find $SCRATCH_MNT/urk/ -inum "$ino" -print0 | xargs -r -0 mv -t $SCRATCH_MNT/save/
+done
+rm -rf $SCRATCH_MNT/urk/ $SCRATCH_MNT/save/*/*
+sync
+$XFS_IO_PROG -c 'fsmap -vvvvv' $SCRATCH_MNT &>> $seqres.full
+
+# Propose shrinking the filesystem such that the end of the fs ends up in the
+# sparse part of our sparse cluster.  Remember, the last block of that cluster
+# ought to be free.
+target_ino=$((icluster_ino + XFS_INODES_PER_CHUNK - 1))
+for ((ino = target_ino; ino >= icluster_ino; ino--)); do
+	found=$(find $SCRATCH_MNT/save/ -inum "$ino" | wc -l)
+	test $found -gt 0 && break
+
+	ino_daddr=$(convert_units "convert ino $ino daddr")
+	new_size=$((ino_daddr / daddr_to_fsblocks))
+
+	echo "Hope to fail at shrinking to $new_size" >> $seqres.full
+	$XFS_GROWFS_PROG -D $new_size $SCRATCH_MNT &>> $seqres.full
+	res=$?
+
+	# Make sure shrink did not work
+	new_dblocks=$($XFS_IO_PROG -c 'statfs' $SCRATCH_MNT | grep geom.datablocks)
+	if [ "$new_dblocks" != "$old_dblocks" ]; then
+		echo "should not have shrank $old_dblocks -> $new_dblocks"
+		break
+	fi
+
+	if [ $res -eq 0 ]; then
+		echo "shrink to $new_size (ino $ino) should have failed"
+		break
+	fi
+done
+
+# success, all done
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/778.out b/tests/xfs/778.out
new file mode 100644
index 00000000..e80f72a3
--- /dev/null
+++ b/tests/xfs/778.out
@@ -0,0 +1,2 @@
+QA output created by 778
+Silence is golden

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] xfs: check for sparse inode clusters that cross new EOAG when shrinking
  2021-07-03  3:01 [PATCH] xfs: check for sparse inode clusters that cross new EOAG when shrinking Darrick J. Wong
  2021-07-03  3:04 ` [RFC PATCH] xfs: test regression in shrink when EOFS splits a sparse inode cluster Darrick J. Wong
@ 2021-07-05  8:11 ` Gao Xiang
  1 sibling, 0 replies; 3+ messages in thread
From: Gao Xiang @ 2021-07-05  8:11 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs

On Fri, Jul 02, 2021 at 08:01:57PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> While running xfs/168, I noticed occasional write verifier shutdowns
> involving inodes at the very end of the filesystem.  Existing inode
> btree validation code checks that all inode clusters are fully contained
> within the filesystem.
> 
> However, due to inadequate checking in the fs shrink code, it's possible
> that there could be a sparse inode cluster at the end of the filesystem
> where the upper inodes of the cluster are marked as holes and the
> corresponding blocks are free.  In this case, the last blocks in the AG
> are listed in the bnobt.  This enables the shrink to proceed but results
> in a filesystem that trips the inode verifiers.  Fix this by disallowing
> the shrink.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>

Ohh, sparse inode chunks with partial free blocks...

Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>

Thanks,
Gao Xiang

> ---
>  fs/xfs/libxfs/xfs_ag.c     |    8 +++++++
>  fs/xfs/libxfs/xfs_ialloc.c |   52 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_ialloc.h |    3 +++
>  3 files changed, 63 insertions(+)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index dcd8c8f7cd4a..376e4bb0e4ec 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -815,6 +815,14 @@ xfs_ag_shrink_space(
>  
>  	args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta);
>  
> +	/*
> +	 * Make sure that the last inode cluster cannot overlap with the new
> +	 * end of the AG, even if it's sparse.
> +	 */
> +	error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta);
> +	if (error)
> +		return error;
> +
>  	/*
>  	 * Disable perag reservations so it doesn't cause the allocation request
>  	 * to fail. We'll reestablish reservation before we return.
> diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
> index 57d9cb632983..90c4afecf7ba 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.c
> +++ b/fs/xfs/libxfs/xfs_ialloc.c
> @@ -2928,3 +2928,55 @@ xfs_ialloc_calc_rootino(
>  
>  	return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
>  }
> +
> +/*
> + * Ensure there are not sparse inode clusters that cross the new EOAG.
> + *
> + * This is a no-op for non-spinode filesystems since clusters are always fully
> + * allocated and checking the bnobt suffices.  However, a spinode filesystem
> + * could have a record where the upper inodes are free blocks.  If those blocks
> + * were removed from the filesystem, the inode record would extend beyond EOAG,
> + * which will be flagged as corruption.
> + */
> +int
> +xfs_ialloc_check_shrink(
> +	struct xfs_trans	*tp,
> +	xfs_agnumber_t		agno,
> +	struct xfs_buf		*agibp,
> +	xfs_agblock_t		new_length)
> +{
> +	struct xfs_inobt_rec_incore rec;
> +	struct xfs_btree_cur	*cur;
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	xfs_agino_t		agino = XFS_AGB_TO_AGINO(mp, new_length);
> +	int			has;
> +	int			error;
> +
> +	if (!xfs_sb_version_hassparseinodes(&mp->m_sb))
> +		return 0;
> +
> +	cur = xfs_inobt_init_cursor(mp, tp, agibp, agno, XFS_BTNUM_INO);
> +
> +	/* Look up the inobt record that would correspond to the new EOFS. */
> +	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
> +	if (error || !has)
> +		goto out;
> +
> +	error = xfs_inobt_get_rec(cur, &rec, &has);
> +	if (error)
> +		goto out;
> +
> +	if (!has) {
> +		error = -EFSCORRUPTED;
> +		goto out;
> +	}
> +
> +	/* If the record covers inodes that would be beyond EOFS, bail out. */
> +	if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
> +		error = -ENOSPC;
> +		goto out;
> +	}
> +out:
> +	xfs_btree_del_cursor(cur, error);
> +	return error;
> +}
> diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
> index 9df7c80408ff..9a2112b4ad5e 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.h
> +++ b/fs/xfs/libxfs/xfs_ialloc.h
> @@ -122,4 +122,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
>  void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
>  xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
>  
> +int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno,
> +		struct xfs_buf *agibp, xfs_agblock_t new_length);
> +
>  #endif	/* __XFS_IALLOC_H__ */

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-07-05  8:11 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-03  3:01 [PATCH] xfs: check for sparse inode clusters that cross new EOAG when shrinking Darrick J. Wong
2021-07-03  3:04 ` [RFC PATCH] xfs: test regression in shrink when EOFS splits a sparse inode cluster Darrick J. Wong
2021-07-05  8:11 ` [PATCH] xfs: check for sparse inode clusters that cross new EOAG when shrinking Gao Xiang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.