linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14
@ 2021-09-01  0:11 Darrick J. Wong
  2021-09-01  0:11 ` [PATCH 1/3] generic: fsstress with cpu offlining Darrick J. Wong
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Darrick J. Wong @ 2021-09-01  0:11 UTC (permalink / raw)
  To: djwong, guaneryu; +Cc: linux-xfs, fstests, guan

Hi all,

Add new tests to exercise code that got refactored in 5.14.  The
nested shutdown test simulates the process of recovering after a VM host
filesystem goes down and the guests have to recover.

v2: fix some bugs pointed out by the maintainer, add cpu offlining stress test

If you're going to start using this mess, you probably ought to just
pull from my git trees, which are linked below.

This is an extraordinary way to destroy everything.  Enjoy!
Comments and questions are, as always, welcome.

--D

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=new-tests-for-5.14
---
 common/rc             |   24 +++++++++
 tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/725.out |    2 +
 tests/generic/726     |   69 +++++++++++++++++++++++++
 tests/generic/726.out |    2 +
 tests/xfs/449         |    2 -
 6 files changed, 234 insertions(+), 1 deletion(-)
 create mode 100755 tests/generic/725
 create mode 100644 tests/generic/725.out
 create mode 100755 tests/generic/726
 create mode 100644 tests/generic/726.out


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/3] generic: fsstress with cpu offlining
  2021-09-01  0:11 [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Darrick J. Wong
@ 2021-09-01  0:11 ` Darrick J. Wong
  2021-09-05 14:48   ` Eryu Guan
  2021-09-01  0:11 ` [PATCH 2/3] generic: test shutdowns of a nested filesystem Darrick J. Wong
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Darrick J. Wong @ 2021-09-01  0:11 UTC (permalink / raw)
  To: djwong, guaneryu; +Cc: linux-xfs, fstests, guan

From: Darrick J. Wong <djwong@kernel.org>

Exercise filesystem operations when we're taking CPUs online and offline
throughout the test.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/generic/726     |   69 +++++++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/726.out |    2 +
 2 files changed, 71 insertions(+)
 create mode 100755 tests/generic/726
 create mode 100644 tests/generic/726.out


diff --git a/tests/generic/726 b/tests/generic/726
new file mode 100755
index 00000000..cb709795
--- /dev/null
+++ b/tests/generic/726
@@ -0,0 +1,69 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
+#
+# FS QA Test No. 726
+#
+# Run an all-writes fsstress run with multiple threads while exercising CPU
+# hotplugging to shake out bugs in the write path.
+#
+. ./common/preamble
+_begin_fstest auto rw stress
+
+# Override the default cleanup function.
+_cleanup()
+{
+	cd /
+	rm -f $tmp.*
+	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
+	wait	# for exercise_cpu_hotplug subprocess
+	for i in "$sysfs_cpu_dir/"cpu*/online; do
+		echo 1 > "$i" 2>/dev/null
+	done
+}
+
+exercise_cpu_hotplug()
+{
+	while [ -e $sentinel_file ]; do
+		local idx=$(( RANDOM % nr_hotplug_cpus ))
+		local cpu="${hotplug_cpus[idx]}"
+		local action=$(( RANDOM % 2 ))
+
+		echo "$action" > "$sysfs_cpu_dir/cpu$cpu/online" 2>/dev/null
+		sleep 0.5
+	done
+}
+
+_supported_fs generic
+
+sysfs_cpu_dir="/sys/devices/system/cpu"
+
+# Figure out which CPU(s) support hotplug.
+nrcpus=$(getconf _NPROCESSORS_CONF)
+hotplug_cpus=()
+for ((i = 0; i < nrcpus; i++ )); do
+	test -e "$sysfs_cpu_dir/cpu$i/online" && hotplug_cpus+=("$i")
+done
+nr_hotplug_cpus="${#hotplug_cpus[@]}"
+test "$nr_hotplug_cpus" -gt 0 || _notrun "CPU hotplugging not supported"
+
+_require_scratch
+_require_command "$KILLALL_PROG" "killall"
+
+echo "Silence is golden."
+
+_scratch_mkfs > $seqres.full 2>&1
+_scratch_mount >> $seqres.full 2>&1
+
+sentinel_file=$tmp.hotplug
+touch $sentinel_file
+exercise_cpu_hotplug &
+
+nr_cpus=$((LOAD_FACTOR * nr_hotplug_cpus))
+nr_ops=$((25000 * TIME_FACTOR))
+$FSSTRESS_PROG $FSSTRESS_AVOID -w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus >> $seqres.full
+rm -f $sentinel_file
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/726.out b/tests/generic/726.out
new file mode 100644
index 00000000..6839f8ce
--- /dev/null
+++ b/tests/generic/726.out
@@ -0,0 +1,2 @@
+QA output created by 726
+Silence is golden.


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-09-01  0:11 [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Darrick J. Wong
  2021-09-01  0:11 ` [PATCH 1/3] generic: fsstress with cpu offlining Darrick J. Wong
@ 2021-09-01  0:11 ` Darrick J. Wong
  2021-09-01  0:12 ` [PATCH 3/3] xfs/449: filter out deprecation warnings from mkfs Darrick J. Wong
  2021-09-05 15:04 ` [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Eryu Guan
  3 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2021-09-01  0:11 UTC (permalink / raw)
  To: djwong, guaneryu; +Cc: linux-xfs, fstests, guan

From: Darrick J. Wong <djwong@kernel.org>

generic/475, but we're running fsstress on a disk image inside the
scratch filesystem

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 common/rc             |   24 +++++++++
 tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/725.out |    2 +
 3 files changed, 162 insertions(+)
 create mode 100755 tests/generic/725
 create mode 100644 tests/generic/725.out


diff --git a/common/rc b/common/rc
index 46b6b220..05c87332 100644
--- a/common/rc
+++ b/common/rc
@@ -631,6 +631,30 @@ _ext4_metadump()
 		$DUMP_COMPRESSOR -f "$dumpfile" &>> "$seqres.full"
 }
 
+# Capture the metadata of a filesystem in a dump file for offline analysis.
+# This is not supported by all filesystem types, so this function should only
+# be used after a test has already failed.
+_metadump_dev() {
+	local device="$1"
+	local dumpfile="$2"
+	local compressopt="$3"
+
+	test "$DUMP_CORRUPT_FS" = 1 || return 0
+
+	case "$FSTYP" in
+	ext*)
+		_ext4_metadump $device $dumpfile $compressopt
+		;;
+	xfs)
+		_xfs_metadump $dumpfile $device none $compressopt
+		;;
+	*)
+		echo "Don't know how to metadump $FSTYP"
+		return 1
+		;;
+	esac
+}
+
 _test_mkfs()
 {
     case $FSTYP in
diff --git a/tests/generic/725 b/tests/generic/725
new file mode 100755
index 00000000..ac008fdb
--- /dev/null
+++ b/tests/generic/725
@@ -0,0 +1,136 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
+#
+# FS QA Test No. 725
+#
+# Test nested log recovery with repeated (simulated) disk failures.  We kick
+# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
+# out the underlying scratch device with dm-error to see what happens when the
+# disk goes down.  Having taken down both fses in this manner, remount them and
+# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
+# in writeback on the host that cause VM guests to fail to recover.
+#
+. ./common/preamble
+_begin_fstest shutdown auto log metadata eio recoveryloop
+
+_cleanup()
+{
+	cd /
+	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
+	wait
+	if [ -n "$loopmnt" ]; then
+		$UMOUNT_PROG $loopmnt 2>/dev/null
+		rm -r -f $loopmnt
+	fi
+	rm -f $tmp.*
+	_dmerror_unmount
+	_dmerror_cleanup
+}
+
+# Import common functions.
+. ./common/dmerror
+. ./common/reflink
+
+# Modify as appropriate.
+_supported_fs generic
+
+_require_scratch_reflink
+_require_cp_reflink
+_require_dm_target error
+_require_command "$KILLALL_PROG" "killall"
+
+echo "Silence is golden."
+
+_scratch_mkfs >> $seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_dmerror_init
+_dmerror_mount
+
+# Create a fs image consuming 1/3 of the scratch fs
+scratch_freesp_bytes=$(_get_available_space $SCRATCH_MNT)
+loopimg_bytes=$((scratch_freesp_bytes / 3))
+
+loopimg=$SCRATCH_MNT/testfs
+truncate -s $loopimg_bytes $loopimg
+_mkfs_dev $loopimg
+
+loopmnt=$tmp.mount
+mkdir -p $loopmnt
+
+scratch_aliveflag=$tmp.runsnap
+snap_aliveflag=$tmp.snapping
+
+snap_loop_fs() {
+	touch "$snap_aliveflag"
+	while [ -e "$scratch_aliveflag" ]; do
+		rm -f $loopimg.a
+		_cp_reflink $loopimg $loopimg.a
+		sleep 1
+	done
+	rm -f "$snap_aliveflag"
+}
+
+fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
+
+for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
+	touch $scratch_aliveflag
+	snap_loop_fs >> $seqres.full 2>&1 &
+
+	if ! _mount $loopimg $loopmnt -o loop; then
+		rm -f $scratch_aliveflag
+		_metadump_dev $loopimg $seqres.loop.$i.md
+		_fail "iteration $i loopimg mount failed"
+		break
+	fi
+
+	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
+
+	# purposely include 0 second sleeps to test shutdown immediately after
+	# recovery
+	sleep $((RANDOM % (3 * TIME_FACTOR) ))
+	rm -f $scratch_aliveflag
+
+	# This test aims to simulate sudden disk failure, which means that we
+	# do not want to quiesce the filesystem or otherwise give it a chance
+	# to flush its logs.  Therefore we want to call dmsetup with the
+	# --nolockfs parameter; to make this happen we must call the load
+	# error table helper *without* 'lockfs'.
+	_dmerror_load_error_table
+
+	ps -e | grep fsstress > /dev/null 2>&1
+	while [ $? -eq 0 ]; do
+		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
+		wait > /dev/null 2>&1
+		ps -e | grep fsstress > /dev/null 2>&1
+	done
+	for ((i = 0; i < 10; i++)); do
+		test -e "$snap_aliveflag" || break
+		sleep 1
+	done
+
+	# Mount again to replay log after loading working table, so we have a
+	# consistent fs after test.
+	$UMOUNT_PROG $loopmnt
+	_dmerror_unmount || _fail "iteration $i scratch unmount failed"
+	_dmerror_load_working_table
+	if ! _dmerror_mount; then
+		_metadump_dev $DMERROR_DEV $seqres.scratch.$i.md
+		_fail "iteration $i scratch mount failed"
+	fi
+done
+
+# Make sure the fs image file is ok
+if [ -f "$loopimg" ]; then
+	if _mount $loopimg $loopmnt -o loop; then
+		$UMOUNT_PROG $loopmnt &> /dev/null
+	else
+		_metadump_dev $DMERROR_DEV $seqres.scratch.final.md
+		echo "final scratch mount failed"
+	fi
+	SCRATCH_RTDEV= SCRATCH_LOGDEV= _check_scratch_fs $loopimg
+fi
+
+# success, all done; let the test harness check the scratch fs
+status=0
+exit
diff --git a/tests/generic/725.out b/tests/generic/725.out
new file mode 100644
index 00000000..ed73a9fc
--- /dev/null
+++ b/tests/generic/725.out
@@ -0,0 +1,2 @@
+QA output created by 725
+Silence is golden.


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/3] xfs/449: filter out deprecation warnings from mkfs
  2021-09-01  0:11 [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Darrick J. Wong
  2021-09-01  0:11 ` [PATCH 1/3] generic: fsstress with cpu offlining Darrick J. Wong
  2021-09-01  0:11 ` [PATCH 2/3] generic: test shutdowns of a nested filesystem Darrick J. Wong
@ 2021-09-01  0:12 ` Darrick J. Wong
  2021-09-05 15:04 ` [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Eryu Guan
  3 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2021-09-01  0:12 UTC (permalink / raw)
  To: djwong, guaneryu; +Cc: linux-xfs, fstests, guan

From: Darrick J. Wong <djwong@kernel.org>

To avoid regressing this test when testing XFS v4 when mkfs is new
enough to whine about creating new deprecated filesystems, filter out
the deprecation warning.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/xfs/449 |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


diff --git a/tests/xfs/449 b/tests/xfs/449
index a3fcd78e..5374bf2f 100755
--- a/tests/xfs/449
+++ b/tests/xfs/449
@@ -23,7 +23,7 @@ _require_scratch_nocheck
 _require_xfs_spaceman_command "info"
 _require_command "$XFS_GROWFS_PROG" xfs_growfs
 
-_scratch_mkfs | sed -e '/Discarding/d' > $tmp.mkfs
+_scratch_mkfs | sed -e '/Discarding/d' -e '/deprecated/d' > $tmp.mkfs
 echo MKFS >> $seqres.full
 cat $tmp.mkfs >> $seqres.full
 


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] generic: fsstress with cpu offlining
  2021-09-01  0:11 ` [PATCH 1/3] generic: fsstress with cpu offlining Darrick J. Wong
@ 2021-09-05 14:48   ` Eryu Guan
  2021-09-13 18:25     ` Darrick J. Wong
  0 siblings, 1 reply; 17+ messages in thread
From: Eryu Guan @ 2021-09-05 14:48 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: guaneryu, linux-xfs, fstests

On Tue, Aug 31, 2021 at 05:11:50PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Exercise filesystem operations when we're taking CPUs online and offline
> throughout the test.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  tests/generic/726     |   69 +++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/726.out |    2 +
>  2 files changed, 71 insertions(+)
>  create mode 100755 tests/generic/726
>  create mode 100644 tests/generic/726.out
> 
> 
> diff --git a/tests/generic/726 b/tests/generic/726
> new file mode 100755
> index 00000000..cb709795
> --- /dev/null
> +++ b/tests/generic/726
> @@ -0,0 +1,69 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> +#
> +# FS QA Test No. 726
> +#
> +# Run an all-writes fsstress run with multiple threads while exercising CPU
> +# hotplugging to shake out bugs in the write path.
> +#
> +. ./common/preamble
> +_begin_fstest auto rw stress
> +
> +# Override the default cleanup function.
> +_cleanup()
> +{
> +	cd /
> +	rm -f $tmp.*
> +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> +	wait	# for exercise_cpu_hotplug subprocess
> +	for i in "$sysfs_cpu_dir/"cpu*/online; do
> +		echo 1 > "$i" 2>/dev/null
> +	done
> +}
> +
> +exercise_cpu_hotplug()
> +{
> +	while [ -e $sentinel_file ]; do
> +		local idx=$(( RANDOM % nr_hotplug_cpus ))
> +		local cpu="${hotplug_cpus[idx]}"
> +		local action=$(( RANDOM % 2 ))
> +
> +		echo "$action" > "$sysfs_cpu_dir/cpu$cpu/online" 2>/dev/null
> +		sleep 0.5
> +	done
> +}
> +
> +_supported_fs generic
> +
> +sysfs_cpu_dir="/sys/devices/system/cpu"
> +
> +# Figure out which CPU(s) support hotplug.
> +nrcpus=$(getconf _NPROCESSORS_CONF)
> +hotplug_cpus=()
> +for ((i = 0; i < nrcpus; i++ )); do
> +	test -e "$sysfs_cpu_dir/cpu$i/online" && hotplug_cpus+=("$i")
> +done
> +nr_hotplug_cpus="${#hotplug_cpus[@]}"
> +test "$nr_hotplug_cpus" -gt 0 || _notrun "CPU hotplugging not supported"
> +
> +_require_scratch
> +_require_command "$KILLALL_PROG" "killall"
> +
> +echo "Silence is golden."
> +
> +_scratch_mkfs > $seqres.full 2>&1
> +_scratch_mount >> $seqres.full 2>&1

I think we could just run fsstress against a dir in $TEST_DIR?

> +
> +sentinel_file=$tmp.hotplug
> +touch $sentinel_file
> +exercise_cpu_hotplug &
> +
> +nr_cpus=$((LOAD_FACTOR * nr_hotplug_cpus))

We'd better to cap nr_cpu just in case we're testing on a system with
1024 cpus and taking very long time for fsstress to finish.

Thanks,
Eryu

> +nr_ops=$((25000 * TIME_FACTOR))
> +$FSSTRESS_PROG $FSSTRESS_AVOID -w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus >> $seqres.full
> +rm -f $sentinel_file
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/726.out b/tests/generic/726.out
> new file mode 100644
> index 00000000..6839f8ce
> --- /dev/null
> +++ b/tests/generic/726.out
> @@ -0,0 +1,2 @@
> +QA output created by 726
> +Silence is golden.
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14
  2021-09-01  0:11 [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Darrick J. Wong
                   ` (2 preceding siblings ...)
  2021-09-01  0:12 ` [PATCH 3/3] xfs/449: filter out deprecation warnings from mkfs Darrick J. Wong
@ 2021-09-05 15:04 ` Eryu Guan
  2021-09-13 18:25   ` Darrick J. Wong
  3 siblings, 1 reply; 17+ messages in thread
From: Eryu Guan @ 2021-09-05 15:04 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: guaneryu, linux-xfs, fstests

On Tue, Aug 31, 2021 at 05:11:44PM -0700, Darrick J. Wong wrote:
> Hi all,
> 
> Add new tests to exercise code that got refactored in 5.14.  The
> nested shutdown test simulates the process of recovering after a VM host
> filesystem goes down and the guests have to recover.
> 
> v2: fix some bugs pointed out by the maintainer, add cpu offlining stress test

Thanks for the revision! I've applied patch 2 and 3 for the update.

Thanks,
Eryu

> 
> If you're going to start using this mess, you probably ought to just
> pull from my git trees, which are linked below.
> 
> This is an extraordinary way to destroy everything.  Enjoy!
> Comments and questions are, as always, welcome.
> 
> --D
> 
> fstests git tree:
> https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=new-tests-for-5.14
> ---
>  common/rc             |   24 +++++++++
>  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/725.out |    2 +
>  tests/generic/726     |   69 +++++++++++++++++++++++++
>  tests/generic/726.out |    2 +
>  tests/xfs/449         |    2 -
>  6 files changed, 234 insertions(+), 1 deletion(-)
>  create mode 100755 tests/generic/725
>  create mode 100644 tests/generic/725.out
>  create mode 100755 tests/generic/726
>  create mode 100644 tests/generic/726.out

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] generic: fsstress with cpu offlining
  2021-09-05 14:48   ` Eryu Guan
@ 2021-09-13 18:25     ` Darrick J. Wong
  0 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2021-09-13 18:25 UTC (permalink / raw)
  To: Eryu Guan; +Cc: guaneryu, linux-xfs, fstests

On Sun, Sep 05, 2021 at 10:48:15PM +0800, Eryu Guan wrote:
> On Tue, Aug 31, 2021 at 05:11:50PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Exercise filesystem operations when we're taking CPUs online and offline
> > throughout the test.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  tests/generic/726     |   69 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  tests/generic/726.out |    2 +
> >  2 files changed, 71 insertions(+)
> >  create mode 100755 tests/generic/726
> >  create mode 100644 tests/generic/726.out
> > 
> > 
> > diff --git a/tests/generic/726 b/tests/generic/726
> > new file mode 100755
> > index 00000000..cb709795
> > --- /dev/null
> > +++ b/tests/generic/726
> > @@ -0,0 +1,69 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0
> > +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> > +#
> > +# FS QA Test No. 726
> > +#
> > +# Run an all-writes fsstress run with multiple threads while exercising CPU
> > +# hotplugging to shake out bugs in the write path.
> > +#
> > +. ./common/preamble
> > +_begin_fstest auto rw stress
> > +
> > +# Override the default cleanup function.
> > +_cleanup()
> > +{
> > +	cd /
> > +	rm -f $tmp.*
> > +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > +	wait	# for exercise_cpu_hotplug subprocess
> > +	for i in "$sysfs_cpu_dir/"cpu*/online; do
> > +		echo 1 > "$i" 2>/dev/null
> > +	done
> > +}
> > +
> > +exercise_cpu_hotplug()
> > +{
> > +	while [ -e $sentinel_file ]; do
> > +		local idx=$(( RANDOM % nr_hotplug_cpus ))
> > +		local cpu="${hotplug_cpus[idx]}"
> > +		local action=$(( RANDOM % 2 ))
> > +
> > +		echo "$action" > "$sysfs_cpu_dir/cpu$cpu/online" 2>/dev/null
> > +		sleep 0.5
> > +	done
> > +}
> > +
> > +_supported_fs generic
> > +
> > +sysfs_cpu_dir="/sys/devices/system/cpu"
> > +
> > +# Figure out which CPU(s) support hotplug.
> > +nrcpus=$(getconf _NPROCESSORS_CONF)
> > +hotplug_cpus=()
> > +for ((i = 0; i < nrcpus; i++ )); do
> > +	test -e "$sysfs_cpu_dir/cpu$i/online" && hotplug_cpus+=("$i")
> > +done
> > +nr_hotplug_cpus="${#hotplug_cpus[@]}"
> > +test "$nr_hotplug_cpus" -gt 0 || _notrun "CPU hotplugging not supported"
> > +
> > +_require_scratch
> > +_require_command "$KILLALL_PROG" "killall"
> > +
> > +echo "Silence is golden."
> > +
> > +_scratch_mkfs > $seqres.full 2>&1
> > +_scratch_mount >> $seqres.full 2>&1
> 
> I think we could just run fsstress against a dir in $TEST_DIR?

Ok.

> > +
> > +sentinel_file=$tmp.hotplug
> > +touch $sentinel_file
> > +exercise_cpu_hotplug &
> > +
> > +nr_cpus=$((LOAD_FACTOR * nr_hotplug_cpus))
> 
> We'd better to cap nr_cpu just in case we're testing on a system with
> 1024 cpus and taking very long time for fsstress to finish.

Not sure why that matters, but I'll cap the number of IO thread at one
per hotpluggable CPU if we go over 1024.

--D

> Thanks,
> Eryu
> 
> > +nr_ops=$((25000 * TIME_FACTOR))
> > +$FSSTRESS_PROG $FSSTRESS_AVOID -w -d $SCRATCH_MNT -n $nr_ops -p $nr_cpus >> $seqres.full
> > +rm -f $sentinel_file
> > +
> > +# success, all done
> > +status=0
> > +exit
> > diff --git a/tests/generic/726.out b/tests/generic/726.out
> > new file mode 100644
> > index 00000000..6839f8ce
> > --- /dev/null
> > +++ b/tests/generic/726.out
> > @@ -0,0 +1,2 @@
> > +QA output created by 726
> > +Silence is golden.
> > 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14
  2021-09-05 15:04 ` [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Eryu Guan
@ 2021-09-13 18:25   ` Darrick J. Wong
  0 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2021-09-13 18:25 UTC (permalink / raw)
  To: Eryu Guan; +Cc: guaneryu, linux-xfs, fstests

On Sun, Sep 05, 2021 at 11:04:33PM +0800, Eryu Guan wrote:
> On Tue, Aug 31, 2021 at 05:11:44PM -0700, Darrick J. Wong wrote:
> > Hi all,
> > 
> > Add new tests to exercise code that got refactored in 5.14.  The
> > nested shutdown test simulates the process of recovering after a VM host
> > filesystem goes down and the guests have to recover.
> > 
> > v2: fix some bugs pointed out by the maintainer, add cpu offlining stress test
> 
> Thanks for the revision! I've applied patch 2 and 3 for the update.

Cool, thanks!

--D

> Thanks,
> Eryu
> 
> > 
> > If you're going to start using this mess, you probably ought to just
> > pull from my git trees, which are linked below.
> > 
> > This is an extraordinary way to destroy everything.  Enjoy!
> > Comments and questions are, as always, welcome.
> > 
> > --D
> > 
> > fstests git tree:
> > https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=new-tests-for-5.14
> > ---
> >  common/rc             |   24 +++++++++
> >  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  tests/generic/725.out |    2 +
> >  tests/generic/726     |   69 +++++++++++++++++++++++++
> >  tests/generic/726.out |    2 +
> >  tests/xfs/449         |    2 -
> >  6 files changed, 234 insertions(+), 1 deletion(-)
> >  create mode 100755 tests/generic/725
> >  create mode 100644 tests/generic/725.out
> >  create mode 100755 tests/generic/726
> >  create mode 100644 tests/generic/726.out

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-08-17  4:16         ` Darrick J. Wong
@ 2021-08-17 15:54           ` Darrick J. Wong
  0 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2021-08-17 15:54 UTC (permalink / raw)
  To: Eryu Guan; +Cc: Eryu Guan, guaneryu, linux-xfs, fstests

On Mon, Aug 16, 2021 at 09:16:16PM -0700, Darrick J. Wong wrote:
> On Tue, Aug 17, 2021 at 11:16:49AM +0800, Eryu Guan wrote:
> > On Mon, Aug 16, 2021 at 09:35:24AM -0700, Darrick J. Wong wrote:
> > > On Mon, Aug 16, 2021 at 12:28:20AM +0800, Eryu Guan wrote:
> > > > On Tue, Jul 27, 2021 at 05:10:30PM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > > 
> > > > > generic/475, but we're running fsstress on a disk image inside the
> > > > > scratch filesystem
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > > > ---
> > > > >  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  tests/generic/725.out |    2 +
> > > > >  2 files changed, 138 insertions(+)
> > > > >  create mode 100755 tests/generic/725
> > > > >  create mode 100644 tests/generic/725.out
> > > > > 
> > > > > 
> > > > > diff --git a/tests/generic/725 b/tests/generic/725
> > > > > new file mode 100755
> > > > > index 00000000..f43bcb37
> > > > > --- /dev/null
> > > > > +++ b/tests/generic/725
> > > > > @@ -0,0 +1,136 @@
> > > > > +#! /bin/bash
> > > > > +# SPDX-License-Identifier: GPL-2.0
> > > > > +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> > > > > +#
> > > > > +# FS QA Test No. 725
> > > > > +#
> > > > > +# Test nested log recovery with repeated (simulated) disk failures.  We kick
> > > > > +# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
> > > > > +# out the underlying scratch device with dm-error to see what happens when the
> > > > > +# disk goes down.  Having taken down both fses in this manner, remount them and
> > > > > +# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
> > > > > +# in writeback on the host that cause VM guests to fail to recover.
> > > > 
> > > > It currently fails for me on btrfs, the loop mount failed in 2nd
> > > > iteration, seems like a bug in btrfs.
> > > 
> > > Yep.  Until recently (aka the Big Xfs Log Recovery Bughunt of 2021) it
> > > wouldn't pass xfs either. :/
> > > 
> > > > > +#
> > > > > +. ./common/preamble
> > > > > +_begin_fstest shutdown auto log metadata eio
> > > > > +
> > > > > +_cleanup()
> > > > > +{
> > > > > +	cd /
> > > > > +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > > > > +	wait
> > > > > +	if [ -n "$loopmnt" ]; then
> > > > > +		umount $loopmnt 2>/dev/null
> > > > 
> > > > $UMOUNT_PROG
> > > > 
> > > > > +		rm -r -f $loopmnt
> > > > > +	fi
> > > > > +	rm -f $tmp.*
> > > > > +	_dmerror_unmount
> > > > > +	_dmerror_cleanup
> > > > > +}
> > > > > +
> > > > > +# Import common functions.
> > > > > +. ./common/dmerror
> > > > > +. ./common/reflink
> > > > > +
> > > > > +# Modify as appropriate.
> > > > > +_supported_fs generic
> > > > > +
> > > > > +_require_scratch_reflink
> > > > > +_require_cp_reflink
> > > > > +_require_dm_target error
> > > > > +_require_command "$KILLALL_PROG" "killall"
> > > > > +
> > > > > +echo "Silence is golden."
> > > > > +
> > > > > +_scratch_mkfs >> $seqres.full 2>&1
> > > > > +_require_metadata_journaling $SCRATCH_DEV
> > > > > +_dmerror_init
> > > > > +_dmerror_mount
> > > > > +
> > > > > +# Create a fs image consuming 1/3 of the scratch fs
> > > > > +scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)
> > > > 
> > > > _get_available_space $SCRATCH_MNT ?
> > > > 
> > > > > +loopimg_bytes=$((scratch_freesp_bytes / 3))
> > > > > +
> > > > > +loopimg=$SCRATCH_MNT/testfs
> > > > > +truncate -s $loopimg_bytes $loopimg
> > > > > +_mkfs_dev $loopimg
> > > > > +
> > > > > +loopmnt=$tmp.mount
> > > > > +mkdir -p $loopmnt
> > > > > +
> > > > > +scratch_aliveflag=$tmp.runsnap
> > > > > +snap_aliveflag=$tmp.snapping
> > > > > +
> > > > > +snap_loop_fs() {
> > > > > +	touch "$snap_aliveflag"
> > > > > +	while [ -e "$scratch_aliveflag" ]; do
> > > > > +		rm -f $loopimg.a
> > > > > +		_cp_reflink $loopimg $loopimg.a
> > > > > +		sleep 1
> > > > > +	done
> > > > > +	rm -f "$snap_aliveflag"
> > > > > +}
> > > > > +
> > > > > +fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
> > > > > +
> > > > > +for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
> > > > > +	touch $scratch_aliveflag
> > > > > +	snap_loop_fs >> $seqres.full 2>&1 &
> > > > > +
> > > > > +	if ! _mount $loopimg $loopmnt -o loop; then
> > > > > +		rm -f $scratch_aliveflag
> > > > > +		_fail "loop mount failed"
> > > > 
> > > > I found it a bit easier to debug if print $i here.
> > > 
> > > Ok, I'll change it to "loop $i mount failed".
> > > 
> > > > > +		break
> > > > > +	fi
> > > > > +
> > > > > +	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
> > > > > +
> > > > > +	# purposely include 0 second sleeps to test shutdown immediately after
> > > > > +	# recovery
> > > > > +	sleep $((RANDOM % (3 * TIME_FACTOR) ))
> > > > > +	rm -f $scratch_aliveflag
> > > > > +
> > > > > +	# This test aims to simulate sudden disk failure, which means that we
> > > > > +	# do not want to quiesce the filesystem or otherwise give it a chance
> > > > > +	# to flush its logs.  Therefore we want to call dmsetup with the
> > > > > +	# --nolockfs parameter; to make this happen we must call the load
> > > > > +	# error table helper *without* 'lockfs'.
> > > > > +	_dmerror_load_error_table
> > > > > +
> > > > > +	ps -e | grep fsstress > /dev/null 2>&1
> > > > > +	while [ $? -eq 0 ]; do
> > > > > +		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > > > > +		wait > /dev/null 2>&1
> > > > > +		ps -e | grep fsstress > /dev/null 2>&1
> > > > > +	done
> > > > > +	for ((i = 0; i < 10; i++)); do
> > > > > +		test -e "$snap_aliveflag" || break
> > > > > +		sleep 1
> > > > > +	done
> > > > > +
> > > > > +	# Mount again to replay log after loading working table, so we have a
> > > > > +	# consistent XFS after test.
> > > > 
> > > > This is a generic test, fix the XFS specific comments?
> > > 
> > > Oops.  "...a consistent fs after test."
> > > 
> > > > > +	$UMOUNT_PROG $loopmnt
> > > > > +	_dmerror_unmount || _fail "unmount failed"
> > > > > +	_dmerror_load_working_table
> > > > > +	if ! _dmerror_mount; then
> > > > > +		dmsetup table | tee -a /dev/ttyprintk
> > > > > +		lsblk | tee -a /dev/ttyprintk
> > > > > +		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md
> > > > 
> > > > Above logs all should go to $seqres.full ?
> > > 
> > > Oops, yeah.  I'll remove them since I was only using them to check the
> > > system state.
> > > 
> > > > And $XFS_METADUMP_PROG is not suitable for a generic test.
> > > 
> > > I'll create _metadump_dev so that this at least works for the two
> > > filesystems for which we have dump creation helpers (ext* and xfs).
> > 
> > Sounds great!
> > 
> > > 
> > > > > +		_fail "mount failed"
> > > > > +	fi
> > > > > +done
> > > > > +
> > > > > +# Make sure the fs image file is ok
> > > > > +if [ -f "$loopimg" ]; then
> > > > > +	if _mount $loopimg $loopmnt -o loop; then
> > > > > +		$UMOUNT_PROG $loopmnt &> /dev/null
> > > > > +	else
> > > > > +		echo "final loop mount failed"
> > > > > +	fi
> > > > > +	_check_xfs_filesystem $loopimg none none
> > > > 
> > > > Same here, use _check_scratch_fs?
> > > 
> > > $loopimg is a file within the scratch fs.
> > 
> > _check_scratch_fs can take dev as argument, and default to $SCRATCH_DEV,
> > I think that works in this case?
> 
> It could be made to work with a large enough crowbar, but that's
> seriously overkill because $loopimg is a file *within* the scratch
> filesystem.  The $loopimg fs gets formatted without the
> SCRATCH_LOGDEV/SCRATCH_RTDEV options (because it is not itself the
> scratch filesystem), which means that in order to (ab)use
> _check_scratch_fs to do the same thing as _check_xfs_filesystem, you
> have to exclude those options.  So yes, this:
> 
> 	SCRATCH_RTDEV= SCRATCH_LOGDEV= _check_scratch_fs $loopimg
> 
> is the equivalent of this:
> 
> 	_check_xfs_filesystem $loopimg none none
> 
> But the first is longer and pointless.

...and now that it's morning and I've had coffee again, I now understand
what you're actually asking, which is "Don't use _foo_xfs* functions in
a generic test!", not "rototill in this helper for stylistic reasons".

Judging from my immediate defensive reaction, I've clearly been worn
down by all the bikeshedding the past year.  Have they resumed flights
to Nobikeshed Island?

Anyway, I'll go fix that.  Thank you for catching the mistake.

--D

> --D
> 
> > Thanks,
> > Eryu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-08-17  3:16       ` Eryu Guan
@ 2021-08-17  4:16         ` Darrick J. Wong
  2021-08-17 15:54           ` Darrick J. Wong
  0 siblings, 1 reply; 17+ messages in thread
From: Darrick J. Wong @ 2021-08-17  4:16 UTC (permalink / raw)
  To: Eryu Guan; +Cc: Eryu Guan, guaneryu, linux-xfs, fstests

On Tue, Aug 17, 2021 at 11:16:49AM +0800, Eryu Guan wrote:
> On Mon, Aug 16, 2021 at 09:35:24AM -0700, Darrick J. Wong wrote:
> > On Mon, Aug 16, 2021 at 12:28:20AM +0800, Eryu Guan wrote:
> > > On Tue, Jul 27, 2021 at 05:10:30PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > 
> > > > generic/475, but we're running fsstress on a disk image inside the
> > > > scratch filesystem
> > > > 
> > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > > ---
> > > >  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  tests/generic/725.out |    2 +
> > > >  2 files changed, 138 insertions(+)
> > > >  create mode 100755 tests/generic/725
> > > >  create mode 100644 tests/generic/725.out
> > > > 
> > > > 
> > > > diff --git a/tests/generic/725 b/tests/generic/725
> > > > new file mode 100755
> > > > index 00000000..f43bcb37
> > > > --- /dev/null
> > > > +++ b/tests/generic/725
> > > > @@ -0,0 +1,136 @@
> > > > +#! /bin/bash
> > > > +# SPDX-License-Identifier: GPL-2.0
> > > > +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> > > > +#
> > > > +# FS QA Test No. 725
> > > > +#
> > > > +# Test nested log recovery with repeated (simulated) disk failures.  We kick
> > > > +# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
> > > > +# out the underlying scratch device with dm-error to see what happens when the
> > > > +# disk goes down.  Having taken down both fses in this manner, remount them and
> > > > +# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
> > > > +# in writeback on the host that cause VM guests to fail to recover.
> > > 
> > > It currently fails for me on btrfs, the loop mount failed in 2nd
> > > iteration, seems like a bug in btrfs.
> > 
> > Yep.  Until recently (aka the Big Xfs Log Recovery Bughunt of 2021) it
> > wouldn't pass xfs either. :/
> > 
> > > > +#
> > > > +. ./common/preamble
> > > > +_begin_fstest shutdown auto log metadata eio
> > > > +
> > > > +_cleanup()
> > > > +{
> > > > +	cd /
> > > > +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > > > +	wait
> > > > +	if [ -n "$loopmnt" ]; then
> > > > +		umount $loopmnt 2>/dev/null
> > > 
> > > $UMOUNT_PROG
> > > 
> > > > +		rm -r -f $loopmnt
> > > > +	fi
> > > > +	rm -f $tmp.*
> > > > +	_dmerror_unmount
> > > > +	_dmerror_cleanup
> > > > +}
> > > > +
> > > > +# Import common functions.
> > > > +. ./common/dmerror
> > > > +. ./common/reflink
> > > > +
> > > > +# Modify as appropriate.
> > > > +_supported_fs generic
> > > > +
> > > > +_require_scratch_reflink
> > > > +_require_cp_reflink
> > > > +_require_dm_target error
> > > > +_require_command "$KILLALL_PROG" "killall"
> > > > +
> > > > +echo "Silence is golden."
> > > > +
> > > > +_scratch_mkfs >> $seqres.full 2>&1
> > > > +_require_metadata_journaling $SCRATCH_DEV
> > > > +_dmerror_init
> > > > +_dmerror_mount
> > > > +
> > > > +# Create a fs image consuming 1/3 of the scratch fs
> > > > +scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)
> > > 
> > > _get_available_space $SCRATCH_MNT ?
> > > 
> > > > +loopimg_bytes=$((scratch_freesp_bytes / 3))
> > > > +
> > > > +loopimg=$SCRATCH_MNT/testfs
> > > > +truncate -s $loopimg_bytes $loopimg
> > > > +_mkfs_dev $loopimg
> > > > +
> > > > +loopmnt=$tmp.mount
> > > > +mkdir -p $loopmnt
> > > > +
> > > > +scratch_aliveflag=$tmp.runsnap
> > > > +snap_aliveflag=$tmp.snapping
> > > > +
> > > > +snap_loop_fs() {
> > > > +	touch "$snap_aliveflag"
> > > > +	while [ -e "$scratch_aliveflag" ]; do
> > > > +		rm -f $loopimg.a
> > > > +		_cp_reflink $loopimg $loopimg.a
> > > > +		sleep 1
> > > > +	done
> > > > +	rm -f "$snap_aliveflag"
> > > > +}
> > > > +
> > > > +fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
> > > > +
> > > > +for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
> > > > +	touch $scratch_aliveflag
> > > > +	snap_loop_fs >> $seqres.full 2>&1 &
> > > > +
> > > > +	if ! _mount $loopimg $loopmnt -o loop; then
> > > > +		rm -f $scratch_aliveflag
> > > > +		_fail "loop mount failed"
> > > 
> > > I found it a bit easier to debug if print $i here.
> > 
> > Ok, I'll change it to "loop $i mount failed".
> > 
> > > > +		break
> > > > +	fi
> > > > +
> > > > +	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
> > > > +
> > > > +	# purposely include 0 second sleeps to test shutdown immediately after
> > > > +	# recovery
> > > > +	sleep $((RANDOM % (3 * TIME_FACTOR) ))
> > > > +	rm -f $scratch_aliveflag
> > > > +
> > > > +	# This test aims to simulate sudden disk failure, which means that we
> > > > +	# do not want to quiesce the filesystem or otherwise give it a chance
> > > > +	# to flush its logs.  Therefore we want to call dmsetup with the
> > > > +	# --nolockfs parameter; to make this happen we must call the load
> > > > +	# error table helper *without* 'lockfs'.
> > > > +	_dmerror_load_error_table
> > > > +
> > > > +	ps -e | grep fsstress > /dev/null 2>&1
> > > > +	while [ $? -eq 0 ]; do
> > > > +		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > > > +		wait > /dev/null 2>&1
> > > > +		ps -e | grep fsstress > /dev/null 2>&1
> > > > +	done
> > > > +	for ((i = 0; i < 10; i++)); do
> > > > +		test -e "$snap_aliveflag" || break
> > > > +		sleep 1
> > > > +	done
> > > > +
> > > > +	# Mount again to replay log after loading working table, so we have a
> > > > +	# consistent XFS after test.
> > > 
> > > This is a generic test, fix the XFS specific comments?
> > 
> > Oops.  "...a consistent fs after test."
> > 
> > > > +	$UMOUNT_PROG $loopmnt
> > > > +	_dmerror_unmount || _fail "unmount failed"
> > > > +	_dmerror_load_working_table
> > > > +	if ! _dmerror_mount; then
> > > > +		dmsetup table | tee -a /dev/ttyprintk
> > > > +		lsblk | tee -a /dev/ttyprintk
> > > > +		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md
> > > 
> > > Above logs all should go to $seqres.full ?
> > 
> > Oops, yeah.  I'll remove them since I was only using them to check the
> > system state.
> > 
> > > And $XFS_METADUMP_PROG is not suitable for a generic test.
> > 
> > I'll create _metadump_dev so that this at least works for the two
> > filesystems for which we have dump creation helpers (ext* and xfs).
> 
> Sounds great!
> 
> > 
> > > > +		_fail "mount failed"
> > > > +	fi
> > > > +done
> > > > +
> > > > +# Make sure the fs image file is ok
> > > > +if [ -f "$loopimg" ]; then
> > > > +	if _mount $loopimg $loopmnt -o loop; then
> > > > +		$UMOUNT_PROG $loopmnt &> /dev/null
> > > > +	else
> > > > +		echo "final loop mount failed"
> > > > +	fi
> > > > +	_check_xfs_filesystem $loopimg none none
> > > 
> > > Same here, use _check_scratch_fs?
> > 
> > $loopimg is a file within the scratch fs.
> 
> _check_scratch_fs can take dev as argument, and default to $SCRATCH_DEV,
> I think that works in this case?

It could be made to work with a large enough crowbar, but that's
seriously overkill because $loopimg is a file *within* the scratch
filesystem.  The $loopimg fs gets formatted without the
SCRATCH_LOGDEV/SCRATCH_RTDEV options (because it is not itself the
scratch filesystem), which means that in order to (ab)use
_check_scratch_fs to do the same thing as _check_xfs_filesystem, you
have to exclude those options.  So yes, this:

	SCRATCH_RTDEV= SCRATCH_LOGDEV= _check_scratch_fs $loopimg

is the equivalent of this:

	_check_xfs_filesystem $loopimg none none

But the first is longer and pointless.

--D

> Thanks,
> Eryu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-08-16 16:35     ` Darrick J. Wong
@ 2021-08-17  3:16       ` Eryu Guan
  2021-08-17  4:16         ` Darrick J. Wong
  0 siblings, 1 reply; 17+ messages in thread
From: Eryu Guan @ 2021-08-17  3:16 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Eryu Guan, guaneryu, linux-xfs, fstests

On Mon, Aug 16, 2021 at 09:35:24AM -0700, Darrick J. Wong wrote:
> On Mon, Aug 16, 2021 at 12:28:20AM +0800, Eryu Guan wrote:
> > On Tue, Jul 27, 2021 at 05:10:30PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > generic/475, but we're running fsstress on a disk image inside the
> > > scratch filesystem
> > > 
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> > >  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
> > >  tests/generic/725.out |    2 +
> > >  2 files changed, 138 insertions(+)
> > >  create mode 100755 tests/generic/725
> > >  create mode 100644 tests/generic/725.out
> > > 
> > > 
> > > diff --git a/tests/generic/725 b/tests/generic/725
> > > new file mode 100755
> > > index 00000000..f43bcb37
> > > --- /dev/null
> > > +++ b/tests/generic/725
> > > @@ -0,0 +1,136 @@
> > > +#! /bin/bash
> > > +# SPDX-License-Identifier: GPL-2.0
> > > +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> > > +#
> > > +# FS QA Test No. 725
> > > +#
> > > +# Test nested log recovery with repeated (simulated) disk failures.  We kick
> > > +# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
> > > +# out the underlying scratch device with dm-error to see what happens when the
> > > +# disk goes down.  Having taken down both fses in this manner, remount them and
> > > +# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
> > > +# in writeback on the host that cause VM guests to fail to recover.
> > 
> > It currently fails for me on btrfs, the loop mount failed in 2nd
> > iteration, seems like a bug in btrfs.
> 
> Yep.  Until recently (aka the Big Xfs Log Recovery Bughunt of 2021) it
> wouldn't pass xfs either. :/
> 
> > > +#
> > > +. ./common/preamble
> > > +_begin_fstest shutdown auto log metadata eio
> > > +
> > > +_cleanup()
> > > +{
> > > +	cd /
> > > +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > > +	wait
> > > +	if [ -n "$loopmnt" ]; then
> > > +		umount $loopmnt 2>/dev/null
> > 
> > $UMOUNT_PROG
> > 
> > > +		rm -r -f $loopmnt
> > > +	fi
> > > +	rm -f $tmp.*
> > > +	_dmerror_unmount
> > > +	_dmerror_cleanup
> > > +}
> > > +
> > > +# Import common functions.
> > > +. ./common/dmerror
> > > +. ./common/reflink
> > > +
> > > +# Modify as appropriate.
> > > +_supported_fs generic
> > > +
> > > +_require_scratch_reflink
> > > +_require_cp_reflink
> > > +_require_dm_target error
> > > +_require_command "$KILLALL_PROG" "killall"
> > > +
> > > +echo "Silence is golden."
> > > +
> > > +_scratch_mkfs >> $seqres.full 2>&1
> > > +_require_metadata_journaling $SCRATCH_DEV
> > > +_dmerror_init
> > > +_dmerror_mount
> > > +
> > > +# Create a fs image consuming 1/3 of the scratch fs
> > > +scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)
> > 
> > _get_available_space $SCRATCH_MNT ?
> > 
> > > +loopimg_bytes=$((scratch_freesp_bytes / 3))
> > > +
> > > +loopimg=$SCRATCH_MNT/testfs
> > > +truncate -s $loopimg_bytes $loopimg
> > > +_mkfs_dev $loopimg
> > > +
> > > +loopmnt=$tmp.mount
> > > +mkdir -p $loopmnt
> > > +
> > > +scratch_aliveflag=$tmp.runsnap
> > > +snap_aliveflag=$tmp.snapping
> > > +
> > > +snap_loop_fs() {
> > > +	touch "$snap_aliveflag"
> > > +	while [ -e "$scratch_aliveflag" ]; do
> > > +		rm -f $loopimg.a
> > > +		_cp_reflink $loopimg $loopimg.a
> > > +		sleep 1
> > > +	done
> > > +	rm -f "$snap_aliveflag"
> > > +}
> > > +
> > > +fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
> > > +
> > > +for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
> > > +	touch $scratch_aliveflag
> > > +	snap_loop_fs >> $seqres.full 2>&1 &
> > > +
> > > +	if ! _mount $loopimg $loopmnt -o loop; then
> > > +		rm -f $scratch_aliveflag
> > > +		_fail "loop mount failed"
> > 
> > I found it a bit easier to debug if print $i here.
> 
> Ok, I'll change it to "loop $i mount failed".
> 
> > > +		break
> > > +	fi
> > > +
> > > +	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
> > > +
> > > +	# purposely include 0 second sleeps to test shutdown immediately after
> > > +	# recovery
> > > +	sleep $((RANDOM % (3 * TIME_FACTOR) ))
> > > +	rm -f $scratch_aliveflag
> > > +
> > > +	# This test aims to simulate sudden disk failure, which means that we
> > > +	# do not want to quiesce the filesystem or otherwise give it a chance
> > > +	# to flush its logs.  Therefore we want to call dmsetup with the
> > > +	# --nolockfs parameter; to make this happen we must call the load
> > > +	# error table helper *without* 'lockfs'.
> > > +	_dmerror_load_error_table
> > > +
> > > +	ps -e | grep fsstress > /dev/null 2>&1
> > > +	while [ $? -eq 0 ]; do
> > > +		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > > +		wait > /dev/null 2>&1
> > > +		ps -e | grep fsstress > /dev/null 2>&1
> > > +	done
> > > +	for ((i = 0; i < 10; i++)); do
> > > +		test -e "$snap_aliveflag" || break
> > > +		sleep 1
> > > +	done
> > > +
> > > +	# Mount again to replay log after loading working table, so we have a
> > > +	# consistent XFS after test.
> > 
> > This is a generic test, fix the XFS specific comments?
> 
> Oops.  "...a consistent fs after test."
> 
> > > +	$UMOUNT_PROG $loopmnt
> > > +	_dmerror_unmount || _fail "unmount failed"
> > > +	_dmerror_load_working_table
> > > +	if ! _dmerror_mount; then
> > > +		dmsetup table | tee -a /dev/ttyprintk
> > > +		lsblk | tee -a /dev/ttyprintk
> > > +		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md
> > 
> > Above logs all should go to $seqres.full ?
> 
> Oops, yeah.  I'll remove them since I was only using them to check the
> system state.
> 
> > And $XFS_METADUMP_PROG is not suitable for a generic test.
> 
> I'll create _metadump_dev so that this at least works for the two
> filesystems for which we have dump creation helpers (ext* and xfs).

Sounds great!

> 
> > > +		_fail "mount failed"
> > > +	fi
> > > +done
> > > +
> > > +# Make sure the fs image file is ok
> > > +if [ -f "$loopimg" ]; then
> > > +	if _mount $loopimg $loopmnt -o loop; then
> > > +		$UMOUNT_PROG $loopmnt &> /dev/null
> > > +	else
> > > +		echo "final loop mount failed"
> > > +	fi
> > > +	_check_xfs_filesystem $loopimg none none
> > 
> > Same here, use _check_scratch_fs?
> 
> $loopimg is a file within the scratch fs.

_check_scratch_fs can take dev as argument, and default to $SCRATCH_DEV,
I think that works in this case?

Thanks,
Eryu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-08-15 16:28   ` Eryu Guan
@ 2021-08-16 16:35     ` Darrick J. Wong
  2021-08-17  3:16       ` Eryu Guan
  0 siblings, 1 reply; 17+ messages in thread
From: Darrick J. Wong @ 2021-08-16 16:35 UTC (permalink / raw)
  To: Eryu Guan; +Cc: guaneryu, linux-xfs, fstests

On Mon, Aug 16, 2021 at 12:28:20AM +0800, Eryu Guan wrote:
> On Tue, Jul 27, 2021 at 05:10:30PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > generic/475, but we're running fsstress on a disk image inside the
> > scratch filesystem
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  tests/generic/725.out |    2 +
> >  2 files changed, 138 insertions(+)
> >  create mode 100755 tests/generic/725
> >  create mode 100644 tests/generic/725.out
> > 
> > 
> > diff --git a/tests/generic/725 b/tests/generic/725
> > new file mode 100755
> > index 00000000..f43bcb37
> > --- /dev/null
> > +++ b/tests/generic/725
> > @@ -0,0 +1,136 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0
> > +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> > +#
> > +# FS QA Test No. 725
> > +#
> > +# Test nested log recovery with repeated (simulated) disk failures.  We kick
> > +# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
> > +# out the underlying scratch device with dm-error to see what happens when the
> > +# disk goes down.  Having taken down both fses in this manner, remount them and
> > +# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
> > +# in writeback on the host that cause VM guests to fail to recover.
> 
> It currently fails for me on btrfs, the loop mount failed in 2nd
> iteration, seems like a bug in btrfs.

Yep.  Until recently (aka the Big Xfs Log Recovery Bughunt of 2021) it
wouldn't pass xfs either. :/

> > +#
> > +. ./common/preamble
> > +_begin_fstest shutdown auto log metadata eio
> > +
> > +_cleanup()
> > +{
> > +	cd /
> > +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > +	wait
> > +	if [ -n "$loopmnt" ]; then
> > +		umount $loopmnt 2>/dev/null
> 
> $UMOUNT_PROG
> 
> > +		rm -r -f $loopmnt
> > +	fi
> > +	rm -f $tmp.*
> > +	_dmerror_unmount
> > +	_dmerror_cleanup
> > +}
> > +
> > +# Import common functions.
> > +. ./common/dmerror
> > +. ./common/reflink
> > +
> > +# Modify as appropriate.
> > +_supported_fs generic
> > +
> > +_require_scratch_reflink
> > +_require_cp_reflink
> > +_require_dm_target error
> > +_require_command "$KILLALL_PROG" "killall"
> > +
> > +echo "Silence is golden."
> > +
> > +_scratch_mkfs >> $seqres.full 2>&1
> > +_require_metadata_journaling $SCRATCH_DEV
> > +_dmerror_init
> > +_dmerror_mount
> > +
> > +# Create a fs image consuming 1/3 of the scratch fs
> > +scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)
> 
> _get_available_space $SCRATCH_MNT ?
> 
> > +loopimg_bytes=$((scratch_freesp_bytes / 3))
> > +
> > +loopimg=$SCRATCH_MNT/testfs
> > +truncate -s $loopimg_bytes $loopimg
> > +_mkfs_dev $loopimg
> > +
> > +loopmnt=$tmp.mount
> > +mkdir -p $loopmnt
> > +
> > +scratch_aliveflag=$tmp.runsnap
> > +snap_aliveflag=$tmp.snapping
> > +
> > +snap_loop_fs() {
> > +	touch "$snap_aliveflag"
> > +	while [ -e "$scratch_aliveflag" ]; do
> > +		rm -f $loopimg.a
> > +		_cp_reflink $loopimg $loopimg.a
> > +		sleep 1
> > +	done
> > +	rm -f "$snap_aliveflag"
> > +}
> > +
> > +fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
> > +
> > +for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
> > +	touch $scratch_aliveflag
> > +	snap_loop_fs >> $seqres.full 2>&1 &
> > +
> > +	if ! _mount $loopimg $loopmnt -o loop; then
> > +		rm -f $scratch_aliveflag
> > +		_fail "loop mount failed"
> 
> I found it a bit easier to debug if print $i here.

Ok, I'll change it to "loop $i mount failed".

> > +		break
> > +	fi
> > +
> > +	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
> > +
> > +	# purposely include 0 second sleeps to test shutdown immediately after
> > +	# recovery
> > +	sleep $((RANDOM % (3 * TIME_FACTOR) ))
> > +	rm -f $scratch_aliveflag
> > +
> > +	# This test aims to simulate sudden disk failure, which means that we
> > +	# do not want to quiesce the filesystem or otherwise give it a chance
> > +	# to flush its logs.  Therefore we want to call dmsetup with the
> > +	# --nolockfs parameter; to make this happen we must call the load
> > +	# error table helper *without* 'lockfs'.
> > +	_dmerror_load_error_table
> > +
> > +	ps -e | grep fsstress > /dev/null 2>&1
> > +	while [ $? -eq 0 ]; do
> > +		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > +		wait > /dev/null 2>&1
> > +		ps -e | grep fsstress > /dev/null 2>&1
> > +	done
> > +	for ((i = 0; i < 10; i++)); do
> > +		test -e "$snap_aliveflag" || break
> > +		sleep 1
> > +	done
> > +
> > +	# Mount again to replay log after loading working table, so we have a
> > +	# consistent XFS after test.
> 
> This is a generic test, fix the XFS specific comments?

Oops.  "...a consistent fs after test."

> > +	$UMOUNT_PROG $loopmnt
> > +	_dmerror_unmount || _fail "unmount failed"
> > +	_dmerror_load_working_table
> > +	if ! _dmerror_mount; then
> > +		dmsetup table | tee -a /dev/ttyprintk
> > +		lsblk | tee -a /dev/ttyprintk
> > +		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md
> 
> Above logs all should go to $seqres.full ?

Oops, yeah.  I'll remove them since I was only using them to check the
system state.

> And $XFS_METADUMP_PROG is not suitable for a generic test.

I'll create _metadump_dev so that this at least works for the two
filesystems for which we have dump creation helpers (ext* and xfs).

> > +		_fail "mount failed"
> > +	fi
> > +done
> > +
> > +# Make sure the fs image file is ok
> > +if [ -f "$loopimg" ]; then
> > +	if _mount $loopimg $loopmnt -o loop; then
> > +		$UMOUNT_PROG $loopmnt &> /dev/null
> > +	else
> > +		echo "final loop mount failed"
> > +	fi
> > +	_check_xfs_filesystem $loopimg none none
> 
> Same here, use _check_scratch_fs?

$loopimg is a file within the scratch fs.

--D

> Thanks,
> Eryu
> 
> > +fi
> > +
> > +# success, all done
> > +status=0
> > +exit
> > diff --git a/tests/generic/725.out b/tests/generic/725.out
> > new file mode 100644
> > index 00000000..ed73a9fc
> > --- /dev/null
> > +++ b/tests/generic/725.out
> > @@ -0,0 +1,2 @@
> > +QA output created by 725
> > +Silence is golden.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-07-28  0:10 ` [PATCH 2/3] generic: test shutdowns of a nested filesystem Darrick J. Wong
  2021-08-12  5:44   ` Zorro Lang
@ 2021-08-15 16:28   ` Eryu Guan
  2021-08-16 16:35     ` Darrick J. Wong
  1 sibling, 1 reply; 17+ messages in thread
From: Eryu Guan @ 2021-08-15 16:28 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: guaneryu, linux-xfs, fstests

On Tue, Jul 27, 2021 at 05:10:30PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> generic/475, but we're running fsstress on a disk image inside the
> scratch filesystem
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/725.out |    2 +
>  2 files changed, 138 insertions(+)
>  create mode 100755 tests/generic/725
>  create mode 100644 tests/generic/725.out
> 
> 
> diff --git a/tests/generic/725 b/tests/generic/725
> new file mode 100755
> index 00000000..f43bcb37
> --- /dev/null
> +++ b/tests/generic/725
> @@ -0,0 +1,136 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> +#
> +# FS QA Test No. 725
> +#
> +# Test nested log recovery with repeated (simulated) disk failures.  We kick
> +# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
> +# out the underlying scratch device with dm-error to see what happens when the
> +# disk goes down.  Having taken down both fses in this manner, remount them and
> +# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
> +# in writeback on the host that cause VM guests to fail to recover.

It currently fails for me on btrfs, the loop mount failed in 2nd
iteration, seems like a bug in btrfs.

> +#
> +. ./common/preamble
> +_begin_fstest shutdown auto log metadata eio
> +
> +_cleanup()
> +{
> +	cd /
> +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> +	wait
> +	if [ -n "$loopmnt" ]; then
> +		umount $loopmnt 2>/dev/null

$UMOUNT_PROG

> +		rm -r -f $loopmnt
> +	fi
> +	rm -f $tmp.*
> +	_dmerror_unmount
> +	_dmerror_cleanup
> +}
> +
> +# Import common functions.
> +. ./common/dmerror
> +. ./common/reflink
> +
> +# Modify as appropriate.
> +_supported_fs generic
> +
> +_require_scratch_reflink
> +_require_cp_reflink
> +_require_dm_target error
> +_require_command "$KILLALL_PROG" "killall"
> +
> +echo "Silence is golden."
> +
> +_scratch_mkfs >> $seqres.full 2>&1
> +_require_metadata_journaling $SCRATCH_DEV
> +_dmerror_init
> +_dmerror_mount
> +
> +# Create a fs image consuming 1/3 of the scratch fs
> +scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)

_get_available_space $SCRATCH_MNT ?

> +loopimg_bytes=$((scratch_freesp_bytes / 3))
> +
> +loopimg=$SCRATCH_MNT/testfs
> +truncate -s $loopimg_bytes $loopimg
> +_mkfs_dev $loopimg
> +
> +loopmnt=$tmp.mount
> +mkdir -p $loopmnt
> +
> +scratch_aliveflag=$tmp.runsnap
> +snap_aliveflag=$tmp.snapping
> +
> +snap_loop_fs() {
> +	touch "$snap_aliveflag"
> +	while [ -e "$scratch_aliveflag" ]; do
> +		rm -f $loopimg.a
> +		_cp_reflink $loopimg $loopimg.a
> +		sleep 1
> +	done
> +	rm -f "$snap_aliveflag"
> +}
> +
> +fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
> +
> +for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
> +	touch $scratch_aliveflag
> +	snap_loop_fs >> $seqres.full 2>&1 &
> +
> +	if ! _mount $loopimg $loopmnt -o loop; then
> +		rm -f $scratch_aliveflag
> +		_fail "loop mount failed"

I found it a bit easier to debug if print $i here.

> +		break
> +	fi
> +
> +	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
> +
> +	# purposely include 0 second sleeps to test shutdown immediately after
> +	# recovery
> +	sleep $((RANDOM % (3 * TIME_FACTOR) ))
> +	rm -f $scratch_aliveflag
> +
> +	# This test aims to simulate sudden disk failure, which means that we
> +	# do not want to quiesce the filesystem or otherwise give it a chance
> +	# to flush its logs.  Therefore we want to call dmsetup with the
> +	# --nolockfs parameter; to make this happen we must call the load
> +	# error table helper *without* 'lockfs'.
> +	_dmerror_load_error_table
> +
> +	ps -e | grep fsstress > /dev/null 2>&1
> +	while [ $? -eq 0 ]; do
> +		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> +		wait > /dev/null 2>&1
> +		ps -e | grep fsstress > /dev/null 2>&1
> +	done
> +	for ((i = 0; i < 10; i++)); do
> +		test -e "$snap_aliveflag" || break
> +		sleep 1
> +	done
> +
> +	# Mount again to replay log after loading working table, so we have a
> +	# consistent XFS after test.

This is a generic test, fix the XFS specific comments?

> +	$UMOUNT_PROG $loopmnt
> +	_dmerror_unmount || _fail "unmount failed"
> +	_dmerror_load_working_table
> +	if ! _dmerror_mount; then
> +		dmsetup table | tee -a /dev/ttyprintk
> +		lsblk | tee -a /dev/ttyprintk
> +		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md

Above logs all should go to $seqres.full ?

And $XFS_METADUMP_PROG is not suitable for a generic test.

> +		_fail "mount failed"
> +	fi
> +done
> +
> +# Make sure the fs image file is ok
> +if [ -f "$loopimg" ]; then
> +	if _mount $loopimg $loopmnt -o loop; then
> +		$UMOUNT_PROG $loopmnt &> /dev/null
> +	else
> +		echo "final loop mount failed"
> +	fi
> +	_check_xfs_filesystem $loopimg none none

Same here, use _check_scratch_fs?

Thanks,
Eryu

> +fi
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/725.out b/tests/generic/725.out
> new file mode 100644
> index 00000000..ed73a9fc
> --- /dev/null
> +++ b/tests/generic/725.out
> @@ -0,0 +1,2 @@
> +QA output created by 725
> +Silence is golden.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-08-12 17:07     ` Darrick J. Wong
@ 2021-08-13 14:52       ` Zorro Lang
  0 siblings, 0 replies; 17+ messages in thread
From: Zorro Lang @ 2021-08-13 14:52 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: guaneryu, linux-xfs, fstests, guan

On Thu, Aug 12, 2021 at 10:07:46AM -0700, Darrick J. Wong wrote:
> On Thu, Aug 12, 2021 at 01:44:21PM +0800, Zorro Lang wrote:
> > On Tue, Jul 27, 2021 at 05:10:30PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > generic/475, but we're running fsstress on a disk image inside the
> > > scratch filesystem
> > > 
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> > >  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
> > >  tests/generic/725.out |    2 +
> > >  2 files changed, 138 insertions(+)
> > >  create mode 100755 tests/generic/725
> > >  create mode 100644 tests/generic/725.out
> > > 
> > > 
> > > diff --git a/tests/generic/725 b/tests/generic/725
> > > new file mode 100755
> > > index 00000000..f43bcb37
> > > --- /dev/null
> > > +++ b/tests/generic/725
> > > @@ -0,0 +1,136 @@
> > > +#! /bin/bash
> > > +# SPDX-License-Identifier: GPL-2.0
> > > +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> > > +#
> > > +# FS QA Test No. 725
> > > +#
> > > +# Test nested log recovery with repeated (simulated) disk failures.  We kick
> > > +# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
> > > +# out the underlying scratch device with dm-error to see what happens when the
> > > +# disk goes down.  Having taken down both fses in this manner, remount them and
> > > +# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
> > > +# in writeback on the host that cause VM guests to fail to recover.
> > > +#
> > > +. ./common/preamble
> > > +_begin_fstest shutdown auto log metadata eio
> > > +
> > > +_cleanup()
> > > +{
> > > +	cd /
> > > +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > > +	wait
> > > +	if [ -n "$loopmnt" ]; then
> > > +		umount $loopmnt 2>/dev/null
> > > +		rm -r -f $loopmnt
> > > +	fi
> > > +	rm -f $tmp.*
> > > +	_dmerror_unmount
> > > +	_dmerror_cleanup
> > > +}
> > > +
> > > +# Import common functions.
> > > +. ./common/dmerror
> > > +. ./common/reflink
> > > +
> > > +# Modify as appropriate.
> > > +_supported_fs generic
> > > +
> > > +_require_scratch_reflink
> > > +_require_cp_reflink
> > > +_require_dm_target error
> > > +_require_command "$KILLALL_PROG" "killall"
> > > +
> > > +echo "Silence is golden."
> > > +
> > > +_scratch_mkfs >> $seqres.full 2>&1
> > > +_require_metadata_journaling $SCRATCH_DEV
> > > +_dmerror_init
> > > +_dmerror_mount
> > > +
> > > +# Create a fs image consuming 1/3 of the scratch fs
> > > +scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)
> > > +loopimg_bytes=$((scratch_freesp_bytes / 3))
> > > +
> > > +loopimg=$SCRATCH_MNT/testfs
> > > +truncate -s $loopimg_bytes $loopimg
> > > +_mkfs_dev $loopimg
> > 
> > I must say this's a nice test as generic/475, I'd like to have it ASAP :)
> > Just one question: if the FSTYP is nfs, cifs or virtiofs and so on ... [see below]
> > 
> > > +
> > > +loopmnt=$tmp.mount
> > > +mkdir -p $loopmnt
> > > +
> > > +scratch_aliveflag=$tmp.runsnap
> > > +snap_aliveflag=$tmp.snapping
> > > +
> > > +snap_loop_fs() {
> > > +	touch "$snap_aliveflag"
> > > +	while [ -e "$scratch_aliveflag" ]; do
> > > +		rm -f $loopimg.a
> > > +		_cp_reflink $loopimg $loopimg.a
> > > +		sleep 1
> > > +	done
> > > +	rm -f "$snap_aliveflag"
> > > +}
> > > +
> > > +fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
> > > +
> > > +for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
> > > +	touch $scratch_aliveflag
> > > +	snap_loop_fs >> $seqres.full 2>&1 &
> > > +
> > > +	if ! _mount $loopimg $loopmnt -o loop; then
> > 
> > ... This test will fail directly at here
> 
> It won't, because this test doesn't run if SCRATCH_DEV isn't a block
> device.  _require_dm_target calls _require_block_device, which should
> prevent that, right?

Oh, you're right[1], I forgot that. If so, this case is good to me.
Hope it get merged soon :)
Reviewed-by: Zorro Lang <zlang@redhat.com>

Thanks,
Zorro

[1]
# ./check generic/725
FSTYP         -- nfs
PLATFORM      -- Linux/x86_64 xx-xxx-xx 4.18.0-xxx.el8.x86_64+debug #1 SMP Wed Jul 14 12:35:49 EDT 2021
MKFS_OPTIONS  -- xxx-xxx-xxx-xxxxxxx:/mnt/scratch/nfs-server
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 xx-xxxx-xxxx.xxxxx.xx:/mnt/scratch/nfs-server /mnt/nfs-scratch

generic/725     [not run] require xx-xxxx-xxxx.xxxxx.xx:/mnt/scratch/nfs-server to be valid block disk
Ran: generic/725
Not run: generic/725
Passed all 1 tests

# ./check generic/725
FSTYP         -- glusterfs
PLATFORM      -- Linux/x86_64 xx-xxx-xx 4.18.0-xxx.el8.x86_64+debug #1 SMP Wed Jul 14 12:35:49 EDT 2021
MKFS_OPTIONS  -- xxx-xxx-xxx-xxxxxxx:/SCRATCH_VOL
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 xx-xxxx-xxxx.xxxxx.xx:/SCRATCH_VOL /mnt/gluster-scratch

generic/725     [not run] Reflink not supported by scratch filesystem type: glusterfs
Ran: generic/725
Not run: generic/725
Passed all 1 tests

> 
> --D
> 
> > 
> > Thanks,
> > Zorro
> > 
> > > +		rm -f $scratch_aliveflag
> > > +		_fail "loop mount failed"
> > > +		break
> > > +	fi
> > > +
> > > +	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
> > > +
> > > +	# purposely include 0 second sleeps to test shutdown immediately after
> > > +	# recovery
> > > +	sleep $((RANDOM % (3 * TIME_FACTOR) ))
> > > +	rm -f $scratch_aliveflag
> > > +
> > > +	# This test aims to simulate sudden disk failure, which means that we
> > > +	# do not want to quiesce the filesystem or otherwise give it a chance
> > > +	# to flush its logs.  Therefore we want to call dmsetup with the
> > > +	# --nolockfs parameter; to make this happen we must call the load
> > > +	# error table helper *without* 'lockfs'.
> > > +	_dmerror_load_error_table
> > > +
> > > +	ps -e | grep fsstress > /dev/null 2>&1
> > > +	while [ $? -eq 0 ]; do
> > > +		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > > +		wait > /dev/null 2>&1
> > > +		ps -e | grep fsstress > /dev/null 2>&1
> > > +	done
> > > +	for ((i = 0; i < 10; i++)); do
> > > +		test -e "$snap_aliveflag" || break
> > > +		sleep 1
> > > +	done
> > > +
> > > +	# Mount again to replay log after loading working table, so we have a
> > > +	# consistent XFS after test.
> > > +	$UMOUNT_PROG $loopmnt
> > > +	_dmerror_unmount || _fail "unmount failed"
> > > +	_dmerror_load_working_table
> > > +	if ! _dmerror_mount; then
> > > +		dmsetup table | tee -a /dev/ttyprintk
> > > +		lsblk | tee -a /dev/ttyprintk
> > > +		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md
> > > +		_fail "mount failed"
> > > +	fi
> > > +done
> > > +
> > > +# Make sure the fs image file is ok
> > > +if [ -f "$loopimg" ]; then
> > > +	if _mount $loopimg $loopmnt -o loop; then
> > > +		$UMOUNT_PROG $loopmnt &> /dev/null
> > > +	else
> > > +		echo "final loop mount failed"
> > > +	fi
> > > +	_check_xfs_filesystem $loopimg none none
> > > +fi
> > > +
> > > +# success, all done
> > > +status=0
> > > +exit
> > > diff --git a/tests/generic/725.out b/tests/generic/725.out
> > > new file mode 100644
> > > index 00000000..ed73a9fc
> > > --- /dev/null
> > > +++ b/tests/generic/725.out
> > > @@ -0,0 +1,2 @@
> > > +QA output created by 725
> > > +Silence is golden.
> > > 
> > 
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-08-12  5:44   ` Zorro Lang
@ 2021-08-12 17:07     ` Darrick J. Wong
  2021-08-13 14:52       ` Zorro Lang
  0 siblings, 1 reply; 17+ messages in thread
From: Darrick J. Wong @ 2021-08-12 17:07 UTC (permalink / raw)
  To: guaneryu, linux-xfs, fstests, guan

On Thu, Aug 12, 2021 at 01:44:21PM +0800, Zorro Lang wrote:
> On Tue, Jul 27, 2021 at 05:10:30PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > generic/475, but we're running fsstress on a disk image inside the
> > scratch filesystem
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> >  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  tests/generic/725.out |    2 +
> >  2 files changed, 138 insertions(+)
> >  create mode 100755 tests/generic/725
> >  create mode 100644 tests/generic/725.out
> > 
> > 
> > diff --git a/tests/generic/725 b/tests/generic/725
> > new file mode 100755
> > index 00000000..f43bcb37
> > --- /dev/null
> > +++ b/tests/generic/725
> > @@ -0,0 +1,136 @@
> > +#! /bin/bash
> > +# SPDX-License-Identifier: GPL-2.0
> > +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> > +#
> > +# FS QA Test No. 725
> > +#
> > +# Test nested log recovery with repeated (simulated) disk failures.  We kick
> > +# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
> > +# out the underlying scratch device with dm-error to see what happens when the
> > +# disk goes down.  Having taken down both fses in this manner, remount them and
> > +# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
> > +# in writeback on the host that cause VM guests to fail to recover.
> > +#
> > +. ./common/preamble
> > +_begin_fstest shutdown auto log metadata eio
> > +
> > +_cleanup()
> > +{
> > +	cd /
> > +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > +	wait
> > +	if [ -n "$loopmnt" ]; then
> > +		umount $loopmnt 2>/dev/null
> > +		rm -r -f $loopmnt
> > +	fi
> > +	rm -f $tmp.*
> > +	_dmerror_unmount
> > +	_dmerror_cleanup
> > +}
> > +
> > +# Import common functions.
> > +. ./common/dmerror
> > +. ./common/reflink
> > +
> > +# Modify as appropriate.
> > +_supported_fs generic
> > +
> > +_require_scratch_reflink
> > +_require_cp_reflink
> > +_require_dm_target error
> > +_require_command "$KILLALL_PROG" "killall"
> > +
> > +echo "Silence is golden."
> > +
> > +_scratch_mkfs >> $seqres.full 2>&1
> > +_require_metadata_journaling $SCRATCH_DEV
> > +_dmerror_init
> > +_dmerror_mount
> > +
> > +# Create a fs image consuming 1/3 of the scratch fs
> > +scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)
> > +loopimg_bytes=$((scratch_freesp_bytes / 3))
> > +
> > +loopimg=$SCRATCH_MNT/testfs
> > +truncate -s $loopimg_bytes $loopimg
> > +_mkfs_dev $loopimg
> 
> I must say this's a nice test as generic/475, I'd like to have it ASAP :)
> Just one question: if the FSTYP is nfs, cifs or virtiofs and so on ... [see below]
> 
> > +
> > +loopmnt=$tmp.mount
> > +mkdir -p $loopmnt
> > +
> > +scratch_aliveflag=$tmp.runsnap
> > +snap_aliveflag=$tmp.snapping
> > +
> > +snap_loop_fs() {
> > +	touch "$snap_aliveflag"
> > +	while [ -e "$scratch_aliveflag" ]; do
> > +		rm -f $loopimg.a
> > +		_cp_reflink $loopimg $loopimg.a
> > +		sleep 1
> > +	done
> > +	rm -f "$snap_aliveflag"
> > +}
> > +
> > +fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
> > +
> > +for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
> > +	touch $scratch_aliveflag
> > +	snap_loop_fs >> $seqres.full 2>&1 &
> > +
> > +	if ! _mount $loopimg $loopmnt -o loop; then
> 
> ... This test will fail directly at here

It won't, because this test doesn't run if SCRATCH_DEV isn't a block
device.  _require_dm_target calls _require_block_device, which should
prevent that, right?

--D

> 
> Thanks,
> Zorro
> 
> > +		rm -f $scratch_aliveflag
> > +		_fail "loop mount failed"
> > +		break
> > +	fi
> > +
> > +	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
> > +
> > +	# purposely include 0 second sleeps to test shutdown immediately after
> > +	# recovery
> > +	sleep $((RANDOM % (3 * TIME_FACTOR) ))
> > +	rm -f $scratch_aliveflag
> > +
> > +	# This test aims to simulate sudden disk failure, which means that we
> > +	# do not want to quiesce the filesystem or otherwise give it a chance
> > +	# to flush its logs.  Therefore we want to call dmsetup with the
> > +	# --nolockfs parameter; to make this happen we must call the load
> > +	# error table helper *without* 'lockfs'.
> > +	_dmerror_load_error_table
> > +
> > +	ps -e | grep fsstress > /dev/null 2>&1
> > +	while [ $? -eq 0 ]; do
> > +		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> > +		wait > /dev/null 2>&1
> > +		ps -e | grep fsstress > /dev/null 2>&1
> > +	done
> > +	for ((i = 0; i < 10; i++)); do
> > +		test -e "$snap_aliveflag" || break
> > +		sleep 1
> > +	done
> > +
> > +	# Mount again to replay log after loading working table, so we have a
> > +	# consistent XFS after test.
> > +	$UMOUNT_PROG $loopmnt
> > +	_dmerror_unmount || _fail "unmount failed"
> > +	_dmerror_load_working_table
> > +	if ! _dmerror_mount; then
> > +		dmsetup table | tee -a /dev/ttyprintk
> > +		lsblk | tee -a /dev/ttyprintk
> > +		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md
> > +		_fail "mount failed"
> > +	fi
> > +done
> > +
> > +# Make sure the fs image file is ok
> > +if [ -f "$loopimg" ]; then
> > +	if _mount $loopimg $loopmnt -o loop; then
> > +		$UMOUNT_PROG $loopmnt &> /dev/null
> > +	else
> > +		echo "final loop mount failed"
> > +	fi
> > +	_check_xfs_filesystem $loopimg none none
> > +fi
> > +
> > +# success, all done
> > +status=0
> > +exit
> > diff --git a/tests/generic/725.out b/tests/generic/725.out
> > new file mode 100644
> > index 00000000..ed73a9fc
> > --- /dev/null
> > +++ b/tests/generic/725.out
> > @@ -0,0 +1,2 @@
> > +QA output created by 725
> > +Silence is golden.
> > 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-07-28  0:10 ` [PATCH 2/3] generic: test shutdowns of a nested filesystem Darrick J. Wong
@ 2021-08-12  5:44   ` Zorro Lang
  2021-08-12 17:07     ` Darrick J. Wong
  2021-08-15 16:28   ` Eryu Guan
  1 sibling, 1 reply; 17+ messages in thread
From: Zorro Lang @ 2021-08-12  5:44 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: guaneryu, linux-xfs, fstests, guan

On Tue, Jul 27, 2021 at 05:10:30PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> generic/475, but we're running fsstress on a disk image inside the
> scratch filesystem
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/725.out |    2 +
>  2 files changed, 138 insertions(+)
>  create mode 100755 tests/generic/725
>  create mode 100644 tests/generic/725.out
> 
> 
> diff --git a/tests/generic/725 b/tests/generic/725
> new file mode 100755
> index 00000000..f43bcb37
> --- /dev/null
> +++ b/tests/generic/725
> @@ -0,0 +1,136 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
> +#
> +# FS QA Test No. 725
> +#
> +# Test nested log recovery with repeated (simulated) disk failures.  We kick
> +# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
> +# out the underlying scratch device with dm-error to see what happens when the
> +# disk goes down.  Having taken down both fses in this manner, remount them and
> +# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
> +# in writeback on the host that cause VM guests to fail to recover.
> +#
> +. ./common/preamble
> +_begin_fstest shutdown auto log metadata eio
> +
> +_cleanup()
> +{
> +	cd /
> +	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> +	wait
> +	if [ -n "$loopmnt" ]; then
> +		umount $loopmnt 2>/dev/null
> +		rm -r -f $loopmnt
> +	fi
> +	rm -f $tmp.*
> +	_dmerror_unmount
> +	_dmerror_cleanup
> +}
> +
> +# Import common functions.
> +. ./common/dmerror
> +. ./common/reflink
> +
> +# Modify as appropriate.
> +_supported_fs generic
> +
> +_require_scratch_reflink
> +_require_cp_reflink
> +_require_dm_target error
> +_require_command "$KILLALL_PROG" "killall"
> +
> +echo "Silence is golden."
> +
> +_scratch_mkfs >> $seqres.full 2>&1
> +_require_metadata_journaling $SCRATCH_DEV
> +_dmerror_init
> +_dmerror_mount
> +
> +# Create a fs image consuming 1/3 of the scratch fs
> +scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)
> +loopimg_bytes=$((scratch_freesp_bytes / 3))
> +
> +loopimg=$SCRATCH_MNT/testfs
> +truncate -s $loopimg_bytes $loopimg
> +_mkfs_dev $loopimg

I must say this's a nice test as generic/475, I'd like to have it ASAP :)
Just one question: if the FSTYP is nfs, cifs or virtiofs and so on ... [see below]

> +
> +loopmnt=$tmp.mount
> +mkdir -p $loopmnt
> +
> +scratch_aliveflag=$tmp.runsnap
> +snap_aliveflag=$tmp.snapping
> +
> +snap_loop_fs() {
> +	touch "$snap_aliveflag"
> +	while [ -e "$scratch_aliveflag" ]; do
> +		rm -f $loopimg.a
> +		_cp_reflink $loopimg $loopimg.a
> +		sleep 1
> +	done
> +	rm -f "$snap_aliveflag"
> +}
> +
> +fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
> +
> +for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
> +	touch $scratch_aliveflag
> +	snap_loop_fs >> $seqres.full 2>&1 &
> +
> +	if ! _mount $loopimg $loopmnt -o loop; then

... This test will fail directly at here

Thanks,
Zorro

> +		rm -f $scratch_aliveflag
> +		_fail "loop mount failed"
> +		break
> +	fi
> +
> +	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
> +
> +	# purposely include 0 second sleeps to test shutdown immediately after
> +	# recovery
> +	sleep $((RANDOM % (3 * TIME_FACTOR) ))
> +	rm -f $scratch_aliveflag
> +
> +	# This test aims to simulate sudden disk failure, which means that we
> +	# do not want to quiesce the filesystem or otherwise give it a chance
> +	# to flush its logs.  Therefore we want to call dmsetup with the
> +	# --nolockfs parameter; to make this happen we must call the load
> +	# error table helper *without* 'lockfs'.
> +	_dmerror_load_error_table
> +
> +	ps -e | grep fsstress > /dev/null 2>&1
> +	while [ $? -eq 0 ]; do
> +		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
> +		wait > /dev/null 2>&1
> +		ps -e | grep fsstress > /dev/null 2>&1
> +	done
> +	for ((i = 0; i < 10; i++)); do
> +		test -e "$snap_aliveflag" || break
> +		sleep 1
> +	done
> +
> +	# Mount again to replay log after loading working table, so we have a
> +	# consistent XFS after test.
> +	$UMOUNT_PROG $loopmnt
> +	_dmerror_unmount || _fail "unmount failed"
> +	_dmerror_load_working_table
> +	if ! _dmerror_mount; then
> +		dmsetup table | tee -a /dev/ttyprintk
> +		lsblk | tee -a /dev/ttyprintk
> +		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md
> +		_fail "mount failed"
> +	fi
> +done
> +
> +# Make sure the fs image file is ok
> +if [ -f "$loopimg" ]; then
> +	if _mount $loopimg $loopmnt -o loop; then
> +		$UMOUNT_PROG $loopmnt &> /dev/null
> +	else
> +		echo "final loop mount failed"
> +	fi
> +	_check_xfs_filesystem $loopimg none none
> +fi
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/725.out b/tests/generic/725.out
> new file mode 100644
> index 00000000..ed73a9fc
> --- /dev/null
> +++ b/tests/generic/725.out
> @@ -0,0 +1,2 @@
> +QA output created by 725
> +Silence is golden.
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 2/3] generic: test shutdowns of a nested filesystem
  2021-07-28  0:10 [PATCHSET " Darrick J. Wong
@ 2021-07-28  0:10 ` Darrick J. Wong
  2021-08-12  5:44   ` Zorro Lang
  2021-08-15 16:28   ` Eryu Guan
  0 siblings, 2 replies; 17+ messages in thread
From: Darrick J. Wong @ 2021-07-28  0:10 UTC (permalink / raw)
  To: djwong, guaneryu; +Cc: linux-xfs, fstests, guan

From: Darrick J. Wong <djwong@kernel.org>

generic/475, but we're running fsstress on a disk image inside the
scratch filesystem

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 tests/generic/725     |  136 +++++++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/725.out |    2 +
 2 files changed, 138 insertions(+)
 create mode 100755 tests/generic/725
 create mode 100644 tests/generic/725.out


diff --git a/tests/generic/725 b/tests/generic/725
new file mode 100755
index 00000000..f43bcb37
--- /dev/null
+++ b/tests/generic/725
@@ -0,0 +1,136 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
+#
+# FS QA Test No. 725
+#
+# Test nested log recovery with repeated (simulated) disk failures.  We kick
+# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
+# out the underlying scratch device with dm-error to see what happens when the
+# disk goes down.  Having taken down both fses in this manner, remount them and
+# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
+# in writeback on the host that cause VM guests to fail to recover.
+#
+. ./common/preamble
+_begin_fstest shutdown auto log metadata eio
+
+_cleanup()
+{
+	cd /
+	$KILLALL_PROG -9 fsstress > /dev/null 2>&1
+	wait
+	if [ -n "$loopmnt" ]; then
+		umount $loopmnt 2>/dev/null
+		rm -r -f $loopmnt
+	fi
+	rm -f $tmp.*
+	_dmerror_unmount
+	_dmerror_cleanup
+}
+
+# Import common functions.
+. ./common/dmerror
+. ./common/reflink
+
+# Modify as appropriate.
+_supported_fs generic
+
+_require_scratch_reflink
+_require_cp_reflink
+_require_dm_target error
+_require_command "$KILLALL_PROG" "killall"
+
+echo "Silence is golden."
+
+_scratch_mkfs >> $seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_dmerror_init
+_dmerror_mount
+
+# Create a fs image consuming 1/3 of the scratch fs
+scratch_freesp_bytes=$(stat -f -c '%a * %S' $SCRATCH_MNT | bc)
+loopimg_bytes=$((scratch_freesp_bytes / 3))
+
+loopimg=$SCRATCH_MNT/testfs
+truncate -s $loopimg_bytes $loopimg
+_mkfs_dev $loopimg
+
+loopmnt=$tmp.mount
+mkdir -p $loopmnt
+
+scratch_aliveflag=$tmp.runsnap
+snap_aliveflag=$tmp.snapping
+
+snap_loop_fs() {
+	touch "$snap_aliveflag"
+	while [ -e "$scratch_aliveflag" ]; do
+		rm -f $loopimg.a
+		_cp_reflink $loopimg $loopimg.a
+		sleep 1
+	done
+	rm -f "$snap_aliveflag"
+}
+
+fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
+
+for i in $(seq 1 $((25 * TIME_FACTOR)) ); do
+	touch $scratch_aliveflag
+	snap_loop_fs >> $seqres.full 2>&1 &
+
+	if ! _mount $loopimg $loopmnt -o loop; then
+		rm -f $scratch_aliveflag
+		_fail "loop mount failed"
+		break
+	fi
+
+	("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
+
+	# purposely include 0 second sleeps to test shutdown immediately after
+	# recovery
+	sleep $((RANDOM % (3 * TIME_FACTOR) ))
+	rm -f $scratch_aliveflag
+
+	# This test aims to simulate sudden disk failure, which means that we
+	# do not want to quiesce the filesystem or otherwise give it a chance
+	# to flush its logs.  Therefore we want to call dmsetup with the
+	# --nolockfs parameter; to make this happen we must call the load
+	# error table helper *without* 'lockfs'.
+	_dmerror_load_error_table
+
+	ps -e | grep fsstress > /dev/null 2>&1
+	while [ $? -eq 0 ]; do
+		$KILLALL_PROG -9 fsstress > /dev/null 2>&1
+		wait > /dev/null 2>&1
+		ps -e | grep fsstress > /dev/null 2>&1
+	done
+	for ((i = 0; i < 10; i++)); do
+		test -e "$snap_aliveflag" || break
+		sleep 1
+	done
+
+	# Mount again to replay log after loading working table, so we have a
+	# consistent XFS after test.
+	$UMOUNT_PROG $loopmnt
+	_dmerror_unmount || _fail "unmount failed"
+	_dmerror_load_working_table
+	if ! _dmerror_mount; then
+		dmsetup table | tee -a /dev/ttyprintk
+		lsblk | tee -a /dev/ttyprintk
+		$XFS_METADUMP_PROG -a -g -o $DMERROR_DEV $seqres.dmfail.md
+		_fail "mount failed"
+	fi
+done
+
+# Make sure the fs image file is ok
+if [ -f "$loopimg" ]; then
+	if _mount $loopimg $loopmnt -o loop; then
+		$UMOUNT_PROG $loopmnt &> /dev/null
+	else
+		echo "final loop mount failed"
+	fi
+	_check_xfs_filesystem $loopimg none none
+fi
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/725.out b/tests/generic/725.out
new file mode 100644
index 00000000..ed73a9fc
--- /dev/null
+++ b/tests/generic/725.out
@@ -0,0 +1,2 @@
+QA output created by 725
+Silence is golden.


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2021-09-13 18:25 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-01  0:11 [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Darrick J. Wong
2021-09-01  0:11 ` [PATCH 1/3] generic: fsstress with cpu offlining Darrick J. Wong
2021-09-05 14:48   ` Eryu Guan
2021-09-13 18:25     ` Darrick J. Wong
2021-09-01  0:11 ` [PATCH 2/3] generic: test shutdowns of a nested filesystem Darrick J. Wong
2021-09-01  0:12 ` [PATCH 3/3] xfs/449: filter out deprecation warnings from mkfs Darrick J. Wong
2021-09-05 15:04 ` [PATCHSET v2 0/3] fstests: exercise code refactored in 5.14 Eryu Guan
2021-09-13 18:25   ` Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2021-07-28  0:10 [PATCHSET " Darrick J. Wong
2021-07-28  0:10 ` [PATCH 2/3] generic: test shutdowns of a nested filesystem Darrick J. Wong
2021-08-12  5:44   ` Zorro Lang
2021-08-12 17:07     ` Darrick J. Wong
2021-08-13 14:52       ` Zorro Lang
2021-08-15 16:28   ` Eryu Guan
2021-08-16 16:35     ` Darrick J. Wong
2021-08-17  3:16       ` Eryu Guan
2021-08-17  4:16         ` Darrick J. Wong
2021-08-17 15:54           ` Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).