All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] fstests: log-writes: Add support to output human readable flags
@ 2018-03-06  8:15 Qu Wenruo
  2018-03-06  8:15 ` [PATCH 2/3] fstests: log-writes: Add support for METADATA flag Qu Wenruo
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Qu Wenruo @ 2018-03-06  8:15 UTC (permalink / raw)
  To: linux-btrfs, fstests; +Cc: dsterba, amir73il

Also change the flag numeric output to hex.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 src/log-writes/log-writes.c | 70 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 7 deletions(-)

diff --git a/src/log-writes/log-writes.c b/src/log-writes/log-writes.c
index 09391574..66fad350 100644
--- a/src/log-writes/log-writes.c
+++ b/src/log-writes/log-writes.c
@@ -120,6 +120,58 @@ int log_discard(struct log *log, struct log_write_entry *entry)
 	return 0;
 }
 
+#define DEFINE_LOG_FLAGS_STR_ENTRY(x)	\
+	{LOG_##x##_FLAG, #x}
+
+struct flags_to_str_entry {
+	u64 flags;
+	const char *str;
+} log_flags_table[] = {
+	DEFINE_LOG_FLAGS_STR_ENTRY(FLUSH),
+	DEFINE_LOG_FLAGS_STR_ENTRY(FUA),
+	DEFINE_LOG_FLAGS_STR_ENTRY(DISCARD),
+	DEFINE_LOG_FLAGS_STR_ENTRY(MARK)
+};
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define LOG_FLAGS_BUF_SIZE	128
+/*
+ * Convert numeric flags to human readable flags.
+ * @flags:	numeric flags
+ * @buf:	output buffer for human readable string.
+ * 		must have enough space (LOG_FLAGS_BUF_SIZE) to contain all
+ * 		the string
+ */
+static void entry_flags_to_str(u64 flags, char *buf)
+{
+	int empty = 1;
+	int left_len;
+	int i;
+
+	buf[0] = '\0';
+	for (i = 0; i < ARRAY_SIZE(log_flags_table); i++) {
+		if (flags & log_flags_table[i].flags) {
+			if (!empty)
+				strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
+			empty = 0;
+			strncat(buf, log_flags_table[i].str, LOG_FLAGS_BUF_SIZE);
+			flags &= ~log_flags_table[i].flags;
+		}
+	}
+	if (flags) {
+		if (!empty)
+			strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
+		empty = 0;
+		left_len = LOG_FLAGS_BUF_SIZE - strnlen(buf,
+						        LOG_FLAGS_BUF_SIZE);
+		if (left_len > 0)
+			snprintf(buf + strnlen(buf, LOG_FLAGS_BUF_SIZE),
+				 left_len, "UNKNOWN.%llu", flags);
+	}
+	if (empty)
+		strncpy(buf, "NONE", LOG_FLAGS_BUF_SIZE);
+}
+
 /*
  * @log: the log we are replaying.
  * @entry: entry to be replayed.
@@ -179,6 +231,7 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
 	size_t read_size = read_data ? log->sectorsize :
 		sizeof(struct log_write_entry);
 	char *buf;
+	char flags_buf[LOG_FLAGS_BUF_SIZE];
 	ssize_t ret;
 	off_t offset;
 	int skip = 0;
@@ -210,19 +263,20 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
 		log->cur_pos += read_size;
 	}
 
+	flags = le64_to_cpu(entry->flags);
+	entry_flags_to_str(flags, flags_buf);
 	skip = log_should_skip(log, entry);
 	if (log_writes_verbose > 1 || (log_writes_verbose && !skip)) {
-		printf("%s %d@%llu: sector %llu, size %llu, flags %llu\n",
+		printf("%s %d@%llu: sector %llu, size %llu, flags 0x%llx(%s)\n",
 		       skip ? "skipping" : "replaying",
 		       (int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
 		       (unsigned long long)le64_to_cpu(entry->sector),
 		       (unsigned long long)size,
-		       (unsigned long long)le64_to_cpu(entry->flags));
+		       (unsigned long long)flags, flags_buf);
 	}
 	if (!size)
 		return 0;
 
-	flags = le64_to_cpu(entry->flags);
 	if (flags & LOG_DISCARD_FLAG)
 		return log_discard(log, entry);
 
@@ -301,7 +355,7 @@ int log_seek_entry(struct log *log, u64 entry_num)
 			return -1;
 		}
 		if (log_writes_verbose > 1)
-			printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
+			printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx\n",
 			       (int)i, log->cur_pos / log->sectorsize,
 			       (unsigned long long)le64_to_cpu(entry.sector),
 			       (unsigned long long)le64_to_cpu(entry.nr_sectors),
@@ -339,6 +393,7 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
 	size_t read_size = read_data ? log->sectorsize :
 		sizeof(struct log_write_entry);
 	u64 flags;
+	char flags_buf[LOG_FLAGS_BUF_SIZE];
 	ssize_t ret;
 
 	if (log->cur_entry >= log->nr_entries)
@@ -366,14 +421,15 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
 	} else {
 		log->cur_pos += read_size;
 	}
+	flags = le64_to_cpu(entry->flags);
+	entry_flags_to_str(flags, flags_buf);
 	if (log_writes_verbose > 1)
-		printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
+		printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx(%s)\n",
 		       (int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
 		       (unsigned long long)le64_to_cpu(entry->sector),
 		       (unsigned long long)le64_to_cpu(entry->nr_sectors),
-		       (unsigned long long)le64_to_cpu(entry->flags));
+		       (unsigned long long)flags, flags_buf);
 
-	flags = le64_to_cpu(entry->flags);
 	read_size = le64_to_cpu(entry->nr_sectors) * log->sectorsize;
 	if (!read_size || (flags & LOG_DISCARD_FLAG))
 		return 0;
-- 
2.15.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] fstests: log-writes: Add support for METADATA flag
  2018-03-06  8:15 [PATCH 1/3] fstests: log-writes: Add support to output human readable flags Qu Wenruo
@ 2018-03-06  8:15 ` Qu Wenruo
  2018-03-06  8:15 ` [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption Qu Wenruo
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Qu Wenruo @ 2018-03-06  8:15 UTC (permalink / raw)
  To: linux-btrfs, fstests; +Cc: dsterba, amir73il

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 src/log-writes/log-writes.c | 3 ++-
 src/log-writes/log-writes.h | 9 +++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/log-writes/log-writes.c b/src/log-writes/log-writes.c
index 66fad350..d755194a 100644
--- a/src/log-writes/log-writes.c
+++ b/src/log-writes/log-writes.c
@@ -130,7 +130,8 @@ struct flags_to_str_entry {
 	DEFINE_LOG_FLAGS_STR_ENTRY(FLUSH),
 	DEFINE_LOG_FLAGS_STR_ENTRY(FUA),
 	DEFINE_LOG_FLAGS_STR_ENTRY(DISCARD),
-	DEFINE_LOG_FLAGS_STR_ENTRY(MARK)
+	DEFINE_LOG_FLAGS_STR_ENTRY(MARK),
+	DEFINE_LOG_FLAGS_STR_ENTRY(METADATA)
 };
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/src/log-writes/log-writes.h b/src/log-writes/log-writes.h
index 35ca3583..75fb8ac0 100644
--- a/src/log-writes/log-writes.h
+++ b/src/log-writes/log-writes.h
@@ -20,10 +20,11 @@ typedef __u32 u32;
 /*
  * Constants copied from kernel file drivers/md/dm-log-writes.c
  */
-#define LOG_FLUSH_FLAG (1 << 0)
-#define LOG_FUA_FLAG (1 << 1)
-#define LOG_DISCARD_FLAG (1 << 2)
-#define LOG_MARK_FLAG (1 << 3)
+#define LOG_FLUSH_FLAG		(1 << 0)
+#define LOG_FUA_FLAG		(1 << 1)
+#define LOG_DISCARD_FLAG	(1 << 2)
+#define LOG_MARK_FLAG		(1 << 3)
+#define LOG_METADATA_FLAG	(1 << 4)
 
 #define WRITE_LOG_VERSION 1
 #define WRITE_LOG_MAGIC 0x6a736677736872
-- 
2.15.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06  8:15 [PATCH 1/3] fstests: log-writes: Add support to output human readable flags Qu Wenruo
  2018-03-06  8:15 ` [PATCH 2/3] fstests: log-writes: Add support for METADATA flag Qu Wenruo
@ 2018-03-06  8:15 ` Qu Wenruo
  2018-03-06  9:03   ` Amir Goldstein
  2018-03-06 10:12   ` Filipe Manana
  2018-03-06  8:35 ` [PATCH 1/3] fstests: log-writes: Add support to output human readable flags Amir Goldstein
  2018-03-06  8:51 ` [PATCH v2 " Qu Wenruo
  3 siblings, 2 replies; 14+ messages in thread
From: Qu Wenruo @ 2018-03-06  8:15 UTC (permalink / raw)
  To: linux-btrfs, fstests; +Cc: dsterba, amir73il

There are some btrfs corruption report in mail list for a while,
although such corruption is pretty rare and almost impossible to
reproduce, with dm-log-writes we found it's highly related to v1 space
cache.

Unlike journal based filesystems, btrfs completely rely on metadata CoW
to protect itself from power loss.
Which needs extent allocator to avoid allocate extents on existing
extent range.
Btrfs also uses free space cache to speed up such allocation.

However there is a problem, v1 space cache is not protected by data CoW,
and can be corrupted during power loss.
So btrfs do extra check on free space cache, verifying its own in-file csum,
generation and free space recorded in cache and extent tree.

The problem is, under heavy concurrency, v1 space cache can be corrupted
even under normal operations without power loss.
And we believe corrupted space cache can break btrfs metadata CoW and
leads to the rare corruption in next power loss.

The most obvious symptom will be difference in free space:

This will be caught by kernel, but such check is quite weak, and if
the net free space change is 0 in one transaction, the corrupted
cache can be loaded by kernel.

In this case, btrfs check would report things like :
------
block group 298844160 has wrong amount of free space
failed to load free space cache for block group 298844160
------

But considering the test case are using notreelog, btrfs won't do
sub-transaction commit which doesn't increase generation, each
transaction should be consistent, and nothing should be reported at all.

Further more, we can even found corrupted file extents like:
------
root 5 inode 261 errors 100, file extent discount
Found file extent holes:
        start: 962560, len: 32768
ERROR: errors found in fs roots
------

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 common/dmlogwrites  |  72 +++++++++++++++++++++++++++
 tests/btrfs/159     | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/btrfs/159.out |   2 +
 tests/btrfs/group   |   1 +
 4 files changed, 216 insertions(+)
 create mode 100755 tests/btrfs/159
 create mode 100644 tests/btrfs/159.out

diff --git a/common/dmlogwrites b/common/dmlogwrites
index 467b872e..54e7e242 100644
--- a/common/dmlogwrites
+++ b/common/dmlogwrites
@@ -126,3 +126,75 @@ _log_writes_cleanup()
 	$UDEV_SETTLE_PROG >/dev/null 2>&1
 	_log_writes_remove
 }
+
+# Convert log writes mark to entry number
+# Result entry number is output to stdout, could be empty if not found
+_log_writes_mark_to_entry_number()
+{
+	local _mark=$1
+	local ret
+
+	[ -z "$_mark" ] && _fail \
+		"mark must be given for _log_writes_mark_to_entry_number"
+
+	ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
+		--end-mark $_mark 2> /dev/null)
+	[ -z "$ret" ] && return
+	ret=$(echo "$ret" | cut -f1 -d\@)
+	echo "mark $_mark has entry number $ret" >> $seqres.full
+	echo "$ret"
+}
+
+# Find next fua write entry number
+# Result entry number is output to stdout, could be empty if not found
+_log_writes_find_next_fua()
+{
+	local _start_entry=$1
+	local ret
+
+	if [ -z "$_start_entry" ]; then
+		ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
+			--next-fua 2> /dev/null)
+	else
+		ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
+			--next-fua --start-entry $_start_entry 2> /dev/null)
+	fi
+	[ -z "$ret" ] && return
+
+	ret=$(echo "$ret" | cut -f1 -d\@)
+	echo "next fua is entry number $ret" >> $seqres.full
+	echo "$ret"
+}
+
+# Replay log range to specified entry
+# $1:	End entry. The last entry will *NOT* be replayed
+# $2:	Start entry. If not specified, start from the first entry.
+_log_writes_replay_log_entry_range()
+{
+	local _end=$1
+	local _start=$2
+
+	[ -z "$_end" ] && _fail \
+	"end entry must be specified for _log_writes_replay_log_entry_range"
+
+	if [[ "$_start" && "$_start" -gt "$_end" ]]; then
+		_fail \
+		"wrong parameter order for _log_writes_replay_log_entry_range:start=$_start end=$_end"
+	fi
+
+	# Original replay-log won't replay the last entry. So increase entry
+	# number here to ensure the end entry to be replayed
+	if [ -z "$_start" ]; then
+		echo "=== replay to $_end ===" >> $seqres.full
+		$here/src/log-writes/replay-log --log $LOGWRITES_DEV \
+			--replay $SCRATCH_DEV --limit $_end -v \
+			>> $seqres.full 2>&1
+	else
+		echo "=== replay from $_start to $_end ===" >> $seqres.full
+		$here/src/log-writes/replay-log --log $LOGWRITES_DEV \
+			--replay $SCRATCH_DEV --start-entry $_start \
+			--limit $(($_end - $_start)) -v \
+			>> $seqres.full 2>&1
+	fi
+	[ $? -ne 0 ] && _fail "replay failed"
+}
diff --git a/tests/btrfs/159 b/tests/btrfs/159
new file mode 100755
index 00000000..5113d526
--- /dev/null
+++ b/tests/btrfs/159
@@ -0,0 +1,141 @@
+#! /bin/bash
+# FS QA Test 159
+#
+# Test case for btrfs v1 space cache corruption
+#
+# Btrfs has some unexpected corruption after power loss, during the
+# investigation, the problem leads to v1 space cache corruption, which
+# could be loaded by kernel without detection.
+#
+# Kernel has 3 detection for corrupted space cache, checksum, generation
+# and free space, however the last one is a weak one. If in the one transaction
+# no net space change it can be loaded by kernel and break up metadata CoW,
+# leading serious corruption.
+#
+#-----------------------------------------------------------------------
+# Copyright (c) SUSE.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#-----------------------------------------------------------------------
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+	cd /
+	rm -f $tmp.*
+	_log_writes_cleanup
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/dmlogwrites
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# Small workload is enough to trigger the problem
+workload=$(( 512 * $LOAD_FACTOR ))
+nr_threads=$(($($here/src/feature -o) * $LOAD_FACTOR))
+
+# Modify as appropriate.
+_supported_fs btrfs
+_supported_os Linux
+_require_log_writes
+_require_scratch
+
+_log_writes_init
+_log_writes_mkfs >> $seqres.full 2>&1
+
+# Here we don't want to create space cache yet
+_log_writes_mount -o nospace_cache
+
+# The SINGLE profile data chunk created by mkfs is too small (8M) so btrfs
+# won't create space cache for it.
+# We need a data chunk large enough so btrfs create space cache for it and
+# store the cache in itself.
+_run_btrfs_util_prog balance start --full-balance "$SCRATCH_MNT"
+
+_log_writes_unmount
+_log_writes_mark mkfs
+
+# Disable tree log, so btrfs will always do full transaction commit
+# It's OK to use tree log, but it will cause extra super block update which
+# doesn't increase generation.
+_log_writes_mount -o space_cache=v1,notreelog
+run_check $FSSTRESS_PROG -w -n $workload -p $nr_threads -d $SCRATCH_MNT \
+	$FSSTRESS_AVOID > /dev/null 2>&1
+_log_writes_unmount
+
+_log_writes_remove
+# Here we manually iterate through the log entries, as we need extra check
+# on the output of btrfs check. And we could have detailed replay log in 
+# $seqres.full
+prev=$(_log_writes_mark_to_entry_number mkfs)
+if [ -z "$prev" ]; then
+	_fail "failed to locate log writes mark 'mkfs'"
+fi
+
+# Here we only checks the result *AFTER* FUA writes, and no tree log, so
+# everything should be valid.
+cur=$(_log_writes_find_next_fua $prev)
+[ -z "$cur" ] && _fail "failed to locate next fua write"
+
+# log-replay will not replay the last item, so manually increase it by one
+cur=$(($cur + 1))
+
+_log_writes_replay_log_entry_range $prev >> $seqres.full 2>&1
+while [ ! -z "$cur" ]; do
+	_log_writes_replay_log_entry_range $cur $prev
+	# Catch the btrfs check output into temp file, as we need to
+	# grep the output to find the cache corruption
+	$BTRFS_UTIL_PROG check --check-data-csum $SCRATCH_DEV &> $tmp.fsck
+
+	# Cache passed generation,csum and free space check but corrupted
+	# will be reported as error
+	if [ $? -ne 0 ]; then
+		cat $tmp.fsck >> $seqres.full
+		_fail "btrfs check found corruption"
+	fi
+
+	# Mount option has ruled out any possible factors affect space cache
+	# And we are at the FUA writes, no generation related problem should
+	# happen anyway
+	if grep -q -e 'failed to load free space cache' $tmp.fsck; then
+		cat $tmp.fsck >> $seqres.full
+		_fail "btrfs check found invalid space cache"
+	fi
+
+	prev=$cur
+	cur=$(_log_writes_find_next_fua $prev)
+	[ -z $cur ] && break
+
+	# Same as above
+	cur=$(($cur + 1))
+done
+
+echo "Silence is golden"
+
+# success, all done
+status=0
+exit
diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
new file mode 100644
index 00000000..e569e60c
--- /dev/null
+++ b/tests/btrfs/159.out
@@ -0,0 +1,2 @@
+QA output created by 159
+Silence is golden
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 8007e07e..bc83db94 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -161,3 +161,4 @@
 156 auto quick trim
 157 auto quick raid
 158 auto quick raid scrub
+159 auto
-- 
2.15.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] fstests: log-writes: Add support to output human readable flags
  2018-03-06  8:15 [PATCH 1/3] fstests: log-writes: Add support to output human readable flags Qu Wenruo
  2018-03-06  8:15 ` [PATCH 2/3] fstests: log-writes: Add support for METADATA flag Qu Wenruo
  2018-03-06  8:15 ` [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption Qu Wenruo
@ 2018-03-06  8:35 ` Amir Goldstein
  2018-03-06  8:51 ` [PATCH v2 " Qu Wenruo
  3 siblings, 0 replies; 14+ messages in thread
From: Amir Goldstein @ 2018-03-06  8:35 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Linux Btrfs, fstests, dsterba

On Tue, Mar 6, 2018 at 10:15 AM, Qu Wenruo <wqu@suse.com> wrote:
> Also change the flag numeric output to hex.
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  src/log-writes/log-writes.c | 70 ++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 63 insertions(+), 7 deletions(-)
>
> diff --git a/src/log-writes/log-writes.c b/src/log-writes/log-writes.c
> index 09391574..66fad350 100644
> --- a/src/log-writes/log-writes.c
> +++ b/src/log-writes/log-writes.c
> @@ -120,6 +120,58 @@ int log_discard(struct log *log, struct log_write_entry *entry)
>         return 0;
>  }
>
> +#define DEFINE_LOG_FLAGS_STR_ENTRY(x)  \
> +       {LOG_##x##_FLAG, #x}
> +
> +struct flags_to_str_entry {
> +       u64 flags;
> +       const char *str;
> +} log_flags_table[] = {
> +       DEFINE_LOG_FLAGS_STR_ENTRY(FLUSH),
> +       DEFINE_LOG_FLAGS_STR_ENTRY(FUA),
> +       DEFINE_LOG_FLAGS_STR_ENTRY(DISCARD),
> +       DEFINE_LOG_FLAGS_STR_ENTRY(MARK)
> +};
> +
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
> +#define LOG_FLAGS_BUF_SIZE     128
> +/*
> + * Convert numeric flags to human readable flags.
> + * @flags:     numeric flags
> + * @buf:       output buffer for human readable string.
> + *             must have enough space (LOG_FLAGS_BUF_SIZE) to contain all
> + *             the string
> + */
> +static void entry_flags_to_str(u64 flags, char *buf)
> +{
> +       int empty = 1;
> +       int left_len;
> +       int i;
> +
> +       buf[0] = '\0';
> +       for (i = 0; i < ARRAY_SIZE(log_flags_table); i++) {
> +               if (flags & log_flags_table[i].flags) {
> +                       if (!empty)
> +                               strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
> +                       empty = 0;
> +                       strncat(buf, log_flags_table[i].str, LOG_FLAGS_BUF_SIZE);
> +                       flags &= ~log_flags_table[i].flags;
> +               }
> +       }
> +       if (flags) {
> +               if (!empty)
> +                       strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
> +               empty = 0;
> +               left_len = LOG_FLAGS_BUF_SIZE - strnlen(buf,
> +                                                       LOG_FLAGS_BUF_SIZE);
> +               if (left_len > 0)
> +                       snprintf(buf + strnlen(buf, LOG_FLAGS_BUF_SIZE),
> +                                left_len, "UNKNOWN.%llu", flags);

Missed a spot - %llx

> +       }
> +       if (empty)
> +               strncpy(buf, "NONE", LOG_FLAGS_BUF_SIZE);
> +}
> +
>  /*
>   * @log: the log we are replaying.
>   * @entry: entry to be replayed.
> @@ -179,6 +231,7 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
>         size_t read_size = read_data ? log->sectorsize :
>                 sizeof(struct log_write_entry);
>         char *buf;
> +       char flags_buf[LOG_FLAGS_BUF_SIZE];
>         ssize_t ret;
>         off_t offset;
>         int skip = 0;
> @@ -210,19 +263,20 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
>                 log->cur_pos += read_size;
>         }
>
> +       flags = le64_to_cpu(entry->flags);
> +       entry_flags_to_str(flags, flags_buf);
>         skip = log_should_skip(log, entry);
>         if (log_writes_verbose > 1 || (log_writes_verbose && !skip)) {
> -               printf("%s %d@%llu: sector %llu, size %llu, flags %llu\n",
> +               printf("%s %d@%llu: sector %llu, size %llu, flags 0x%llx(%s)\n",
>                        skip ? "skipping" : "replaying",
>                        (int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
>                        (unsigned long long)le64_to_cpu(entry->sector),
>                        (unsigned long long)size,
> -                      (unsigned long long)le64_to_cpu(entry->flags));
> +                      (unsigned long long)flags, flags_buf);
>         }
>         if (!size)
>                 return 0;
>
> -       flags = le64_to_cpu(entry->flags);
>         if (flags & LOG_DISCARD_FLAG)
>                 return log_discard(log, entry);
>
> @@ -301,7 +355,7 @@ int log_seek_entry(struct log *log, u64 entry_num)
>                         return -1;
>                 }
>                 if (log_writes_verbose > 1)
> -                       printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
> +                       printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx\n",
>                                (int)i, log->cur_pos / log->sectorsize,
>                                (unsigned long long)le64_to_cpu(entry.sector),
>                                (unsigned long long)le64_to_cpu(entry.nr_sectors),
> @@ -339,6 +393,7 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
>         size_t read_size = read_data ? log->sectorsize :
>                 sizeof(struct log_write_entry);
>         u64 flags;
> +       char flags_buf[LOG_FLAGS_BUF_SIZE];
>         ssize_t ret;
>
>         if (log->cur_entry >= log->nr_entries)
> @@ -366,14 +421,15 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
>         } else {
>                 log->cur_pos += read_size;
>         }
> +       flags = le64_to_cpu(entry->flags);
> +       entry_flags_to_str(flags, flags_buf);
>         if (log_writes_verbose > 1)
> -               printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
> +               printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx(%s)\n",
>                        (int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
>                        (unsigned long long)le64_to_cpu(entry->sector),
>                        (unsigned long long)le64_to_cpu(entry->nr_sectors),
> -                      (unsigned long long)le64_to_cpu(entry->flags));
> +                      (unsigned long long)flags, flags_buf);
>
> -       flags = le64_to_cpu(entry->flags);
>         read_size = le64_to_cpu(entry->nr_sectors) * log->sectorsize;
>         if (!read_size || (flags & LOG_DISCARD_FLAG))
>                 return 0;
> --
> 2.15.1
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 1/3] fstests: log-writes: Add support to output human readable flags
  2018-03-06  8:15 [PATCH 1/3] fstests: log-writes: Add support to output human readable flags Qu Wenruo
                   ` (2 preceding siblings ...)
  2018-03-06  8:35 ` [PATCH 1/3] fstests: log-writes: Add support to output human readable flags Amir Goldstein
@ 2018-03-06  8:51 ` Qu Wenruo
  3 siblings, 0 replies; 14+ messages in thread
From: Qu Wenruo @ 2018-03-06  8:51 UTC (permalink / raw)
  To: linux-btrfs, fstests; +Cc: dsterba, amir73il

Also change the flag numeric output to hex.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
v2:
  Also output unknown flags in hex
---
 src/log-writes/log-writes.c | 70 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 7 deletions(-)

diff --git a/src/log-writes/log-writes.c b/src/log-writes/log-writes.c
index 09391574..a872429d 100644
--- a/src/log-writes/log-writes.c
+++ b/src/log-writes/log-writes.c
@@ -120,6 +120,58 @@ int log_discard(struct log *log, struct log_write_entry *entry)
 	return 0;
 }
 
+#define DEFINE_LOG_FLAGS_STR_ENTRY(x)	\
+	{LOG_##x##_FLAG, #x}
+
+struct flags_to_str_entry {
+	u64 flags;
+	const char *str;
+} log_flags_table[] = {
+	DEFINE_LOG_FLAGS_STR_ENTRY(FLUSH),
+	DEFINE_LOG_FLAGS_STR_ENTRY(FUA),
+	DEFINE_LOG_FLAGS_STR_ENTRY(DISCARD),
+	DEFINE_LOG_FLAGS_STR_ENTRY(MARK)
+};
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define LOG_FLAGS_BUF_SIZE	128
+/*
+ * Convert numeric flags to human readable flags.
+ * @flags:	numeric flags
+ * @buf:	output buffer for human readable string.
+ * 		must have enough space (LOG_FLAGS_BUF_SIZE) to contain all
+ * 		the string
+ */
+static void entry_flags_to_str(u64 flags, char *buf)
+{
+	int empty = 1;
+	int left_len;
+	int i;
+
+	buf[0] = '\0';
+	for (i = 0; i < ARRAY_SIZE(log_flags_table); i++) {
+		if (flags & log_flags_table[i].flags) {
+			if (!empty)
+				strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
+			empty = 0;
+			strncat(buf, log_flags_table[i].str, LOG_FLAGS_BUF_SIZE);
+			flags &= ~log_flags_table[i].flags;
+		}
+	}
+	if (flags) {
+		if (!empty)
+			strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
+		empty = 0;
+		left_len = LOG_FLAGS_BUF_SIZE - strnlen(buf,
+						        LOG_FLAGS_BUF_SIZE);
+		if (left_len > 0)
+			snprintf(buf + strnlen(buf, LOG_FLAGS_BUF_SIZE),
+				 left_len, "UNKNOWN.0x%llx", flags);
+	}
+	if (empty)
+		strncpy(buf, "NONE", LOG_FLAGS_BUF_SIZE);
+}
+
 /*
  * @log: the log we are replaying.
  * @entry: entry to be replayed.
@@ -179,6 +231,7 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
 	size_t read_size = read_data ? log->sectorsize :
 		sizeof(struct log_write_entry);
 	char *buf;
+	char flags_buf[LOG_FLAGS_BUF_SIZE];
 	ssize_t ret;
 	off_t offset;
 	int skip = 0;
@@ -210,19 +263,20 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
 		log->cur_pos += read_size;
 	}
 
+	flags = le64_to_cpu(entry->flags);
+	entry_flags_to_str(flags, flags_buf);
 	skip = log_should_skip(log, entry);
 	if (log_writes_verbose > 1 || (log_writes_verbose && !skip)) {
-		printf("%s %d@%llu: sector %llu, size %llu, flags %llu\n",
+		printf("%s %d@%llu: sector %llu, size %llu, flags 0x%llx(%s)\n",
 		       skip ? "skipping" : "replaying",
 		       (int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
 		       (unsigned long long)le64_to_cpu(entry->sector),
 		       (unsigned long long)size,
-		       (unsigned long long)le64_to_cpu(entry->flags));
+		       (unsigned long long)flags, flags_buf);
 	}
 	if (!size)
 		return 0;
 
-	flags = le64_to_cpu(entry->flags);
 	if (flags & LOG_DISCARD_FLAG)
 		return log_discard(log, entry);
 
@@ -301,7 +355,7 @@ int log_seek_entry(struct log *log, u64 entry_num)
 			return -1;
 		}
 		if (log_writes_verbose > 1)
-			printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
+			printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx\n",
 			       (int)i, log->cur_pos / log->sectorsize,
 			       (unsigned long long)le64_to_cpu(entry.sector),
 			       (unsigned long long)le64_to_cpu(entry.nr_sectors),
@@ -339,6 +393,7 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
 	size_t read_size = read_data ? log->sectorsize :
 		sizeof(struct log_write_entry);
 	u64 flags;
+	char flags_buf[LOG_FLAGS_BUF_SIZE];
 	ssize_t ret;
 
 	if (log->cur_entry >= log->nr_entries)
@@ -366,14 +421,15 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
 	} else {
 		log->cur_pos += read_size;
 	}
+	flags = le64_to_cpu(entry->flags);
+	entry_flags_to_str(flags, flags_buf);
 	if (log_writes_verbose > 1)
-		printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
+		printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx(%s)\n",
 		       (int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
 		       (unsigned long long)le64_to_cpu(entry->sector),
 		       (unsigned long long)le64_to_cpu(entry->nr_sectors),
-		       (unsigned long long)le64_to_cpu(entry->flags));
+		       (unsigned long long)flags, flags_buf);
 
-	flags = le64_to_cpu(entry->flags);
 	read_size = le64_to_cpu(entry->nr_sectors) * log->sectorsize;
 	if (!read_size || (flags & LOG_DISCARD_FLAG))
 		return 0;
-- 
2.15.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06  8:15 ` [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption Qu Wenruo
@ 2018-03-06  9:03   ` Amir Goldstein
  2018-03-06  9:25     ` Qu Wenruo
  2018-03-06 10:12   ` Filipe Manana
  1 sibling, 1 reply; 14+ messages in thread
From: Amir Goldstein @ 2018-03-06  9:03 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Linux Btrfs, fstests, dsterba

On Tue, Mar 6, 2018 at 10:15 AM, Qu Wenruo <wqu@suse.com> wrote:
> There are some btrfs corruption report in mail list for a while,
> although such corruption is pretty rare and almost impossible to
> reproduce, with dm-log-writes we found it's highly related to v1 space
> cache.
>
> Unlike journal based filesystems, btrfs completely rely on metadata CoW
> to protect itself from power loss.
> Which needs extent allocator to avoid allocate extents on existing
> extent range.
> Btrfs also uses free space cache to speed up such allocation.
>
> However there is a problem, v1 space cache is not protected by data CoW,
> and can be corrupted during power loss.
> So btrfs do extra check on free space cache, verifying its own in-file csum,
> generation and free space recorded in cache and extent tree.
>
> The problem is, under heavy concurrency, v1 space cache can be corrupted
> even under normal operations without power loss.
> And we believe corrupted space cache can break btrfs metadata CoW and
> leads to the rare corruption in next power loss.
>
> The most obvious symptom will be difference in free space:
>
> This will be caught by kernel, but such check is quite weak, and if
> the net free space change is 0 in one transaction, the corrupted
> cache can be loaded by kernel.
>
> In this case, btrfs check would report things like :
> ------
> block group 298844160 has wrong amount of free space
> failed to load free space cache for block group 298844160
> ------
>
> But considering the test case are using notreelog, btrfs won't do
> sub-transaction commit which doesn't increase generation, each
> transaction should be consistent, and nothing should be reported at all.
>
> Further more, we can even found corrupted file extents like:
> ------
> root 5 inode 261 errors 100, file extent discount
> Found file extent holes:
>         start: 962560, len: 32768
> ERROR: errors found in fs roots
> ------
>

So what is the expectation from this test on upstream btrfs?
Probable failure? reliable failure?
Are there random seeds to fsstress that can make the test fail reliably?
Or does failure also depend on IO timing and other uncontrolled parameters?

> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  common/dmlogwrites  |  72 +++++++++++++++++++++++++++
>  tests/btrfs/159     | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/btrfs/159.out |   2 +
>  tests/btrfs/group   |   1 +
>  4 files changed, 216 insertions(+)
>  create mode 100755 tests/btrfs/159
>  create mode 100644 tests/btrfs/159.out
>
> diff --git a/common/dmlogwrites b/common/dmlogwrites
> index 467b872e..54e7e242 100644
> --- a/common/dmlogwrites
> +++ b/common/dmlogwrites
> @@ -126,3 +126,75 @@ _log_writes_cleanup()
>         $UDEV_SETTLE_PROG >/dev/null 2>&1
>         _log_writes_remove
>  }
> +
> +# Convert log writes mark to entry number
> +# Result entry number is output to stdout, could be empty if not found
> +_log_writes_mark_to_entry_number()
> +{
> +       local _mark=$1
> +       local ret
> +
> +       [ -z "$_mark" ] && _fail \
> +               "mark must be given for _log_writes_mark_to_entry_number"
> +
> +       ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
> +               --end-mark $_mark 2> /dev/null)
> +       [ -z "$ret" ] && return
> +       ret=$(echo "$ret" | cut -f1 -d\@)
> +       echo "mark $_mark has entry number $ret" >> $seqres.full
> +       echo "$ret"
> +}
> +
> +# Find next fua write entry number
> +# Result entry number is output to stdout, could be empty if not found
> +_log_writes_find_next_fua()
> +{
> +       local _start_entry=$1
> +       local ret
> +
> +       if [ -z "$_start_entry" ]; then
> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
> +                       --next-fua 2> /dev/null)
> +       else
> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
> +                       --next-fua --start-entry $_start_entry 2> /dev/null)
> +       fi
> +       [ -z "$ret" ] && return
> +
> +       ret=$(echo "$ret" | cut -f1 -d\@)
> +       echo "next fua is entry number $ret" >> $seqres.full
> +       echo "$ret"
> +}
> +
> +# Replay log range to specified entry
> +# $1:  End entry. The last entry will *NOT* be replayed
> +# $2:  Start entry. If not specified, start from the first entry.
> +_log_writes_replay_log_entry_range()
> +{
> +       local _end=$1
> +       local _start=$2
> +
> +       [ -z "$_end" ] && _fail \
> +       "end entry must be specified for _log_writes_replay_log_entry_range"
> +
> +       if [[ "$_start" && "$_start" -gt "$_end" ]]; then
> +               _fail \
> +               "wrong parameter order for _log_writes_replay_log_entry_range:start=$_start end=$_end"
> +       fi
> +
> +       # Original replay-log won't replay the last entry. So increase entry
> +       # number here to ensure the end entry to be replayed
> +       if [ -z "$_start" ]; then
> +               echo "=== replay to $_end ===" >> $seqres.full
> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
> +                       --replay $SCRATCH_DEV --limit $_end -v \
> +                       >> $seqres.full 2>&1
> +       else
> +               echo "=== replay from $_start to $_end ===" >> $seqres.full
> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
> +                       --replay $SCRATCH_DEV --start-entry $_start \
> +                       --limit $(($_end - $_start)) -v \
> +                       >> $seqres.full 2>&1
> +       fi
> +       [ $? -ne 0 ] && _fail "replay failed"
> +}
> diff --git a/tests/btrfs/159 b/tests/btrfs/159
> new file mode 100755
> index 00000000..5113d526
> --- /dev/null
> +++ b/tests/btrfs/159
> @@ -0,0 +1,141 @@
> +#! /bin/bash
> +# FS QA Test 159
> +#
> +# Test case for btrfs v1 space cache corruption
> +#
> +# Btrfs has some unexpected corruption after power loss, during the
> +# investigation, the problem leads to v1 space cache corruption, which
> +# could be loaded by kernel without detection.
> +#
> +# Kernel has 3 detection for corrupted space cache, checksum, generation
> +# and free space, however the last one is a weak one. If in the one transaction
> +# no net space change it can be loaded by kernel and break up metadata CoW,
> +# leading serious corruption.
> +#
> +#-----------------------------------------------------------------------
> +# Copyright (c) SUSE.  All Rights Reserved.

2018

> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> +#-----------------------------------------------------------------------
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1       # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +       cd /
> +       rm -f $tmp.*
> +       _log_writes_cleanup
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/dmlogwrites
> +. ./common/filter
> +
> +# remove previous $seqres.full before test
> +rm -f $seqres.full
> +
> +# Small workload is enough to trigger the problem
> +workload=$(( 512 * $LOAD_FACTOR ))
> +nr_threads=$(($($here/src/feature -o) * $LOAD_FACTOR))
> +
> +# Modify as appropriate.
> +_supported_fs btrfs
> +_supported_os Linux
> +_require_log_writes
> +_require_scratch
> +
> +_log_writes_init
> +_log_writes_mkfs >> $seqres.full 2>&1
> +
> +# Here we don't want to create space cache yet
> +_log_writes_mount -o nospace_cache
> +
> +# The SINGLE profile data chunk created by mkfs is too small (8M) so btrfs
> +# won't create space cache for it.
> +# We need a data chunk large enough so btrfs create space cache for it and
> +# store the cache in itself.
> +_run_btrfs_util_prog balance start --full-balance "$SCRATCH_MNT"
> +
> +_log_writes_unmount
> +_log_writes_mark mkfs
> +
> +# Disable tree log, so btrfs will always do full transaction commit
> +# It's OK to use tree log, but it will cause extra super block update which
> +# doesn't increase generation.
> +_log_writes_mount -o space_cache=v1,notreelog
> +run_check $FSSTRESS_PROG -w -n $workload -p $nr_threads -d $SCRATCH_MNT \
> +       $FSSTRESS_AVOID > /dev/null 2>&1
> +_log_writes_unmount
> +
> +_log_writes_remove
> +# Here we manually iterate through the log entries, as we need extra check
> +# on the output of btrfs check. And we could have detailed replay log in
> +# $seqres.full
> +prev=$(_log_writes_mark_to_entry_number mkfs)
> +if [ -z "$prev" ]; then
> +       _fail "failed to locate log writes mark 'mkfs'"
> +fi
> +
> +# Here we only checks the result *AFTER* FUA writes, and no tree log, so
> +# everything should be valid.
> +cur=$(_log_writes_find_next_fua $prev)
> +[ -z "$cur" ] && _fail "failed to locate next fua write"
> +
> +# log-replay will not replay the last item, so manually increase it by one
> +cur=$(($cur + 1))
> +
> +_log_writes_replay_log_entry_range $prev >> $seqres.full 2>&1
> +while [ ! -z "$cur" ]; do
> +       _log_writes_replay_log_entry_range $cur $prev
> +       # Catch the btrfs check output into temp file, as we need to
> +       # grep the output to find the cache corruption
> +       $BTRFS_UTIL_PROG check --check-data-csum $SCRATCH_DEV &> $tmp.fsck

So by making this a btrfs specific test you avoid the need to mount/umount and
revert to $prev. Right?

Please spell out the missing pieces for making a generic variant
to this test, so if anyone wants to pick it up they have a good starting point.
Or maybe you still intend to post a generic test as well?

> +
> +       # Cache passed generation,csum and free space check but corrupted
> +       # will be reported as error
> +       if [ $? -ne 0 ]; then
> +               cat $tmp.fsck >> $seqres.full
> +               _fail "btrfs check found corruption"
> +       fi
> +
> +       # Mount option has ruled out any possible factors affect space cache
> +       # And we are at the FUA writes, no generation related problem should
> +       # happen anyway
> +       if grep -q -e 'failed to load free space cache' $tmp.fsck; then
> +               cat $tmp.fsck >> $seqres.full
> +               _fail "btrfs check found invalid space cache"
> +       fi
> +
> +       prev=$cur
> +       cur=$(_log_writes_find_next_fua $prev)
> +       [ -z $cur ] && break
> +
> +       # Same as above
> +       cur=$(($cur + 1))
> +done
> +
> +echo "Silence is golden"
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
> new file mode 100644
> index 00000000..e569e60c
> --- /dev/null
> +++ b/tests/btrfs/159.out
> @@ -0,0 +1,2 @@
> +QA output created by 159
> +Silence is golden
> diff --git a/tests/btrfs/group b/tests/btrfs/group
> index 8007e07e..bc83db94 100644
> --- a/tests/btrfs/group
> +++ b/tests/btrfs/group
> @@ -161,3 +161,4 @@
>  156 auto quick trim
>  157 auto quick raid
>  158 auto quick raid scrub
> +159 auto

Please add to "replay" group.

How long does the test run? Is it not quick?

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06  9:03   ` Amir Goldstein
@ 2018-03-06  9:25     ` Qu Wenruo
  0 siblings, 0 replies; 14+ messages in thread
From: Qu Wenruo @ 2018-03-06  9:25 UTC (permalink / raw)
  To: Amir Goldstein, Qu Wenruo; +Cc: Linux Btrfs, fstests, dsterba


[-- Attachment #1.1: Type: text/plain, Size: 5677 bytes --]



On 2018年03月06日 17:03, Amir Goldstein wrote:
> On Tue, Mar 6, 2018 at 10:15 AM, Qu Wenruo <wqu@suse.com> wrote:
>> There are some btrfs corruption report in mail list for a while,
>> although such corruption is pretty rare and almost impossible to
>> reproduce, with dm-log-writes we found it's highly related to v1 space
>> cache.
>>
>> Unlike journal based filesystems, btrfs completely rely on metadata CoW
>> to protect itself from power loss.
>> Which needs extent allocator to avoid allocate extents on existing
>> extent range.
>> Btrfs also uses free space cache to speed up such allocation.
>>
>> However there is a problem, v1 space cache is not protected by data CoW,
>> and can be corrupted during power loss.
>> So btrfs do extra check on free space cache, verifying its own in-file csum,
>> generation and free space recorded in cache and extent tree.
>>
>> The problem is, under heavy concurrency, v1 space cache can be corrupted
>> even under normal operations without power loss.
>> And we believe corrupted space cache can break btrfs metadata CoW and
>> leads to the rare corruption in next power loss.
>>
>> The most obvious symptom will be difference in free space:
>>
>> This will be caught by kernel, but such check is quite weak, and if
>> the net free space change is 0 in one transaction, the corrupted
>> cache can be loaded by kernel.
>>
>> In this case, btrfs check would report things like :
>> ------
>> block group 298844160 has wrong amount of free space
>> failed to load free space cache for block group 298844160
>> ------
>>
>> But considering the test case are using notreelog, btrfs won't do
>> sub-transaction commit which doesn't increase generation, each
>> transaction should be consistent, and nothing should be reported at all.
>>
>> Further more, we can even found corrupted file extents like:
>> ------
>> root 5 inode 261 errors 100, file extent discount
>> Found file extent holes:
>>         start: 962560, len: 32768
>> ERROR: errors found in fs roots
>> ------
>>
> 
> So what is the expectation from this test on upstream btrfs?
> Probable failure? reliable failure?

Reliable failure, as the root reason is not fully exposed yet.

> Are there random seeds to fsstress that can make the test fail reliably?

Since concurrency is involved, I don't think seed would help much.

> Or does failure also depend on IO timing and other uncontrolled parameters?

Currently the concurrency would be the main factor.

>> +#
>> +#-----------------------------------------------------------------------
>> +# Copyright (c) SUSE.  All Rights Reserved.
> 
> 2018>
>> +#
[snip]
>> +
>> +_log_writes_replay_log_entry_range $prev >> $seqres.full 2>&1
>> +while [ ! -z "$cur" ]; do
>> +       _log_writes_replay_log_entry_range $cur $prev
>> +       # Catch the btrfs check output into temp file, as we need to
>> +       # grep the output to find the cache corruption
>> +       $BTRFS_UTIL_PROG check --check-data-csum $SCRATCH_DEV &> $tmp.fsck
> 
> So by making this a btrfs specific test you avoid the need to mount/umount and
> revert to $prev. Right?

Yes. Especially notreelog mount option disables journal-like behavior.

> 
> Please spell out the missing pieces for making a generic variant
> to this test, so if anyone wants to pick it up they have a good starting point.
> Or maybe you still intend to post a generic test as well?

I'm still working on the generic test, but the priority is the btrfs
corruption fixing.

For the missing pieces, we need dm-snapshot to make journal based
filesystems to replay their log without polluting the original device.

Despite that, current code should illustrate the framework.

> 
>> +
>> +       # Cache passed generation,csum and free space check but corrupted
>> +       # will be reported as error
>> +       if [ $? -ne 0 ]; then
>> +               cat $tmp.fsck >> $seqres.full
>> +               _fail "btrfs check found corruption"
>> +       fi
>> +
>> +       # Mount option has ruled out any possible factors affect space cache
>> +       # And we are at the FUA writes, no generation related problem should
>> +       # happen anyway
>> +       if grep -q -e 'failed to load free space cache' $tmp.fsck; then
>> +               cat $tmp.fsck >> $seqres.full
>> +               _fail "btrfs check found invalid space cache"
>> +       fi
>> +
>> +       prev=$cur
>> +       cur=$(_log_writes_find_next_fua $prev)
>> +       [ -z $cur ] && break
>> +
>> +       # Same as above
>> +       cur=$(($cur + 1))
>> +done
>> +
>> +echo "Silence is golden"
>> +
>> +# success, all done
>> +status=0
>> +exit
>> diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
>> new file mode 100644
>> index 00000000..e569e60c
>> --- /dev/null
>> +++ b/tests/btrfs/159.out
>> @@ -0,0 +1,2 @@
>> +QA output created by 159
>> +Silence is golden
>> diff --git a/tests/btrfs/group b/tests/btrfs/group
>> index 8007e07e..bc83db94 100644
>> --- a/tests/btrfs/group
>> +++ b/tests/btrfs/group
>> @@ -161,3 +161,4 @@
>>  156 auto quick trim
>>  157 auto quick raid
>>  158 auto quick raid scrub
>> +159 auto
> 
> Please add to "replay" group.
> 
> How long does the test run? Is it not quick?

It's quick, less than one minute.

But since it's using LOAD_FACTOR, fast doesn't seem proper here.

Thanks,
Qu

> 
> Thanks,
> Amir.
> --
> To unsubscribe from this list: send the line "unsubscribe fstests" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06  8:15 ` [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption Qu Wenruo
  2018-03-06  9:03   ` Amir Goldstein
@ 2018-03-06 10:12   ` Filipe Manana
  2018-03-06 10:53     ` Qu Wenruo
  2018-03-06 12:07     ` Qu Wenruo
  1 sibling, 2 replies; 14+ messages in thread
From: Filipe Manana @ 2018-03-06 10:12 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, fstests, dsterba, amir73il

On Tue, Mar 6, 2018 at 8:15 AM, Qu Wenruo <wqu@suse.com> wrote:
> There are some btrfs corruption report in mail list for a while,

There have been for years (well, since ever) many reports of different
types of corruptions.
Which kind of corruption are you referring to?

> although such corruption is pretty rare and almost impossible to
> reproduce, with dm-log-writes we found it's highly related to v1 space
> cache.
>
> Unlike journal based filesystems, btrfs completely rely on metadata CoW
> to protect itself from power loss.
> Which needs extent allocator to avoid allocate extents on existing
> extent range.
> Btrfs also uses free space cache to speed up such allocation.
>
> However there is a problem, v1 space cache is not protected by data CoW,
> and can be corrupted during power loss.
> So btrfs do extra check on free space cache, verifying its own in-file csum,
> generation and free space recorded in cache and extent tree.
>
> The problem is, under heavy concurrency, v1 space cache can be corrupted
> even under normal operations without power loss.

How?

> And we believe corrupted space cache can break btrfs metadata CoW and
> leads to the rare corruption in next power loss.

Which kind of corruption?

>
> The most obvious symptom will be difference in free space:
>
> This will be caught by kernel, but such check is quite weak, and if
> the net free space change is 0 in one transaction, the corrupted
> cache can be loaded by kernel.

How can that happen? The only case I'm aware of, explained below,
always leads to a difference (space cache has less free space then
what we actually have if we check the extent tree).

>
> In this case, btrfs check would report things like :
> ------
> block group 298844160 has wrong amount of free space
> failed to load free space cache for block group 298844160
> ------

This is normal, but not very common, due to tiny races that exists
between committing a transaction (writing the free space caches) and
running dellaloc for example (since reserving an extent while running
dealloc doesn't joing/start a transaction).

>
> But considering the test case are using notreelog, btrfs won't do
> sub-transaction commit which doesn't increase generation, each
> transaction should be consistent, and nothing should be reported at all.
>
> Further more, we can even found corrupted file extents like:
> ------
> root 5 inode 261 errors 100, file extent discount
> Found file extent holes:
>         start: 962560, len: 32768
> ERROR: errors found in fs roots

Why do you think that's a corruption? Does it cause data loss or any
user visible issue?

Having file extent holes not inserted happens when mixing buffered and
direct IO writes to a file (and fsstress does that), for example:

create file
buffered write at offset 0, length 64K
direct IO write at offset at offset 64K, length 4K
transaction commit
power loss
after this we got a missing 64K hole extent at offset 0 (at
btrfs_file_write_iter we only add hole extents if the start offset is
greater then the current i_size)

But this does not cause any problem for the user or the fs itself, and
it's supposed to be like that in the NO_HOLES mode which one day
(probably) will be the default mode.

> ------
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  common/dmlogwrites  |  72 +++++++++++++++++++++++++++
>  tests/btrfs/159     | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/btrfs/159.out |   2 +
>  tests/btrfs/group   |   1 +
>  4 files changed, 216 insertions(+)
>  create mode 100755 tests/btrfs/159
>  create mode 100644 tests/btrfs/159.out
>
> diff --git a/common/dmlogwrites b/common/dmlogwrites
> index 467b872e..54e7e242 100644
> --- a/common/dmlogwrites
> +++ b/common/dmlogwrites
> @@ -126,3 +126,75 @@ _log_writes_cleanup()
>         $UDEV_SETTLE_PROG >/dev/null 2>&1
>         _log_writes_remove
>  }
> +
> +# Convert log writes mark to entry number
> +# Result entry number is output to stdout, could be empty if not found
> +_log_writes_mark_to_entry_number()
> +{
> +       local _mark=$1
> +       local ret
> +
> +       [ -z "$_mark" ] && _fail \
> +               "mark must be given for _log_writes_mark_to_entry_number"
> +
> +       ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
> +               --end-mark $_mark 2> /dev/null)
> +       [ -z "$ret" ] && return
> +       ret=$(echo "$ret" | cut -f1 -d\@)
> +       echo "mark $_mark has entry number $ret" >> $seqres.full
> +       echo "$ret"
> +}
> +
> +# Find next fua write entry number
> +# Result entry number is output to stdout, could be empty if not found
> +_log_writes_find_next_fua()
> +{
> +       local _start_entry=$1
> +       local ret
> +
> +       if [ -z "$_start_entry" ]; then
> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
> +                       --next-fua 2> /dev/null)
> +       else
> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
> +                       --next-fua --start-entry $_start_entry 2> /dev/null)
> +       fi
> +       [ -z "$ret" ] && return
> +
> +       ret=$(echo "$ret" | cut -f1 -d\@)
> +       echo "next fua is entry number $ret" >> $seqres.full
> +       echo "$ret"
> +}
> +
> +# Replay log range to specified entry
> +# $1:  End entry. The last entry will *NOT* be replayed
> +# $2:  Start entry. If not specified, start from the first entry.
> +_log_writes_replay_log_entry_range()
> +{
> +       local _end=$1
> +       local _start=$2
> +
> +       [ -z "$_end" ] && _fail \
> +       "end entry must be specified for _log_writes_replay_log_entry_range"
> +
> +       if [[ "$_start" && "$_start" -gt "$_end" ]]; then
> +               _fail \
> +               "wrong parameter order for _log_writes_replay_log_entry_range:start=$_start end=$_end"
> +       fi
> +
> +       # Original replay-log won't replay the last entry. So increase entry
> +       # number here to ensure the end entry to be replayed
> +       if [ -z "$_start" ]; then
> +               echo "=== replay to $_end ===" >> $seqres.full
> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
> +                       --replay $SCRATCH_DEV --limit $_end -v \
> +                       >> $seqres.full 2>&1
> +       else
> +               echo "=== replay from $_start to $_end ===" >> $seqres.full
> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
> +                       --replay $SCRATCH_DEV --start-entry $_start \
> +                       --limit $(($_end - $_start)) -v \
> +                       >> $seqres.full 2>&1
> +       fi
> +       [ $? -ne 0 ] && _fail "replay failed"
> +}
> diff --git a/tests/btrfs/159 b/tests/btrfs/159
> new file mode 100755
> index 00000000..5113d526
> --- /dev/null
> +++ b/tests/btrfs/159
> @@ -0,0 +1,141 @@
> +#! /bin/bash
> +# FS QA Test 159
> +#
> +# Test case for btrfs v1 space cache corruption
> +#
> +# Btrfs has some unexpected corruption after power loss, during the
> +# investigation, the problem leads to v1 space cache corruption, which
> +# could be loaded by kernel without detection.
> +#
> +# Kernel has 3 detection for corrupted space cache, checksum, generation
> +# and free space, however the last one is a weak one. If in the one transaction
> +# no net space change it can be loaded by kernel and break up metadata CoW,
> +# leading serious corruption.
> +#
> +#-----------------------------------------------------------------------
> +# Copyright (c) SUSE.  All Rights Reserved.
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> +#-----------------------------------------------------------------------
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1       # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +       cd /
> +       rm -f $tmp.*
> +       _log_writes_cleanup
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/dmlogwrites
> +. ./common/filter
> +
> +# remove previous $seqres.full before test
> +rm -f $seqres.full
> +
> +# Small workload is enough to trigger the problem
> +workload=$(( 512 * $LOAD_FACTOR ))
> +nr_threads=$(($($here/src/feature -o) * $LOAD_FACTOR))
> +
> +# Modify as appropriate.
> +_supported_fs btrfs
> +_supported_os Linux
> +_require_log_writes
> +_require_scratch
> +
> +_log_writes_init
> +_log_writes_mkfs >> $seqres.full 2>&1
> +
> +# Here we don't want to create space cache yet
> +_log_writes_mount -o nospace_cache
> +
> +# The SINGLE profile data chunk created by mkfs is too small (8M) so btrfs
> +# won't create space cache for it.
> +# We need a data chunk large enough so btrfs create space cache for it and
> +# store the cache in itself.
> +_run_btrfs_util_prog balance start --full-balance "$SCRATCH_MNT"
> +
> +_log_writes_unmount
> +_log_writes_mark mkfs
> +
> +# Disable tree log, so btrfs will always do full transaction commit
> +# It's OK to use tree log, but it will cause extra super block update which
> +# doesn't increase generation.
> +_log_writes_mount -o space_cache=v1,notreelog
> +run_check $FSSTRESS_PROG -w -n $workload -p $nr_threads -d $SCRATCH_MNT \
> +       $FSSTRESS_AVOID > /dev/null 2>&1
> +_log_writes_unmount
> +
> +_log_writes_remove
> +# Here we manually iterate through the log entries, as we need extra check
> +# on the output of btrfs check. And we could have detailed replay log in
> +# $seqres.full
> +prev=$(_log_writes_mark_to_entry_number mkfs)
> +if [ -z "$prev" ]; then
> +       _fail "failed to locate log writes mark 'mkfs'"
> +fi
> +
> +# Here we only checks the result *AFTER* FUA writes, and no tree log, so
> +# everything should be valid.
> +cur=$(_log_writes_find_next_fua $prev)
> +[ -z "$cur" ] && _fail "failed to locate next fua write"
> +
> +# log-replay will not replay the last item, so manually increase it by one
> +cur=$(($cur + 1))
> +
> +_log_writes_replay_log_entry_range $prev >> $seqres.full 2>&1
> +while [ ! -z "$cur" ]; do
> +       _log_writes_replay_log_entry_range $cur $prev
> +       # Catch the btrfs check output into temp file, as we need to
> +       # grep the output to find the cache corruption
> +       $BTRFS_UTIL_PROG check --check-data-csum $SCRATCH_DEV &> $tmp.fsck
> +
> +       # Cache passed generation,csum and free space check but corrupted
> +       # will be reported as error
> +       if [ $? -ne 0 ]; then
> +               cat $tmp.fsck >> $seqres.full
> +               _fail "btrfs check found corruption"
> +       fi
> +
> +       # Mount option has ruled out any possible factors affect space cache
> +       # And we are at the FUA writes, no generation related problem should
> +       # happen anyway
> +       if grep -q -e 'failed to load free space cache' $tmp.fsck; then
> +               cat $tmp.fsck >> $seqres.full
> +               _fail "btrfs check found invalid space cache"
> +       fi
> +
> +       prev=$cur
> +       cur=$(_log_writes_find_next_fua $prev)
> +       [ -z $cur ] && break
> +
> +       # Same as above
> +       cur=$(($cur + 1))
> +done
> +
> +echo "Silence is golden"
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
> new file mode 100644
> index 00000000..e569e60c
> --- /dev/null
> +++ b/tests/btrfs/159.out
> @@ -0,0 +1,2 @@
> +QA output created by 159
> +Silence is golden
> diff --git a/tests/btrfs/group b/tests/btrfs/group
> index 8007e07e..bc83db94 100644
> --- a/tests/btrfs/group
> +++ b/tests/btrfs/group
> @@ -161,3 +161,4 @@
>  156 auto quick trim
>  157 auto quick raid
>  158 auto quick raid scrub
> +159 auto
> --
> 2.15.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe fstests" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06 10:12   ` Filipe Manana
@ 2018-03-06 10:53     ` Qu Wenruo
  2018-03-06 11:06       ` Nikolay Borisov
  2018-03-06 12:22       ` Filipe Manana
  2018-03-06 12:07     ` Qu Wenruo
  1 sibling, 2 replies; 14+ messages in thread
From: Qu Wenruo @ 2018-03-06 10:53 UTC (permalink / raw)
  To: fdmanana, Qu Wenruo; +Cc: linux-btrfs, fstests, dsterba, amir73il


[-- Attachment #1.1: Type: text/plain, Size: 14208 bytes --]



On 2018年03月06日 18:12, Filipe Manana wrote:
> On Tue, Mar 6, 2018 at 8:15 AM, Qu Wenruo <wqu@suse.com> wrote:
>> There are some btrfs corruption report in mail list for a while,
> 
> There have been for years (well, since ever) many reports of different
> types of corruptions.
> Which kind of corruption are you referring to?

Transid error.

> 
>> although such corruption is pretty rare and almost impossible to
>> reproduce, with dm-log-writes we found it's highly related to v1 space
>> cache.
>>
>> Unlike journal based filesystems, btrfs completely rely on metadata CoW
>> to protect itself from power loss.
>> Which needs extent allocator to avoid allocate extents on existing
>> extent range.
>> Btrfs also uses free space cache to speed up such allocation.
>>
>> However there is a problem, v1 space cache is not protected by data CoW,
>> and can be corrupted during power loss.
>> So btrfs do extra check on free space cache, verifying its own in-file csum,
>> generation and free space recorded in cache and extent tree.
>>
>> The problem is, under heavy concurrency, v1 space cache can be corrupted
>> even under normal operations without power loss.
> 
> How?

At FUA writes, we can get v1 space cache who can pass checksum and
generation check, but has difference in free space.

> 
>> And we believe corrupted space cache can break btrfs metadata CoW and
>> leads to the rare corruption in next power loss.
> 
> Which kind of corruption?

Transid related error.

> 
>>
>> The most obvious symptom will be difference in free space:
>>
>> This will be caught by kernel, but such check is quite weak, and if
>> the net free space change is 0 in one transaction, the corrupted
>> cache can be loaded by kernel.
> 
> How can that happen? The only case I'm aware of, explained below,
> always leads to a difference (space cache has less free space then
> what we actually have if we check the extent tree).
> 
>>
>> In this case, btrfs check would report things like :
>> ------
>> block group 298844160 has wrong amount of free space
>> failed to load free space cache for block group 298844160
>> ------
> 
> This is normal, but not very common, due to tiny races that exists
> between committing a transaction (writing the free space caches) and
> running dellaloc for example (since reserving an extent while running
> dealloc doesn't joing/start a transaction).

This race explains a lot.

But could that cause corrupted cache to be loaded after power loss, and
break metadata CoW?

At least for the time point when FUA happens, the free space cache can
pass both csum and generation, we only have free space difference to
prevent it to be loaded.

> 
>>
>> But considering the test case are using notreelog, btrfs won't do
>> sub-transaction commit which doesn't increase generation, each
>> transaction should be consistent, and nothing should be reported at all.
>>
>> Further more, we can even found corrupted file extents like:
>> ------
>> root 5 inode 261 errors 100, file extent discount
>> Found file extent holes:
>>         start: 962560, len: 32768
>> ERROR: errors found in fs roots
> 
> Why do you think that's a corruption? Does it cause data loss or any
> user visible issue?

It breaks the rule that we shouldn't have the hole in file extents.

IIRC Nikolay is trying to use inode_lock_shared() to solve this race.

> 
> Having file extent holes not inserted happens when mixing buffered and
> direct IO writes to a file (and fsstress does that), for example:
> 
> create file
> buffered write at offset 0, length 64K
> direct IO write at offset at offset 64K, length 4K
> transaction commit
> power loss
> after this we got a missing 64K hole extent at offset 0 (at
> btrfs_file_write_iter we only add hole extents if the start offset is
> greater then the current i_size)
> 
> But this does not cause any problem for the user or the fs itself, and
> it's supposed to be like that in the NO_HOLES mode which one day
> (probably) will be the default mode.

At least before that happens, we should follow the current schema of
file extents.

If we just ignore problems that won't cause data loss and keeps them,
there will never be a good on-disk format schema.

Thanks,
Qu

> 
>> ------
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>  common/dmlogwrites  |  72 +++++++++++++++++++++++++++
>>  tests/btrfs/159     | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  tests/btrfs/159.out |   2 +
>>  tests/btrfs/group   |   1 +
>>  4 files changed, 216 insertions(+)
>>  create mode 100755 tests/btrfs/159
>>  create mode 100644 tests/btrfs/159.out
>>
>> diff --git a/common/dmlogwrites b/common/dmlogwrites
>> index 467b872e..54e7e242 100644
>> --- a/common/dmlogwrites
>> +++ b/common/dmlogwrites
>> @@ -126,3 +126,75 @@ _log_writes_cleanup()
>>         $UDEV_SETTLE_PROG >/dev/null 2>&1
>>         _log_writes_remove
>>  }
>> +
>> +# Convert log writes mark to entry number
>> +# Result entry number is output to stdout, could be empty if not found
>> +_log_writes_mark_to_entry_number()
>> +{
>> +       local _mark=$1
>> +       local ret
>> +
>> +       [ -z "$_mark" ] && _fail \
>> +               "mark must be given for _log_writes_mark_to_entry_number"
>> +
>> +       ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>> +               --end-mark $_mark 2> /dev/null)
>> +       [ -z "$ret" ] && return
>> +       ret=$(echo "$ret" | cut -f1 -d\@)
>> +       echo "mark $_mark has entry number $ret" >> $seqres.full
>> +       echo "$ret"
>> +}
>> +
>> +# Find next fua write entry number
>> +# Result entry number is output to stdout, could be empty if not found
>> +_log_writes_find_next_fua()
>> +{
>> +       local _start_entry=$1
>> +       local ret
>> +
>> +       if [ -z "$_start_entry" ]; then
>> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>> +                       --next-fua 2> /dev/null)
>> +       else
>> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>> +                       --next-fua --start-entry $_start_entry 2> /dev/null)
>> +       fi
>> +       [ -z "$ret" ] && return
>> +
>> +       ret=$(echo "$ret" | cut -f1 -d\@)
>> +       echo "next fua is entry number $ret" >> $seqres.full
>> +       echo "$ret"
>> +}
>> +
>> +# Replay log range to specified entry
>> +# $1:  End entry. The last entry will *NOT* be replayed
>> +# $2:  Start entry. If not specified, start from the first entry.
>> +_log_writes_replay_log_entry_range()
>> +{
>> +       local _end=$1
>> +       local _start=$2
>> +
>> +       [ -z "$_end" ] && _fail \
>> +       "end entry must be specified for _log_writes_replay_log_entry_range"
>> +
>> +       if [[ "$_start" && "$_start" -gt "$_end" ]]; then
>> +               _fail \
>> +               "wrong parameter order for _log_writes_replay_log_entry_range:start=$_start end=$_end"
>> +       fi
>> +
>> +       # Original replay-log won't replay the last entry. So increase entry
>> +       # number here to ensure the end entry to be replayed
>> +       if [ -z "$_start" ]; then
>> +               echo "=== replay to $_end ===" >> $seqres.full
>> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>> +                       --replay $SCRATCH_DEV --limit $_end -v \
>> +                       >> $seqres.full 2>&1
>> +       else
>> +               echo "=== replay from $_start to $_end ===" >> $seqres.full
>> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>> +                       --replay $SCRATCH_DEV --start-entry $_start \
>> +                       --limit $(($_end - $_start)) -v \
>> +                       >> $seqres.full 2>&1
>> +       fi
>> +       [ $? -ne 0 ] && _fail "replay failed"
>> +}
>> diff --git a/tests/btrfs/159 b/tests/btrfs/159
>> new file mode 100755
>> index 00000000..5113d526
>> --- /dev/null
>> +++ b/tests/btrfs/159
>> @@ -0,0 +1,141 @@
>> +#! /bin/bash
>> +# FS QA Test 159
>> +#
>> +# Test case for btrfs v1 space cache corruption
>> +#
>> +# Btrfs has some unexpected corruption after power loss, during the
>> +# investigation, the problem leads to v1 space cache corruption, which
>> +# could be loaded by kernel without detection.
>> +#
>> +# Kernel has 3 detection for corrupted space cache, checksum, generation
>> +# and free space, however the last one is a weak one. If in the one transaction
>> +# no net space change it can be loaded by kernel and break up metadata CoW,
>> +# leading serious corruption.
>> +#
>> +#-----------------------------------------------------------------------
>> +# Copyright (c) SUSE.  All Rights Reserved.
>> +#
>> +# This program is free software; you can redistribute it and/or
>> +# modify it under the terms of the GNU General Public License as
>> +# published by the Free Software Foundation.
>> +#
>> +# This program is distributed in the hope that it would be useful,
>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +# GNU General Public License for more details.
>> +#
>> +# You should have received a copy of the GNU General Public License
>> +# along with this program; if not, write the Free Software Foundation,
>> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
>> +#-----------------------------------------------------------------------
>> +#
>> +
>> +seq=`basename $0`
>> +seqres=$RESULT_DIR/$seq
>> +echo "QA output created by $seq"
>> +
>> +here=`pwd`
>> +tmp=/tmp/$$
>> +status=1       # failure is the default!
>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>> +
>> +_cleanup()
>> +{
>> +       cd /
>> +       rm -f $tmp.*
>> +       _log_writes_cleanup
>> +}
>> +
>> +# get standard environment, filters and checks
>> +. ./common/rc
>> +. ./common/dmlogwrites
>> +. ./common/filter
>> +
>> +# remove previous $seqres.full before test
>> +rm -f $seqres.full
>> +
>> +# Small workload is enough to trigger the problem
>> +workload=$(( 512 * $LOAD_FACTOR ))
>> +nr_threads=$(($($here/src/feature -o) * $LOAD_FACTOR))
>> +
>> +# Modify as appropriate.
>> +_supported_fs btrfs
>> +_supported_os Linux
>> +_require_log_writes
>> +_require_scratch
>> +
>> +_log_writes_init
>> +_log_writes_mkfs >> $seqres.full 2>&1
>> +
>> +# Here we don't want to create space cache yet
>> +_log_writes_mount -o nospace_cache
>> +
>> +# The SINGLE profile data chunk created by mkfs is too small (8M) so btrfs
>> +# won't create space cache for it.
>> +# We need a data chunk large enough so btrfs create space cache for it and
>> +# store the cache in itself.
>> +_run_btrfs_util_prog balance start --full-balance "$SCRATCH_MNT"
>> +
>> +_log_writes_unmount
>> +_log_writes_mark mkfs
>> +
>> +# Disable tree log, so btrfs will always do full transaction commit
>> +# It's OK to use tree log, but it will cause extra super block update which
>> +# doesn't increase generation.
>> +_log_writes_mount -o space_cache=v1,notreelog
>> +run_check $FSSTRESS_PROG -w -n $workload -p $nr_threads -d $SCRATCH_MNT \
>> +       $FSSTRESS_AVOID > /dev/null 2>&1
>> +_log_writes_unmount
>> +
>> +_log_writes_remove
>> +# Here we manually iterate through the log entries, as we need extra check
>> +# on the output of btrfs check. And we could have detailed replay log in
>> +# $seqres.full
>> +prev=$(_log_writes_mark_to_entry_number mkfs)
>> +if [ -z "$prev" ]; then
>> +       _fail "failed to locate log writes mark 'mkfs'"
>> +fi
>> +
>> +# Here we only checks the result *AFTER* FUA writes, and no tree log, so
>> +# everything should be valid.
>> +cur=$(_log_writes_find_next_fua $prev)
>> +[ -z "$cur" ] && _fail "failed to locate next fua write"
>> +
>> +# log-replay will not replay the last item, so manually increase it by one
>> +cur=$(($cur + 1))
>> +
>> +_log_writes_replay_log_entry_range $prev >> $seqres.full 2>&1
>> +while [ ! -z "$cur" ]; do
>> +       _log_writes_replay_log_entry_range $cur $prev
>> +       # Catch the btrfs check output into temp file, as we need to
>> +       # grep the output to find the cache corruption
>> +       $BTRFS_UTIL_PROG check --check-data-csum $SCRATCH_DEV &> $tmp.fsck
>> +
>> +       # Cache passed generation,csum and free space check but corrupted
>> +       # will be reported as error
>> +       if [ $? -ne 0 ]; then
>> +               cat $tmp.fsck >> $seqres.full
>> +               _fail "btrfs check found corruption"
>> +       fi
>> +
>> +       # Mount option has ruled out any possible factors affect space cache
>> +       # And we are at the FUA writes, no generation related problem should
>> +       # happen anyway
>> +       if grep -q -e 'failed to load free space cache' $tmp.fsck; then
>> +               cat $tmp.fsck >> $seqres.full
>> +               _fail "btrfs check found invalid space cache"
>> +       fi
>> +
>> +       prev=$cur
>> +       cur=$(_log_writes_find_next_fua $prev)
>> +       [ -z $cur ] && break
>> +
>> +       # Same as above
>> +       cur=$(($cur + 1))
>> +done
>> +
>> +echo "Silence is golden"
>> +
>> +# success, all done
>> +status=0
>> +exit
>> diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
>> new file mode 100644
>> index 00000000..e569e60c
>> --- /dev/null
>> +++ b/tests/btrfs/159.out
>> @@ -0,0 +1,2 @@
>> +QA output created by 159
>> +Silence is golden
>> diff --git a/tests/btrfs/group b/tests/btrfs/group
>> index 8007e07e..bc83db94 100644
>> --- a/tests/btrfs/group
>> +++ b/tests/btrfs/group
>> @@ -161,3 +161,4 @@
>>  156 auto quick trim
>>  157 auto quick raid
>>  158 auto quick raid scrub
>> +159 auto
>> --
>> 2.15.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe fstests" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06 10:53     ` Qu Wenruo
@ 2018-03-06 11:06       ` Nikolay Borisov
  2018-03-06 12:22       ` Filipe Manana
  1 sibling, 0 replies; 14+ messages in thread
From: Nikolay Borisov @ 2018-03-06 11:06 UTC (permalink / raw)
  To: Qu Wenruo, fdmanana, Qu Wenruo; +Cc: linux-btrfs, fstests, dsterba, amir73il



On  6.03.2018 12:53, Qu Wenruo wrote:
> 
> 
[snip]

> It breaks the rule that we shouldn't have the hole in file extents.
> 
> IIRC Nikolay is trying to use inode_lock_shared() to solve this race.
> 

Unfortunately the inode_lock_shared approach is a no go since Filipe
objected to it quite adamantly. After discussion that happened in that
thread I can say I'm almost convinced that the pair of READDIO_LOCK and
setsize *do* provide necessary consistency for the DIO case. However at
the moment there is a memory barrier missing.

But I think the DIO case is unrelated to the issue you are discussing
here, no ?


[snip]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06 10:12   ` Filipe Manana
  2018-03-06 10:53     ` Qu Wenruo
@ 2018-03-06 12:07     ` Qu Wenruo
  2018-03-06 12:25       ` Filipe Manana
  1 sibling, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2018-03-06 12:07 UTC (permalink / raw)
  To: fdmanana, Qu Wenruo; +Cc: linux-btrfs, fstests, dsterba, amir73il


[-- Attachment #1.1: Type: text/plain, Size: 13676 bytes --]



On 2018年03月06日 18:12, Filipe Manana wrote:
> On Tue, Mar 6, 2018 at 8:15 AM, Qu Wenruo <wqu@suse.com> wrote:
>> There are some btrfs corruption report in mail list for a while,
> 
> There have been for years (well, since ever) many reports of different
> types of corruptions.
> Which kind of corruption are you referring to?
> 
>> although such corruption is pretty rare and almost impossible to
>> reproduce, with dm-log-writes we found it's highly related to v1 space
>> cache.
>>
>> Unlike journal based filesystems, btrfs completely rely on metadata CoW
>> to protect itself from power loss.
>> Which needs extent allocator to avoid allocate extents on existing
>> extent range.
>> Btrfs also uses free space cache to speed up such allocation.
>>
>> However there is a problem, v1 space cache is not protected by data CoW,
>> and can be corrupted during power loss.
>> So btrfs do extra check on free space cache, verifying its own in-file csum,
>> generation and free space recorded in cache and extent tree.
>>
>> The problem is, under heavy concurrency, v1 space cache can be corrupted
>> even under normal operations without power loss.
> 
> How?
> 
>> And we believe corrupted space cache can break btrfs metadata CoW and
>> leads to the rare corruption in next power loss.
> 
> Which kind of corruption?
> 
>>
>> The most obvious symptom will be difference in free space:
>>
>> This will be caught by kernel, but such check is quite weak, and if
>> the net free space change is 0 in one transaction, the corrupted
>> cache can be loaded by kernel.
> 
> How can that happen? The only case I'm aware of, explained below,
> always leads to a difference (space cache has less free space then
> what we actually have if we check the extent tree).
> 
>>
>> In this case, btrfs check would report things like :
>> ------
>> block group 298844160 has wrong amount of free space
>> failed to load free space cache for block group 298844160
>> ------
> 
> This is normal, but not very common, due to tiny races that exists
> between committing a transaction (writing the free space caches) and
> running dellaloc for example (since reserving an extent while running
> dealloc doesn't joing/start a transaction).

Well, at least I didn't find any place to release space unprotected by
trans handler, so free space of cache can only be less or equal to real
free space.

So in that case, corrupted cache will never pass the free space check so
it will never be loaded.

Another dead end unfortunately.

Thanks,
Qu
> 
>>
>> But considering the test case are using notreelog, btrfs won't do
>> sub-transaction commit which doesn't increase generation, each
>> transaction should be consistent, and nothing should be reported at all.
>>
>> Further more, we can even found corrupted file extents like:
>> ------
>> root 5 inode 261 errors 100, file extent discount
>> Found file extent holes:
>>         start: 962560, len: 32768
>> ERROR: errors found in fs roots
> 
> Why do you think that's a corruption? Does it cause data loss or any
> user visible issue?
> 
> Having file extent holes not inserted happens when mixing buffered and
> direct IO writes to a file (and fsstress does that), for example:
> 
> create file
> buffered write at offset 0, length 64K
> direct IO write at offset at offset 64K, length 4K
> transaction commit
> power loss
> after this we got a missing 64K hole extent at offset 0 (at
> btrfs_file_write_iter we only add hole extents if the start offset is
> greater then the current i_size)
> 
> But this does not cause any problem for the user or the fs itself, and
> it's supposed to be like that in the NO_HOLES mode which one day
> (probably) will be the default mode.
> 
>> ------
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>  common/dmlogwrites  |  72 +++++++++++++++++++++++++++
>>  tests/btrfs/159     | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  tests/btrfs/159.out |   2 +
>>  tests/btrfs/group   |   1 +
>>  4 files changed, 216 insertions(+)
>>  create mode 100755 tests/btrfs/159
>>  create mode 100644 tests/btrfs/159.out
>>
>> diff --git a/common/dmlogwrites b/common/dmlogwrites
>> index 467b872e..54e7e242 100644
>> --- a/common/dmlogwrites
>> +++ b/common/dmlogwrites
>> @@ -126,3 +126,75 @@ _log_writes_cleanup()
>>         $UDEV_SETTLE_PROG >/dev/null 2>&1
>>         _log_writes_remove
>>  }
>> +
>> +# Convert log writes mark to entry number
>> +# Result entry number is output to stdout, could be empty if not found
>> +_log_writes_mark_to_entry_number()
>> +{
>> +       local _mark=$1
>> +       local ret
>> +
>> +       [ -z "$_mark" ] && _fail \
>> +               "mark must be given for _log_writes_mark_to_entry_number"
>> +
>> +       ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>> +               --end-mark $_mark 2> /dev/null)
>> +       [ -z "$ret" ] && return
>> +       ret=$(echo "$ret" | cut -f1 -d\@)
>> +       echo "mark $_mark has entry number $ret" >> $seqres.full
>> +       echo "$ret"
>> +}
>> +
>> +# Find next fua write entry number
>> +# Result entry number is output to stdout, could be empty if not found
>> +_log_writes_find_next_fua()
>> +{
>> +       local _start_entry=$1
>> +       local ret
>> +
>> +       if [ -z "$_start_entry" ]; then
>> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>> +                       --next-fua 2> /dev/null)
>> +       else
>> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>> +                       --next-fua --start-entry $_start_entry 2> /dev/null)
>> +       fi
>> +       [ -z "$ret" ] && return
>> +
>> +       ret=$(echo "$ret" | cut -f1 -d\@)
>> +       echo "next fua is entry number $ret" >> $seqres.full
>> +       echo "$ret"
>> +}
>> +
>> +# Replay log range to specified entry
>> +# $1:  End entry. The last entry will *NOT* be replayed
>> +# $2:  Start entry. If not specified, start from the first entry.
>> +_log_writes_replay_log_entry_range()
>> +{
>> +       local _end=$1
>> +       local _start=$2
>> +
>> +       [ -z "$_end" ] && _fail \
>> +       "end entry must be specified for _log_writes_replay_log_entry_range"
>> +
>> +       if [[ "$_start" && "$_start" -gt "$_end" ]]; then
>> +               _fail \
>> +               "wrong parameter order for _log_writes_replay_log_entry_range:start=$_start end=$_end"
>> +       fi
>> +
>> +       # Original replay-log won't replay the last entry. So increase entry
>> +       # number here to ensure the end entry to be replayed
>> +       if [ -z "$_start" ]; then
>> +               echo "=== replay to $_end ===" >> $seqres.full
>> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>> +                       --replay $SCRATCH_DEV --limit $_end -v \
>> +                       >> $seqres.full 2>&1
>> +       else
>> +               echo "=== replay from $_start to $_end ===" >> $seqres.full
>> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>> +                       --replay $SCRATCH_DEV --start-entry $_start \
>> +                       --limit $(($_end - $_start)) -v \
>> +                       >> $seqres.full 2>&1
>> +       fi
>> +       [ $? -ne 0 ] && _fail "replay failed"
>> +}
>> diff --git a/tests/btrfs/159 b/tests/btrfs/159
>> new file mode 100755
>> index 00000000..5113d526
>> --- /dev/null
>> +++ b/tests/btrfs/159
>> @@ -0,0 +1,141 @@
>> +#! /bin/bash
>> +# FS QA Test 159
>> +#
>> +# Test case for btrfs v1 space cache corruption
>> +#
>> +# Btrfs has some unexpected corruption after power loss, during the
>> +# investigation, the problem leads to v1 space cache corruption, which
>> +# could be loaded by kernel without detection.
>> +#
>> +# Kernel has 3 detection for corrupted space cache, checksum, generation
>> +# and free space, however the last one is a weak one. If in the one transaction
>> +# no net space change it can be loaded by kernel and break up metadata CoW,
>> +# leading serious corruption.
>> +#
>> +#-----------------------------------------------------------------------
>> +# Copyright (c) SUSE.  All Rights Reserved.
>> +#
>> +# This program is free software; you can redistribute it and/or
>> +# modify it under the terms of the GNU General Public License as
>> +# published by the Free Software Foundation.
>> +#
>> +# This program is distributed in the hope that it would be useful,
>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +# GNU General Public License for more details.
>> +#
>> +# You should have received a copy of the GNU General Public License
>> +# along with this program; if not, write the Free Software Foundation,
>> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
>> +#-----------------------------------------------------------------------
>> +#
>> +
>> +seq=`basename $0`
>> +seqres=$RESULT_DIR/$seq
>> +echo "QA output created by $seq"
>> +
>> +here=`pwd`
>> +tmp=/tmp/$$
>> +status=1       # failure is the default!
>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>> +
>> +_cleanup()
>> +{
>> +       cd /
>> +       rm -f $tmp.*
>> +       _log_writes_cleanup
>> +}
>> +
>> +# get standard environment, filters and checks
>> +. ./common/rc
>> +. ./common/dmlogwrites
>> +. ./common/filter
>> +
>> +# remove previous $seqres.full before test
>> +rm -f $seqres.full
>> +
>> +# Small workload is enough to trigger the problem
>> +workload=$(( 512 * $LOAD_FACTOR ))
>> +nr_threads=$(($($here/src/feature -o) * $LOAD_FACTOR))
>> +
>> +# Modify as appropriate.
>> +_supported_fs btrfs
>> +_supported_os Linux
>> +_require_log_writes
>> +_require_scratch
>> +
>> +_log_writes_init
>> +_log_writes_mkfs >> $seqres.full 2>&1
>> +
>> +# Here we don't want to create space cache yet
>> +_log_writes_mount -o nospace_cache
>> +
>> +# The SINGLE profile data chunk created by mkfs is too small (8M) so btrfs
>> +# won't create space cache for it.
>> +# We need a data chunk large enough so btrfs create space cache for it and
>> +# store the cache in itself.
>> +_run_btrfs_util_prog balance start --full-balance "$SCRATCH_MNT"
>> +
>> +_log_writes_unmount
>> +_log_writes_mark mkfs
>> +
>> +# Disable tree log, so btrfs will always do full transaction commit
>> +# It's OK to use tree log, but it will cause extra super block update which
>> +# doesn't increase generation.
>> +_log_writes_mount -o space_cache=v1,notreelog
>> +run_check $FSSTRESS_PROG -w -n $workload -p $nr_threads -d $SCRATCH_MNT \
>> +       $FSSTRESS_AVOID > /dev/null 2>&1
>> +_log_writes_unmount
>> +
>> +_log_writes_remove
>> +# Here we manually iterate through the log entries, as we need extra check
>> +# on the output of btrfs check. And we could have detailed replay log in
>> +# $seqres.full
>> +prev=$(_log_writes_mark_to_entry_number mkfs)
>> +if [ -z "$prev" ]; then
>> +       _fail "failed to locate log writes mark 'mkfs'"
>> +fi
>> +
>> +# Here we only checks the result *AFTER* FUA writes, and no tree log, so
>> +# everything should be valid.
>> +cur=$(_log_writes_find_next_fua $prev)
>> +[ -z "$cur" ] && _fail "failed to locate next fua write"
>> +
>> +# log-replay will not replay the last item, so manually increase it by one
>> +cur=$(($cur + 1))
>> +
>> +_log_writes_replay_log_entry_range $prev >> $seqres.full 2>&1
>> +while [ ! -z "$cur" ]; do
>> +       _log_writes_replay_log_entry_range $cur $prev
>> +       # Catch the btrfs check output into temp file, as we need to
>> +       # grep the output to find the cache corruption
>> +       $BTRFS_UTIL_PROG check --check-data-csum $SCRATCH_DEV &> $tmp.fsck
>> +
>> +       # Cache passed generation,csum and free space check but corrupted
>> +       # will be reported as error
>> +       if [ $? -ne 0 ]; then
>> +               cat $tmp.fsck >> $seqres.full
>> +               _fail "btrfs check found corruption"
>> +       fi
>> +
>> +       # Mount option has ruled out any possible factors affect space cache
>> +       # And we are at the FUA writes, no generation related problem should
>> +       # happen anyway
>> +       if grep -q -e 'failed to load free space cache' $tmp.fsck; then
>> +               cat $tmp.fsck >> $seqres.full
>> +               _fail "btrfs check found invalid space cache"
>> +       fi
>> +
>> +       prev=$cur
>> +       cur=$(_log_writes_find_next_fua $prev)
>> +       [ -z $cur ] && break
>> +
>> +       # Same as above
>> +       cur=$(($cur + 1))
>> +done
>> +
>> +echo "Silence is golden"
>> +
>> +# success, all done
>> +status=0
>> +exit
>> diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
>> new file mode 100644
>> index 00000000..e569e60c
>> --- /dev/null
>> +++ b/tests/btrfs/159.out
>> @@ -0,0 +1,2 @@
>> +QA output created by 159
>> +Silence is golden
>> diff --git a/tests/btrfs/group b/tests/btrfs/group
>> index 8007e07e..bc83db94 100644
>> --- a/tests/btrfs/group
>> +++ b/tests/btrfs/group
>> @@ -161,3 +161,4 @@
>>  156 auto quick trim
>>  157 auto quick raid
>>  158 auto quick raid scrub
>> +159 auto
>> --
>> 2.15.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe fstests" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06 10:53     ` Qu Wenruo
  2018-03-06 11:06       ` Nikolay Borisov
@ 2018-03-06 12:22       ` Filipe Manana
  1 sibling, 0 replies; 14+ messages in thread
From: Filipe Manana @ 2018-03-06 12:22 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Qu Wenruo, linux-btrfs, fstests, dsterba, amir73il

On Tue, Mar 6, 2018 at 10:53 AM, Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
> On 2018年03月06日 18:12, Filipe Manana wrote:
>> On Tue, Mar 6, 2018 at 8:15 AM, Qu Wenruo <wqu@suse.com> wrote:
>>> There are some btrfs corruption report in mail list for a while,
>>
>> There have been for years (well, since ever) many reports of different
>> types of corruptions.
>> Which kind of corruption are you referring to?
>
> Transid error.

You mean parent transid mismatches in tree blocks?
Please always be explicit about problems are you mentioning. There can
be many "transid" problems.


>
>>
>>> although such corruption is pretty rare and almost impossible to
>>> reproduce, with dm-log-writes we found it's highly related to v1 space
>>> cache.
>>>
>>> Unlike journal based filesystems, btrfs completely rely on metadata CoW
>>> to protect itself from power loss.
>>> Which needs extent allocator to avoid allocate extents on existing
>>> extent range.
>>> Btrfs also uses free space cache to speed up such allocation.
>>>
>>> However there is a problem, v1 space cache is not protected by data CoW,
>>> and can be corrupted during power loss.
>>> So btrfs do extra check on free space cache, verifying its own in-file csum,
>>> generation and free space recorded in cache and extent tree.
>>>
>>> The problem is, under heavy concurrency, v1 space cache can be corrupted
>>> even under normal operations without power loss.
>>
>> How?
>
> At FUA writes, we can get v1 space cache who can pass checksum and
> generation check, but has difference in free space.
>
>>
>>> And we believe corrupted space cache can break btrfs metadata CoW and
>>> leads to the rare corruption in next power loss.
>>
>> Which kind of corruption?
>
> Transid related error.
>
>>
>>>
>>> The most obvious symptom will be difference in free space:
>>>
>>> This will be caught by kernel, but such check is quite weak, and if
>>> the net free space change is 0 in one transaction, the corrupted
>>> cache can be loaded by kernel.
>>
>> How can that happen? The only case I'm aware of, explained below,
>> always leads to a difference (space cache has less free space then
>> what we actually have if we check the extent tree).
>>
>>>
>>> In this case, btrfs check would report things like :
>>> ------
>>> block group 298844160 has wrong amount of free space
>>> failed to load free space cache for block group 298844160
>>> ------
>>
>> This is normal, but not very common, due to tiny races that exists
>> between committing a transaction (writing the free space caches) and
>> running dellaloc for example (since reserving an extent while running
>> dealloc doesn't joing/start a transaction).
>
> This race explains a lot.
>
> But could that cause corrupted cache to be loaded after power loss, and
> break metadata CoW?

No, unless there's a bug in the procedure of detecting inconsistent caches.

>
> At least for the time point when FUA happens, the free space cache can
> pass both csum and generation, we only have free space difference to
> prevent it to be loaded.
>
>>
>>>
>>> But considering the test case are using notreelog, btrfs won't do
>>> sub-transaction commit which doesn't increase generation, each
>>> transaction should be consistent, and nothing should be reported at all.
>>>
>>> Further more, we can even found corrupted file extents like:
>>> ------
>>> root 5 inode 261 errors 100, file extent discount
>>> Found file extent holes:
>>>         start: 962560, len: 32768
>>> ERROR: errors found in fs roots
>>
>> Why do you think that's a corruption? Does it cause data loss or any
>> user visible issue?
>
> It breaks the rule that we shouldn't have the hole in file extents.

Right. My question/remark was that, besides the warning emitted by
fsck, this does not cause any harm to users/applications nor the
filesytem. That is, no data loss, metadata corruption or anything that
prevents the user from reading all previously written data nor
anything preventing future IO to any range of the file.
So this is just a minor annoyance and far from a serious bug.

>
> IIRC Nikolay is trying to use inode_lock_shared() to solve this race.

I don't see what's the relation, even because this is not caused by
race conditions.

>
>>
>> Having file extent holes not inserted happens when mixing buffered and
>> direct IO writes to a file (and fsstress does that), for example:
>>
>> create file
>> buffered write at offset 0, length 64K
>> direct IO write at offset at offset 64K, length 4K
>> transaction commit
>> power loss
>> after this we got a missing 64K hole extent at offset 0 (at
>> btrfs_file_write_iter we only add hole extents if the start offset is
>> greater then the current i_size)
>>
>> But this does not cause any problem for the user or the fs itself, and
>> it's supposed to be like that in the NO_HOLES mode which one day
>> (probably) will be the default mode.
>
> At least before that happens, we should follow the current schema of
> file extents.
>
> If we just ignore problems that won't cause data loss and keeps them,
> there will never be a good on-disk format schema.

Sure, but it's a minor thing as previously said. I was pointing out
that what you called a corruption is not really a corruption.

>
> Thanks,
> Qu
>
>>
>>> ------
>>>
>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>> ---
>>>  common/dmlogwrites  |  72 +++++++++++++++++++++++++++
>>>  tests/btrfs/159     | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  tests/btrfs/159.out |   2 +
>>>  tests/btrfs/group   |   1 +
>>>  4 files changed, 216 insertions(+)
>>>  create mode 100755 tests/btrfs/159
>>>  create mode 100644 tests/btrfs/159.out
>>>
>>> diff --git a/common/dmlogwrites b/common/dmlogwrites
>>> index 467b872e..54e7e242 100644
>>> --- a/common/dmlogwrites
>>> +++ b/common/dmlogwrites
>>> @@ -126,3 +126,75 @@ _log_writes_cleanup()
>>>         $UDEV_SETTLE_PROG >/dev/null 2>&1
>>>         _log_writes_remove
>>>  }
>>> +
>>> +# Convert log writes mark to entry number
>>> +# Result entry number is output to stdout, could be empty if not found
>>> +_log_writes_mark_to_entry_number()
>>> +{
>>> +       local _mark=$1
>>> +       local ret
>>> +
>>> +       [ -z "$_mark" ] && _fail \
>>> +               "mark must be given for _log_writes_mark_to_entry_number"
>>> +
>>> +       ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>>> +               --end-mark $_mark 2> /dev/null)
>>> +       [ -z "$ret" ] && return
>>> +       ret=$(echo "$ret" | cut -f1 -d\@)
>>> +       echo "mark $_mark has entry number $ret" >> $seqres.full
>>> +       echo "$ret"
>>> +}
>>> +
>>> +# Find next fua write entry number
>>> +# Result entry number is output to stdout, could be empty if not found
>>> +_log_writes_find_next_fua()
>>> +{
>>> +       local _start_entry=$1
>>> +       local ret
>>> +
>>> +       if [ -z "$_start_entry" ]; then
>>> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>>> +                       --next-fua 2> /dev/null)
>>> +       else
>>> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>>> +                       --next-fua --start-entry $_start_entry 2> /dev/null)
>>> +       fi
>>> +       [ -z "$ret" ] && return
>>> +
>>> +       ret=$(echo "$ret" | cut -f1 -d\@)
>>> +       echo "next fua is entry number $ret" >> $seqres.full
>>> +       echo "$ret"
>>> +}
>>> +
>>> +# Replay log range to specified entry
>>> +# $1:  End entry. The last entry will *NOT* be replayed
>>> +# $2:  Start entry. If not specified, start from the first entry.
>>> +_log_writes_replay_log_entry_range()
>>> +{
>>> +       local _end=$1
>>> +       local _start=$2
>>> +
>>> +       [ -z "$_end" ] && _fail \
>>> +       "end entry must be specified for _log_writes_replay_log_entry_range"
>>> +
>>> +       if [[ "$_start" && "$_start" -gt "$_end" ]]; then
>>> +               _fail \
>>> +               "wrong parameter order for _log_writes_replay_log_entry_range:start=$_start end=$_end"
>>> +       fi
>>> +
>>> +       # Original replay-log won't replay the last entry. So increase entry
>>> +       # number here to ensure the end entry to be replayed
>>> +       if [ -z "$_start" ]; then
>>> +               echo "=== replay to $_end ===" >> $seqres.full
>>> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>>> +                       --replay $SCRATCH_DEV --limit $_end -v \
>>> +                       >> $seqres.full 2>&1
>>> +       else
>>> +               echo "=== replay from $_start to $_end ===" >> $seqres.full
>>> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>>> +                       --replay $SCRATCH_DEV --start-entry $_start \
>>> +                       --limit $(($_end - $_start)) -v \
>>> +                       >> $seqres.full 2>&1
>>> +       fi
>>> +       [ $? -ne 0 ] && _fail "replay failed"
>>> +}
>>> diff --git a/tests/btrfs/159 b/tests/btrfs/159
>>> new file mode 100755
>>> index 00000000..5113d526
>>> --- /dev/null
>>> +++ b/tests/btrfs/159
>>> @@ -0,0 +1,141 @@
>>> +#! /bin/bash
>>> +# FS QA Test 159
>>> +#
>>> +# Test case for btrfs v1 space cache corruption
>>> +#
>>> +# Btrfs has some unexpected corruption after power loss, during the
>>> +# investigation, the problem leads to v1 space cache corruption, which
>>> +# could be loaded by kernel without detection.
>>> +#
>>> +# Kernel has 3 detection for corrupted space cache, checksum, generation
>>> +# and free space, however the last one is a weak one. If in the one transaction
>>> +# no net space change it can be loaded by kernel and break up metadata CoW,
>>> +# leading serious corruption.
>>> +#
>>> +#-----------------------------------------------------------------------
>>> +# Copyright (c) SUSE.  All Rights Reserved.
>>> +#
>>> +# This program is free software; you can redistribute it and/or
>>> +# modify it under the terms of the GNU General Public License as
>>> +# published by the Free Software Foundation.
>>> +#
>>> +# This program is distributed in the hope that it would be useful,
>>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> +# GNU General Public License for more details.
>>> +#
>>> +# You should have received a copy of the GNU General Public License
>>> +# along with this program; if not, write the Free Software Foundation,
>>> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
>>> +#-----------------------------------------------------------------------
>>> +#
>>> +
>>> +seq=`basename $0`
>>> +seqres=$RESULT_DIR/$seq
>>> +echo "QA output created by $seq"
>>> +
>>> +here=`pwd`
>>> +tmp=/tmp/$$
>>> +status=1       # failure is the default!
>>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>>> +
>>> +_cleanup()
>>> +{
>>> +       cd /
>>> +       rm -f $tmp.*
>>> +       _log_writes_cleanup
>>> +}
>>> +
>>> +# get standard environment, filters and checks
>>> +. ./common/rc
>>> +. ./common/dmlogwrites
>>> +. ./common/filter
>>> +
>>> +# remove previous $seqres.full before test
>>> +rm -f $seqres.full
>>> +
>>> +# Small workload is enough to trigger the problem
>>> +workload=$(( 512 * $LOAD_FACTOR ))
>>> +nr_threads=$(($($here/src/feature -o) * $LOAD_FACTOR))
>>> +
>>> +# Modify as appropriate.
>>> +_supported_fs btrfs
>>> +_supported_os Linux
>>> +_require_log_writes
>>> +_require_scratch
>>> +
>>> +_log_writes_init
>>> +_log_writes_mkfs >> $seqres.full 2>&1
>>> +
>>> +# Here we don't want to create space cache yet
>>> +_log_writes_mount -o nospace_cache
>>> +
>>> +# The SINGLE profile data chunk created by mkfs is too small (8M) so btrfs
>>> +# won't create space cache for it.
>>> +# We need a data chunk large enough so btrfs create space cache for it and
>>> +# store the cache in itself.
>>> +_run_btrfs_util_prog balance start --full-balance "$SCRATCH_MNT"
>>> +
>>> +_log_writes_unmount
>>> +_log_writes_mark mkfs
>>> +
>>> +# Disable tree log, so btrfs will always do full transaction commit
>>> +# It's OK to use tree log, but it will cause extra super block update which
>>> +# doesn't increase generation.
>>> +_log_writes_mount -o space_cache=v1,notreelog
>>> +run_check $FSSTRESS_PROG -w -n $workload -p $nr_threads -d $SCRATCH_MNT \
>>> +       $FSSTRESS_AVOID > /dev/null 2>&1
>>> +_log_writes_unmount
>>> +
>>> +_log_writes_remove
>>> +# Here we manually iterate through the log entries, as we need extra check
>>> +# on the output of btrfs check. And we could have detailed replay log in
>>> +# $seqres.full
>>> +prev=$(_log_writes_mark_to_entry_number mkfs)
>>> +if [ -z "$prev" ]; then
>>> +       _fail "failed to locate log writes mark 'mkfs'"
>>> +fi
>>> +
>>> +# Here we only checks the result *AFTER* FUA writes, and no tree log, so
>>> +# everything should be valid.
>>> +cur=$(_log_writes_find_next_fua $prev)
>>> +[ -z "$cur" ] && _fail "failed to locate next fua write"
>>> +
>>> +# log-replay will not replay the last item, so manually increase it by one
>>> +cur=$(($cur + 1))
>>> +
>>> +_log_writes_replay_log_entry_range $prev >> $seqres.full 2>&1
>>> +while [ ! -z "$cur" ]; do
>>> +       _log_writes_replay_log_entry_range $cur $prev
>>> +       # Catch the btrfs check output into temp file, as we need to
>>> +       # grep the output to find the cache corruption
>>> +       $BTRFS_UTIL_PROG check --check-data-csum $SCRATCH_DEV &> $tmp.fsck
>>> +
>>> +       # Cache passed generation,csum and free space check but corrupted
>>> +       # will be reported as error
>>> +       if [ $? -ne 0 ]; then
>>> +               cat $tmp.fsck >> $seqres.full
>>> +               _fail "btrfs check found corruption"
>>> +       fi
>>> +
>>> +       # Mount option has ruled out any possible factors affect space cache
>>> +       # And we are at the FUA writes, no generation related problem should
>>> +       # happen anyway
>>> +       if grep -q -e 'failed to load free space cache' $tmp.fsck; then
>>> +               cat $tmp.fsck >> $seqres.full
>>> +               _fail "btrfs check found invalid space cache"
>>> +       fi
>>> +
>>> +       prev=$cur
>>> +       cur=$(_log_writes_find_next_fua $prev)
>>> +       [ -z $cur ] && break
>>> +
>>> +       # Same as above
>>> +       cur=$(($cur + 1))
>>> +done
>>> +
>>> +echo "Silence is golden"
>>> +
>>> +# success, all done
>>> +status=0
>>> +exit
>>> diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
>>> new file mode 100644
>>> index 00000000..e569e60c
>>> --- /dev/null
>>> +++ b/tests/btrfs/159.out
>>> @@ -0,0 +1,2 @@
>>> +QA output created by 159
>>> +Silence is golden
>>> diff --git a/tests/btrfs/group b/tests/btrfs/group
>>> index 8007e07e..bc83db94 100644
>>> --- a/tests/btrfs/group
>>> +++ b/tests/btrfs/group
>>> @@ -161,3 +161,4 @@
>>>  156 auto quick trim
>>>  157 auto quick raid
>>>  158 auto quick raid scrub
>>> +159 auto
>>> --
>>> 2.15.1
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe fstests" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption
  2018-03-06 12:07     ` Qu Wenruo
@ 2018-03-06 12:25       ` Filipe Manana
  0 siblings, 0 replies; 14+ messages in thread
From: Filipe Manana @ 2018-03-06 12:25 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Qu Wenruo, linux-btrfs, fstests, dsterba, Amir Goldstein

On Tue, Mar 6, 2018 at 12:07 PM, Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
> On 2018年03月06日 18:12, Filipe Manana wrote:
>> On Tue, Mar 6, 2018 at 8:15 AM, Qu Wenruo <wqu@suse.com> wrote:
>>> There are some btrfs corruption report in mail list for a while,
>>
>> There have been for years (well, since ever) many reports of different
>> types of corruptions.
>> Which kind of corruption are you referring to?
>>
>>> although such corruption is pretty rare and almost impossible to
>>> reproduce, with dm-log-writes we found it's highly related to v1 space
>>> cache.
>>>
>>> Unlike journal based filesystems, btrfs completely rely on metadata CoW
>>> to protect itself from power loss.
>>> Which needs extent allocator to avoid allocate extents on existing
>>> extent range.
>>> Btrfs also uses free space cache to speed up such allocation.
>>>
>>> However there is a problem, v1 space cache is not protected by data CoW,
>>> and can be corrupted during power loss.
>>> So btrfs do extra check on free space cache, verifying its own in-file csum,
>>> generation and free space recorded in cache and extent tree.
>>>
>>> The problem is, under heavy concurrency, v1 space cache can be corrupted
>>> even under normal operations without power loss.
>>
>> How?
>>
>>> And we believe corrupted space cache can break btrfs metadata CoW and
>>> leads to the rare corruption in next power loss.
>>
>> Which kind of corruption?
>>
>>>
>>> The most obvious symptom will be difference in free space:
>>>
>>> This will be caught by kernel, but such check is quite weak, and if
>>> the net free space change is 0 in one transaction, the corrupted
>>> cache can be loaded by kernel.
>>
>> How can that happen? The only case I'm aware of, explained below,
>> always leads to a difference (space cache has less free space then
>> what we actually have if we check the extent tree).
>>
>>>
>>> In this case, btrfs check would report things like :
>>> ------
>>> block group 298844160 has wrong amount of free space
>>> failed to load free space cache for block group 298844160
>>> ------
>>
>> This is normal, but not very common, due to tiny races that exists
>> between committing a transaction (writing the free space caches) and
>> running dellaloc for example (since reserving an extent while running
>> dealloc doesn't joing/start a transaction).
>
> Well, at least I didn't find any place to release space unprotected by
> trans handler, so free space of cache can only be less or equal to real
> free space.

Well, that what I said before. For that particular race I mentioned,
which is the only one I can remember of always existing, the only
inconsistency that can happen is that the cache has less free extents
than what you can find by scanning the extent tree. If the
inconsistency was not detected at cache loading time, we would only
leak extents, but never double allocate them.

>
> So in that case, corrupted cache will never pass the free space check so
> it will never be loaded.
>
> Another dead end unfortunately.
>
> Thanks,
> Qu
>>
>>>
>>> But considering the test case are using notreelog, btrfs won't do
>>> sub-transaction commit which doesn't increase generation, each
>>> transaction should be consistent, and nothing should be reported at all.
>>>
>>> Further more, we can even found corrupted file extents like:
>>> ------
>>> root 5 inode 261 errors 100, file extent discount
>>> Found file extent holes:
>>>         start: 962560, len: 32768
>>> ERROR: errors found in fs roots
>>
>> Why do you think that's a corruption? Does it cause data loss or any
>> user visible issue?
>>
>> Having file extent holes not inserted happens when mixing buffered and
>> direct IO writes to a file (and fsstress does that), for example:
>>
>> create file
>> buffered write at offset 0, length 64K
>> direct IO write at offset at offset 64K, length 4K
>> transaction commit
>> power loss
>> after this we got a missing 64K hole extent at offset 0 (at
>> btrfs_file_write_iter we only add hole extents if the start offset is
>> greater then the current i_size)
>>
>> But this does not cause any problem for the user or the fs itself, and
>> it's supposed to be like that in the NO_HOLES mode which one day
>> (probably) will be the default mode.
>>
>>> ------
>>>
>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>> ---
>>>  common/dmlogwrites  |  72 +++++++++++++++++++++++++++
>>>  tests/btrfs/159     | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  tests/btrfs/159.out |   2 +
>>>  tests/btrfs/group   |   1 +
>>>  4 files changed, 216 insertions(+)
>>>  create mode 100755 tests/btrfs/159
>>>  create mode 100644 tests/btrfs/159.out
>>>
>>> diff --git a/common/dmlogwrites b/common/dmlogwrites
>>> index 467b872e..54e7e242 100644
>>> --- a/common/dmlogwrites
>>> +++ b/common/dmlogwrites
>>> @@ -126,3 +126,75 @@ _log_writes_cleanup()
>>>         $UDEV_SETTLE_PROG >/dev/null 2>&1
>>>         _log_writes_remove
>>>  }
>>> +
>>> +# Convert log writes mark to entry number
>>> +# Result entry number is output to stdout, could be empty if not found
>>> +_log_writes_mark_to_entry_number()
>>> +{
>>> +       local _mark=$1
>>> +       local ret
>>> +
>>> +       [ -z "$_mark" ] && _fail \
>>> +               "mark must be given for _log_writes_mark_to_entry_number"
>>> +
>>> +       ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>>> +               --end-mark $_mark 2> /dev/null)
>>> +       [ -z "$ret" ] && return
>>> +       ret=$(echo "$ret" | cut -f1 -d\@)
>>> +       echo "mark $_mark has entry number $ret" >> $seqres.full
>>> +       echo "$ret"
>>> +}
>>> +
>>> +# Find next fua write entry number
>>> +# Result entry number is output to stdout, could be empty if not found
>>> +_log_writes_find_next_fua()
>>> +{
>>> +       local _start_entry=$1
>>> +       local ret
>>> +
>>> +       if [ -z "$_start_entry" ]; then
>>> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>>> +                       --next-fua 2> /dev/null)
>>> +       else
>>> +               ret=$($here/src/log-writes/replay-log --find --log $LOGWRITES_DEV \
>>> +                       --next-fua --start-entry $_start_entry 2> /dev/null)
>>> +       fi
>>> +       [ -z "$ret" ] && return
>>> +
>>> +       ret=$(echo "$ret" | cut -f1 -d\@)
>>> +       echo "next fua is entry number $ret" >> $seqres.full
>>> +       echo "$ret"
>>> +}
>>> +
>>> +# Replay log range to specified entry
>>> +# $1:  End entry. The last entry will *NOT* be replayed
>>> +# $2:  Start entry. If not specified, start from the first entry.
>>> +_log_writes_replay_log_entry_range()
>>> +{
>>> +       local _end=$1
>>> +       local _start=$2
>>> +
>>> +       [ -z "$_end" ] && _fail \
>>> +       "end entry must be specified for _log_writes_replay_log_entry_range"
>>> +
>>> +       if [[ "$_start" && "$_start" -gt "$_end" ]]; then
>>> +               _fail \
>>> +               "wrong parameter order for _log_writes_replay_log_entry_range:start=$_start end=$_end"
>>> +       fi
>>> +
>>> +       # Original replay-log won't replay the last entry. So increase entry
>>> +       # number here to ensure the end entry to be replayed
>>> +       if [ -z "$_start" ]; then
>>> +               echo "=== replay to $_end ===" >> $seqres.full
>>> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>>> +                       --replay $SCRATCH_DEV --limit $_end -v \
>>> +                       >> $seqres.full 2>&1
>>> +       else
>>> +               echo "=== replay from $_start to $_end ===" >> $seqres.full
>>> +               $here/src/log-writes/replay-log --log $LOGWRITES_DEV \
>>> +                       --replay $SCRATCH_DEV --start-entry $_start \
>>> +                       --limit $(($_end - $_start)) -v \
>>> +                       >> $seqres.full 2>&1
>>> +       fi
>>> +       [ $? -ne 0 ] && _fail "replay failed"
>>> +}
>>> diff --git a/tests/btrfs/159 b/tests/btrfs/159
>>> new file mode 100755
>>> index 00000000..5113d526
>>> --- /dev/null
>>> +++ b/tests/btrfs/159
>>> @@ -0,0 +1,141 @@
>>> +#! /bin/bash
>>> +# FS QA Test 159
>>> +#
>>> +# Test case for btrfs v1 space cache corruption
>>> +#
>>> +# Btrfs has some unexpected corruption after power loss, during the
>>> +# investigation, the problem leads to v1 space cache corruption, which
>>> +# could be loaded by kernel without detection.
>>> +#
>>> +# Kernel has 3 detection for corrupted space cache, checksum, generation
>>> +# and free space, however the last one is a weak one. If in the one transaction
>>> +# no net space change it can be loaded by kernel and break up metadata CoW,
>>> +# leading serious corruption.
>>> +#
>>> +#-----------------------------------------------------------------------
>>> +# Copyright (c) SUSE.  All Rights Reserved.
>>> +#
>>> +# This program is free software; you can redistribute it and/or
>>> +# modify it under the terms of the GNU General Public License as
>>> +# published by the Free Software Foundation.
>>> +#
>>> +# This program is distributed in the hope that it would be useful,
>>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> +# GNU General Public License for more details.
>>> +#
>>> +# You should have received a copy of the GNU General Public License
>>> +# along with this program; if not, write the Free Software Foundation,
>>> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
>>> +#-----------------------------------------------------------------------
>>> +#
>>> +
>>> +seq=`basename $0`
>>> +seqres=$RESULT_DIR/$seq
>>> +echo "QA output created by $seq"
>>> +
>>> +here=`pwd`
>>> +tmp=/tmp/$$
>>> +status=1       # failure is the default!
>>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>>> +
>>> +_cleanup()
>>> +{
>>> +       cd /
>>> +       rm -f $tmp.*
>>> +       _log_writes_cleanup
>>> +}
>>> +
>>> +# get standard environment, filters and checks
>>> +. ./common/rc
>>> +. ./common/dmlogwrites
>>> +. ./common/filter
>>> +
>>> +# remove previous $seqres.full before test
>>> +rm -f $seqres.full
>>> +
>>> +# Small workload is enough to trigger the problem
>>> +workload=$(( 512 * $LOAD_FACTOR ))
>>> +nr_threads=$(($($here/src/feature -o) * $LOAD_FACTOR))
>>> +
>>> +# Modify as appropriate.
>>> +_supported_fs btrfs
>>> +_supported_os Linux
>>> +_require_log_writes
>>> +_require_scratch
>>> +
>>> +_log_writes_init
>>> +_log_writes_mkfs >> $seqres.full 2>&1
>>> +
>>> +# Here we don't want to create space cache yet
>>> +_log_writes_mount -o nospace_cache
>>> +
>>> +# The SINGLE profile data chunk created by mkfs is too small (8M) so btrfs
>>> +# won't create space cache for it.
>>> +# We need a data chunk large enough so btrfs create space cache for it and
>>> +# store the cache in itself.
>>> +_run_btrfs_util_prog balance start --full-balance "$SCRATCH_MNT"
>>> +
>>> +_log_writes_unmount
>>> +_log_writes_mark mkfs
>>> +
>>> +# Disable tree log, so btrfs will always do full transaction commit
>>> +# It's OK to use tree log, but it will cause extra super block update which
>>> +# doesn't increase generation.
>>> +_log_writes_mount -o space_cache=v1,notreelog
>>> +run_check $FSSTRESS_PROG -w -n $workload -p $nr_threads -d $SCRATCH_MNT \
>>> +       $FSSTRESS_AVOID > /dev/null 2>&1
>>> +_log_writes_unmount
>>> +
>>> +_log_writes_remove
>>> +# Here we manually iterate through the log entries, as we need extra check
>>> +# on the output of btrfs check. And we could have detailed replay log in
>>> +# $seqres.full
>>> +prev=$(_log_writes_mark_to_entry_number mkfs)
>>> +if [ -z "$prev" ]; then
>>> +       _fail "failed to locate log writes mark 'mkfs'"
>>> +fi
>>> +
>>> +# Here we only checks the result *AFTER* FUA writes, and no tree log, so
>>> +# everything should be valid.
>>> +cur=$(_log_writes_find_next_fua $prev)
>>> +[ -z "$cur" ] && _fail "failed to locate next fua write"
>>> +
>>> +# log-replay will not replay the last item, so manually increase it by one
>>> +cur=$(($cur + 1))
>>> +
>>> +_log_writes_replay_log_entry_range $prev >> $seqres.full 2>&1
>>> +while [ ! -z "$cur" ]; do
>>> +       _log_writes_replay_log_entry_range $cur $prev
>>> +       # Catch the btrfs check output into temp file, as we need to
>>> +       # grep the output to find the cache corruption
>>> +       $BTRFS_UTIL_PROG check --check-data-csum $SCRATCH_DEV &> $tmp.fsck
>>> +
>>> +       # Cache passed generation,csum and free space check but corrupted
>>> +       # will be reported as error
>>> +       if [ $? -ne 0 ]; then
>>> +               cat $tmp.fsck >> $seqres.full
>>> +               _fail "btrfs check found corruption"
>>> +       fi
>>> +
>>> +       # Mount option has ruled out any possible factors affect space cache
>>> +       # And we are at the FUA writes, no generation related problem should
>>> +       # happen anyway
>>> +       if grep -q -e 'failed to load free space cache' $tmp.fsck; then
>>> +               cat $tmp.fsck >> $seqres.full
>>> +               _fail "btrfs check found invalid space cache"
>>> +       fi
>>> +
>>> +       prev=$cur
>>> +       cur=$(_log_writes_find_next_fua $prev)
>>> +       [ -z $cur ] && break
>>> +
>>> +       # Same as above
>>> +       cur=$(($cur + 1))
>>> +done
>>> +
>>> +echo "Silence is golden"
>>> +
>>> +# success, all done
>>> +status=0
>>> +exit
>>> diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
>>> new file mode 100644
>>> index 00000000..e569e60c
>>> --- /dev/null
>>> +++ b/tests/btrfs/159.out
>>> @@ -0,0 +1,2 @@
>>> +QA output created by 159
>>> +Silence is golden
>>> diff --git a/tests/btrfs/group b/tests/btrfs/group
>>> index 8007e07e..bc83db94 100644
>>> --- a/tests/btrfs/group
>>> +++ b/tests/btrfs/group
>>> @@ -161,3 +161,4 @@
>>>  156 auto quick trim
>>>  157 auto quick raid
>>>  158 auto quick raid scrub
>>> +159 auto
>>> --
>>> 2.15.1
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe fstests" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 1/3] fstests: log-writes: Add support to output human readable flags
@ 2018-03-28  4:40 Qu Wenruo
  0 siblings, 0 replies; 14+ messages in thread
From: Qu Wenruo @ 2018-03-28  4:40 UTC (permalink / raw)
  To: linux-btrfs, fstests; +Cc: amir73il, linux-xfs, linux-ext4

Also change the flag numeric output to hex.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
---
changelog:
v2:
  Add Amir's reviewed-by tag.
---
 src/log-writes/log-writes.c | 70 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 7 deletions(-)

diff --git a/src/log-writes/log-writes.c b/src/log-writes/log-writes.c
index 09391574c4d2..a872429d3f41 100644
--- a/src/log-writes/log-writes.c
+++ b/src/log-writes/log-writes.c
@@ -120,6 +120,58 @@ int log_discard(struct log *log, struct log_write_entry *entry)
 	return 0;
 }
 
+#define DEFINE_LOG_FLAGS_STR_ENTRY(x)	\
+	{LOG_##x##_FLAG, #x}
+
+struct flags_to_str_entry {
+	u64 flags;
+	const char *str;
+} log_flags_table[] = {
+	DEFINE_LOG_FLAGS_STR_ENTRY(FLUSH),
+	DEFINE_LOG_FLAGS_STR_ENTRY(FUA),
+	DEFINE_LOG_FLAGS_STR_ENTRY(DISCARD),
+	DEFINE_LOG_FLAGS_STR_ENTRY(MARK)
+};
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define LOG_FLAGS_BUF_SIZE	128
+/*
+ * Convert numeric flags to human readable flags.
+ * @flags:	numeric flags
+ * @buf:	output buffer for human readable string.
+ * 		must have enough space (LOG_FLAGS_BUF_SIZE) to contain all
+ * 		the string
+ */
+static void entry_flags_to_str(u64 flags, char *buf)
+{
+	int empty = 1;
+	int left_len;
+	int i;
+
+	buf[0] = '\0';
+	for (i = 0; i < ARRAY_SIZE(log_flags_table); i++) {
+		if (flags & log_flags_table[i].flags) {
+			if (!empty)
+				strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
+			empty = 0;
+			strncat(buf, log_flags_table[i].str, LOG_FLAGS_BUF_SIZE);
+			flags &= ~log_flags_table[i].flags;
+		}
+	}
+	if (flags) {
+		if (!empty)
+			strncat(buf, "|", LOG_FLAGS_BUF_SIZE);
+		empty = 0;
+		left_len = LOG_FLAGS_BUF_SIZE - strnlen(buf,
+						        LOG_FLAGS_BUF_SIZE);
+		if (left_len > 0)
+			snprintf(buf + strnlen(buf, LOG_FLAGS_BUF_SIZE),
+				 left_len, "UNKNOWN.0x%llx", flags);
+	}
+	if (empty)
+		strncpy(buf, "NONE", LOG_FLAGS_BUF_SIZE);
+}
+
 /*
  * @log: the log we are replaying.
  * @entry: entry to be replayed.
@@ -179,6 +231,7 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
 	size_t read_size = read_data ? log->sectorsize :
 		sizeof(struct log_write_entry);
 	char *buf;
+	char flags_buf[LOG_FLAGS_BUF_SIZE];
 	ssize_t ret;
 	off_t offset;
 	int skip = 0;
@@ -210,19 +263,20 @@ int log_replay_next_entry(struct log *log, struct log_write_entry *entry,
 		log->cur_pos += read_size;
 	}
 
+	flags = le64_to_cpu(entry->flags);
+	entry_flags_to_str(flags, flags_buf);
 	skip = log_should_skip(log, entry);
 	if (log_writes_verbose > 1 || (log_writes_verbose && !skip)) {
-		printf("%s %d@%llu: sector %llu, size %llu, flags %llu\n",
+		printf("%s %d@%llu: sector %llu, size %llu, flags 0x%llx(%s)\n",
 		       skip ? "skipping" : "replaying",
 		       (int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
 		       (unsigned long long)le64_to_cpu(entry->sector),
 		       (unsigned long long)size,
-		       (unsigned long long)le64_to_cpu(entry->flags));
+		       (unsigned long long)flags, flags_buf);
 	}
 	if (!size)
 		return 0;
 
-	flags = le64_to_cpu(entry->flags);
 	if (flags & LOG_DISCARD_FLAG)
 		return log_discard(log, entry);
 
@@ -301,7 +355,7 @@ int log_seek_entry(struct log *log, u64 entry_num)
 			return -1;
 		}
 		if (log_writes_verbose > 1)
-			printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
+			printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx\n",
 			       (int)i, log->cur_pos / log->sectorsize,
 			       (unsigned long long)le64_to_cpu(entry.sector),
 			       (unsigned long long)le64_to_cpu(entry.nr_sectors),
@@ -339,6 +393,7 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
 	size_t read_size = read_data ? log->sectorsize :
 		sizeof(struct log_write_entry);
 	u64 flags;
+	char flags_buf[LOG_FLAGS_BUF_SIZE];
 	ssize_t ret;
 
 	if (log->cur_entry >= log->nr_entries)
@@ -366,14 +421,15 @@ int log_seek_next_entry(struct log *log, struct log_write_entry *entry,
 	} else {
 		log->cur_pos += read_size;
 	}
+	flags = le64_to_cpu(entry->flags);
+	entry_flags_to_str(flags, flags_buf);
 	if (log_writes_verbose > 1)
-		printf("seek entry %d@%llu: %llu, size %llu, flags %llu\n",
+		printf("seek entry %d@%llu: %llu, size %llu, flags 0x%llx(%s)\n",
 		       (int)log->cur_entry - 1, log->cur_pos / log->sectorsize,
 		       (unsigned long long)le64_to_cpu(entry->sector),
 		       (unsigned long long)le64_to_cpu(entry->nr_sectors),
-		       (unsigned long long)le64_to_cpu(entry->flags));
+		       (unsigned long long)flags, flags_buf);
 
-	flags = le64_to_cpu(entry->flags);
 	read_size = le64_to_cpu(entry->nr_sectors) * log->sectorsize;
 	if (!read_size || (flags & LOG_DISCARD_FLAG))
 		return 0;
-- 
2.16.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-03-28  4:40 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-06  8:15 [PATCH 1/3] fstests: log-writes: Add support to output human readable flags Qu Wenruo
2018-03-06  8:15 ` [PATCH 2/3] fstests: log-writes: Add support for METADATA flag Qu Wenruo
2018-03-06  8:15 ` [PATCH 3/3] fstests: btrfs: Add test case to check v1 space cache corruption Qu Wenruo
2018-03-06  9:03   ` Amir Goldstein
2018-03-06  9:25     ` Qu Wenruo
2018-03-06 10:12   ` Filipe Manana
2018-03-06 10:53     ` Qu Wenruo
2018-03-06 11:06       ` Nikolay Borisov
2018-03-06 12:22       ` Filipe Manana
2018-03-06 12:07     ` Qu Wenruo
2018-03-06 12:25       ` Filipe Manana
2018-03-06  8:35 ` [PATCH 1/3] fstests: log-writes: Add support to output human readable flags Amir Goldstein
2018-03-06  8:51 ` [PATCH v2 " Qu Wenruo
2018-03-28  4:40 Qu Wenruo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.