All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
@ 2018-03-13 18:47 fdmanana
  2018-03-13 22:48 ` [PATCH 1/2 v2] " fdmanana
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: fdmanana @ 2018-03-13 18:47 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

Under some cases the filesystem checker reports an error when it finds
checksum items for an extent that is referenced by an inode as a prealloc
extent. Such cases are not an error when the extent is actually shared
(was cloned/reflinked) with other inodes and was written through one of
those other inodes.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ xfs_io -c "falloc 0 256K" /mnt/foo
  $ sync

  $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
  $ touch /mnt/bar
  $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  <power fail>
  $ mount /dev/sdb /mnt
  $ umount /mnt

  $ btrfs check /dev/sdc
  Checking filesystem on /dev/sdb
  UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
  checking extents
  checking free space cache
  checking fs roots
  root 5 inode 257 errors 800, odd csum item
  ERROR: errors found in fs roots
  found 688128 bytes used, error(s) found
  total csum bytes: 256
  total tree bytes: 163840
  total fs tree bytes: 65536
  total extent tree bytes: 16384
  btree space waste bytes: 138819
  file data blocks allocated: 10747904
   referenced 10747904
  $ echo $?
  1

So tech check to not report such cases as errors by checking if the
extent is shared with other inodes and if so, consider it an error the
existence of checksum items only if all those other inodes are referencing
the extent as a prealloc extent.
This case can be hit often when running the generic/475 testcase from
fstests.

A test case will follow in a separate patch.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 268 insertions(+), 2 deletions(-)

diff --git a/check/main.c b/check/main.c
index 392195ca..bb816311 100644
--- a/check/main.c
+++ b/check/main.c
@@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
 
 }
 
+/*
+ * Check if the inode referenced by the given data reference uses the extent
+ * at disk_bytenr as a non-prealloc extent.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
+				   u64 ino,
+				   u64 disk_bytenr,
+				   struct btrfs_extent_data_ref *dref,
+				   struct extent_buffer *eb)
+{
+	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
+	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
+	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct btrfs_path path;
+	int ret;
+
+	if (objectid == ino)
+		return 0;
+
+	btrfs_init_path(&path);
+	key.objectid = rootid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root(fs_info, &key);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	key.objectid = objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+	if (ret > 0) {
+		fprintf(stderr,
+			"Missing file extent item for inode %llu, root %llu, offset %llu",
+			objectid, rootid, offset);
+		ret = -ENOENT;
+	}
+	if (ret < 0)
+		goto out;
+
+	while (true) {
+		struct btrfs_file_extent_item *fi;
+		int extent_type;
+
+		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+			ret = btrfs_next_leaf(fs_info->extent_root, &path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+		}
+
+		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+		if (key.objectid != objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
+		if (extent_type != BTRFS_FILE_EXTENT_REG &&
+		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
+			goto next;
+
+		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
+		    disk_bytenr)
+			break;
+
+		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+			ret = 1;
+			goto out;
+		}
+next:
+		path.slots[0]++;
+	}
+	ret = 0;
+ out:
+	btrfs_release_path(&path);
+	return ret;
+}
+
+/*
+ * Check if a shared data reference points to a node that has a file extent item
+ * pointing to the extent at disk_bytenr that is not of type prealloc and is
+ * associated to an inode with a number different from ino.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
+					  u64 ino,
+					  u64 parent,
+					  u64 disk_bytenr)
+{
+	struct extent_buffer *eb;
+	u32 nr;
+	int i;
+	int ret = 0;
+
+	eb = read_tree_block(fs_info, parent, 0);
+	if (!extent_buffer_uptodate(eb)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	nr = btrfs_header_nritems(eb);
+	for (i = 0; i < nr; i++) {
+		struct btrfs_key key;
+		struct btrfs_file_extent_item *fi;
+		int extent_type;
+
+		btrfs_item_key_to_cpu(eb, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		if (key.objectid == ino)
+			continue;
+
+		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+
+		extent_type = btrfs_file_extent_type(eb, fi);
+		if (extent_type != BTRFS_FILE_EXTENT_REG &&
+		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
+			continue;
+
+		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
+		    extent_type == BTRFS_FILE_EXTENT_REG) {
+			ret = 1;
+			break;
+		}
+	}
+ out:
+	free_extent_buffer(eb);
+	return ret;
+}
+
+/*
+ * Check if a prealloc extent is shared by other inodes and if any inode has
+ * already written to that extent. This is to avoid emitting invalid warnings
+ * about odd csum items (a inode has an extent entirely marked as prealloc
+ * but another inode shares it and has already written to it).
+ *
+ * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
+ * at least one other inode has written to it, and < 0 on error.
+ */
+static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
+					 u64 ino,
+					 u64 disk_bytenr,
+					 u64 num_bytes)
+{
+	struct btrfs_path path;
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_extent_item *ei;
+	u32 item_size;
+	unsigned long ptr;
+	unsigned long end;
+
+	key.objectid = disk_bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
+	if (ret > 0) {
+		fprintf(stderr,
+			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
+			disk_bytenr, num_bytes);
+		ret = -ENOENT;
+	}
+	if (ret < 0)
+		goto out;
+
+	/* First check all inline refs. */
+	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
+			    struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		int type;
+
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
+		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
+		       type == BTRFS_SHARED_DATA_REF_KEY);
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
+						      dref, path.nodes[0]);
+			if (ret != 0)
+				goto out;
+		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+			u64 parent;
+
+			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
+								iref);
+			ret = check_prealloc_shared_data_ref(fs_info,
+							     ino,
+							     parent,
+							     disk_bytenr);
+			if (ret != 0)
+				goto out;
+		}
+
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+
+	/* Now check if there are any non-inlined refs. */
+	path.slots[0]++;
+	while (true) {
+		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+			ret = btrfs_next_leaf(fs_info->extent_root, &path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+		}
+
+		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+		if (key.objectid != disk_bytenr)
+			break;
+
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+
+			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
+					      struct btrfs_extent_data_ref);
+			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
+						      dref, path.nodes[0]);
+			if (ret != 0)
+				goto out;
+		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			ret = check_prealloc_shared_data_ref(fs_info,
+							     ino,
+							     key.offset,
+							     disk_bytenr);
+			if (ret != 0)
+				goto out;
+		}
+
+		path.slots[0]++;
+	}
+out:
+	btrfs_release_path(&path);
+	return ret;
+}
+
 static int process_file_extent(struct btrfs_root *root,
 				struct extent_buffer *eb,
 				int slot, struct btrfs_key *key,
@@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
 			if (found < num_bytes)
 				rec->some_csum_missing = 1;
 		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			if (found > 0)
-				rec->errors |= I_ERR_ODD_CSUM_ITEM;
+			if (found > 0) {
+				ret = check_prealloc_extent_written(root->fs_info,
+								    rec->ino,
+								    disk_bytenr,
+								    num_bytes);
+				if (ret < 0)
+					return ret;
+				if (ret == 0)
+					rec->errors |= I_ERR_ODD_CSUM_ITEM;
+			}
 		}
 	}
 	return 0;
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 1/2 v2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-13 18:47 [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents fdmanana
@ 2018-03-13 22:48 ` fdmanana
  2018-03-15  9:04   ` Nikolay Borisov
  2018-03-14  1:32 ` [PATCH 1/2] " Qu Wenruo
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 16+ messages in thread
From: fdmanana @ 2018-03-13 22:48 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

Under some cases the filesystem checker reports an error when it finds
checksum items for an extent that is referenced by an inode as a prealloc
extent. Such cases are not an error when the extent is actually shared
(was cloned/reflinked) with other inodes and was written through one of
those other inodes.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ xfs_io -c "falloc 0 256K" /mnt/foo
  $ sync

  $ xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
  $ touch /mnt/bar
  $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  <power fail>
  $ mount /dev/sdb /mnt
  $ umount /mnt

  $ btrfs check /dev/sdc
  Checking filesystem on /dev/sdb
  UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
  checking extents
  checking free space cache
  checking fs roots
  root 5 inode 257 errors 800, odd csum item
  ERROR: errors found in fs roots
  found 688128 bytes used, error(s) found
  total csum bytes: 256
  total tree bytes: 163840
  total fs tree bytes: 65536
  total extent tree bytes: 16384
  btree space waste bytes: 138819
  file data blocks allocated: 10747904
   referenced 10747904
  $ echo $?
  1

So teach check to not report such cases as errors by checking if the
extent is shared with other inodes and if so, consider it an error the
existence of checksum items only if all those other inodes are referencing
the extent as a prealloc extent.
This case can be hit often when running the generic/475 testcase from
fstests.

A test case will follow in a separate patch.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---

V2: Made stuff work with lowmem mode as well.
    Added a comment about the limitations of the current check.
    Removed filtering by inode number since it was unreliable as we can
    have different inodes with same inode number but in different roots
    (so opted to drop the filtering instead of filtering by root as well,
    to keep it simpler).

 check/main.c        |  11 ++-
 check/mode-common.c | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 check/mode-common.h |   3 +
 check/mode-lowmem.c |  14 ++-
 4 files changed, 281 insertions(+), 5 deletions(-)

diff --git a/check/main.c b/check/main.c
index 392195ca..88e6c1e9 100644
--- a/check/main.c
+++ b/check/main.c
@@ -1515,8 +1515,15 @@ static int process_file_extent(struct btrfs_root *root,
 			if (found < num_bytes)
 				rec->some_csum_missing = 1;
 		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			if (found > 0)
-				rec->errors |= I_ERR_ODD_CSUM_ITEM;
+			if (found > 0) {
+				ret = check_prealloc_extent_written(root->fs_info,
+								    disk_bytenr,
+								    num_bytes);
+				if (ret < 0)
+					return ret;
+				if (ret == 0)
+					rec->errors |= I_ERR_ODD_CSUM_ITEM;
+			}
 		}
 	}
 	return 0;
diff --git a/check/mode-common.c b/check/mode-common.c
index 1b56a968..559cd11d 100644
--- a/check/mode-common.c
+++ b/check/mode-common.c
@@ -24,6 +24,264 @@
 #include "check/mode-common.h"
 
 /*
+ * Check if the inode referenced by the given data reference uses the extent
+ * at disk_bytenr as a non-prealloc extent.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
+				   u64 disk_bytenr,
+				   struct btrfs_extent_data_ref *dref,
+				   struct extent_buffer *eb)
+{
+	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
+	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
+	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct btrfs_path path;
+	int ret;
+
+	btrfs_init_path(&path);
+	key.objectid = rootid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root(fs_info, &key);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	key.objectid = objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+	if (ret > 0) {
+		fprintf(stderr,
+			"Missing file extent item for inode %llu, root %llu, offset %llu",
+			objectid, rootid, offset);
+		ret = -ENOENT;
+	}
+	if (ret < 0)
+		goto out;
+
+	while (true) {
+		struct btrfs_file_extent_item *fi;
+		int extent_type;
+
+		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+			ret = btrfs_next_leaf(fs_info->extent_root, &path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+		}
+
+		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+		if (key.objectid != objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
+		if (extent_type != BTRFS_FILE_EXTENT_REG &&
+		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
+			goto next;
+
+		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
+		    disk_bytenr)
+			break;
+
+		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+			ret = 1;
+			goto out;
+		}
+next:
+		path.slots[0]++;
+	}
+	ret = 0;
+ out:
+	btrfs_release_path(&path);
+	return ret;
+}
+
+/*
+ * Check if a shared data reference points to a node that has a file extent item
+ * pointing to the extent at @disk_bytenr that is not of type prealloc.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
+					  u64 parent,
+					  u64 disk_bytenr)
+{
+	struct extent_buffer *eb;
+	u32 nr;
+	int i;
+	int ret = 0;
+
+	eb = read_tree_block(fs_info, parent, 0);
+	if (!extent_buffer_uptodate(eb)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	nr = btrfs_header_nritems(eb);
+	for (i = 0; i < nr; i++) {
+		struct btrfs_key key;
+		struct btrfs_file_extent_item *fi;
+		int extent_type;
+
+		btrfs_item_key_to_cpu(eb, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(eb, fi);
+		if (extent_type != BTRFS_FILE_EXTENT_REG &&
+		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
+			continue;
+
+		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
+		    extent_type == BTRFS_FILE_EXTENT_REG) {
+			ret = 1;
+			break;
+		}
+	}
+ out:
+	free_extent_buffer(eb);
+	return ret;
+}
+
+/*
+ * Check if a prealloc extent is shared by multiple inodes and if any inode has
+ * already written to that extent. This is to avoid emitting invalid warnings
+ * about odd csum items (a inode has an extent entirely marked as prealloc
+ * but another inode shares it and has already written to it).
+ *
+ * Note: right now it does not check if the number of checksum items in the
+ * csum tree matches the number of bytes written into the ex-prealloc extent.
+ * It's complex to deal with that because the prealloc extent might have been
+ * partially written through multiple inodes and we would have to keep track of
+ * ranges, merging them and notice ranges that fully or partially overlap, to
+ * avoid false reports of csum items missing for areas of the prealloc extent
+ * that were not written to - for example if we have a 1M prealloc extent, we
+ * can have only the first half of it written, but 2 different inodes refer to
+ * the its first half (through reflinks/cloning), so keeping a counter of bytes
+ * covered by checksum items is not enough, as the correct value would be 512K
+ * and not 1M (whence the need to track ranges).
+ *
+ * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
+ * at least one other inode has written to it, and < 0 on error.
+ */
+int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
+				  u64 disk_bytenr,
+				  u64 num_bytes)
+{
+	struct btrfs_path path;
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_extent_item *ei;
+	u32 item_size;
+	unsigned long ptr;
+	unsigned long end;
+
+	key.objectid = disk_bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
+	if (ret > 0) {
+		fprintf(stderr,
+			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
+			disk_bytenr, num_bytes);
+		ret = -ENOENT;
+	}
+	if (ret < 0)
+		goto out;
+
+	/* First check all inline refs. */
+	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
+			    struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		int type;
+
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
+		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
+		       type == BTRFS_SHARED_DATA_REF_KEY);
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			ret = check_prealloc_data_ref(fs_info, disk_bytenr,
+						      dref, path.nodes[0]);
+			if (ret != 0)
+				goto out;
+		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+			u64 parent;
+
+			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
+								iref);
+			ret = check_prealloc_shared_data_ref(fs_info,
+							     parent,
+							     disk_bytenr);
+			if (ret != 0)
+				goto out;
+		}
+
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+
+	/* Now check if there are any non-inlined refs. */
+	path.slots[0]++;
+	while (true) {
+		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+			ret = btrfs_next_leaf(fs_info->extent_root, &path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+		}
+
+		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+		if (key.objectid != disk_bytenr)
+			break;
+
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+
+			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
+					      struct btrfs_extent_data_ref);
+			ret = check_prealloc_data_ref(fs_info, disk_bytenr,
+						      dref, path.nodes[0]);
+			if (ret != 0)
+				goto out;
+		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			ret = check_prealloc_shared_data_ref(fs_info,
+							     key.offset,
+							     disk_bytenr);
+			if (ret != 0)
+				goto out;
+		}
+
+		path.slots[0]++;
+	}
+out:
+	btrfs_release_path(&path);
+	return ret;
+}
+
+/*
  * Search in csum tree to find how many bytes of range [@start, @start + @len)
  * has the corresponding csum item.
  *
diff --git a/check/mode-common.h b/check/mode-common.h
index ffae782b..79e1b0cf 100644
--- a/check/mode-common.h
+++ b/check/mode-common.h
@@ -80,6 +80,9 @@ static inline int fs_root_objectid(u64 objectid)
 	return is_fstree(objectid);
 }
 
+int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
+				  u64 disk_bytenr,
+				  u64 num_bytes);
 int count_csum_range(struct btrfs_fs_info *fs_info, u64 start,
 		     u64 len, u64 *found);
 int insert_inode_item(struct btrfs_trans_handle *trans,
diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
index 2424cdc2..02806377 100644
--- a/check/mode-lowmem.c
+++ b/check/mode-lowmem.c
@@ -1511,9 +1511,17 @@ static int check_file_extent(struct btrfs_root *root, struct btrfs_key *fkey,
 		      csum_found, search_len);
 	} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
 		   csum_found > 0) {
-		err |= ODD_CSUM_ITEM;
-		error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
-		      root->objectid, fkey->objectid, fkey->offset, csum_found);
+		ret = check_prealloc_extent_written(root->fs_info,
+						    disk_bytenr,
+						    disk_num_bytes);
+		if (ret < 0)
+			return ret;
+		if (ret == 0) {
+			err |= ODD_CSUM_ITEM;
+			error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
+			      root->objectid, fkey->objectid, fkey->offset,
+			      csum_found);
+		}
 	}
 
 	/* Check EXTENT_DATA hole */
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-13 18:47 [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents fdmanana
  2018-03-13 22:48 ` [PATCH 1/2 v2] " fdmanana
@ 2018-03-14  1:32 ` Qu Wenruo
  2018-03-14  1:58   ` Qu Wenruo
                     ` (2 more replies)
  2018-03-14  8:19 ` Nikolay Borisov
                   ` (2 subsequent siblings)
  4 siblings, 3 replies; 16+ messages in thread
From: Qu Wenruo @ 2018-03-14  1:32 UTC (permalink / raw)
  To: fdmanana, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 11684 bytes --]



On 2018年03月14日 02:47, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> Under some cases the filesystem checker reports an error when it finds
> checksum items for an extent that is referenced by an inode as a prealloc
> extent. Such cases are not an error when the extent is actually shared
> (was cloned/reflinked) with other inodes and was written through one of
> those other inodes.
> 
> Example:
> 
>   $ mkfs.btrfs -f /dev/sdb
>   $ mount /dev/sdb /mnt
> 
>   $ touch /mnt/foo
>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>   $ sync
> 
>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
>   $ touch /mnt/bar
>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>   $ xfs_io -c "fsync" /mnt/bar
> 
>   <power fail>
>   $ mount /dev/sdb /mnt
>   $ umount /mnt
> 
>   $ btrfs check /dev/sdc
>   Checking filesystem on /dev/sdb
>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>   checking extents
>   checking free space cache
>   checking fs roots
>   root 5 inode 257 errors 800, odd csum item
>   ERROR: errors found in fs roots
>   found 688128 bytes used, error(s) found
>   total csum bytes: 256
>   total tree bytes: 163840
>   total fs tree bytes: 65536
>   total extent tree bytes: 16384
>   btree space waste bytes: 138819
>   file data blocks allocated: 10747904
>    referenced 10747904
>   $ echo $?
>   1
> 
> So tech check to not report such cases as errors by checking if the
> extent is shared with other inodes and if so, consider it an error the
> existence of checksum items only if all those other inodes are referencing
> the extent as a prealloc extent.
> This case can be hit often when running the generic/475 testcase from
> fstests.
> 
> A test case will follow in a separate patch.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>

Looks good overall, but still some small nitpicks.

One is, the fix is only for original mode, while the fix itself has
nothing mode dependent, so it's better to put the check into
check/mode-original.[ch] so lowmem mode can also benefit from it.

> ---
>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 268 insertions(+), 2 deletions(-)
> 
> diff --git a/check/main.c b/check/main.c
> index 392195ca..bb816311 100644
> --- a/check/main.c
> +++ b/check/main.c
> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>  
>  }
>  
> +/*
> + * Check if the inode referenced by the given data reference uses the extent
> + * at disk_bytenr as a non-prealloc extent.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */

The check itself (along with check_prealloc_shared_data_ref) is good
enough to detect any regular file extents inside the prealloc extent.

But when it finds any regular extents, it doesn't check if it's covered
by csum correctly.

For example:

|<-------------- prealloc extent range -------------------------->|
|<- the *only* regular extent ->|
|<-------- range covered by csum -------->|
                                |<- ODD ->|

Or
|<-------------- prealloc extent range -------------------------->|
|<- the *only* regular extent ->|
|<- csum ->|
           |<------ ODD ------->|

So at least in theory, the best solution is not to just check if there
is any regular extents inside the large prealloc extent, but also check
how many csum bytes the regular extents contribute, and then compare it
to the result of count_csum_range().

Thanks,
Qu

> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
> +				   u64 ino,
> +				   u64 disk_bytenr,
> +				   struct btrfs_extent_data_ref *dref,
> +				   struct extent_buffer *eb)
> +{
> +	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
> +	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
> +	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
> +	struct btrfs_root *root;
> +	struct btrfs_key key;
> +	struct btrfs_path path;
> +	int ret;
> +
> +	if (objectid == ino)
> +		return 0;
> +
> +	btrfs_init_path(&path);
> +	key.objectid = rootid;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = (u64)-1;
> +	root = btrfs_read_fs_root(fs_info, &key);
> +	if (IS_ERR(root)) {
> +		ret = PTR_ERR(root);
> +		goto out;
> +	}
> +
> +	key.objectid = objectid;
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = offset;
> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing file extent item for inode %llu, root %llu, offset %llu",
> +			objectid, rootid, offset);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	while (true) {
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0)
> +				break;
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != objectid ||
> +		    key.type != BTRFS_EXTENT_DATA_KEY)
> +			break;
> +
> +		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +				    struct btrfs_file_extent_item);
> +		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			goto next;
> +
> +		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
> +		    disk_bytenr)
> +			break;
> +
> +		if (extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			goto out;
> +		}
> +next:
> +		path.slots[0]++;
> +	}
> +	ret = 0;
> + out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
> +/*
> + * Check if a shared data reference points to a node that has a file extent item
> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
> + * associated to an inode with a number different from ino.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
> +					  u64 ino,
> +					  u64 parent,
> +					  u64 disk_bytenr)
> +{
> +	struct extent_buffer *eb;
> +	u32 nr;
> +	int i;
> +	int ret = 0;
> +
> +	eb = read_tree_block(fs_info, parent, 0);
> +	if (!extent_buffer_uptodate(eb)) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	nr = btrfs_header_nritems(eb);
> +	for (i = 0; i < nr; i++) {
> +		struct btrfs_key key;
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		btrfs_item_key_to_cpu(eb, &key, i);
> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
> +			continue;
> +		if (key.objectid == ino)
> +			continue;
> +
> +		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
> +
> +		extent_type = btrfs_file_extent_type(eb, fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			continue;
> +
> +		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
> +		    extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> + out:
> +	free_extent_buffer(eb);
> +	return ret;
> +}
> +
> +/*
> + * Check if a prealloc extent is shared by other inodes and if any inode has
> + * already written to that extent. This is to avoid emitting invalid warnings
> + * about odd csum items (a inode has an extent entirely marked as prealloc
> + * but another inode shares it and has already written to it).
> + *
> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
> + * at least one other inode has written to it, and < 0 on error.
> + */
> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
> +					 u64 ino,
> +					 u64 disk_bytenr,
> +					 u64 num_bytes)
> +{
> +	struct btrfs_path path;
> +	struct btrfs_key key;
> +	int ret;
> +	struct btrfs_extent_item *ei;
> +	u32 item_size;
> +	unsigned long ptr;
> +	unsigned long end;
> +
> +	key.objectid = disk_bytenr;
> +	key.type = BTRFS_EXTENT_ITEM_KEY;
> +	key.offset = num_bytes;
> +
> +	btrfs_init_path(&path);
> +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
> +			disk_bytenr, num_bytes);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	/* First check all inline refs. */
> +	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +			    struct btrfs_extent_item);
> +	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
> +	ptr = (unsigned long)(ei + 1);
> +	end = (unsigned long)ei + item_size;
> +	while (ptr < end) {
> +		struct btrfs_extent_inline_ref *iref;
> +		int type;
> +
> +		iref = (struct btrfs_extent_inline_ref *)ptr;
> +		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
> +		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
> +		       type == BTRFS_SHARED_DATA_REF_KEY);
> +
> +		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
> +			u64 parent;
> +
> +			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
> +								iref);
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     ino,
> +							     parent,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		ptr += btrfs_extent_inline_ref_size(type);
> +	}
> +
> +	/* Now check if there are any non-inlined refs. */
> +	path.slots[0]++;
> +	while (true) {
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0) {
> +				ret = 0;
> +				break;
> +			}
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != disk_bytenr)
> +			break;
> +
> +		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +					      struct btrfs_extent_data_ref);
> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     ino,
> +							     key.offset,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		path.slots[0]++;
> +	}
> +out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
>  static int process_file_extent(struct btrfs_root *root,
>  				struct extent_buffer *eb,
>  				int slot, struct btrfs_key *key,
> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>  			if (found < num_bytes)
>  				rec->some_csum_missing = 1;
>  		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
> -			if (found > 0)
> -				rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			if (found > 0) {
> +				ret = check_prealloc_extent_written(root->fs_info,
> +								    rec->ino,
> +								    disk_bytenr,
> +								    num_bytes);
> +				if (ret < 0)
> +					return ret;
> +				if (ret == 0)
> +					rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			}
>  		}
>  	}
>  	return 0;
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-14  1:32 ` [PATCH 1/2] " Qu Wenruo
@ 2018-03-14  1:58   ` Qu Wenruo
  2018-03-14  1:59   ` Lu Fengqi
  2018-03-14 10:33   ` Filipe Manana
  2 siblings, 0 replies; 16+ messages in thread
From: Qu Wenruo @ 2018-03-14  1:58 UTC (permalink / raw)
  To: fdmanana, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 12237 bytes --]



On 2018年03月14日 09:32, Qu Wenruo wrote:
> 
> 
> On 2018年03月14日 02:47, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>>
>> Under some cases the filesystem checker reports an error when it finds
>> checksum items for an extent that is referenced by an inode as a prealloc
>> extent. Such cases are not an error when the extent is actually shared
>> (was cloned/reflinked) with other inodes and was written through one of
>> those other inodes.
>>
>> Example:
>>
>>   $ mkfs.btrfs -f /dev/sdb
>>   $ mount /dev/sdb /mnt
>>
>>   $ touch /mnt/foo
>>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>>   $ sync
>>
>>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
>>   $ touch /mnt/bar
>>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>>   $ xfs_io -c "fsync" /mnt/bar
>>
>>   <power fail>
>>   $ mount /dev/sdb /mnt
>>   $ umount /mnt
>>
>>   $ btrfs check /dev/sdc
>>   Checking filesystem on /dev/sdb
>>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>>   checking extents
>>   checking free space cache
>>   checking fs roots
>>   root 5 inode 257 errors 800, odd csum item
>>   ERROR: errors found in fs roots
>>   found 688128 bytes used, error(s) found
>>   total csum bytes: 256
>>   total tree bytes: 163840
>>   total fs tree bytes: 65536
>>   total extent tree bytes: 16384
>>   btree space waste bytes: 138819
>>   file data blocks allocated: 10747904
>>    referenced 10747904
>>   $ echo $?
>>   1
>>
>> So tech check to not report such cases as errors by checking if the
>> extent is shared with other inodes and if so, consider it an error the
>> existence of checksum items only if all those other inodes are referencing
>> the extent as a prealloc extent.
>> This case can be hit often when running the generic/475 testcase from
>> fstests.
>>
>> A test case will follow in a separate patch.
>>
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> 
> Looks good overall, but still some small nitpicks.
> 
> One is, the fix is only for original mode, while the fix itself has
> nothing mode dependent, so it's better to put the check into
> check/mode-original.[ch] so lowmem mode can also benefit from it.
  ^^^^^^^^^^^^^^^^^^^
  I mean check/mode-common

Sorry for that.

Thanks,
Qu

> 
>> ---
>>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 268 insertions(+), 2 deletions(-)
>>
>> diff --git a/check/main.c b/check/main.c
>> index 392195ca..bb816311 100644
>> --- a/check/main.c
>> +++ b/check/main.c
>> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>>  
>>  }
>>  
>> +/*
>> + * Check if the inode referenced by the given data reference uses the extent
>> + * at disk_bytenr as a non-prealloc extent.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
> 
> The check itself (along with check_prealloc_shared_data_ref) is good
> enough to detect any regular file extents inside the prealloc extent.
> 
> But when it finds any regular extents, it doesn't check if it's covered
> by csum correctly.
> 
> For example:
> 
> |<-------------- prealloc extent range -------------------------->|
> |<- the *only* regular extent ->|
> |<-------- range covered by csum -------->|
>                                 |<- ODD ->|
> 
> Or
> |<-------------- prealloc extent range -------------------------->|
> |<- the *only* regular extent ->|
> |<- csum ->|
>            |<------ ODD ------->|
> 
> So at least in theory, the best solution is not to just check if there
> is any regular extents inside the large prealloc extent, but also check
> how many csum bytes the regular extents contribute, and then compare it
> to the result of count_csum_range().
> 
> Thanks,
> Qu
> 
>> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
>> +				   u64 ino,
>> +				   u64 disk_bytenr,
>> +				   struct btrfs_extent_data_ref *dref,
>> +				   struct extent_buffer *eb)
>> +{
>> +	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
>> +	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
>> +	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
>> +	struct btrfs_root *root;
>> +	struct btrfs_key key;
>> +	struct btrfs_path path;
>> +	int ret;
>> +
>> +	if (objectid == ino)
>> +		return 0;
>> +
>> +	btrfs_init_path(&path);
>> +	key.objectid = rootid;
>> +	key.type = BTRFS_ROOT_ITEM_KEY;
>> +	key.offset = (u64)-1;
>> +	root = btrfs_read_fs_root(fs_info, &key);
>> +	if (IS_ERR(root)) {
>> +		ret = PTR_ERR(root);
>> +		goto out;
>> +	}
>> +
>> +	key.objectid = objectid;
>> +	key.type = BTRFS_EXTENT_DATA_KEY;
>> +	key.offset = offset;
>> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
>> +	if (ret > 0) {
>> +		fprintf(stderr,
>> +			"Missing file extent item for inode %llu, root %llu, offset %llu",
>> +			objectid, rootid, offset);
>> +		ret = -ENOENT;
>> +	}
>> +	if (ret < 0)
>> +		goto out;
>> +
>> +	while (true) {
>> +		struct btrfs_file_extent_item *fi;
>> +		int extent_type;
>> +
>> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +			if (ret < 0)
>> +				goto out;
>> +			if (ret > 0)
>> +				break;
>> +		}
>> +
>> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +		if (key.objectid != objectid ||
>> +		    key.type != BTRFS_EXTENT_DATA_KEY)
>> +			break;
>> +
>> +		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +				    struct btrfs_file_extent_item);
>> +		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
>> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +			goto next;
>> +
>> +		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
>> +		    disk_bytenr)
>> +			break;
>> +
>> +		if (extent_type == BTRFS_FILE_EXTENT_REG) {
>> +			ret = 1;
>> +			goto out;
>> +		}
>> +next:
>> +		path.slots[0]++;
>> +	}
>> +	ret = 0;
>> + out:
>> +	btrfs_release_path(&path);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Check if a shared data reference points to a node that has a file extent item
>> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
>> + * associated to an inode with a number different from ino.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
>> +					  u64 ino,
>> +					  u64 parent,
>> +					  u64 disk_bytenr)
>> +{
>> +	struct extent_buffer *eb;
>> +	u32 nr;
>> +	int i;
>> +	int ret = 0;
>> +
>> +	eb = read_tree_block(fs_info, parent, 0);
>> +	if (!extent_buffer_uptodate(eb)) {
>> +		ret = -EIO;
>> +		goto out;
>> +	}
>> +
>> +	nr = btrfs_header_nritems(eb);
>> +	for (i = 0; i < nr; i++) {
>> +		struct btrfs_key key;
>> +		struct btrfs_file_extent_item *fi;
>> +		int extent_type;
>> +
>> +		btrfs_item_key_to_cpu(eb, &key, i);
>> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
>> +			continue;
>> +		if (key.objectid == ino)
>> +			continue;
>> +
>> +		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
>> +
>> +		extent_type = btrfs_file_extent_type(eb, fi);
>> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +			continue;
>> +
>> +		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
>> +		    extent_type == BTRFS_FILE_EXTENT_REG) {
>> +			ret = 1;
>> +			break;
>> +		}
>> +	}
>> + out:
>> +	free_extent_buffer(eb);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Check if a prealloc extent is shared by other inodes and if any inode has
>> + * already written to that extent. This is to avoid emitting invalid warnings
>> + * about odd csum items (a inode has an extent entirely marked as prealloc
>> + * but another inode shares it and has already written to it).
>> + *
>> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
>> + * at least one other inode has written to it, and < 0 on error.
>> + */
>> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
>> +					 u64 ino,
>> +					 u64 disk_bytenr,
>> +					 u64 num_bytes)
>> +{
>> +	struct btrfs_path path;
>> +	struct btrfs_key key;
>> +	int ret;
>> +	struct btrfs_extent_item *ei;
>> +	u32 item_size;
>> +	unsigned long ptr;
>> +	unsigned long end;
>> +
>> +	key.objectid = disk_bytenr;
>> +	key.type = BTRFS_EXTENT_ITEM_KEY;
>> +	key.offset = num_bytes;
>> +
>> +	btrfs_init_path(&path);
>> +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
>> +	if (ret > 0) {
>> +		fprintf(stderr,
>> +			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
>> +			disk_bytenr, num_bytes);
>> +		ret = -ENOENT;
>> +	}
>> +	if (ret < 0)
>> +		goto out;
>> +
>> +	/* First check all inline refs. */
>> +	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +			    struct btrfs_extent_item);
>> +	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
>> +	ptr = (unsigned long)(ei + 1);
>> +	end = (unsigned long)ei + item_size;
>> +	while (ptr < end) {
>> +		struct btrfs_extent_inline_ref *iref;
>> +		int type;
>> +
>> +		iref = (struct btrfs_extent_inline_ref *)ptr;
>> +		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
>> +		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
>> +		       type == BTRFS_SHARED_DATA_REF_KEY);
>> +
>> +		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +			struct btrfs_extent_data_ref *dref;
>> +
>> +			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
>> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +						      dref, path.nodes[0]);
>> +			if (ret != 0)
>> +				goto out;
>> +		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
>> +			u64 parent;
>> +
>> +			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
>> +								iref);
>> +			ret = check_prealloc_shared_data_ref(fs_info,
>> +							     ino,
>> +							     parent,
>> +							     disk_bytenr);
>> +			if (ret != 0)
>> +				goto out;
>> +		}
>> +
>> +		ptr += btrfs_extent_inline_ref_size(type);
>> +	}
>> +
>> +	/* Now check if there are any non-inlined refs. */
>> +	path.slots[0]++;
>> +	while (true) {
>> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +			if (ret < 0)
>> +				goto out;
>> +			if (ret > 0) {
>> +				ret = 0;
>> +				break;
>> +			}
>> +		}
>> +
>> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +		if (key.objectid != disk_bytenr)
>> +			break;
>> +
>> +		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +			struct btrfs_extent_data_ref *dref;
>> +
>> +			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +					      struct btrfs_extent_data_ref);
>> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +						      dref, path.nodes[0]);
>> +			if (ret != 0)
>> +				goto out;
>> +		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
>> +			ret = check_prealloc_shared_data_ref(fs_info,
>> +							     ino,
>> +							     key.offset,
>> +							     disk_bytenr);
>> +			if (ret != 0)
>> +				goto out;
>> +		}
>> +
>> +		path.slots[0]++;
>> +	}
>> +out:
>> +	btrfs_release_path(&path);
>> +	return ret;
>> +}
>> +
>>  static int process_file_extent(struct btrfs_root *root,
>>  				struct extent_buffer *eb,
>>  				int slot, struct btrfs_key *key,
>> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>>  			if (found < num_bytes)
>>  				rec->some_csum_missing = 1;
>>  		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
>> -			if (found > 0)
>> -				rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +			if (found > 0) {
>> +				ret = check_prealloc_extent_written(root->fs_info,
>> +								    rec->ino,
>> +								    disk_bytenr,
>> +								    num_bytes);
>> +				if (ret < 0)
>> +					return ret;
>> +				if (ret == 0)
>> +					rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +			}
>>  		}
>>  	}
>>  	return 0;
>>
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-14  1:32 ` [PATCH 1/2] " Qu Wenruo
  2018-03-14  1:58   ` Qu Wenruo
@ 2018-03-14  1:59   ` Lu Fengqi
  2018-03-14 10:33   ` Filipe Manana
  2 siblings, 0 replies; 16+ messages in thread
From: Lu Fengqi @ 2018-03-14  1:59 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: fdmanana, linux-btrfs

On Wed, Mar 14, 2018 at 09:32:50AM +0800, Qu Wenruo wrote:
>
>
>On 2018年03月14日 02:47, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>> 
>> Under some cases the filesystem checker reports an error when it finds
>> checksum items for an extent that is referenced by an inode as a prealloc
>> extent. Such cases are not an error when the extent is actually shared
>> (was cloned/reflinked) with other inodes and was written through one of
>> those other inodes.
>> 
>> Example:
>> 
>>   $ mkfs.btrfs -f /dev/sdb
>>   $ mount /dev/sdb /mnt
>> 
>>   $ touch /mnt/foo
>>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>>   $ sync
>> 
>>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
>>   $ touch /mnt/bar
>>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>>   $ xfs_io -c "fsync" /mnt/bar
>> 
>>   <power fail>
>>   $ mount /dev/sdb /mnt
>>   $ umount /mnt
>> 
>>   $ btrfs check /dev/sdc
>>   Checking filesystem on /dev/sdb
>>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>>   checking extents
>>   checking free space cache
>>   checking fs roots
>>   root 5 inode 257 errors 800, odd csum item
>>   ERROR: errors found in fs roots
>>   found 688128 bytes used, error(s) found
>>   total csum bytes: 256
>>   total tree bytes: 163840
>>   total fs tree bytes: 65536
>>   total extent tree bytes: 16384
>>   btree space waste bytes: 138819
>>   file data blocks allocated: 10747904
>>    referenced 10747904
>>   $ echo $?
>>   1
>> 
>> So tech check to not report such cases as errors by checking if the
>> extent is shared with other inodes and if so, consider it an error the
>> existence of checksum items only if all those other inodes are referencing
>> the extent as a prealloc extent.
>> This case can be hit often when running the generic/475 testcase from
>> fstests.
>> 
>> A test case will follow in a separate patch.
>> 
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>
>Looks good overall, but still some small nitpicks.
>
>One is, the fix is only for original mode, while the fix itself has
>nothing mode dependent, so it's better to put the check into
>check/mode-original.[ch] so lowmem mode can also benefit from it.

Fix a tyro:
s/mode-original/mode-common

>
>> ---
>>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 268 insertions(+), 2 deletions(-)
>> 
>> diff --git a/check/main.c b/check/main.c
>> index 392195ca..bb816311 100644
>> --- a/check/main.c
>> +++ b/check/main.c
>> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>>  
>>  }
>>  
>> +/*
>> + * Check if the inode referenced by the given data reference uses the extent
>> + * at disk_bytenr as a non-prealloc extent.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>
>The check itself (along with check_prealloc_shared_data_ref) is good
>enough to detect any regular file extents inside the prealloc extent.
>
>But when it finds any regular extents, it doesn't check if it's covered
>by csum correctly.
>
>For example:
>
>|<-------------- prealloc extent range -------------------------->|
>|<- the *only* regular extent ->|
>|<-------- range covered by csum -------->|
>                                |<- ODD ->|
>
>Or
>|<-------------- prealloc extent range -------------------------->|
>|<- the *only* regular extent ->|
>|<- csum ->|
>           |<------ ODD ------->|
>
>So at least in theory, the best solution is not to just check if there
>is any regular extents inside the large prealloc extent, but also check
>how many csum bytes the regular extents contribute, and then compare it
>to the result of count_csum_range().
>
>Thanks,
>Qu
>
>> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
>> +				   u64 ino,
>> +				   u64 disk_bytenr,
>> +				   struct btrfs_extent_data_ref *dref,
>> +				   struct extent_buffer *eb)
>> +{
>> +	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
>> +	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
>> +	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
>> +	struct btrfs_root *root;
>> +	struct btrfs_key key;
>> +	struct btrfs_path path;
>> +	int ret;
>> +
>> +	if (objectid == ino)
>> +		return 0;
>> +
>> +	btrfs_init_path(&path);
>> +	key.objectid = rootid;
>> +	key.type = BTRFS_ROOT_ITEM_KEY;
>> +	key.offset = (u64)-1;
>> +	root = btrfs_read_fs_root(fs_info, &key);
>> +	if (IS_ERR(root)) {
>> +		ret = PTR_ERR(root);
>> +		goto out;
>> +	}
>> +
>> +	key.objectid = objectid;
>> +	key.type = BTRFS_EXTENT_DATA_KEY;
>> +	key.offset = offset;
>> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
>> +	if (ret > 0) {
>> +		fprintf(stderr,
>> +			"Missing file extent item for inode %llu, root %llu, offset %llu",
>> +			objectid, rootid, offset);
>> +		ret = -ENOENT;
>> +	}
>> +	if (ret < 0)
>> +		goto out;
>> +
>> +	while (true) {
>> +		struct btrfs_file_extent_item *fi;
>> +		int extent_type;
>> +
>> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +			if (ret < 0)
>> +				goto out;
>> +			if (ret > 0)
>> +				break;
>> +		}
>> +
>> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +		if (key.objectid != objectid ||
>> +		    key.type != BTRFS_EXTENT_DATA_KEY)
>> +			break;
>> +
>> +		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +				    struct btrfs_file_extent_item);
>> +		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
>> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +			goto next;
>> +
>> +		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
>> +		    disk_bytenr)
>> +			break;
>> +
>> +		if (extent_type == BTRFS_FILE_EXTENT_REG) {
>> +			ret = 1;
>> +			goto out;
>> +		}
>> +next:
>> +		path.slots[0]++;
>> +	}
>> +	ret = 0;
>> + out:
>> +	btrfs_release_path(&path);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Check if a shared data reference points to a node that has a file extent item
>> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
>> + * associated to an inode with a number different from ino.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
>> +					  u64 ino,
>> +					  u64 parent,
>> +					  u64 disk_bytenr)
>> +{
>> +	struct extent_buffer *eb;
>> +	u32 nr;
>> +	int i;
>> +	int ret = 0;
>> +
>> +	eb = read_tree_block(fs_info, parent, 0);
>> +	if (!extent_buffer_uptodate(eb)) {
>> +		ret = -EIO;
>> +		goto out;
>> +	}
>> +
>> +	nr = btrfs_header_nritems(eb);
>> +	for (i = 0; i < nr; i++) {
>> +		struct btrfs_key key;
>> +		struct btrfs_file_extent_item *fi;
>> +		int extent_type;
>> +
>> +		btrfs_item_key_to_cpu(eb, &key, i);
>> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
>> +			continue;
>> +		if (key.objectid == ino)
>> +			continue;
>> +
>> +		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
>> +
>> +		extent_type = btrfs_file_extent_type(eb, fi);
>> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +			continue;
>> +
>> +		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
>> +		    extent_type == BTRFS_FILE_EXTENT_REG) {
>> +			ret = 1;
>> +			break;
>> +		}
>> +	}
>> + out:
>> +	free_extent_buffer(eb);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Check if a prealloc extent is shared by other inodes and if any inode has
>> + * already written to that extent. This is to avoid emitting invalid warnings
>> + * about odd csum items (a inode has an extent entirely marked as prealloc
>> + * but another inode shares it and has already written to it).
>> + *
>> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
>> + * at least one other inode has written to it, and < 0 on error.
>> + */
>> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
>> +					 u64 ino,
>> +					 u64 disk_bytenr,
>> +					 u64 num_bytes)
>> +{
>> +	struct btrfs_path path;
>> +	struct btrfs_key key;
>> +	int ret;
>> +	struct btrfs_extent_item *ei;
>> +	u32 item_size;
>> +	unsigned long ptr;
>> +	unsigned long end;
>> +
>> +	key.objectid = disk_bytenr;
>> +	key.type = BTRFS_EXTENT_ITEM_KEY;
>> +	key.offset = num_bytes;
>> +
>> +	btrfs_init_path(&path);
>> +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
>> +	if (ret > 0) {
>> +		fprintf(stderr,
>> +			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
>> +			disk_bytenr, num_bytes);
>> +		ret = -ENOENT;
>> +	}
>> +	if (ret < 0)
>> +		goto out;
>> +
>> +	/* First check all inline refs. */
>> +	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +			    struct btrfs_extent_item);
>> +	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
>> +	ptr = (unsigned long)(ei + 1);
>> +	end = (unsigned long)ei + item_size;
>> +	while (ptr < end) {
>> +		struct btrfs_extent_inline_ref *iref;
>> +		int type;
>> +
>> +		iref = (struct btrfs_extent_inline_ref *)ptr;
>> +		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
>> +		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
>> +		       type == BTRFS_SHARED_DATA_REF_KEY);
>> +
>> +		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +			struct btrfs_extent_data_ref *dref;
>> +
>> +			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
>> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +						      dref, path.nodes[0]);
>> +			if (ret != 0)
>> +				goto out;
>> +		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
>> +			u64 parent;
>> +
>> +			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
>> +								iref);
>> +			ret = check_prealloc_shared_data_ref(fs_info,
>> +							     ino,
>> +							     parent,
>> +							     disk_bytenr);
>> +			if (ret != 0)
>> +				goto out;
>> +		}
>> +
>> +		ptr += btrfs_extent_inline_ref_size(type);
>> +	}
>> +
>> +	/* Now check if there are any non-inlined refs. */
>> +	path.slots[0]++;
>> +	while (true) {
>> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +			if (ret < 0)
>> +				goto out;
>> +			if (ret > 0) {
>> +				ret = 0;
>> +				break;
>> +			}
>> +		}
>> +
>> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +		if (key.objectid != disk_bytenr)
>> +			break;
>> +
>> +		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +			struct btrfs_extent_data_ref *dref;
>> +
>> +			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +					      struct btrfs_extent_data_ref);
>> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +						      dref, path.nodes[0]);
>> +			if (ret != 0)
>> +				goto out;
>> +		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
>> +			ret = check_prealloc_shared_data_ref(fs_info,
>> +							     ino,
>> +							     key.offset,
>> +							     disk_bytenr);
>> +			if (ret != 0)
>> +				goto out;
>> +		}
>> +
>> +		path.slots[0]++;
>> +	}
>> +out:
>> +	btrfs_release_path(&path);
>> +	return ret;
>> +}
>> +
>>  static int process_file_extent(struct btrfs_root *root,
>>  				struct extent_buffer *eb,
>>  				int slot, struct btrfs_key *key,
>> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>>  			if (found < num_bytes)
>>  				rec->some_csum_missing = 1;
>>  		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
>> -			if (found > 0)
>> -				rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +			if (found > 0) {
>> +				ret = check_prealloc_extent_written(root->fs_info,
>> +								    rec->ino,
>> +								    disk_bytenr,
>> +								    num_bytes);
>> +				if (ret < 0)
>> +					return ret;
>> +				if (ret == 0)
>> +					rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +			}
>>  		}
>>  	}
>>  	return 0;
>> 
>




-- 
Thanks,
Lu



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-13 18:47 [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents fdmanana
  2018-03-13 22:48 ` [PATCH 1/2 v2] " fdmanana
  2018-03-14  1:32 ` [PATCH 1/2] " Qu Wenruo
@ 2018-03-14  8:19 ` Nikolay Borisov
  2018-03-14 10:34   ` Filipe Manana
  2018-03-14  9:19 ` Nikolay Borisov
  2018-03-14 20:11 ` [PATCH 1/2 v3] " fdmanana
  4 siblings, 1 reply; 16+ messages in thread
From: Nikolay Borisov @ 2018-03-14  8:19 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 13.03.2018 20:47, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> Under some cases the filesystem checker reports an error when it finds
> checksum items for an extent that is referenced by an inode as a prealloc
> extent. Such cases are not an error when the extent is actually shared
> (was cloned/reflinked) with other inodes and was written through one of
> those other inodes.
> 
> Example:
> 
>   $ mkfs.btrfs -f /dev/sdb
>   $ mount /dev/sdb /mnt
> 
>   $ touch /mnt/foo
>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>   $ sync
> 
>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
>   $ touch /mnt/bar
>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>   $ xfs_io -c "fsync" /mnt/bar
> 
>   <power fail>
>   $ mount /dev/sdb /mnt
>   $ umount /mnt
> 
>   $ btrfs check /dev/sdc
>   Checking filesystem on /dev/sdb
>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>   checking extents
>   checking free space cache
>   checking fs roots
>   root 5 inode 257 errors 800, odd csum item
>   ERROR: errors found in fs roots
>   found 688128 bytes used, error(s) found
>   total csum bytes: 256
>   total tree bytes: 163840
>   total fs tree bytes: 65536
>   total extent tree bytes: 16384
>   btree space waste bytes: 138819
>   file data blocks allocated: 10747904
>    referenced 10747904
>   $ echo $?
>   1
> 
> So tech check to not report such cases as errors by checking if the
> extent is shared with other inodes and if so, consider it an error the
> existence of checksum items only if all those other inodes are referencing
> the extent as a prealloc extent.
> This case can be hit often when running the generic/475 testcase from
> fstests.
> 
> A test case will follow in a separate patch.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 268 insertions(+), 2 deletions(-)
> 
> diff --git a/check/main.c b/check/main.c
> index 392195ca..bb816311 100644
> --- a/check/main.c
> +++ b/check/main.c
> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>  
>  }
>  
> +/*
> + * Check if the inode referenced by the given data reference uses the extent
> + * at disk_bytenr as a non-prealloc extent.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
> +				   u64 ino,
> +				   u64 disk_bytenr,
> +				   struct btrfs_extent_data_ref *dref,
> +				   struct extent_buffer *eb)
> +{
> +	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
> +	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);

nit: Shouldn't this ino be equal to the ino passed to the function? If
so objectid can be removed and when setting up the key for the second
search you just assigne ino to its objectid

> +	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
> +	struct btrfs_root *root;
> +	struct btrfs_key key;
> +	struct btrfs_path path;
> +	int ret;
> +
> +	if (objectid == ino)
> +		return 0;
> +
> +	btrfs_init_path(&path);
> +	key.objectid = rootid;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = (u64)-1;
> +	root = btrfs_read_fs_root(fs_info, &key);
> +	if (IS_ERR(root)) {
> +		ret = PTR_ERR(root);
> +		goto out;
> +	}
> +
> +	key.objectid = objectid;
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = offset;
> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing file extent item for inode %llu, root %llu, offset %llu",
> +			objectid, rootid, offset);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	while (true) {
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0)
> +				break;
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != objectid ||
> +		    key.type != BTRFS_EXTENT_DATA_KEY)
> +			break;
> +
> +		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +				    struct btrfs_file_extent_item);
> +		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			goto next;
> +
> +		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
> +		    disk_bytenr)
> +			break;
> +
> +		if (extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			goto out;
> +		}
> +next:
> +		path.slots[0]++;
> +	}
> +	ret = 0;
> + out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
> +/*
> + * Check if a shared data reference points to a node that has a file extent item
> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
> + * associated to an inode with a number different from ino.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
> +					  u64 ino,
> +					  u64 parent,
> +					  u64 disk_bytenr)
> +{
> +	struct extent_buffer *eb;
> +	u32 nr;
> +	int i;
> +	int ret = 0;
> +
> +	eb = read_tree_block(fs_info, parent, 0);
> +	if (!extent_buffer_uptodate(eb)) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	nr = btrfs_header_nritems(eb);
> +	for (i = 0; i < nr; i++) {
> +		struct btrfs_key key;
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		btrfs_item_key_to_cpu(eb, &key, i);
> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
> +			continue;
> +		if (key.objectid == ino)
> +			continue;
> +
> +		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
> +
> +		extent_type = btrfs_file_extent_type(eb, fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			continue;
> +
> +		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
> +		    extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> + out:
> +	free_extent_buffer(eb);
> +	return ret;
> +}
> +
> +/*
> + * Check if a prealloc extent is shared by other inodes and if any inode has
> + * already written to that extent. This is to avoid emitting invalid warnings
> + * about odd csum items (a inode has an extent entirely marked as prealloc
> + * but another inode shares it and has already written to it).
> + *
> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
> + * at least one other inode has written to it, and < 0 on error.
> + */
> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
> +					 u64 ino,
> +					 u64 disk_bytenr,
> +					 u64 num_bytes)
> +{
> +	struct btrfs_path path;
> +	struct btrfs_key key;
> +	int ret;
> +	struct btrfs_extent_item *ei;
> +	u32 item_size;
> +	unsigned long ptr;
> +	unsigned long end;
> +
> +	key.objectid = disk_bytenr;
> +	key.type = BTRFS_EXTENT_ITEM_KEY;
> +	key.offset = num_bytes;
> +
> +	btrfs_init_path(&path);
> +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
> +			disk_bytenr, num_bytes);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	/* First check all inline refs. */
> +	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +			    struct btrfs_extent_item);
> +	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
> +	ptr = (unsigned long)(ei + 1);
> +	end = (unsigned long)ei + item_size;
> +	while (ptr < end) {
> +		struct btrfs_extent_inline_ref *iref;
> +		int type;
> +
> +		iref = (struct btrfs_extent_inline_ref *)ptr;
> +		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
> +		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
> +		       type == BTRFS_SHARED_DATA_REF_KEY);
> +
> +		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
> +			u64 parent;
> +
> +			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
> +								iref);
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     ino,
> +							     parent,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		ptr += btrfs_extent_inline_ref_size(type);
> +	}
> +
> +	/* Now check if there are any non-inlined refs. */
> +	path.slots[0]++;
> +	while (true) {
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0) {
> +				ret = 0;
> +				break;
> +			}
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != disk_bytenr)
> +			break;
> +
> +		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +					      struct btrfs_extent_data_ref);
> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     ino,
> +							     key.offset,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		path.slots[0]++;
> +	}
> +out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
>  static int process_file_extent(struct btrfs_root *root,
>  				struct extent_buffer *eb,
>  				int slot, struct btrfs_key *key,
> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>  			if (found < num_bytes)
>  				rec->some_csum_missing = 1;
>  		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
> -			if (found > 0)
> -				rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			if (found > 0) {
> +				ret = check_prealloc_extent_written(root->fs_info,
> +								    rec->ino,
> +								    disk_bytenr,
> +								    num_bytes);
> +				if (ret < 0)
> +					return ret;
> +				if (ret == 0)
> +					rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			}
>  		}
>  	}
>  	return 0;
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-13 18:47 [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents fdmanana
                   ` (2 preceding siblings ...)
  2018-03-14  8:19 ` Nikolay Borisov
@ 2018-03-14  9:19 ` Nikolay Borisov
  2018-03-14 10:35   ` Filipe Manana
  2018-03-14 20:11 ` [PATCH 1/2 v3] " fdmanana
  4 siblings, 1 reply; 16+ messages in thread
From: Nikolay Borisov @ 2018-03-14  9:19 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 13.03.2018 20:47, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> Under some cases the filesystem checker reports an error when it finds
> checksum items for an extent that is referenced by an inode as a prealloc
> extent. Such cases are not an error when the extent is actually shared
> (was cloned/reflinked) with other inodes and was written through one of
> those other inodes.
> 
> Example:
> 
>   $ mkfs.btrfs -f /dev/sdb
>   $ mount /dev/sdb /mnt
> 
>   $ touch /mnt/foo
>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>   $ sync
> 
>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
                                  ^^
nit: You are actually missing the argument for the -b switch, how big
should the blocks be ? Same thing applies to the commit message of your test

>   $ touch /mnt/bar
>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>   $ xfs_io -c "fsync" /mnt/bar
> 
>   <power fail>
>   $ mount /dev/sdb /mnt
>   $ umount /mnt
> 
>   $ btrfs check /dev/sdc
>   Checking filesystem on /dev/sdb
>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>   checking extents
>   checking free space cache
>   checking fs roots
>   root 5 inode 257 errors 800, odd csum item
>   ERROR: errors found in fs roots
>   found 688128 bytes used, error(s) found
>   total csum bytes: 256
>   total tree bytes: 163840
>   total fs tree bytes: 65536
>   total extent tree bytes: 16384
>   btree space waste bytes: 138819
>   file data blocks allocated: 10747904
>    referenced 10747904
>   $ echo $?
>   1
> 
> So tech check to not report such cases as errors by checking if the
> extent is shared with other inodes and if so, consider it an error the
> existence of checksum items only if all those other inodes are referencing
> the extent as a prealloc extent.
> This case can be hit often when running the generic/475 testcase from
> fstests.
> 
> A test case will follow in a separate patch.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 268 insertions(+), 2 deletions(-)
> 
> diff --git a/check/main.c b/check/main.c
> index 392195ca..bb816311 100644
> --- a/check/main.c
> +++ b/check/main.c
> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>  
>  }
>  
> +/*
> + * Check if the inode referenced by the given data reference uses the extent
> + * at disk_bytenr as a non-prealloc extent.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
> +				   u64 ino,
> +				   u64 disk_bytenr,
> +				   struct btrfs_extent_data_ref *dref,
> +				   struct extent_buffer *eb)
> +{
> +	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
> +	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
> +	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
> +	struct btrfs_root *root;
> +	struct btrfs_key key;
> +	struct btrfs_path path;
> +	int ret;
> +
> +	if (objectid == ino)
> +		return 0;
> +
> +	btrfs_init_path(&path);
> +	key.objectid = rootid;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = (u64)-1;
> +	root = btrfs_read_fs_root(fs_info, &key);
> +	if (IS_ERR(root)) {
> +		ret = PTR_ERR(root);
> +		goto out;
> +	}
> +
> +	key.objectid = objectid;
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = offset;
> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing file extent item for inode %llu, root %llu, offset %llu",
> +			objectid, rootid, offset);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	while (true) {
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0)
> +				break;
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != objectid ||
> +		    key.type != BTRFS_EXTENT_DATA_KEY)
> +			break;
> +
> +		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +				    struct btrfs_file_extent_item);
> +		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			goto next;
> +
> +		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
> +		    disk_bytenr)
> +			break;
> +
> +		if (extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			goto out;
> +		}
> +next:
> +		path.slots[0]++;
> +	}
> +	ret = 0;
> + out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
> +/*
> + * Check if a shared data reference points to a node that has a file extent item
> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
> + * associated to an inode with a number different from ino.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
> +					  u64 ino,
> +					  u64 parent,
> +					  u64 disk_bytenr)
> +{
> +	struct extent_buffer *eb;
> +	u32 nr;
> +	int i;
> +	int ret = 0;
> +
> +	eb = read_tree_block(fs_info, parent, 0);
> +	if (!extent_buffer_uptodate(eb)) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	nr = btrfs_header_nritems(eb);
> +	for (i = 0; i < nr; i++) {
> +		struct btrfs_key key;
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		btrfs_item_key_to_cpu(eb, &key, i);
> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
> +			continue;
> +		if (key.objectid == ino)
> +			continue;
> +
> +		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
> +
> +		extent_type = btrfs_file_extent_type(eb, fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			continue;
> +
> +		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
> +		    extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> + out:
> +	free_extent_buffer(eb);
> +	return ret;
> +}
> +
> +/*
> + * Check if a prealloc extent is shared by other inodes and if any inode has
> + * already written to that extent. This is to avoid emitting invalid warnings
> + * about odd csum items (a inode has an extent entirely marked as prealloc
> + * but another inode shares it and has already written to it).
> + *
> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
> + * at least one other inode has written to it, and < 0 on error.
> + */
> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
> +					 u64 ino,
> +					 u64 disk_bytenr,
> +					 u64 num_bytes)
> +{
> +	struct btrfs_path path;
> +	struct btrfs_key key;
> +	int ret;
> +	struct btrfs_extent_item *ei;
> +	u32 item_size;
> +	unsigned long ptr;
> +	unsigned long end;
> +
> +	key.objectid = disk_bytenr;
> +	key.type = BTRFS_EXTENT_ITEM_KEY;
> +	key.offset = num_bytes;
> +
> +	btrfs_init_path(&path);
> +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
> +			disk_bytenr, num_bytes);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	/* First check all inline refs. */
> +	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +			    struct btrfs_extent_item);
> +	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
> +	ptr = (unsigned long)(ei + 1);
> +	end = (unsigned long)ei + item_size;
> +	while (ptr < end) {
> +		struct btrfs_extent_inline_ref *iref;
> +		int type;
> +
> +		iref = (struct btrfs_extent_inline_ref *)ptr;
> +		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
> +		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
> +		       type == BTRFS_SHARED_DATA_REF_KEY);
> +
> +		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
> +			u64 parent;
> +
> +			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
> +								iref);
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     ino,
> +							     parent,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		ptr += btrfs_extent_inline_ref_size(type);
> +	}
> +
> +	/* Now check if there are any non-inlined refs. */
> +	path.slots[0]++;
> +	while (true) {
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0) {
> +				ret = 0;
> +				break;
> +			}
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != disk_bytenr)
> +			break;
> +
> +		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +					      struct btrfs_extent_data_ref);
> +			ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     ino,
> +							     key.offset,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		path.slots[0]++;
> +	}
> +out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
>  static int process_file_extent(struct btrfs_root *root,
>  				struct extent_buffer *eb,
>  				int slot, struct btrfs_key *key,
> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>  			if (found < num_bytes)
>  				rec->some_csum_missing = 1;
>  		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
> -			if (found > 0)
> -				rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			if (found > 0) {
> +				ret = check_prealloc_extent_written(root->fs_info,
> +								    rec->ino,
> +								    disk_bytenr,
> +								    num_bytes);
> +				if (ret < 0)
> +					return ret;
> +				if (ret == 0)
> +					rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			}
>  		}
>  	}
>  	return 0;
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-14  1:32 ` [PATCH 1/2] " Qu Wenruo
  2018-03-14  1:58   ` Qu Wenruo
  2018-03-14  1:59   ` Lu Fengqi
@ 2018-03-14 10:33   ` Filipe Manana
  2018-03-14 10:54     ` Qu Wenruo
  2 siblings, 1 reply; 16+ messages in thread
From: Filipe Manana @ 2018-03-14 10:33 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Wed, Mar 14, 2018 at 1:32 AM, Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
> On 2018年03月14日 02:47, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>>
>> Under some cases the filesystem checker reports an error when it finds
>> checksum items for an extent that is referenced by an inode as a prealloc
>> extent. Such cases are not an error when the extent is actually shared
>> (was cloned/reflinked) with other inodes and was written through one of
>> those other inodes.
>>
>> Example:
>>
>>   $ mkfs.btrfs -f /dev/sdb
>>   $ mount /dev/sdb /mnt
>>
>>   $ touch /mnt/foo
>>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>>   $ sync
>>
>>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
>>   $ touch /mnt/bar
>>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>>   $ xfs_io -c "fsync" /mnt/bar
>>
>>   <power fail>
>>   $ mount /dev/sdb /mnt
>>   $ umount /mnt
>>
>>   $ btrfs check /dev/sdc
>>   Checking filesystem on /dev/sdb
>>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>>   checking extents
>>   checking free space cache
>>   checking fs roots
>>   root 5 inode 257 errors 800, odd csum item
>>   ERROR: errors found in fs roots
>>   found 688128 bytes used, error(s) found
>>   total csum bytes: 256
>>   total tree bytes: 163840
>>   total fs tree bytes: 65536
>>   total extent tree bytes: 16384
>>   btree space waste bytes: 138819
>>   file data blocks allocated: 10747904
>>    referenced 10747904
>>   $ echo $?
>>   1
>>
>> So tech check to not report such cases as errors by checking if the
>> extent is shared with other inodes and if so, consider it an error the
>> existence of checksum items only if all those other inodes are referencing
>> the extent as a prealloc extent.
>> This case can be hit often when running the generic/475 testcase from
>> fstests.
>>
>> A test case will follow in a separate patch.
>>
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>
> Looks good overall, but still some small nitpicks.
>
> One is, the fix is only for original mode, while the fix itself has
> nothing mode dependent, so it's better to put the check into
> check/mode-original.[ch] so lowmem mode can also benefit from it.

Ok, I started doing development on a release branch where the split
doesn't exist, them moved to the development branch and wrongly
assumed that shared code would be in main.c.
>
>> ---
>>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 268 insertions(+), 2 deletions(-)
>>
>> diff --git a/check/main.c b/check/main.c
>> index 392195ca..bb816311 100644
>> --- a/check/main.c
>> +++ b/check/main.c
>> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>>
>>  }
>>
>> +/*
>> + * Check if the inode referenced by the given data reference uses the extent
>> + * at disk_bytenr as a non-prealloc extent.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>
> The check itself (along with check_prealloc_shared_data_ref) is good
> enough to detect any regular file extents inside the prealloc extent.
>
> But when it finds any regular extents, it doesn't check if it's covered
> by csum correctly.
>
> For example:
>
> |<-------------- prealloc extent range -------------------------->|
> |<- the *only* regular extent ->|
> |<-------- range covered by csum -------->|
>                                 |<- ODD ->|
>
> Or
> |<-------------- prealloc extent range -------------------------->|
> |<- the *only* regular extent ->|
> |<- csum ->|
>            |<------ ODD ------->|
>
> So at least in theory, the best solution is not to just check if there
> is any regular extents inside the large prealloc extent, but also check
> how many csum bytes the regular extents contribute, and then compare it
> to the result of count_csum_range().

Yes, I considered that initially but then moved to this simpler, but
less accurate solution.
So the problem is more complex then you think, as a prealloc extent
can be partially written and then reflinked multiple times and
fsync'ed.
For example:

create inode A with prealloc extent [0, 256k[
sync
write into prealloc extent, range [0, 128k[ through inode A
reflink prealloc extent into inode B
reflink prealloc extent into inode C
reflink prealloc extent into inode D
fsync inodes B, C and D
power fail
remount to replay log

Now using a simple counter to count how many bytes the are covered by
chekcsum items we would have, we would get 384K (128K * 3), more then
the size of the extent.
So we would have to track ranges and then ranges that partially
overlap (due to non-zero offsets in the extent items).

And this could go even more complex. It's doable, but I wanted to keep
things simple for now.

thanks

>
> Thanks,
> Qu
>
>> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
>> +                                u64 ino,
>> +                                u64 disk_bytenr,
>> +                                struct btrfs_extent_data_ref *dref,
>> +                                struct extent_buffer *eb)
>> +{
>> +     u64 rootid = btrfs_extent_data_ref_root(eb, dref);
>> +     u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
>> +     u64 offset = btrfs_extent_data_ref_offset(eb, dref);
>> +     struct btrfs_root *root;
>> +     struct btrfs_key key;
>> +     struct btrfs_path path;
>> +     int ret;
>> +
>> +     if (objectid == ino)
>> +             return 0;
>> +
>> +     btrfs_init_path(&path);
>> +     key.objectid = rootid;
>> +     key.type = BTRFS_ROOT_ITEM_KEY;
>> +     key.offset = (u64)-1;
>> +     root = btrfs_read_fs_root(fs_info, &key);
>> +     if (IS_ERR(root)) {
>> +             ret = PTR_ERR(root);
>> +             goto out;
>> +     }
>> +
>> +     key.objectid = objectid;
>> +     key.type = BTRFS_EXTENT_DATA_KEY;
>> +     key.offset = offset;
>> +     ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
>> +     if (ret > 0) {
>> +             fprintf(stderr,
>> +                     "Missing file extent item for inode %llu, root %llu, offset %llu",
>> +                     objectid, rootid, offset);
>> +             ret = -ENOENT;
>> +     }
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     while (true) {
>> +             struct btrfs_file_extent_item *fi;
>> +             int extent_type;
>> +
>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +                     if (ret < 0)
>> +                             goto out;
>> +                     if (ret > 0)
>> +                             break;
>> +             }
>> +
>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +             if (key.objectid != objectid ||
>> +                 key.type != BTRFS_EXTENT_DATA_KEY)
>> +                     break;
>> +
>> +             fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                                 struct btrfs_file_extent_item);
>> +             extent_type = btrfs_file_extent_type(path.nodes[0], fi);
>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +                     goto next;
>> +
>> +             if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
>> +                 disk_bytenr)
>> +                     break;
>> +
>> +             if (extent_type == BTRFS_FILE_EXTENT_REG) {
>> +                     ret = 1;
>> +                     goto out;
>> +             }
>> +next:
>> +             path.slots[0]++;
>> +     }
>> +     ret = 0;
>> + out:
>> +     btrfs_release_path(&path);
>> +     return ret;
>> +}
>> +
>> +/*
>> + * Check if a shared data reference points to a node that has a file extent item
>> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
>> + * associated to an inode with a number different from ino.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
>> +                                       u64 ino,
>> +                                       u64 parent,
>> +                                       u64 disk_bytenr)
>> +{
>> +     struct extent_buffer *eb;
>> +     u32 nr;
>> +     int i;
>> +     int ret = 0;
>> +
>> +     eb = read_tree_block(fs_info, parent, 0);
>> +     if (!extent_buffer_uptodate(eb)) {
>> +             ret = -EIO;
>> +             goto out;
>> +     }
>> +
>> +     nr = btrfs_header_nritems(eb);
>> +     for (i = 0; i < nr; i++) {
>> +             struct btrfs_key key;
>> +             struct btrfs_file_extent_item *fi;
>> +             int extent_type;
>> +
>> +             btrfs_item_key_to_cpu(eb, &key, i);
>> +             if (key.type != BTRFS_EXTENT_DATA_KEY)
>> +                     continue;
>> +             if (key.objectid == ino)
>> +                     continue;
>> +
>> +             fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
>> +
>> +             extent_type = btrfs_file_extent_type(eb, fi);
>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +                     continue;
>> +
>> +             if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
>> +                 extent_type == BTRFS_FILE_EXTENT_REG) {
>> +                     ret = 1;
>> +                     break;
>> +             }
>> +     }
>> + out:
>> +     free_extent_buffer(eb);
>> +     return ret;
>> +}
>> +
>> +/*
>> + * Check if a prealloc extent is shared by other inodes and if any inode has
>> + * already written to that extent. This is to avoid emitting invalid warnings
>> + * about odd csum items (a inode has an extent entirely marked as prealloc
>> + * but another inode shares it and has already written to it).
>> + *
>> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
>> + * at least one other inode has written to it, and < 0 on error.
>> + */
>> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
>> +                                      u64 ino,
>> +                                      u64 disk_bytenr,
>> +                                      u64 num_bytes)
>> +{
>> +     struct btrfs_path path;
>> +     struct btrfs_key key;
>> +     int ret;
>> +     struct btrfs_extent_item *ei;
>> +     u32 item_size;
>> +     unsigned long ptr;
>> +     unsigned long end;
>> +
>> +     key.objectid = disk_bytenr;
>> +     key.type = BTRFS_EXTENT_ITEM_KEY;
>> +     key.offset = num_bytes;
>> +
>> +     btrfs_init_path(&path);
>> +     ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
>> +     if (ret > 0) {
>> +             fprintf(stderr,
>> +                     "Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
>> +                     disk_bytenr, num_bytes);
>> +             ret = -ENOENT;
>> +     }
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     /* First check all inline refs. */
>> +     ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                         struct btrfs_extent_item);
>> +     item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
>> +     ptr = (unsigned long)(ei + 1);
>> +     end = (unsigned long)ei + item_size;
>> +     while (ptr < end) {
>> +             struct btrfs_extent_inline_ref *iref;
>> +             int type;
>> +
>> +             iref = (struct btrfs_extent_inline_ref *)ptr;
>> +             type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
>> +             ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
>> +                    type == BTRFS_SHARED_DATA_REF_KEY);
>> +
>> +             if (type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +                     struct btrfs_extent_data_ref *dref;
>> +
>> +                     dref = (struct btrfs_extent_data_ref *)(&iref->offset);
>> +                     ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +                                                   dref, path.nodes[0]);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
>> +                     u64 parent;
>> +
>> +                     parent = btrfs_extent_inline_ref_offset(path.nodes[0],
>> +                                                             iref);
>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>> +                                                          ino,
>> +                                                          parent,
>> +                                                          disk_bytenr);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             }
>> +
>> +             ptr += btrfs_extent_inline_ref_size(type);
>> +     }
>> +
>> +     /* Now check if there are any non-inlined refs. */
>> +     path.slots[0]++;
>> +     while (true) {
>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +                     if (ret < 0)
>> +                             goto out;
>> +                     if (ret > 0) {
>> +                             ret = 0;
>> +                             break;
>> +                     }
>> +             }
>> +
>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +             if (key.objectid != disk_bytenr)
>> +                     break;
>> +
>> +             if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +                     struct btrfs_extent_data_ref *dref;
>> +
>> +                     dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                                           struct btrfs_extent_data_ref);
>> +                     ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +                                                   dref, path.nodes[0]);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>> +                                                          ino,
>> +                                                          key.offset,
>> +                                                          disk_bytenr);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             }
>> +
>> +             path.slots[0]++;
>> +     }
>> +out:
>> +     btrfs_release_path(&path);
>> +     return ret;
>> +}
>> +
>>  static int process_file_extent(struct btrfs_root *root,
>>                               struct extent_buffer *eb,
>>                               int slot, struct btrfs_key *key,
>> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>>                       if (found < num_bytes)
>>                               rec->some_csum_missing = 1;
>>               } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
>> -                     if (found > 0)
>> -                             rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +                     if (found > 0) {
>> +                             ret = check_prealloc_extent_written(root->fs_info,
>> +                                                                 rec->ino,
>> +                                                                 disk_bytenr,
>> +                                                                 num_bytes);
>> +                             if (ret < 0)
>> +                                     return ret;
>> +                             if (ret == 0)
>> +                                     rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +                     }
>>               }
>>       }
>>       return 0;
>>
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-14  8:19 ` Nikolay Borisov
@ 2018-03-14 10:34   ` Filipe Manana
  0 siblings, 0 replies; 16+ messages in thread
From: Filipe Manana @ 2018-03-14 10:34 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: linux-btrfs

On Wed, Mar 14, 2018 at 8:19 AM, Nikolay Borisov <nborisov@suse.com> wrote:
>
>
> On 13.03.2018 20:47, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>>
>> Under some cases the filesystem checker reports an error when it finds
>> checksum items for an extent that is referenced by an inode as a prealloc
>> extent. Such cases are not an error when the extent is actually shared
>> (was cloned/reflinked) with other inodes and was written through one of
>> those other inodes.
>>
>> Example:
>>
>>   $ mkfs.btrfs -f /dev/sdb
>>   $ mount /dev/sdb /mnt
>>
>>   $ touch /mnt/foo
>>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>>   $ sync
>>
>>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
>>   $ touch /mnt/bar
>>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>>   $ xfs_io -c "fsync" /mnt/bar
>>
>>   <power fail>
>>   $ mount /dev/sdb /mnt
>>   $ umount /mnt
>>
>>   $ btrfs check /dev/sdc
>>   Checking filesystem on /dev/sdb
>>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>>   checking extents
>>   checking free space cache
>>   checking fs roots
>>   root 5 inode 257 errors 800, odd csum item
>>   ERROR: errors found in fs roots
>>   found 688128 bytes used, error(s) found
>>   total csum bytes: 256
>>   total tree bytes: 163840
>>   total fs tree bytes: 65536
>>   total extent tree bytes: 16384
>>   btree space waste bytes: 138819
>>   file data blocks allocated: 10747904
>>    referenced 10747904
>>   $ echo $?
>>   1
>>
>> So tech check to not report such cases as errors by checking if the
>> extent is shared with other inodes and if so, consider it an error the
>> existence of checksum items only if all those other inodes are referencing
>> the extent as a prealloc extent.
>> This case can be hit often when running the generic/475 testcase from
>> fstests.
>>
>> A test case will follow in a separate patch.
>>
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>> ---
>>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 268 insertions(+), 2 deletions(-)
>>
>> diff --git a/check/main.c b/check/main.c
>> index 392195ca..bb816311 100644
>> --- a/check/main.c
>> +++ b/check/main.c
>> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>>
>>  }
>>
>> +/*
>> + * Check if the inode referenced by the given data reference uses the extent
>> + * at disk_bytenr as a non-prealloc extent.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
>> +                                u64 ino,
>> +                                u64 disk_bytenr,
>> +                                struct btrfs_extent_data_ref *dref,
>> +                                struct extent_buffer *eb)
>> +{
>> +     u64 rootid = btrfs_extent_data_ref_root(eb, dref);
>> +     u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
>
> nit: Shouldn't this ino be equal to the ino passed to the function? If
> so objectid can be removed and when setting up the key for the second
> search you just assigne ino to its objectid

No, we want to skip if it's equal, because we're only interested in
references for inodes other then the one being currently processed.

>
>> +     u64 offset = btrfs_extent_data_ref_offset(eb, dref);
>> +     struct btrfs_root *root;
>> +     struct btrfs_key key;
>> +     struct btrfs_path path;
>> +     int ret;
>> +
>> +     if (objectid == ino)
>> +             return 0;
>> +
>> +     btrfs_init_path(&path);
>> +     key.objectid = rootid;
>> +     key.type = BTRFS_ROOT_ITEM_KEY;
>> +     key.offset = (u64)-1;
>> +     root = btrfs_read_fs_root(fs_info, &key);
>> +     if (IS_ERR(root)) {
>> +             ret = PTR_ERR(root);
>> +             goto out;
>> +     }
>> +
>> +     key.objectid = objectid;
>> +     key.type = BTRFS_EXTENT_DATA_KEY;
>> +     key.offset = offset;
>> +     ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
>> +     if (ret > 0) {
>> +             fprintf(stderr,
>> +                     "Missing file extent item for inode %llu, root %llu, offset %llu",
>> +                     objectid, rootid, offset);
>> +             ret = -ENOENT;
>> +     }
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     while (true) {
>> +             struct btrfs_file_extent_item *fi;
>> +             int extent_type;
>> +
>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +                     if (ret < 0)
>> +                             goto out;
>> +                     if (ret > 0)
>> +                             break;
>> +             }
>> +
>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +             if (key.objectid != objectid ||
>> +                 key.type != BTRFS_EXTENT_DATA_KEY)
>> +                     break;
>> +
>> +             fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                                 struct btrfs_file_extent_item);
>> +             extent_type = btrfs_file_extent_type(path.nodes[0], fi);
>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +                     goto next;
>> +
>> +             if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
>> +                 disk_bytenr)
>> +                     break;
>> +
>> +             if (extent_type == BTRFS_FILE_EXTENT_REG) {
>> +                     ret = 1;
>> +                     goto out;
>> +             }
>> +next:
>> +             path.slots[0]++;
>> +     }
>> +     ret = 0;
>> + out:
>> +     btrfs_release_path(&path);
>> +     return ret;
>> +}
>> +
>> +/*
>> + * Check if a shared data reference points to a node that has a file extent item
>> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
>> + * associated to an inode with a number different from ino.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
>> +                                       u64 ino,
>> +                                       u64 parent,
>> +                                       u64 disk_bytenr)
>> +{
>> +     struct extent_buffer *eb;
>> +     u32 nr;
>> +     int i;
>> +     int ret = 0;
>> +
>> +     eb = read_tree_block(fs_info, parent, 0);
>> +     if (!extent_buffer_uptodate(eb)) {
>> +             ret = -EIO;
>> +             goto out;
>> +     }
>> +
>> +     nr = btrfs_header_nritems(eb);
>> +     for (i = 0; i < nr; i++) {
>> +             struct btrfs_key key;
>> +             struct btrfs_file_extent_item *fi;
>> +             int extent_type;
>> +
>> +             btrfs_item_key_to_cpu(eb, &key, i);
>> +             if (key.type != BTRFS_EXTENT_DATA_KEY)
>> +                     continue;
>> +             if (key.objectid == ino)
>> +                     continue;
>> +
>> +             fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
>> +
>> +             extent_type = btrfs_file_extent_type(eb, fi);
>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +                     continue;
>> +
>> +             if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
>> +                 extent_type == BTRFS_FILE_EXTENT_REG) {
>> +                     ret = 1;
>> +                     break;
>> +             }
>> +     }
>> + out:
>> +     free_extent_buffer(eb);
>> +     return ret;
>> +}
>> +
>> +/*
>> + * Check if a prealloc extent is shared by other inodes and if any inode has
>> + * already written to that extent. This is to avoid emitting invalid warnings
>> + * about odd csum items (a inode has an extent entirely marked as prealloc
>> + * but another inode shares it and has already written to it).
>> + *
>> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
>> + * at least one other inode has written to it, and < 0 on error.
>> + */
>> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
>> +                                      u64 ino,
>> +                                      u64 disk_bytenr,
>> +                                      u64 num_bytes)
>> +{
>> +     struct btrfs_path path;
>> +     struct btrfs_key key;
>> +     int ret;
>> +     struct btrfs_extent_item *ei;
>> +     u32 item_size;
>> +     unsigned long ptr;
>> +     unsigned long end;
>> +
>> +     key.objectid = disk_bytenr;
>> +     key.type = BTRFS_EXTENT_ITEM_KEY;
>> +     key.offset = num_bytes;
>> +
>> +     btrfs_init_path(&path);
>> +     ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
>> +     if (ret > 0) {
>> +             fprintf(stderr,
>> +                     "Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
>> +                     disk_bytenr, num_bytes);
>> +             ret = -ENOENT;
>> +     }
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     /* First check all inline refs. */
>> +     ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                         struct btrfs_extent_item);
>> +     item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
>> +     ptr = (unsigned long)(ei + 1);
>> +     end = (unsigned long)ei + item_size;
>> +     while (ptr < end) {
>> +             struct btrfs_extent_inline_ref *iref;
>> +             int type;
>> +
>> +             iref = (struct btrfs_extent_inline_ref *)ptr;
>> +             type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
>> +             ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
>> +                    type == BTRFS_SHARED_DATA_REF_KEY);
>> +
>> +             if (type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +                     struct btrfs_extent_data_ref *dref;
>> +
>> +                     dref = (struct btrfs_extent_data_ref *)(&iref->offset);
>> +                     ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +                                                   dref, path.nodes[0]);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
>> +                     u64 parent;
>> +
>> +                     parent = btrfs_extent_inline_ref_offset(path.nodes[0],
>> +                                                             iref);
>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>> +                                                          ino,
>> +                                                          parent,
>> +                                                          disk_bytenr);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             }
>> +
>> +             ptr += btrfs_extent_inline_ref_size(type);
>> +     }
>> +
>> +     /* Now check if there are any non-inlined refs. */
>> +     path.slots[0]++;
>> +     while (true) {
>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +                     if (ret < 0)
>> +                             goto out;
>> +                     if (ret > 0) {
>> +                             ret = 0;
>> +                             break;
>> +                     }
>> +             }
>> +
>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +             if (key.objectid != disk_bytenr)
>> +                     break;
>> +
>> +             if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +                     struct btrfs_extent_data_ref *dref;
>> +
>> +                     dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                                           struct btrfs_extent_data_ref);
>> +                     ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +                                                   dref, path.nodes[0]);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>> +                                                          ino,
>> +                                                          key.offset,
>> +                                                          disk_bytenr);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             }
>> +
>> +             path.slots[0]++;
>> +     }
>> +out:
>> +     btrfs_release_path(&path);
>> +     return ret;
>> +}
>> +
>>  static int process_file_extent(struct btrfs_root *root,
>>                               struct extent_buffer *eb,
>>                               int slot, struct btrfs_key *key,
>> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>>                       if (found < num_bytes)
>>                               rec->some_csum_missing = 1;
>>               } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
>> -                     if (found > 0)
>> -                             rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +                     if (found > 0) {
>> +                             ret = check_prealloc_extent_written(root->fs_info,
>> +                                                                 rec->ino,
>> +                                                                 disk_bytenr,
>> +                                                                 num_bytes);
>> +                             if (ret < 0)
>> +                                     return ret;
>> +                             if (ret == 0)
>> +                                     rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +                     }
>>               }
>>       }
>>       return 0;
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-14  9:19 ` Nikolay Borisov
@ 2018-03-14 10:35   ` Filipe Manana
  0 siblings, 0 replies; 16+ messages in thread
From: Filipe Manana @ 2018-03-14 10:35 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: linux-btrfs

On Wed, Mar 14, 2018 at 9:19 AM, Nikolay Borisov <nborisov@suse.com> wrote:
>
>
> On 13.03.2018 20:47, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>>
>> Under some cases the filesystem checker reports an error when it finds
>> checksum items for an extent that is referenced by an inode as a prealloc
>> extent. Such cases are not an error when the extent is actually shared
>> (was cloned/reflinked) with other inodes and was written through one of
>> those other inodes.
>>
>> Example:
>>
>>   $ mkfs.btrfs -f /dev/sdb
>>   $ mount /dev/sdb /mnt
>>
>>   $ touch /mnt/foo
>>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>>   $ sync
>>
>>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
>                                   ^^
> nit: You are actually missing the argument for the -b switch, how big
> should the blocks be ? Same thing applies to the commit message of your test

-b is there because originally I had a value there. It's irrelevant
the block size value, I simply forgot to delete -b.

>
>>   $ touch /mnt/bar
>>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>>   $ xfs_io -c "fsync" /mnt/bar
>>
>>   <power fail>
>>   $ mount /dev/sdb /mnt
>>   $ umount /mnt
>>
>>   $ btrfs check /dev/sdc
>>   Checking filesystem on /dev/sdb
>>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>>   checking extents
>>   checking free space cache
>>   checking fs roots
>>   root 5 inode 257 errors 800, odd csum item
>>   ERROR: errors found in fs roots
>>   found 688128 bytes used, error(s) found
>>   total csum bytes: 256
>>   total tree bytes: 163840
>>   total fs tree bytes: 65536
>>   total extent tree bytes: 16384
>>   btree space waste bytes: 138819
>>   file data blocks allocated: 10747904
>>    referenced 10747904
>>   $ echo $?
>>   1
>>
>> So tech check to not report such cases as errors by checking if the
>> extent is shared with other inodes and if so, consider it an error the
>> existence of checksum items only if all those other inodes are referencing
>> the extent as a prealloc extent.
>> This case can be hit often when running the generic/475 testcase from
>> fstests.
>>
>> A test case will follow in a separate patch.
>>
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>> ---
>>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 268 insertions(+), 2 deletions(-)
>>
>> diff --git a/check/main.c b/check/main.c
>> index 392195ca..bb816311 100644
>> --- a/check/main.c
>> +++ b/check/main.c
>> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>>
>>  }
>>
>> +/*
>> + * Check if the inode referenced by the given data reference uses the extent
>> + * at disk_bytenr as a non-prealloc extent.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
>> +                                u64 ino,
>> +                                u64 disk_bytenr,
>> +                                struct btrfs_extent_data_ref *dref,
>> +                                struct extent_buffer *eb)
>> +{
>> +     u64 rootid = btrfs_extent_data_ref_root(eb, dref);
>> +     u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
>> +     u64 offset = btrfs_extent_data_ref_offset(eb, dref);
>> +     struct btrfs_root *root;
>> +     struct btrfs_key key;
>> +     struct btrfs_path path;
>> +     int ret;
>> +
>> +     if (objectid == ino)
>> +             return 0;
>> +
>> +     btrfs_init_path(&path);
>> +     key.objectid = rootid;
>> +     key.type = BTRFS_ROOT_ITEM_KEY;
>> +     key.offset = (u64)-1;
>> +     root = btrfs_read_fs_root(fs_info, &key);
>> +     if (IS_ERR(root)) {
>> +             ret = PTR_ERR(root);
>> +             goto out;
>> +     }
>> +
>> +     key.objectid = objectid;
>> +     key.type = BTRFS_EXTENT_DATA_KEY;
>> +     key.offset = offset;
>> +     ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
>> +     if (ret > 0) {
>> +             fprintf(stderr,
>> +                     "Missing file extent item for inode %llu, root %llu, offset %llu",
>> +                     objectid, rootid, offset);
>> +             ret = -ENOENT;
>> +     }
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     while (true) {
>> +             struct btrfs_file_extent_item *fi;
>> +             int extent_type;
>> +
>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +                     if (ret < 0)
>> +                             goto out;
>> +                     if (ret > 0)
>> +                             break;
>> +             }
>> +
>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +             if (key.objectid != objectid ||
>> +                 key.type != BTRFS_EXTENT_DATA_KEY)
>> +                     break;
>> +
>> +             fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                                 struct btrfs_file_extent_item);
>> +             extent_type = btrfs_file_extent_type(path.nodes[0], fi);
>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +                     goto next;
>> +
>> +             if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
>> +                 disk_bytenr)
>> +                     break;
>> +
>> +             if (extent_type == BTRFS_FILE_EXTENT_REG) {
>> +                     ret = 1;
>> +                     goto out;
>> +             }
>> +next:
>> +             path.slots[0]++;
>> +     }
>> +     ret = 0;
>> + out:
>> +     btrfs_release_path(&path);
>> +     return ret;
>> +}
>> +
>> +/*
>> + * Check if a shared data reference points to a node that has a file extent item
>> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
>> + * associated to an inode with a number different from ino.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
>> +                                       u64 ino,
>> +                                       u64 parent,
>> +                                       u64 disk_bytenr)
>> +{
>> +     struct extent_buffer *eb;
>> +     u32 nr;
>> +     int i;
>> +     int ret = 0;
>> +
>> +     eb = read_tree_block(fs_info, parent, 0);
>> +     if (!extent_buffer_uptodate(eb)) {
>> +             ret = -EIO;
>> +             goto out;
>> +     }
>> +
>> +     nr = btrfs_header_nritems(eb);
>> +     for (i = 0; i < nr; i++) {
>> +             struct btrfs_key key;
>> +             struct btrfs_file_extent_item *fi;
>> +             int extent_type;
>> +
>> +             btrfs_item_key_to_cpu(eb, &key, i);
>> +             if (key.type != BTRFS_EXTENT_DATA_KEY)
>> +                     continue;
>> +             if (key.objectid == ino)
>> +                     continue;
>> +
>> +             fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
>> +
>> +             extent_type = btrfs_file_extent_type(eb, fi);
>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +                     continue;
>> +
>> +             if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
>> +                 extent_type == BTRFS_FILE_EXTENT_REG) {
>> +                     ret = 1;
>> +                     break;
>> +             }
>> +     }
>> + out:
>> +     free_extent_buffer(eb);
>> +     return ret;
>> +}
>> +
>> +/*
>> + * Check if a prealloc extent is shared by other inodes and if any inode has
>> + * already written to that extent. This is to avoid emitting invalid warnings
>> + * about odd csum items (a inode has an extent entirely marked as prealloc
>> + * but another inode shares it and has already written to it).
>> + *
>> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
>> + * at least one other inode has written to it, and < 0 on error.
>> + */
>> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
>> +                                      u64 ino,
>> +                                      u64 disk_bytenr,
>> +                                      u64 num_bytes)
>> +{
>> +     struct btrfs_path path;
>> +     struct btrfs_key key;
>> +     int ret;
>> +     struct btrfs_extent_item *ei;
>> +     u32 item_size;
>> +     unsigned long ptr;
>> +     unsigned long end;
>> +
>> +     key.objectid = disk_bytenr;
>> +     key.type = BTRFS_EXTENT_ITEM_KEY;
>> +     key.offset = num_bytes;
>> +
>> +     btrfs_init_path(&path);
>> +     ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
>> +     if (ret > 0) {
>> +             fprintf(stderr,
>> +                     "Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
>> +                     disk_bytenr, num_bytes);
>> +             ret = -ENOENT;
>> +     }
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     /* First check all inline refs. */
>> +     ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                         struct btrfs_extent_item);
>> +     item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
>> +     ptr = (unsigned long)(ei + 1);
>> +     end = (unsigned long)ei + item_size;
>> +     while (ptr < end) {
>> +             struct btrfs_extent_inline_ref *iref;
>> +             int type;
>> +
>> +             iref = (struct btrfs_extent_inline_ref *)ptr;
>> +             type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
>> +             ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
>> +                    type == BTRFS_SHARED_DATA_REF_KEY);
>> +
>> +             if (type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +                     struct btrfs_extent_data_ref *dref;
>> +
>> +                     dref = (struct btrfs_extent_data_ref *)(&iref->offset);
>> +                     ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +                                                   dref, path.nodes[0]);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
>> +                     u64 parent;
>> +
>> +                     parent = btrfs_extent_inline_ref_offset(path.nodes[0],
>> +                                                             iref);
>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>> +                                                          ino,
>> +                                                          parent,
>> +                                                          disk_bytenr);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             }
>> +
>> +             ptr += btrfs_extent_inline_ref_size(type);
>> +     }
>> +
>> +     /* Now check if there are any non-inlined refs. */
>> +     path.slots[0]++;
>> +     while (true) {
>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +                     if (ret < 0)
>> +                             goto out;
>> +                     if (ret > 0) {
>> +                             ret = 0;
>> +                             break;
>> +                     }
>> +             }
>> +
>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +             if (key.objectid != disk_bytenr)
>> +                     break;
>> +
>> +             if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +                     struct btrfs_extent_data_ref *dref;
>> +
>> +                     dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                                           struct btrfs_extent_data_ref);
>> +                     ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>> +                                                   dref, path.nodes[0]);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>> +                                                          ino,
>> +                                                          key.offset,
>> +                                                          disk_bytenr);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             }
>> +
>> +             path.slots[0]++;
>> +     }
>> +out:
>> +     btrfs_release_path(&path);
>> +     return ret;
>> +}
>> +
>>  static int process_file_extent(struct btrfs_root *root,
>>                               struct extent_buffer *eb,
>>                               int slot, struct btrfs_key *key,
>> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>>                       if (found < num_bytes)
>>                               rec->some_csum_missing = 1;
>>               } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
>> -                     if (found > 0)
>> -                             rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +                     if (found > 0) {
>> +                             ret = check_prealloc_extent_written(root->fs_info,
>> +                                                                 rec->ino,
>> +                                                                 disk_bytenr,
>> +                                                                 num_bytes);
>> +                             if (ret < 0)
>> +                                     return ret;
>> +                             if (ret == 0)
>> +                                     rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +                     }
>>               }
>>       }
>>       return 0;
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-14 10:33   ` Filipe Manana
@ 2018-03-14 10:54     ` Qu Wenruo
  0 siblings, 0 replies; 16+ messages in thread
From: Qu Wenruo @ 2018-03-14 10:54 UTC (permalink / raw)
  To: Filipe Manana; +Cc: linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 17270 bytes --]



On 2018年03月14日 18:33, Filipe Manana wrote:
> On Wed, Mar 14, 2018 at 1:32 AM, Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>>
>>
>> On 2018年03月14日 02:47, fdmanana@kernel.org wrote:
>>> From: Filipe Manana <fdmanana@suse.com>
>>>
>>> Under some cases the filesystem checker reports an error when it finds
>>> checksum items for an extent that is referenced by an inode as a prealloc
>>> extent. Such cases are not an error when the extent is actually shared
>>> (was cloned/reflinked) with other inodes and was written through one of
>>> those other inodes.
>>>
>>> Example:
>>>
>>>   $ mkfs.btrfs -f /dev/sdb
>>>   $ mount /dev/sdb /mnt
>>>
>>>   $ touch /mnt/foo
>>>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>>>   $ sync
>>>
>>>   $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
>>>   $ touch /mnt/bar
>>>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>>>   $ xfs_io -c "fsync" /mnt/bar
>>>
>>>   <power fail>
>>>   $ mount /dev/sdb /mnt
>>>   $ umount /mnt
>>>
>>>   $ btrfs check /dev/sdc
>>>   Checking filesystem on /dev/sdb
>>>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>>>   checking extents
>>>   checking free space cache
>>>   checking fs roots
>>>   root 5 inode 257 errors 800, odd csum item
>>>   ERROR: errors found in fs roots
>>>   found 688128 bytes used, error(s) found
>>>   total csum bytes: 256
>>>   total tree bytes: 163840
>>>   total fs tree bytes: 65536
>>>   total extent tree bytes: 16384
>>>   btree space waste bytes: 138819
>>>   file data blocks allocated: 10747904
>>>    referenced 10747904
>>>   $ echo $?
>>>   1
>>>
>>> So tech check to not report such cases as errors by checking if the
>>> extent is shared with other inodes and if so, consider it an error the
>>> existence of checksum items only if all those other inodes are referencing
>>> the extent as a prealloc extent.
>>> This case can be hit often when running the generic/475 testcase from
>>> fstests.
>>>
>>> A test case will follow in a separate patch.
>>>
>>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>>
>> Looks good overall, but still some small nitpicks.
>>
>> One is, the fix is only for original mode, while the fix itself has
>> nothing mode dependent, so it's better to put the check into
>> check/mode-original.[ch] so lowmem mode can also benefit from it.
> 
> Ok, I started doing development on a release branch where the split
> doesn't exist, them moved to the development branch and wrongly
> assumed that shared code would be in main.c.
>>
>>> ---
>>>  check/main.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 268 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/check/main.c b/check/main.c
>>> index 392195ca..bb816311 100644
>>> --- a/check/main.c
>>> +++ b/check/main.c
>>> @@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer *eb,
>>>
>>>  }
>>>
>>> +/*
>>> + * Check if the inode referenced by the given data reference uses the extent
>>> + * at disk_bytenr as a non-prealloc extent.
>>> + *
>>> + * Returns 1 if true, 0 if false and < 0 on error.
>>> + */
>>
>> The check itself (along with check_prealloc_shared_data_ref) is good
>> enough to detect any regular file extents inside the prealloc extent.
>>
>> But when it finds any regular extents, it doesn't check if it's covered
>> by csum correctly.
>>
>> For example:
>>
>> |<-------------- prealloc extent range -------------------------->|
>> |<- the *only* regular extent ->|
>> |<-------- range covered by csum -------->|
>>                                 |<- ODD ->|
>>
>> Or
>> |<-------------- prealloc extent range -------------------------->|
>> |<- the *only* regular extent ->|
>> |<- csum ->|
>>            |<------ ODD ------->|
>>
>> So at least in theory, the best solution is not to just check if there
>> is any regular extents inside the large prealloc extent, but also check
>> how many csum bytes the regular extents contribute, and then compare it
>> to the result of count_csum_range().
> 
> Yes, I considered that initially but then moved to this simpler, but
> less accurate solution.
> So the problem is more complex then you think, as a prealloc extent
> can be partially written and then reflinked multiple times and
> fsync'ed.
> For example:
> 
> create inode A with prealloc extent [0, 256k[
> sync
> write into prealloc extent, range [0, 128k[ through inode A
> reflink prealloc extent into inode B
> reflink prealloc extent into inode C
> reflink prealloc extent into inode D
> fsync inodes B, C and D
> power fail
> remount to replay log
> 
> Now using a simple counter to count how many bytes the are covered by
> chekcsum items we would have, we would get 384K (128K * 3), more then
> the size of the extent.
> So we would have to track ranges and then ranges that partially
> overlap (due to non-zero offsets in the extent items).

Yep, I also find that we need that overlap check.

> 
> And this could go even more complex. It's doable, but I wanted to keep
> things simple for now.

I'm mostly OK to keep it like this right now.
But still need some comment to info us about the possible problem for
later enhancement.

Thanks,
Qu

> 
> thanks
> 
>>
>> Thanks,
>> Qu
>>
>>> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
>>> +                                u64 ino,
>>> +                                u64 disk_bytenr,
>>> +                                struct btrfs_extent_data_ref *dref,
>>> +                                struct extent_buffer *eb)
>>> +{
>>> +     u64 rootid = btrfs_extent_data_ref_root(eb, dref);
>>> +     u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
>>> +     u64 offset = btrfs_extent_data_ref_offset(eb, dref);
>>> +     struct btrfs_root *root;
>>> +     struct btrfs_key key;
>>> +     struct btrfs_path path;
>>> +     int ret;
>>> +
>>> +     if (objectid == ino)
>>> +             return 0;
>>> +
>>> +     btrfs_init_path(&path);
>>> +     key.objectid = rootid;
>>> +     key.type = BTRFS_ROOT_ITEM_KEY;
>>> +     key.offset = (u64)-1;
>>> +     root = btrfs_read_fs_root(fs_info, &key);
>>> +     if (IS_ERR(root)) {
>>> +             ret = PTR_ERR(root);
>>> +             goto out;
>>> +     }
>>> +
>>> +     key.objectid = objectid;
>>> +     key.type = BTRFS_EXTENT_DATA_KEY;
>>> +     key.offset = offset;
>>> +     ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
>>> +     if (ret > 0) {
>>> +             fprintf(stderr,
>>> +                     "Missing file extent item for inode %llu, root %llu, offset %llu",
>>> +                     objectid, rootid, offset);
>>> +             ret = -ENOENT;
>>> +     }
>>> +     if (ret < 0)
>>> +             goto out;
>>> +
>>> +     while (true) {
>>> +             struct btrfs_file_extent_item *fi;
>>> +             int extent_type;
>>> +
>>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>>> +                     if (ret < 0)
>>> +                             goto out;
>>> +                     if (ret > 0)
>>> +                             break;
>>> +             }
>>> +
>>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>>> +             if (key.objectid != objectid ||
>>> +                 key.type != BTRFS_EXTENT_DATA_KEY)
>>> +                     break;
>>> +
>>> +             fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
>>> +                                 struct btrfs_file_extent_item);
>>> +             extent_type = btrfs_file_extent_type(path.nodes[0], fi);
>>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>>> +                     goto next;
>>> +
>>> +             if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
>>> +                 disk_bytenr)
>>> +                     break;
>>> +
>>> +             if (extent_type == BTRFS_FILE_EXTENT_REG) {
>>> +                     ret = 1;
>>> +                     goto out;
>>> +             }
>>> +next:
>>> +             path.slots[0]++;
>>> +     }
>>> +     ret = 0;
>>> + out:
>>> +     btrfs_release_path(&path);
>>> +     return ret;
>>> +}
>>> +
>>> +/*
>>> + * Check if a shared data reference points to a node that has a file extent item
>>> + * pointing to the extent at disk_bytenr that is not of type prealloc and is
>>> + * associated to an inode with a number different from ino.
>>> + *
>>> + * Returns 1 if true, 0 if false and < 0 on error.
>>> + */
>>> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
>>> +                                       u64 ino,
>>> +                                       u64 parent,
>>> +                                       u64 disk_bytenr)
>>> +{
>>> +     struct extent_buffer *eb;
>>> +     u32 nr;
>>> +     int i;
>>> +     int ret = 0;
>>> +
>>> +     eb = read_tree_block(fs_info, parent, 0);
>>> +     if (!extent_buffer_uptodate(eb)) {
>>> +             ret = -EIO;
>>> +             goto out;
>>> +     }
>>> +
>>> +     nr = btrfs_header_nritems(eb);
>>> +     for (i = 0; i < nr; i++) {
>>> +             struct btrfs_key key;
>>> +             struct btrfs_file_extent_item *fi;
>>> +             int extent_type;
>>> +
>>> +             btrfs_item_key_to_cpu(eb, &key, i);
>>> +             if (key.type != BTRFS_EXTENT_DATA_KEY)
>>> +                     continue;
>>> +             if (key.objectid == ino)
>>> +                     continue;
>>> +
>>> +             fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
>>> +
>>> +             extent_type = btrfs_file_extent_type(eb, fi);
>>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>>> +                     continue;
>>> +
>>> +             if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
>>> +                 extent_type == BTRFS_FILE_EXTENT_REG) {
>>> +                     ret = 1;
>>> +                     break;
>>> +             }
>>> +     }
>>> + out:
>>> +     free_extent_buffer(eb);
>>> +     return ret;
>>> +}
>>> +
>>> +/*
>>> + * Check if a prealloc extent is shared by other inodes and if any inode has
>>> + * already written to that extent. This is to avoid emitting invalid warnings
>>> + * about odd csum items (a inode has an extent entirely marked as prealloc
>>> + * but another inode shares it and has already written to it).
>>> + *
>>> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
>>> + * at least one other inode has written to it, and < 0 on error.
>>> + */
>>> +static int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
>>> +                                      u64 ino,
>>> +                                      u64 disk_bytenr,
>>> +                                      u64 num_bytes)
>>> +{
>>> +     struct btrfs_path path;
>>> +     struct btrfs_key key;
>>> +     int ret;
>>> +     struct btrfs_extent_item *ei;
>>> +     u32 item_size;
>>> +     unsigned long ptr;
>>> +     unsigned long end;
>>> +
>>> +     key.objectid = disk_bytenr;
>>> +     key.type = BTRFS_EXTENT_ITEM_KEY;
>>> +     key.offset = num_bytes;
>>> +
>>> +     btrfs_init_path(&path);
>>> +     ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
>>> +     if (ret > 0) {
>>> +             fprintf(stderr,
>>> +                     "Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
>>> +                     disk_bytenr, num_bytes);
>>> +             ret = -ENOENT;
>>> +     }
>>> +     if (ret < 0)
>>> +             goto out;
>>> +
>>> +     /* First check all inline refs. */
>>> +     ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
>>> +                         struct btrfs_extent_item);
>>> +     item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
>>> +     ptr = (unsigned long)(ei + 1);
>>> +     end = (unsigned long)ei + item_size;
>>> +     while (ptr < end) {
>>> +             struct btrfs_extent_inline_ref *iref;
>>> +             int type;
>>> +
>>> +             iref = (struct btrfs_extent_inline_ref *)ptr;
>>> +             type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
>>> +             ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
>>> +                    type == BTRFS_SHARED_DATA_REF_KEY);
>>> +
>>> +             if (type == BTRFS_EXTENT_DATA_REF_KEY) {
>>> +                     struct btrfs_extent_data_ref *dref;
>>> +
>>> +                     dref = (struct btrfs_extent_data_ref *)(&iref->offset);
>>> +                     ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>>> +                                                   dref, path.nodes[0]);
>>> +                     if (ret != 0)
>>> +                             goto out;
>>> +             } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
>>> +                     u64 parent;
>>> +
>>> +                     parent = btrfs_extent_inline_ref_offset(path.nodes[0],
>>> +                                                             iref);
>>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>>> +                                                          ino,
>>> +                                                          parent,
>>> +                                                          disk_bytenr);
>>> +                     if (ret != 0)
>>> +                             goto out;
>>> +             }
>>> +
>>> +             ptr += btrfs_extent_inline_ref_size(type);
>>> +     }
>>> +
>>> +     /* Now check if there are any non-inlined refs. */
>>> +     path.slots[0]++;
>>> +     while (true) {
>>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>>> +                     if (ret < 0)
>>> +                             goto out;
>>> +                     if (ret > 0) {
>>> +                             ret = 0;
>>> +                             break;
>>> +                     }
>>> +             }
>>> +
>>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>>> +             if (key.objectid != disk_bytenr)
>>> +                     break;
>>> +
>>> +             if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
>>> +                     struct btrfs_extent_data_ref *dref;
>>> +
>>> +                     dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
>>> +                                           struct btrfs_extent_data_ref);
>>> +                     ret = check_prealloc_data_ref(fs_info, ino, disk_bytenr,
>>> +                                                   dref, path.nodes[0]);
>>> +                     if (ret != 0)
>>> +                             goto out;
>>> +             } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
>>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>>> +                                                          ino,
>>> +                                                          key.offset,
>>> +                                                          disk_bytenr);
>>> +                     if (ret != 0)
>>> +                             goto out;
>>> +             }
>>> +
>>> +             path.slots[0]++;
>>> +     }
>>> +out:
>>> +     btrfs_release_path(&path);
>>> +     return ret;
>>> +}
>>> +
>>>  static int process_file_extent(struct btrfs_root *root,
>>>                               struct extent_buffer *eb,
>>>                               int slot, struct btrfs_key *key,
>>> @@ -1515,8 +1773,16 @@ static int process_file_extent(struct btrfs_root *root,
>>>                       if (found < num_bytes)
>>>                               rec->some_csum_missing = 1;
>>>               } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
>>> -                     if (found > 0)
>>> -                             rec->errors |= I_ERR_ODD_CSUM_ITEM;
>>> +                     if (found > 0) {
>>> +                             ret = check_prealloc_extent_written(root->fs_info,
>>> +                                                                 rec->ino,
>>> +                                                                 disk_bytenr,
>>> +                                                                 num_bytes);
>>> +                             if (ret < 0)
>>> +                                     return ret;
>>> +                             if (ret == 0)
>>> +                                     rec->errors |= I_ERR_ODD_CSUM_ITEM;
>>> +                     }
>>>               }
>>>       }
>>>       return 0;
>>>
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2 v3] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-13 18:47 [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents fdmanana
                   ` (3 preceding siblings ...)
  2018-03-14  9:19 ` Nikolay Borisov
@ 2018-03-14 20:11 ` fdmanana
  2018-03-15 12:35   ` Nikolay Borisov
  4 siblings, 1 reply; 16+ messages in thread
From: fdmanana @ 2018-03-14 20:11 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

Under some cases the filesystem checker reports an error when it finds
checksum items for an extent that is referenced by an inode as a prealloc
extent. Such cases are not an error when the extent is actually shared
(was cloned/reflinked) with other inodes and was written through one of
those other inodes.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ xfs_io -c "falloc 0 256K" /mnt/foo
  $ sync

  $ xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
  $ touch /mnt/bar
  $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  <power fail>
  $ mount /dev/sdb /mnt
  $ umount /mnt

  $ btrfs check /dev/sdc
  Checking filesystem on /dev/sdb
  UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
  checking extents
  checking free space cache
  checking fs roots
  root 5 inode 257 errors 800, odd csum item
  ERROR: errors found in fs roots
  found 688128 bytes used, error(s) found
  total csum bytes: 256
  total tree bytes: 163840
  total fs tree bytes: 65536
  total extent tree bytes: 16384
  btree space waste bytes: 138819
  file data blocks allocated: 10747904
   referenced 10747904
  $ echo $?
  1

So teach check to not report such cases as errors by checking if the
extent is shared with other inodes and if so, consider it an error the
existence of checksum items only if all those other inodes are referencing
the extent as a prealloc extent.
This case can be hit often when running the generic/475 testcase from
fstests.

A test case will follow in a separate patch.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---

V3: Replaced incorrect argument to a btrfs_next_leaf() call (extent_root -> root).

V2: Made stuff work with lowmem mode as well.
    Added a comment about the limitations of the current check.
    Removed filtering by inode number since it was unreliable as we can
    have different inodes with same inode number but in different roots
    (so opted to drop the filtering instead of filtering by root as well,
    to keep it simpler).


 check/main.c        |  11 ++-
 check/mode-common.c | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 check/mode-common.h |   3 +
 check/mode-lowmem.c |  14 ++-
 4 files changed, 281 insertions(+), 5 deletions(-)

diff --git a/check/main.c b/check/main.c
index 392195ca..88e6c1e9 100644
--- a/check/main.c
+++ b/check/main.c
@@ -1515,8 +1515,15 @@ static int process_file_extent(struct btrfs_root *root,
 			if (found < num_bytes)
 				rec->some_csum_missing = 1;
 		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			if (found > 0)
-				rec->errors |= I_ERR_ODD_CSUM_ITEM;
+			if (found > 0) {
+				ret = check_prealloc_extent_written(root->fs_info,
+								    disk_bytenr,
+								    num_bytes);
+				if (ret < 0)
+					return ret;
+				if (ret == 0)
+					rec->errors |= I_ERR_ODD_CSUM_ITEM;
+			}
 		}
 	}
 	return 0;
diff --git a/check/mode-common.c b/check/mode-common.c
index 1b56a968..8a59e8a4 100644
--- a/check/mode-common.c
+++ b/check/mode-common.c
@@ -24,6 +24,264 @@
 #include "check/mode-common.h"
 
 /*
+ * Check if the inode referenced by the given data reference uses the extent
+ * at disk_bytenr as a non-prealloc extent.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
+				   u64 disk_bytenr,
+				   struct btrfs_extent_data_ref *dref,
+				   struct extent_buffer *eb)
+{
+	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
+	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
+	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct btrfs_path path;
+	int ret;
+
+	btrfs_init_path(&path);
+	key.objectid = rootid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root(fs_info, &key);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	key.objectid = objectid;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+	if (ret > 0) {
+		fprintf(stderr,
+			"Missing file extent item for inode %llu, root %llu, offset %llu",
+			objectid, rootid, offset);
+		ret = -ENOENT;
+	}
+	if (ret < 0)
+		goto out;
+
+	while (true) {
+		struct btrfs_file_extent_item *fi;
+		int extent_type;
+
+		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+			ret = btrfs_next_leaf(root, &path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+		}
+
+		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+		if (key.objectid != objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
+		if (extent_type != BTRFS_FILE_EXTENT_REG &&
+		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
+			goto next;
+
+		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
+		    disk_bytenr)
+			break;
+
+		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+			ret = 1;
+			goto out;
+		}
+next:
+		path.slots[0]++;
+	}
+	ret = 0;
+ out:
+	btrfs_release_path(&path);
+	return ret;
+}
+
+/*
+ * Check if a shared data reference points to a node that has a file extent item
+ * pointing to the extent at @disk_bytenr that is not of type prealloc.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
+					  u64 parent,
+					  u64 disk_bytenr)
+{
+	struct extent_buffer *eb;
+	u32 nr;
+	int i;
+	int ret = 0;
+
+	eb = read_tree_block(fs_info, parent, 0);
+	if (!extent_buffer_uptodate(eb)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	nr = btrfs_header_nritems(eb);
+	for (i = 0; i < nr; i++) {
+		struct btrfs_key key;
+		struct btrfs_file_extent_item *fi;
+		int extent_type;
+
+		btrfs_item_key_to_cpu(eb, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(eb, fi);
+		if (extent_type != BTRFS_FILE_EXTENT_REG &&
+		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
+			continue;
+
+		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
+		    extent_type == BTRFS_FILE_EXTENT_REG) {
+			ret = 1;
+			break;
+		}
+	}
+ out:
+	free_extent_buffer(eb);
+	return ret;
+}
+
+/*
+ * Check if a prealloc extent is shared by multiple inodes and if any inode has
+ * already written to that extent. This is to avoid emitting invalid warnings
+ * about odd csum items (a inode has an extent entirely marked as prealloc
+ * but another inode shares it and has already written to it).
+ *
+ * Note: right now it does not check if the number of checksum items in the
+ * csum tree matches the number of bytes written into the ex-prealloc extent.
+ * It's complex to deal with that because the prealloc extent might have been
+ * partially written through multiple inodes and we would have to keep track of
+ * ranges, merging them and notice ranges that fully or partially overlap, to
+ * avoid false reports of csum items missing for areas of the prealloc extent
+ * that were not written to - for example if we have a 1M prealloc extent, we
+ * can have only the first half of it written, but 2 different inodes refer to
+ * the its first half (through reflinks/cloning), so keeping a counter of bytes
+ * covered by checksum items is not enough, as the correct value would be 512K
+ * and not 1M (whence the need to track ranges).
+ *
+ * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
+ * at least one other inode has written to it, and < 0 on error.
+ */
+int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
+				  u64 disk_bytenr,
+				  u64 num_bytes)
+{
+	struct btrfs_path path;
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_extent_item *ei;
+	u32 item_size;
+	unsigned long ptr;
+	unsigned long end;
+
+	key.objectid = disk_bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
+	if (ret > 0) {
+		fprintf(stderr,
+			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
+			disk_bytenr, num_bytes);
+		ret = -ENOENT;
+	}
+	if (ret < 0)
+		goto out;
+
+	/* First check all inline refs. */
+	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
+			    struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		int type;
+
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
+		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
+		       type == BTRFS_SHARED_DATA_REF_KEY);
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			ret = check_prealloc_data_ref(fs_info, disk_bytenr,
+						      dref, path.nodes[0]);
+			if (ret != 0)
+				goto out;
+		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+			u64 parent;
+
+			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
+								iref);
+			ret = check_prealloc_shared_data_ref(fs_info,
+							     parent,
+							     disk_bytenr);
+			if (ret != 0)
+				goto out;
+		}
+
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+
+	/* Now check if there are any non-inlined refs. */
+	path.slots[0]++;
+	while (true) {
+		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+			ret = btrfs_next_leaf(fs_info->extent_root, &path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+		}
+
+		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+		if (key.objectid != disk_bytenr)
+			break;
+
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+
+			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
+					      struct btrfs_extent_data_ref);
+			ret = check_prealloc_data_ref(fs_info, disk_bytenr,
+						      dref, path.nodes[0]);
+			if (ret != 0)
+				goto out;
+		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			ret = check_prealloc_shared_data_ref(fs_info,
+							     key.offset,
+							     disk_bytenr);
+			if (ret != 0)
+				goto out;
+		}
+
+		path.slots[0]++;
+	}
+out:
+	btrfs_release_path(&path);
+	return ret;
+}
+
+/*
  * Search in csum tree to find how many bytes of range [@start, @start + @len)
  * has the corresponding csum item.
  *
diff --git a/check/mode-common.h b/check/mode-common.h
index ffae782b..79e1b0cf 100644
--- a/check/mode-common.h
+++ b/check/mode-common.h
@@ -80,6 +80,9 @@ static inline int fs_root_objectid(u64 objectid)
 	return is_fstree(objectid);
 }
 
+int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
+				  u64 disk_bytenr,
+				  u64 num_bytes);
 int count_csum_range(struct btrfs_fs_info *fs_info, u64 start,
 		     u64 len, u64 *found);
 int insert_inode_item(struct btrfs_trans_handle *trans,
diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
index 2424cdc2..02806377 100644
--- a/check/mode-lowmem.c
+++ b/check/mode-lowmem.c
@@ -1511,9 +1511,17 @@ static int check_file_extent(struct btrfs_root *root, struct btrfs_key *fkey,
 		      csum_found, search_len);
 	} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
 		   csum_found > 0) {
-		err |= ODD_CSUM_ITEM;
-		error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
-		      root->objectid, fkey->objectid, fkey->offset, csum_found);
+		ret = check_prealloc_extent_written(root->fs_info,
+						    disk_bytenr,
+						    disk_num_bytes);
+		if (ret < 0)
+			return ret;
+		if (ret == 0) {
+			err |= ODD_CSUM_ITEM;
+			error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
+			      root->objectid, fkey->objectid, fkey->offset,
+			      csum_found);
+		}
 	}
 
 	/* Check EXTENT_DATA hole */
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2 v2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-13 22:48 ` [PATCH 1/2 v2] " fdmanana
@ 2018-03-15  9:04   ` Nikolay Borisov
  2018-03-15  9:44     ` Filipe Manana
  0 siblings, 1 reply; 16+ messages in thread
From: Nikolay Borisov @ 2018-03-15  9:04 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 14.03.2018 00:48, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> Under some cases the filesystem checker reports an error when it finds
> checksum items for an extent that is referenced by an inode as a prealloc
> extent. Such cases are not an error when the extent is actually shared
> (was cloned/reflinked) with other inodes and was written through one of
> those other inodes.
> 
> Example:
> 
>   $ mkfs.btrfs -f /dev/sdb
>   $ mount /dev/sdb /mnt
> 
>   $ touch /mnt/foo
>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>   $ sync
> 
>   $ xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
>   $ touch /mnt/bar
>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>   $ xfs_io -c "fsync" /mnt/bar
> 
>   <power fail>
>   $ mount /dev/sdb /mnt
>   $ umount /mnt
> 
>   $ btrfs check /dev/sdc
>   Checking filesystem on /dev/sdb
>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>   checking extents
>   checking free space cache
>   checking fs roots
>   root 5 inode 257 errors 800, odd csum item
>   ERROR: errors found in fs roots
>   found 688128 bytes used, error(s) found
>   total csum bytes: 256
>   total tree bytes: 163840
>   total fs tree bytes: 65536
>   total extent tree bytes: 16384
>   btree space waste bytes: 138819
>   file data blocks allocated: 10747904
>    referenced 10747904
>   $ echo $?
>   1
> 
> So teach check to not report such cases as errors by checking if the
> extent is shared with other inodes and if so, consider it an error the
> existence of checksum items only if all those other inodes are referencing
> the extent as a prealloc extent.
> This case can be hit often when running the generic/475 testcase from
> fstests.
> 
> A test case will follow in a separate patch.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
> 
> V2: Made stuff work with lowmem mode as well.
>     Added a comment about the limitations of the current check.
>     Removed filtering by inode number since it was unreliable as we can
>     have different inodes with same inode number but in different roots
>     (so opted to drop the filtering instead of filtering by root as well,
>     to keep it simpler).
> 
>  check/main.c        |  11 ++-
>  check/mode-common.c | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  check/mode-common.h |   3 +
>  check/mode-lowmem.c |  14 ++-
>  4 files changed, 281 insertions(+), 5 deletions(-)
> 
> diff --git a/check/main.c b/check/main.c
> index 392195ca..88e6c1e9 100644
> --- a/check/main.c
> +++ b/check/main.c
> @@ -1515,8 +1515,15 @@ static int process_file_extent(struct btrfs_root *root,
>  			if (found < num_bytes)
>  				rec->some_csum_missing = 1;
>  		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
> -			if (found > 0)
> -				rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			if (found > 0) {
> +				ret = check_prealloc_extent_written(root->fs_info,
> +								    disk_bytenr,
> +								    num_bytes);
> +				if (ret < 0)
> +					return ret;
> +				if (ret == 0)
> +					rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			}
>  		}
>  	}
>  	return 0;
> diff --git a/check/mode-common.c b/check/mode-common.c
> index 1b56a968..559cd11d 100644
> --- a/check/mode-common.c
> +++ b/check/mode-common.c
> @@ -24,6 +24,264 @@
>  #include "check/mode-common.h"
>  
>  /*
> + * Check if the inode referenced by the given data reference uses the extent
> + * at disk_bytenr as a non-prealloc extent.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
> +				   u64 disk_bytenr,
> +				   struct btrfs_extent_data_ref *dref,
> +				   struct extent_buffer *eb)
> +{
> +	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
> +	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
> +	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
> +	struct btrfs_root *root;
> +	struct btrfs_key key;
> +	struct btrfs_path path;
> +	int ret;
> +
> +	btrfs_init_path(&path);
> +	key.objectid = rootid;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = (u64)-1;
> +	root = btrfs_read_fs_root(fs_info, &key);
> +	if (IS_ERR(root)) {
> +		ret = PTR_ERR(root);
> +		goto out;
> +	}
> +
> +	key.objectid = objectid;
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = offset;
> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing file extent item for inode %llu, root %llu, offset %llu",
> +			objectid, rootid, offset);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	while (true) {
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);

Shouldn't you use the read fs root rather than extent root in
btrfs_next_leaf?

> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0)
> +				break;
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != objectid ||
> +		    key.type != BTRFS_EXTENT_DATA_KEY)
> +			break;
> +
> +		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +				    struct btrfs_file_extent_item);
> +		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			goto next;
> +
> +		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
> +		    disk_bytenr)
> +			break;
> +
> +		if (extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			goto out;
> +		}
> +next:
> +		path.slots[0]++;
> +	}
> +	ret = 0;
> + out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
> +/*
> + * Check if a shared data reference points to a node that has a file extent item
> + * pointing to the extent at @disk_bytenr that is not of type prealloc.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
> +					  u64 parent,
> +					  u64 disk_bytenr)
> +{
> +	struct extent_buffer *eb;
> +	u32 nr;
> +	int i;
> +	int ret = 0;
> +
> +	eb = read_tree_block(fs_info, parent, 0);
> +	if (!extent_buffer_uptodate(eb)) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	nr = btrfs_header_nritems(eb);
> +	for (i = 0; i < nr; i++) {
> +		struct btrfs_key key;
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		btrfs_item_key_to_cpu(eb, &key, i);
> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
> +			continue;
> +
> +		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
> +		extent_type = btrfs_file_extent_type(eb, fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			continue;
> +
> +		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
> +		    extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> + out:
> +	free_extent_buffer(eb);
> +	return ret;
> +}
> +
> +/*
> + * Check if a prealloc extent is shared by multiple inodes and if any inode has
> + * already written to that extent. This is to avoid emitting invalid warnings
> + * about odd csum items (a inode has an extent entirely marked as prealloc
> + * but another inode shares it and has already written to it).
> + *
> + * Note: right now it does not check if the number of checksum items in the
> + * csum tree matches the number of bytes written into the ex-prealloc extent.
> + * It's complex to deal with that because the prealloc extent might have been
> + * partially written through multiple inodes and we would have to keep track of
> + * ranges, merging them and notice ranges that fully or partially overlap, to
> + * avoid false reports of csum items missing for areas of the prealloc extent
> + * that were not written to - for example if we have a 1M prealloc extent, we
> + * can have only the first half of it written, but 2 different inodes refer to
> + * the its first half (through reflinks/cloning), so keeping a counter of bytes
> + * covered by checksum items is not enough, as the correct value would be 512K
> + * and not 1M (whence the need to track ranges).
> + *
> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
> + * at least one other inode has written to it, and < 0 on error.
> + */
> +int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
> +				  u64 disk_bytenr,
> +				  u64 num_bytes)
> +{
> +	struct btrfs_path path;
> +	struct btrfs_key key;
> +	int ret;
> +	struct btrfs_extent_item *ei;
> +	u32 item_size;
> +	unsigned long ptr;
> +	unsigned long end;
> +
> +	key.objectid = disk_bytenr;
> +	key.type = BTRFS_EXTENT_ITEM_KEY;
> +	key.offset = num_bytes;
> +
> +	btrfs_init_path(&path);
> +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
> +			disk_bytenr, num_bytes);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	/* First check all inline refs. */
> +	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +			    struct btrfs_extent_item);
> +	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
> +	ptr = (unsigned long)(ei + 1);
> +	end = (unsigned long)ei + item_size;
> +	while (ptr < end) {
> +		struct btrfs_extent_inline_ref *iref;
> +		int type;
> +
> +		iref = (struct btrfs_extent_inline_ref *)ptr;
> +		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
> +		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
> +		       type == BTRFS_SHARED_DATA_REF_KEY);
> +
> +		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
> +			ret = check_prealloc_data_ref(fs_info, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
> +			u64 parent;
> +
> +			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
> +								iref);
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     parent,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		ptr += btrfs_extent_inline_ref_size(type);
> +	}
> +
> +	/* Now check if there are any non-inlined refs. */
> +	path.slots[0]++;
> +	while (true) {
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0) {
> +				ret = 0;
> +				break;
> +			}
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != disk_bytenr)
> +			break;
> +
> +		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +					      struct btrfs_extent_data_ref);
> +			ret = check_prealloc_data_ref(fs_info, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     key.offset,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		path.slots[0]++;
> +	}
> +out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
> +/*
>   * Search in csum tree to find how many bytes of range [@start, @start + @len)
>   * has the corresponding csum item.
>   *
> diff --git a/check/mode-common.h b/check/mode-common.h
> index ffae782b..79e1b0cf 100644
> --- a/check/mode-common.h
> +++ b/check/mode-common.h
> @@ -80,6 +80,9 @@ static inline int fs_root_objectid(u64 objectid)
>  	return is_fstree(objectid);
>  }
>  
> +int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
> +				  u64 disk_bytenr,
> +				  u64 num_bytes);
>  int count_csum_range(struct btrfs_fs_info *fs_info, u64 start,
>  		     u64 len, u64 *found);
>  int insert_inode_item(struct btrfs_trans_handle *trans,
> diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
> index 2424cdc2..02806377 100644
> --- a/check/mode-lowmem.c
> +++ b/check/mode-lowmem.c
> @@ -1511,9 +1511,17 @@ static int check_file_extent(struct btrfs_root *root, struct btrfs_key *fkey,
>  		      csum_found, search_len);
>  	} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
>  		   csum_found > 0) {
> -		err |= ODD_CSUM_ITEM;
> -		error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
> -		      root->objectid, fkey->objectid, fkey->offset, csum_found);
> +		ret = check_prealloc_extent_written(root->fs_info,
> +						    disk_bytenr,
> +						    disk_num_bytes);
> +		if (ret < 0)
> +			return ret;
> +		if (ret == 0) {
> +			err |= ODD_CSUM_ITEM;
> +			error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
> +			      root->objectid, fkey->objectid, fkey->offset,
> +			      csum_found);
> +		}
>  	}
>  
>  	/* Check EXTENT_DATA hole */
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2 v2] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-15  9:04   ` Nikolay Borisov
@ 2018-03-15  9:44     ` Filipe Manana
  0 siblings, 0 replies; 16+ messages in thread
From: Filipe Manana @ 2018-03-15  9:44 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: linux-btrfs

On Thu, Mar 15, 2018 at 9:04 AM, Nikolay Borisov <nborisov@suse.com> wrote:
>
>
> On 14.03.2018 00:48, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>>
>> Under some cases the filesystem checker reports an error when it finds
>> checksum items for an extent that is referenced by an inode as a prealloc
>> extent. Such cases are not an error when the extent is actually shared
>> (was cloned/reflinked) with other inodes and was written through one of
>> those other inodes.
>>
>> Example:
>>
>>   $ mkfs.btrfs -f /dev/sdb
>>   $ mount /dev/sdb /mnt
>>
>>   $ touch /mnt/foo
>>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>>   $ sync
>>
>>   $ xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
>>   $ touch /mnt/bar
>>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>>   $ xfs_io -c "fsync" /mnt/bar
>>
>>   <power fail>
>>   $ mount /dev/sdb /mnt
>>   $ umount /mnt
>>
>>   $ btrfs check /dev/sdc
>>   Checking filesystem on /dev/sdb
>>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>>   checking extents
>>   checking free space cache
>>   checking fs roots
>>   root 5 inode 257 errors 800, odd csum item
>>   ERROR: errors found in fs roots
>>   found 688128 bytes used, error(s) found
>>   total csum bytes: 256
>>   total tree bytes: 163840
>>   total fs tree bytes: 65536
>>   total extent tree bytes: 16384
>>   btree space waste bytes: 138819
>>   file data blocks allocated: 10747904
>>    referenced 10747904
>>   $ echo $?
>>   1
>>
>> So teach check to not report such cases as errors by checking if the
>> extent is shared with other inodes and if so, consider it an error the
>> existence of checksum items only if all those other inodes are referencing
>> the extent as a prealloc extent.
>> This case can be hit often when running the generic/475 testcase from
>> fstests.
>>
>> A test case will follow in a separate patch.
>>
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>> ---
>>
>> V2: Made stuff work with lowmem mode as well.
>>     Added a comment about the limitations of the current check.
>>     Removed filtering by inode number since it was unreliable as we can
>>     have different inodes with same inode number but in different roots
>>     (so opted to drop the filtering instead of filtering by root as well,
>>     to keep it simpler).
>>
>>  check/main.c        |  11 ++-
>>  check/mode-common.c | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  check/mode-common.h |   3 +
>>  check/mode-lowmem.c |  14 ++-
>>  4 files changed, 281 insertions(+), 5 deletions(-)
>>
>> diff --git a/check/main.c b/check/main.c
>> index 392195ca..88e6c1e9 100644
>> --- a/check/main.c
>> +++ b/check/main.c
>> @@ -1515,8 +1515,15 @@ static int process_file_extent(struct btrfs_root *root,
>>                       if (found < num_bytes)
>>                               rec->some_csum_missing = 1;
>>               } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
>> -                     if (found > 0)
>> -                             rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +                     if (found > 0) {
>> +                             ret = check_prealloc_extent_written(root->fs_info,
>> +                                                                 disk_bytenr,
>> +                                                                 num_bytes);
>> +                             if (ret < 0)
>> +                                     return ret;
>> +                             if (ret == 0)
>> +                                     rec->errors |= I_ERR_ODD_CSUM_ITEM;
>> +                     }
>>               }
>>       }
>>       return 0;
>> diff --git a/check/mode-common.c b/check/mode-common.c
>> index 1b56a968..559cd11d 100644
>> --- a/check/mode-common.c
>> +++ b/check/mode-common.c
>> @@ -24,6 +24,264 @@
>>  #include "check/mode-common.h"
>>
>>  /*
>> + * Check if the inode referenced by the given data reference uses the extent
>> + * at disk_bytenr as a non-prealloc extent.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
>> +                                u64 disk_bytenr,
>> +                                struct btrfs_extent_data_ref *dref,
>> +                                struct extent_buffer *eb)
>> +{
>> +     u64 rootid = btrfs_extent_data_ref_root(eb, dref);
>> +     u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
>> +     u64 offset = btrfs_extent_data_ref_offset(eb, dref);
>> +     struct btrfs_root *root;
>> +     struct btrfs_key key;
>> +     struct btrfs_path path;
>> +     int ret;
>> +
>> +     btrfs_init_path(&path);
>> +     key.objectid = rootid;
>> +     key.type = BTRFS_ROOT_ITEM_KEY;
>> +     key.offset = (u64)-1;
>> +     root = btrfs_read_fs_root(fs_info, &key);
>> +     if (IS_ERR(root)) {
>> +             ret = PTR_ERR(root);
>> +             goto out;
>> +     }
>> +
>> +     key.objectid = objectid;
>> +     key.type = BTRFS_EXTENT_DATA_KEY;
>> +     key.offset = offset;
>> +     ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
>> +     if (ret > 0) {
>> +             fprintf(stderr,
>> +                     "Missing file extent item for inode %llu, root %llu, offset %llu",
>> +                     objectid, rootid, offset);
>> +             ret = -ENOENT;
>> +     }
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     while (true) {
>> +             struct btrfs_file_extent_item *fi;
>> +             int extent_type;
>> +
>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>
> Shouldn't you use the read fs root rather than extent root in
> btrfs_next_leaf?

It should, copy paste error.
thanks

>
>> +                     if (ret < 0)
>> +                             goto out;
>> +                     if (ret > 0)
>> +                             break;
>> +             }
>> +
>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +             if (key.objectid != objectid ||
>> +                 key.type != BTRFS_EXTENT_DATA_KEY)
>> +                     break;
>> +
>> +             fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                                 struct btrfs_file_extent_item);
>> +             extent_type = btrfs_file_extent_type(path.nodes[0], fi);
>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +                     goto next;
>> +
>> +             if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
>> +                 disk_bytenr)
>> +                     break;
>> +
>> +             if (extent_type == BTRFS_FILE_EXTENT_REG) {
>> +                     ret = 1;
>> +                     goto out;
>> +             }
>> +next:
>> +             path.slots[0]++;
>> +     }
>> +     ret = 0;
>> + out:
>> +     btrfs_release_path(&path);
>> +     return ret;
>> +}
>> +
>> +/*
>> + * Check if a shared data reference points to a node that has a file extent item
>> + * pointing to the extent at @disk_bytenr that is not of type prealloc.
>> + *
>> + * Returns 1 if true, 0 if false and < 0 on error.
>> + */
>> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
>> +                                       u64 parent,
>> +                                       u64 disk_bytenr)
>> +{
>> +     struct extent_buffer *eb;
>> +     u32 nr;
>> +     int i;
>> +     int ret = 0;
>> +
>> +     eb = read_tree_block(fs_info, parent, 0);
>> +     if (!extent_buffer_uptodate(eb)) {
>> +             ret = -EIO;
>> +             goto out;
>> +     }
>> +
>> +     nr = btrfs_header_nritems(eb);
>> +     for (i = 0; i < nr; i++) {
>> +             struct btrfs_key key;
>> +             struct btrfs_file_extent_item *fi;
>> +             int extent_type;
>> +
>> +             btrfs_item_key_to_cpu(eb, &key, i);
>> +             if (key.type != BTRFS_EXTENT_DATA_KEY)
>> +                     continue;
>> +
>> +             fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
>> +             extent_type = btrfs_file_extent_type(eb, fi);
>> +             if (extent_type != BTRFS_FILE_EXTENT_REG &&
>> +                 extent_type != BTRFS_FILE_EXTENT_PREALLOC)
>> +                     continue;
>> +
>> +             if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
>> +                 extent_type == BTRFS_FILE_EXTENT_REG) {
>> +                     ret = 1;
>> +                     break;
>> +             }
>> +     }
>> + out:
>> +     free_extent_buffer(eb);
>> +     return ret;
>> +}
>> +
>> +/*
>> + * Check if a prealloc extent is shared by multiple inodes and if any inode has
>> + * already written to that extent. This is to avoid emitting invalid warnings
>> + * about odd csum items (a inode has an extent entirely marked as prealloc
>> + * but another inode shares it and has already written to it).
>> + *
>> + * Note: right now it does not check if the number of checksum items in the
>> + * csum tree matches the number of bytes written into the ex-prealloc extent.
>> + * It's complex to deal with that because the prealloc extent might have been
>> + * partially written through multiple inodes and we would have to keep track of
>> + * ranges, merging them and notice ranges that fully or partially overlap, to
>> + * avoid false reports of csum items missing for areas of the prealloc extent
>> + * that were not written to - for example if we have a 1M prealloc extent, we
>> + * can have only the first half of it written, but 2 different inodes refer to
>> + * the its first half (through reflinks/cloning), so keeping a counter of bytes
>> + * covered by checksum items is not enough, as the correct value would be 512K
>> + * and not 1M (whence the need to track ranges).
>> + *
>> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
>> + * at least one other inode has written to it, and < 0 on error.
>> + */
>> +int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
>> +                               u64 disk_bytenr,
>> +                               u64 num_bytes)
>> +{
>> +     struct btrfs_path path;
>> +     struct btrfs_key key;
>> +     int ret;
>> +     struct btrfs_extent_item *ei;
>> +     u32 item_size;
>> +     unsigned long ptr;
>> +     unsigned long end;
>> +
>> +     key.objectid = disk_bytenr;
>> +     key.type = BTRFS_EXTENT_ITEM_KEY;
>> +     key.offset = num_bytes;
>> +
>> +     btrfs_init_path(&path);
>> +     ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
>> +     if (ret > 0) {
>> +             fprintf(stderr,
>> +                     "Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
>> +                     disk_bytenr, num_bytes);
>> +             ret = -ENOENT;
>> +     }
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     /* First check all inline refs. */
>> +     ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                         struct btrfs_extent_item);
>> +     item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
>> +     ptr = (unsigned long)(ei + 1);
>> +     end = (unsigned long)ei + item_size;
>> +     while (ptr < end) {
>> +             struct btrfs_extent_inline_ref *iref;
>> +             int type;
>> +
>> +             iref = (struct btrfs_extent_inline_ref *)ptr;
>> +             type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
>> +             ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
>> +                    type == BTRFS_SHARED_DATA_REF_KEY);
>> +
>> +             if (type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +                     struct btrfs_extent_data_ref *dref;
>> +
>> +                     dref = (struct btrfs_extent_data_ref *)(&iref->offset);
>> +                     ret = check_prealloc_data_ref(fs_info, disk_bytenr,
>> +                                                   dref, path.nodes[0]);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
>> +                     u64 parent;
>> +
>> +                     parent = btrfs_extent_inline_ref_offset(path.nodes[0],
>> +                                                             iref);
>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>> +                                                          parent,
>> +                                                          disk_bytenr);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             }
>> +
>> +             ptr += btrfs_extent_inline_ref_size(type);
>> +     }
>> +
>> +     /* Now check if there are any non-inlined refs. */
>> +     path.slots[0]++;
>> +     while (true) {
>> +             if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
>> +                     ret = btrfs_next_leaf(fs_info->extent_root, &path);
>> +                     if (ret < 0)
>> +                             goto out;
>> +                     if (ret > 0) {
>> +                             ret = 0;
>> +                             break;
>> +                     }
>> +             }
>> +
>> +             btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
>> +             if (key.objectid != disk_bytenr)
>> +                     break;
>> +
>> +             if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
>> +                     struct btrfs_extent_data_ref *dref;
>> +
>> +                     dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
>> +                                           struct btrfs_extent_data_ref);
>> +                     ret = check_prealloc_data_ref(fs_info, disk_bytenr,
>> +                                                   dref, path.nodes[0]);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
>> +                     ret = check_prealloc_shared_data_ref(fs_info,
>> +                                                          key.offset,
>> +                                                          disk_bytenr);
>> +                     if (ret != 0)
>> +                             goto out;
>> +             }
>> +
>> +             path.slots[0]++;
>> +     }
>> +out:
>> +     btrfs_release_path(&path);
>> +     return ret;
>> +}
>> +
>> +/*
>>   * Search in csum tree to find how many bytes of range [@start, @start + @len)
>>   * has the corresponding csum item.
>>   *
>> diff --git a/check/mode-common.h b/check/mode-common.h
>> index ffae782b..79e1b0cf 100644
>> --- a/check/mode-common.h
>> +++ b/check/mode-common.h
>> @@ -80,6 +80,9 @@ static inline int fs_root_objectid(u64 objectid)
>>       return is_fstree(objectid);
>>  }
>>
>> +int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
>> +                               u64 disk_bytenr,
>> +                               u64 num_bytes);
>>  int count_csum_range(struct btrfs_fs_info *fs_info, u64 start,
>>                    u64 len, u64 *found);
>>  int insert_inode_item(struct btrfs_trans_handle *trans,
>> diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
>> index 2424cdc2..02806377 100644
>> --- a/check/mode-lowmem.c
>> +++ b/check/mode-lowmem.c
>> @@ -1511,9 +1511,17 @@ static int check_file_extent(struct btrfs_root *root, struct btrfs_key *fkey,
>>                     csum_found, search_len);
>>       } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
>>                  csum_found > 0) {
>> -             err |= ODD_CSUM_ITEM;
>> -             error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
>> -                   root->objectid, fkey->objectid, fkey->offset, csum_found);
>> +             ret = check_prealloc_extent_written(root->fs_info,
>> +                                                 disk_bytenr,
>> +                                                 disk_num_bytes);
>> +             if (ret < 0)
>> +                     return ret;
>> +             if (ret == 0) {
>> +                     err |= ODD_CSUM_ITEM;
>> +                     error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
>> +                           root->objectid, fkey->objectid, fkey->offset,
>> +                           csum_found);
>> +             }
>>       }
>>
>>       /* Check EXTENT_DATA hole */
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2 v3] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-14 20:11 ` [PATCH 1/2 v3] " fdmanana
@ 2018-03-15 12:35   ` Nikolay Borisov
  2018-03-19 12:45     ` David Sterba
  0 siblings, 1 reply; 16+ messages in thread
From: Nikolay Borisov @ 2018-03-15 12:35 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 14.03.2018 22:11, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> Under some cases the filesystem checker reports an error when it finds
> checksum items for an extent that is referenced by an inode as a prealloc
> extent. Such cases are not an error when the extent is actually shared
> (was cloned/reflinked) with other inodes and was written through one of
> those other inodes.
> 
> Example:
> 
>   $ mkfs.btrfs -f /dev/sdb
>   $ mount /dev/sdb /mnt
> 
>   $ touch /mnt/foo
>   $ xfs_io -c "falloc 0 256K" /mnt/foo
>   $ sync
> 
>   $ xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
>   $ touch /mnt/bar
>   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
>   $ xfs_io -c "fsync" /mnt/bar
> 
>   <power fail>
>   $ mount /dev/sdb /mnt
>   $ umount /mnt
> 
>   $ btrfs check /dev/sdc
>   Checking filesystem on /dev/sdb
>   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
>   checking extents
>   checking free space cache
>   checking fs roots
>   root 5 inode 257 errors 800, odd csum item
>   ERROR: errors found in fs roots
>   found 688128 bytes used, error(s) found
>   total csum bytes: 256
>   total tree bytes: 163840
>   total fs tree bytes: 65536
>   total extent tree bytes: 16384
>   btree space waste bytes: 138819
>   file data blocks allocated: 10747904
>    referenced 10747904
>   $ echo $?
>   1
> 
> So teach check to not report such cases as errors by checking if the
> extent is shared with other inodes and if so, consider it an error the
> existence of checksum items only if all those other inodes are referencing
> the extent as a prealloc extent.
> This case can be hit often when running the generic/475 testcase from
> fstests.
> 
> A test case will follow in a separate patch.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>

Reviewed-by: Nikolay Borisov <nborisov@suse.com>

> ---
> 
> V3: Replaced incorrect argument to a btrfs_next_leaf() call (extent_root -> root).
> 
> V2: Made stuff work with lowmem mode as well.
>     Added a comment about the limitations of the current check.
>     Removed filtering by inode number since it was unreliable as we can
>     have different inodes with same inode number but in different roots
>     (so opted to drop the filtering instead of filtering by root as well,
>     to keep it simpler).
> 
> 
>  check/main.c        |  11 ++-
>  check/mode-common.c | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  check/mode-common.h |   3 +
>  check/mode-lowmem.c |  14 ++-
>  4 files changed, 281 insertions(+), 5 deletions(-)
> 
> diff --git a/check/main.c b/check/main.c
> index 392195ca..88e6c1e9 100644
> --- a/check/main.c
> +++ b/check/main.c
> @@ -1515,8 +1515,15 @@ static int process_file_extent(struct btrfs_root *root,
>  			if (found < num_bytes)
>  				rec->some_csum_missing = 1;
>  		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
> -			if (found > 0)
> -				rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			if (found > 0) {
> +				ret = check_prealloc_extent_written(root->fs_info,
> +								    disk_bytenr,
> +								    num_bytes);
> +				if (ret < 0)
> +					return ret;
> +				if (ret == 0)
> +					rec->errors |= I_ERR_ODD_CSUM_ITEM;
> +			}
>  		}
>  	}
>  	return 0;
> diff --git a/check/mode-common.c b/check/mode-common.c
> index 1b56a968..8a59e8a4 100644
> --- a/check/mode-common.c
> +++ b/check/mode-common.c
> @@ -24,6 +24,264 @@
>  #include "check/mode-common.h"
>  
>  /*
> + * Check if the inode referenced by the given data reference uses the extent
> + * at disk_bytenr as a non-prealloc extent.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
> +				   u64 disk_bytenr,
> +				   struct btrfs_extent_data_ref *dref,
> +				   struct extent_buffer *eb)
> +{
> +	u64 rootid = btrfs_extent_data_ref_root(eb, dref);
> +	u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
> +	u64 offset = btrfs_extent_data_ref_offset(eb, dref);
> +	struct btrfs_root *root;
> +	struct btrfs_key key;
> +	struct btrfs_path path;
> +	int ret;
> +
> +	btrfs_init_path(&path);
> +	key.objectid = rootid;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = (u64)-1;
> +	root = btrfs_read_fs_root(fs_info, &key);
> +	if (IS_ERR(root)) {
> +		ret = PTR_ERR(root);
> +		goto out;
> +	}
> +
> +	key.objectid = objectid;
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = offset;
> +	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing file extent item for inode %llu, root %llu, offset %llu",
> +			objectid, rootid, offset);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	while (true) {
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0)
> +				break;
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != objectid ||
> +		    key.type != BTRFS_EXTENT_DATA_KEY)
> +			break;
> +
> +		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +				    struct btrfs_file_extent_item);
> +		extent_type = btrfs_file_extent_type(path.nodes[0], fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			goto next;
> +
> +		if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
> +		    disk_bytenr)
> +			break;
> +
> +		if (extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			goto out;
> +		}
> +next:
> +		path.slots[0]++;
> +	}
> +	ret = 0;
> + out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
> +/*
> + * Check if a shared data reference points to a node that has a file extent item
> + * pointing to the extent at @disk_bytenr that is not of type prealloc.
> + *
> + * Returns 1 if true, 0 if false and < 0 on error.
> + */
> +static int check_prealloc_shared_data_ref(struct btrfs_fs_info *fs_info,
> +					  u64 parent,
> +					  u64 disk_bytenr)
> +{
> +	struct extent_buffer *eb;
> +	u32 nr;
> +	int i;
> +	int ret = 0;
> +
> +	eb = read_tree_block(fs_info, parent, 0);
> +	if (!extent_buffer_uptodate(eb)) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	nr = btrfs_header_nritems(eb);
> +	for (i = 0; i < nr; i++) {
> +		struct btrfs_key key;
> +		struct btrfs_file_extent_item *fi;
> +		int extent_type;
> +
> +		btrfs_item_key_to_cpu(eb, &key, i);
> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
> +			continue;
> +
> +		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
> +		extent_type = btrfs_file_extent_type(eb, fi);
> +		if (extent_type != BTRFS_FILE_EXTENT_REG &&
> +		    extent_type != BTRFS_FILE_EXTENT_PREALLOC)
> +			continue;
> +
> +		if (btrfs_file_extent_disk_bytenr(eb, fi) == disk_bytenr &&
> +		    extent_type == BTRFS_FILE_EXTENT_REG) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> + out:
> +	free_extent_buffer(eb);
> +	return ret;
> +}
> +
> +/*
> + * Check if a prealloc extent is shared by multiple inodes and if any inode has
> + * already written to that extent. This is to avoid emitting invalid warnings
> + * about odd csum items (a inode has an extent entirely marked as prealloc
> + * but another inode shares it and has already written to it).
> + *
> + * Note: right now it does not check if the number of checksum items in the
> + * csum tree matches the number of bytes written into the ex-prealloc extent.
> + * It's complex to deal with that because the prealloc extent might have been
> + * partially written through multiple inodes and we would have to keep track of
> + * ranges, merging them and notice ranges that fully or partially overlap, to
> + * avoid false reports of csum items missing for areas of the prealloc extent
> + * that were not written to - for example if we have a 1M prealloc extent, we
> + * can have only the first half of it written, but 2 different inodes refer to
> + * the its first half (through reflinks/cloning), so keeping a counter of bytes
> + * covered by checksum items is not enough, as the correct value would be 512K
> + * and not 1M (whence the need to track ranges).
> + *
> + * Returns 0 if the prealloc extent was not written yet by any inode, 1 if
> + * at least one other inode has written to it, and < 0 on error.
> + */
> +int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
> +				  u64 disk_bytenr,
> +				  u64 num_bytes)
> +{
> +	struct btrfs_path path;
> +	struct btrfs_key key;
> +	int ret;
> +	struct btrfs_extent_item *ei;
> +	u32 item_size;
> +	unsigned long ptr;
> +	unsigned long end;
> +
> +	key.objectid = disk_bytenr;
> +	key.type = BTRFS_EXTENT_ITEM_KEY;
> +	key.offset = num_bytes;
> +
> +	btrfs_init_path(&path);
> +	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
> +	if (ret > 0) {
> +		fprintf(stderr,
> +			"Missing extent item in extent tree for disk_bytenr %llu, num_bytes %llu\n",
> +			disk_bytenr, num_bytes);
> +		ret = -ENOENT;
> +	}
> +	if (ret < 0)
> +		goto out;
> +
> +	/* First check all inline refs. */
> +	ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +			    struct btrfs_extent_item);
> +	item_size = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
> +	ptr = (unsigned long)(ei + 1);
> +	end = (unsigned long)ei + item_size;
> +	while (ptr < end) {
> +		struct btrfs_extent_inline_ref *iref;
> +		int type;
> +
> +		iref = (struct btrfs_extent_inline_ref *)ptr;
> +		type = btrfs_extent_inline_ref_type(path.nodes[0], iref);
> +		ASSERT(type == BTRFS_EXTENT_DATA_REF_KEY ||
> +		       type == BTRFS_SHARED_DATA_REF_KEY);
> +
> +		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
> +			ret = check_prealloc_data_ref(fs_info, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
> +			u64 parent;
> +
> +			parent = btrfs_extent_inline_ref_offset(path.nodes[0],
> +								iref);
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     parent,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		ptr += btrfs_extent_inline_ref_size(type);
> +	}
> +
> +	/* Now check if there are any non-inlined refs. */
> +	path.slots[0]++;
> +	while (true) {
> +		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
> +			ret = btrfs_next_leaf(fs_info->extent_root, &path);
> +			if (ret < 0)
> +				goto out;
> +			if (ret > 0) {
> +				ret = 0;
> +				break;
> +			}
> +		}
> +
> +		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
> +		if (key.objectid != disk_bytenr)
> +			break;
> +
> +		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
> +			struct btrfs_extent_data_ref *dref;
> +
> +			dref = btrfs_item_ptr(path.nodes[0], path.slots[0],
> +					      struct btrfs_extent_data_ref);
> +			ret = check_prealloc_data_ref(fs_info, disk_bytenr,
> +						      dref, path.nodes[0]);
> +			if (ret != 0)
> +				goto out;
> +		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
> +			ret = check_prealloc_shared_data_ref(fs_info,
> +							     key.offset,
> +							     disk_bytenr);
> +			if (ret != 0)
> +				goto out;
> +		}
> +
> +		path.slots[0]++;
> +	}
> +out:
> +	btrfs_release_path(&path);
> +	return ret;
> +}
> +
> +/*
>   * Search in csum tree to find how many bytes of range [@start, @start + @len)
>   * has the corresponding csum item.
>   *
> diff --git a/check/mode-common.h b/check/mode-common.h
> index ffae782b..79e1b0cf 100644
> --- a/check/mode-common.h
> +++ b/check/mode-common.h
> @@ -80,6 +80,9 @@ static inline int fs_root_objectid(u64 objectid)
>  	return is_fstree(objectid);
>  }
>  
> +int check_prealloc_extent_written(struct btrfs_fs_info *fs_info,
> +				  u64 disk_bytenr,
> +				  u64 num_bytes);
>  int count_csum_range(struct btrfs_fs_info *fs_info, u64 start,
>  		     u64 len, u64 *found);
>  int insert_inode_item(struct btrfs_trans_handle *trans,
> diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
> index 2424cdc2..02806377 100644
> --- a/check/mode-lowmem.c
> +++ b/check/mode-lowmem.c
> @@ -1511,9 +1511,17 @@ static int check_file_extent(struct btrfs_root *root, struct btrfs_key *fkey,
>  		      csum_found, search_len);
>  	} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
>  		   csum_found > 0) {
> -		err |= ODD_CSUM_ITEM;
> -		error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
> -		      root->objectid, fkey->objectid, fkey->offset, csum_found);
> +		ret = check_prealloc_extent_written(root->fs_info,
> +						    disk_bytenr,
> +						    disk_num_bytes);
> +		if (ret < 0)
> +			return ret;
> +		if (ret == 0) {
> +			err |= ODD_CSUM_ITEM;
> +			error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have csum, but has: %llu",
> +			      root->objectid, fkey->objectid, fkey->offset,
> +			      csum_found);
> +		}
>  	}
>  
>  	/* Check EXTENT_DATA hole */
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2 v3] Btrfs-progs: check, fix false error reports for shared prealloc extents
  2018-03-15 12:35   ` Nikolay Borisov
@ 2018-03-19 12:45     ` David Sterba
  0 siblings, 0 replies; 16+ messages in thread
From: David Sterba @ 2018-03-19 12:45 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: fdmanana, linux-btrfs

On Thu, Mar 15, 2018 at 02:35:36PM +0200, Nikolay Borisov wrote:
> On 14.03.2018 22:11, fdmanana@kernel.org wrote:
> > From: Filipe Manana <fdmanana@suse.com>
> > 
> > Under some cases the filesystem checker reports an error when it finds
> > checksum items for an extent that is referenced by an inode as a prealloc
> > extent. Such cases are not an error when the extent is actually shared
> > (was cloned/reflinked) with other inodes and was written through one of
> > those other inodes.
> > 
> > Example:
> > 
> >   $ mkfs.btrfs -f /dev/sdb
> >   $ mount /dev/sdb /mnt
> > 
> >   $ touch /mnt/foo
> >   $ xfs_io -c "falloc 0 256K" /mnt/foo
> >   $ sync
> > 
> >   $ xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
> >   $ touch /mnt/bar
> >   $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
> >   $ xfs_io -c "fsync" /mnt/bar
> > 
> >   <power fail>
> >   $ mount /dev/sdb /mnt
> >   $ umount /mnt
> > 
> >   $ btrfs check /dev/sdc
> >   Checking filesystem on /dev/sdb
> >   UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
> >   checking extents
> >   checking free space cache
> >   checking fs roots
> >   root 5 inode 257 errors 800, odd csum item
> >   ERROR: errors found in fs roots
> >   found 688128 bytes used, error(s) found
> >   total csum bytes: 256
> >   total tree bytes: 163840
> >   total fs tree bytes: 65536
> >   total extent tree bytes: 16384
> >   btree space waste bytes: 138819
> >   file data blocks allocated: 10747904
> >    referenced 10747904
> >   $ echo $?
> >   1
> > 
> > So teach check to not report such cases as errors by checking if the
> > extent is shared with other inodes and if so, consider it an error the
> > existence of checksum items only if all those other inodes are referencing
> > the extent as a prealloc extent.
> > This case can be hit often when running the generic/475 testcase from
> > fstests.
> > 
> > A test case will follow in a separate patch.
> > 
> > Signed-off-by: Filipe Manana <fdmanana@suse.com>
> 
> Reviewed-by: Nikolay Borisov <nborisov@suse.com>

Applied, thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2018-03-19 12:48 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-13 18:47 [PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents fdmanana
2018-03-13 22:48 ` [PATCH 1/2 v2] " fdmanana
2018-03-15  9:04   ` Nikolay Borisov
2018-03-15  9:44     ` Filipe Manana
2018-03-14  1:32 ` [PATCH 1/2] " Qu Wenruo
2018-03-14  1:58   ` Qu Wenruo
2018-03-14  1:59   ` Lu Fengqi
2018-03-14 10:33   ` Filipe Manana
2018-03-14 10:54     ` Qu Wenruo
2018-03-14  8:19 ` Nikolay Borisov
2018-03-14 10:34   ` Filipe Manana
2018-03-14  9:19 ` Nikolay Borisov
2018-03-14 10:35   ` Filipe Manana
2018-03-14 20:11 ` [PATCH 1/2 v3] " fdmanana
2018-03-15 12:35   ` Nikolay Borisov
2018-03-19 12:45     ` David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.