linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Btrfs: check items for correctness as we search
@ 2011-03-16 20:04 Josef Bacik
  2011-03-16 20:50 ` Chris Mason
  2011-03-17 18:18 ` [PATCH] Btrfs: check items for correctness as we search V3 Josef Bacik
  0 siblings, 2 replies; 10+ messages in thread
From: Josef Bacik @ 2011-03-16 20:04 UTC (permalink / raw)
  To: linux-btrfs

Currently if we have corrupted items things will blow up in spectacular ways.
So as we read in blocks and they are leaves, check the entire leaf to make sure
all of the items are correct and point to valid parts in the leaf for the item
data the are responsible for.  If the item is corrupt we will kick back EIO and
not read any of the copies since they are likely to not be correct either.  This
will catch generic corruptions, it will be up to the individual callers of
btrfs_search_slot to make sure their items are right.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.c     |  123 --------------------------------------------------
 fs/btrfs/disk-io.c   |   92 +++++++++++++++++++++++++++++++++++++-
 fs/btrfs/extent_io.h |    1 +
 3 files changed, 92 insertions(+), 124 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0..73e5300 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -732,122 +732,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
 	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
-/*
- * extra debugging checks to make sure all the items in a key are
- * well formed and in the proper order
- */
-static int check_node(struct btrfs_root *root, struct btrfs_path *path,
-		      int level)
-{
-	struct extent_buffer *parent = NULL;
-	struct extent_buffer *node = path->nodes[level];
-	struct btrfs_disk_key parent_key;
-	struct btrfs_disk_key node_key;
-	int parent_slot;
-	int slot;
-	struct btrfs_key cpukey;
-	u32 nritems = btrfs_header_nritems(node);
-
-	if (path->nodes[level + 1])
-		parent = path->nodes[level + 1];
-
-	slot = path->slots[level];
-	BUG_ON(nritems == 0);
-	if (parent) {
-		parent_slot = path->slots[level + 1];
-		btrfs_node_key(parent, &parent_key, parent_slot);
-		btrfs_node_key(node, &node_key, 0);
-		BUG_ON(memcmp(&parent_key, &node_key,
-			      sizeof(struct btrfs_disk_key)));
-		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_bytenr(node));
-	}
-	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
-	if (slot != 0) {
-		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
-		btrfs_node_key(node, &node_key, slot);
-		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
-	}
-	if (slot < nritems - 1) {
-		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
-		btrfs_node_key(node, &node_key, slot);
-		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
-	}
-	return 0;
-}
-
-/*
- * extra checking to make sure all the items in a leaf are
- * well formed and in the proper order
- */
-static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
-		      int level)
-{
-	struct extent_buffer *leaf = path->nodes[level];
-	struct extent_buffer *parent = NULL;
-	int parent_slot;
-	struct btrfs_key cpukey;
-	struct btrfs_disk_key parent_key;
-	struct btrfs_disk_key leaf_key;
-	int slot = path->slots[0];
-
-	u32 nritems = btrfs_header_nritems(leaf);
-
-	if (path->nodes[level + 1])
-		parent = path->nodes[level + 1];
-
-	if (nritems == 0)
-		return 0;
-
-	if (parent) {
-		parent_slot = path->slots[level + 1];
-		btrfs_node_key(parent, &parent_key, parent_slot);
-		btrfs_item_key(leaf, &leaf_key, 0);
-
-		BUG_ON(memcmp(&parent_key, &leaf_key,
-		       sizeof(struct btrfs_disk_key)));
-		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_bytenr(leaf));
-	}
-	if (slot != 0 && slot < nritems - 1) {
-		btrfs_item_key(leaf, &leaf_key, slot);
-		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
-		if (comp_keys(&leaf_key, &cpukey) <= 0) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad key\n", slot);
-			BUG_ON(1);
-		}
-		if (btrfs_item_offset_nr(leaf, slot - 1) !=
-		       btrfs_item_end_nr(leaf, slot)) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad\n", slot);
-			BUG_ON(1);
-		}
-	}
-	if (slot < nritems - 1) {
-		btrfs_item_key(leaf, &leaf_key, slot);
-		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
-		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
-		if (btrfs_item_offset_nr(leaf, slot) !=
-			btrfs_item_end_nr(leaf, slot + 1)) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad\n", slot);
-			BUG_ON(1);
-		}
-	}
-	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
-	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
-	return 0;
-}
-
-static noinline int check_block(struct btrfs_root *root,
-				struct btrfs_path *path, int level)
-{
-	return 0;
-	if (level == 0)
-		return check_leaf(root, path, level);
-	return check_node(root, path, level);
-}
 
 /*
  * search for key in the extent_buffer.  The items start at offset p,
@@ -1188,7 +1072,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		}
 	}
 	/* double check we haven't messed things up */
-	check_block(root, path, level);
 	if (orig_ptr !=
 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
 		BUG();
@@ -1798,12 +1681,6 @@ cow_done:
 		if (!cow)
 			btrfs_unlock_up_safe(p, level + 1);
 
-		ret = check_block(root, p, level);
-		if (ret) {
-			ret = -1;
-			goto done;
-		}
-
 		ret = bin_search(b, key, level, &slot);
 
 		if (level != 0) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 495b1ac..1694782 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -331,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
 
+		/*
+		 * This buffer's crc is fine, but its contents are corrupted, so
+		 * there is no reason to read the other copies, they won't be
+		 * any less wrong.
+		 */
+		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+			return ret;
+
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
@@ -419,6 +427,76 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
+#define CORRUPT(reason, eb, root, slot)				\
+	printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"	\
+	       "root=%llu, slot=%d\n", reason,			\
+	       (unsigned long long)btrfs_header_bytenr(eb),	\
+	       (unsigned long long)root->objectid, slot)
+
+/*
+ * extra checking to make sure all the items in a leaf are
+ * well formed and in the proper order
+ */
+static int check_leaf(struct btrfs_root *root,
+		      struct extent_buffer *leaf, int slot)
+{
+	struct btrfs_key key;
+	struct btrfs_key leaf_key;
+	u32 nritems = btrfs_header_nritems(leaf);
+
+	btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+	if (slot != 0 && slot < nritems - 1) {
+		btrfs_item_key_to_cpu(leaf, &key, slot - 1);
+		if (btrfs_comp_cpu_keys(&leaf_key, &key) <= 0) {
+			CORRUPT("offset bad key", leaf, root, slot);
+			return -EIO;
+		}
+		if (btrfs_item_offset_nr(leaf, slot - 1) !=
+		       btrfs_item_end_nr(leaf, slot)) {
+			CORRUPT("slot offset bad", leaf, root, slot);
+			return -EIO;
+		}
+	}
+	if (slot < nritems - 1) {
+		btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+			CORRUPT("offset bad key", leaf, root, slot);
+			return -EIO;
+		}
+		if (btrfs_item_offset_nr(leaf, slot) !=
+			btrfs_item_end_nr(leaf, slot + 1)) {
+			CORRUPT("slot offset bad", leaf, root, slot);
+			return -EIO;
+		}
+	}
+	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+	    BTRFS_LEAF_DATA_SIZE(root)) {
+		CORRUPT("invalid offset size pair", leaf, root, slot);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static noinline int check_block(struct btrfs_root *root,
+				struct extent_buffer *eb)
+{
+	u32 nritems = btrfs_header_nritems(eb);
+	int slot;
+	int ret = 0;
+
+	if (nritems == 0)
+		return 0;
+
+	for (slot = 0; slot < nritems; slot++) {
+		ret = check_leaf(root, eb, slot);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
 {
@@ -485,8 +563,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	btrfs_set_buffer_lockdep_class(eb, found_level);
 
 	ret = csum_tree_block(root, eb, 1);
-	if (ret)
+	if (ret) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/*
+	 * If this is a leaf block and it is corrupt, set the corrupt bit so
+	 * that we don't try and read the other copies of this block, just
+	 * return -EIO.
+	 */
+	if (found_level == 0 && check_block(root, eb)) {
+		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
+	}
 
 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 	end = eb->start + end - 1;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9318dfe..f62c544 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -31,6 +31,7 @@
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] Btrfs: check items for correctness as we search
  2011-03-16 20:04 [PATCH] Btrfs: check items for correctness as we search Josef Bacik
@ 2011-03-16 20:50 ` Chris Mason
  2011-03-16 21:05   ` Josef Bacik
  2011-03-17 18:18 ` [PATCH] Btrfs: check items for correctness as we search V3 Josef Bacik
  1 sibling, 1 reply; 10+ messages in thread
From: Chris Mason @ 2011-03-16 20:50 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs

Excerpts from Josef Bacik's message of 2011-03-16 16:04:23 -0400:
> Currently if we have corrupted items things will blow up in spectacular ways.
> So as we read in blocks and they are leaves, check the entire leaf to make sure
> all of the items are correct and point to valid parts in the leaf for the item
> data the are responsible for.  If the item is corrupt we will kick back EIO and
> not read any of the copies since they are likely to not be correct either.  This
> will catch generic corruptions, it will be up to the individual callers of
> btrfs_search_slot to make sure their items are right.  Thanks,

Thanks for working on this, a few comments below.

> 
> Signed-off-by: Josef Bacik <josef@redhat.com>

> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 495b1ac..1694782 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -331,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
>              !verify_parent_transid(io_tree, eb, parent_transid))
>              return ret;
>  
> +        /*
> +         * This buffer's crc is fine, but its contents are corrupted, so
> +         * there is no reason to read the other copies, they won't be
> +         * any less wrong.
> +         */
> +        if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
> +            return ret;
> +

We should make sure to clear this when read_extent_buffer_pages starts?
At the very least it should get cleared if we delete the block.

>          num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
>                            eb->start, eb->len);
>          if (num_copies == 1)
> @@ -419,6 +427,76 @@ static int check_tree_block_fsid(struct btrfs_root *root,
>      return ret;
>  }
>  
> +#define CORRUPT(reason, eb, root, slot)                \
> +    printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"    \
> +           "root=%llu, slot=%d\n", reason,            \
> +           (unsigned long long)btrfs_header_bytenr(eb),    \
> +           (unsigned long long)root->objectid, slot)
> +
> +/*
> + * extra checking to make sure all the items in a leaf are
> + * well formed and in the proper order
> + */
> +static int check_leaf(struct btrfs_root *root,
> +              struct extent_buffer *leaf, int slot)
> +{
> +    struct btrfs_key key;
> +    struct btrfs_key leaf_key;
> +    u32 nritems = btrfs_header_nritems(leaf);
> +
> +    btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
> +    if (slot != 0 && slot < nritems - 1) {
> +        btrfs_item_key_to_cpu(leaf, &key, slot - 1);
> +        if (btrfs_comp_cpu_keys(&leaf_key, &key) <= 0) {
> +            CORRUPT("offset bad key", leaf, root, slot);
> +            return -EIO;
> +        }
> +        if (btrfs_item_offset_nr(leaf, slot - 1) !=
> +               btrfs_item_end_nr(leaf, slot)) {
> +            CORRUPT("slot offset bad", leaf, root, slot);
> +            return -EIO;
> +        }

Ok, this checks the item before our slot.

> +    }
> +    if (slot < nritems - 1) {
> +        btrfs_item_key_to_cpu(leaf, &key, slot + 1);
> +        if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
> +            CORRUPT("offset bad key", leaf, root, slot);
> +            return -EIO;
> +        }
> +        if (btrfs_item_offset_nr(leaf, slot) !=
> +            btrfs_item_end_nr(leaf, slot + 1)) {
> +            CORRUPT("slot offset bad", leaf, root, slot);
> +            return -EIO;
> +        }

And this checks the item after our slot

> +    }
> +    if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
> +        BTRFS_LEAF_DATA_SIZE(root)) {
> +        CORRUPT("invalid offset size pair", leaf, root, slot);
> +        return -EIO;

And this checks item zero.  But we're not checking to make sure
the offsets of the item headers are inside the leaf.  In your code they
all have to be consistent, but they might all point into funny places
(consistently).  I'm not sure if that is possible to do and have item
zero check out, but it seems like we could add one check to make sure
the offset is inside the block itself.

> +    }
> +
> +    return 0;
> +}
> +
> +static noinline int check_block(struct btrfs_root *root,
> +                struct extent_buffer *eb)
> +{
> +    u32 nritems = btrfs_header_nritems(eb);
> +    int slot;
> +    int ret = 0;
> +
> +    if (nritems == 0)
> +        return 0;
> +
> +    for (slot = 0; slot < nritems; slot++) {
> +        ret = check_leaf(root, eb, slot);
> +        if (ret)
> +            break;
> +    }

I might be missing something, but this looks like:

	for each item in the leaf {
		check the item before
		check the item after
		check item 0
	}

Why not:
	check item 0
	for each item in the leaf {
		check the item after
	}
	check the last item

> +
> +    return ret;
> +}
> +
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
>  {
> @@ -485,8 +563,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
>      btrfs_set_buffer_lockdep_class(eb, found_level);
>  
>      ret = csum_tree_block(root, eb, 1);
> -    if (ret)
> +    if (ret) {
> +        ret = -EIO;
> +        goto err;
> +    }
> +
> +    /*
> +     * If this is a leaf block and it is corrupt, set the corrupt bit so
> +     * that we don't try and read the other copies of this block, just
> +     * return -EIO.
> +     */
> +    if (found_level == 0 && check_block(root, eb)) {
> +        set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
>          ret = -EIO;
> +    }
>  
>      end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
>      end = eb->start + end - 1;
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 9318dfe..f62c544 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -31,6 +31,7 @@
>  #define EXTENT_BUFFER_UPTODATE 0
>  #define EXTENT_BUFFER_BLOCKING 1
>  #define EXTENT_BUFFER_DIRTY 2
> +#define EXTENT_BUFFER_CORRUPT 3
>  
>  /* these are flags for extent_clear_unlock_delalloc */
>  #define EXTENT_CLEAR_UNLOCK_PAGE 0x1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Btrfs: check items for correctness as we search
  2011-03-16 20:50 ` Chris Mason
@ 2011-03-16 21:05   ` Josef Bacik
  0 siblings, 0 replies; 10+ messages in thread
From: Josef Bacik @ 2011-03-16 21:05 UTC (permalink / raw)
  To: Chris Mason; +Cc: Josef Bacik, linux-btrfs

On Wed, Mar 16, 2011 at 04:50:30PM -0400, Chris Mason wrote:
> Excerpts from Josef Bacik's message of 2011-03-16 16:04:23 -0400:
> > Currently if we have corrupted items things will blow up in spectacular ways.
> > So as we read in blocks and they are leaves, check the entire leaf to make sure
> > all of the items are correct and point to valid parts in the leaf for the item
> > data the are responsible for.  If the item is corrupt we will kick back EIO and
> > not read any of the copies since they are likely to not be correct either.  This
> > will catch generic corruptions, it will be up to the individual callers of
> > btrfs_search_slot to make sure their items are right.  Thanks,
> 
> Thanks for working on this, a few comments below.
> 
> > 
> > Signed-off-by: Josef Bacik <josef@redhat.com>
> 
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 495b1ac..1694782 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -331,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
> >              !verify_parent_transid(io_tree, eb, parent_transid))
> >              return ret;
> >  
> > +        /*
> > +         * This buffer's crc is fine, but its contents are corrupted, so
> > +         * there is no reason to read the other copies, they won't be
> > +         * any less wrong.
> > +         */
> > +        if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
> > +            return ret;
> > +
> 
> We should make sure to clear this when read_extent_buffer_pages starts?
> At the very least it should get cleared if we delete the block.
> 

Ok I can fix that.

> >          num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
> >                            eb->start, eb->len);
> >          if (num_copies == 1)
> > @@ -419,6 +427,76 @@ static int check_tree_block_fsid(struct btrfs_root *root,
> >      return ret;
> >  }
> >  
> > +#define CORRUPT(reason, eb, root, slot)                \
> > +    printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"    \
> > +           "root=%llu, slot=%d\n", reason,            \
> > +           (unsigned long long)btrfs_header_bytenr(eb),    \
> > +           (unsigned long long)root->objectid, slot)
> > +
> > +/*
> > + * extra checking to make sure all the items in a leaf are
> > + * well formed and in the proper order
> > + */
> > +static int check_leaf(struct btrfs_root *root,
> > +              struct extent_buffer *leaf, int slot)
> > +{
> > +    struct btrfs_key key;
> > +    struct btrfs_key leaf_key;
> > +    u32 nritems = btrfs_header_nritems(leaf);
> > +
> > +    btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
> > +    if (slot != 0 && slot < nritems - 1) {
> > +        btrfs_item_key_to_cpu(leaf, &key, slot - 1);
> > +        if (btrfs_comp_cpu_keys(&leaf_key, &key) <= 0) {
> > +            CORRUPT("offset bad key", leaf, root, slot);
> > +            return -EIO;
> > +        }
> > +        if (btrfs_item_offset_nr(leaf, slot - 1) !=
> > +               btrfs_item_end_nr(leaf, slot)) {
> > +            CORRUPT("slot offset bad", leaf, root, slot);
> > +            return -EIO;
> > +        }
> 
> Ok, this checks the item before our slot.
> 
> > +    }
> > +    if (slot < nritems - 1) {
> > +        btrfs_item_key_to_cpu(leaf, &key, slot + 1);
> > +        if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
> > +            CORRUPT("offset bad key", leaf, root, slot);
> > +            return -EIO;
> > +        }
> > +        if (btrfs_item_offset_nr(leaf, slot) !=
> > +            btrfs_item_end_nr(leaf, slot + 1)) {
> > +            CORRUPT("slot offset bad", leaf, root, slot);
> > +            return -EIO;
> > +        }
> 
> And this checks the item after our slot
> 
> > +    }
> > +    if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
> > +        BTRFS_LEAF_DATA_SIZE(root)) {
> > +        CORRUPT("invalid offset size pair", leaf, root, slot);
> > +        return -EIO;
> 
> And this checks item zero.  But we're not checking to make sure
> the offsets of the item headers are inside the leaf.  In your code they
> all have to be consistent, but they might all point into funny places
> (consistently).  I'm not sure if that is possible to do and have item
> zero check out, but it seems like we could add one check to make sure
> the offset is inside the block itself.
> 
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static noinline int check_block(struct btrfs_root *root,
> > +                struct extent_buffer *eb)
> > +{
> > +    u32 nritems = btrfs_header_nritems(eb);
> > +    int slot;
> > +    int ret = 0;
> > +
> > +    if (nritems == 0)
> > +        return 0;
> > +
> > +    for (slot = 0; slot < nritems; slot++) {
> > +        ret = check_leaf(root, eb, slot);
> > +        if (ret)
> > +            break;
> > +    }
> 
> I might be missing something, but this looks like:
> 
> 	for each item in the leaf {
> 		check the item before
> 		check the item after
> 		check item 0
> 	}
> 
> Why not:
> 	check item 0
> 	for each item in the leaf {
> 		check the item after
> 	}
> 	check the last item
> 

Right that sounds good.  I'll fix this up tomorrow and resend.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] Btrfs: check items for correctness as we search V3
  2011-03-16 20:04 [PATCH] Btrfs: check items for correctness as we search Josef Bacik
  2011-03-16 20:50 ` Chris Mason
@ 2011-03-17 18:18 ` Josef Bacik
  2011-03-17 19:12   ` Andrey Kuzmin
  1 sibling, 1 reply; 10+ messages in thread
From: Josef Bacik @ 2011-03-17 18:18 UTC (permalink / raw)
  To: linux-btrfs

Currently if we have corrupted items things will blow up in spectacular ways.
So as we read in blocks and they are leaves, check the entire leaf to make sure
all of the items are correct and point to valid parts in the leaf for the item
data the are responsible for.  If the item is corrupt we will kick back EIO and
not read any of the copies since they are likely to not be correct either.  This
will catch generic corruptions, it will be up to the individual callers of
btrfs_search_slot to make sure their items are right.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
V2->V3:
-Be smarter about checking the leaf
-Clear EXTENT_BUFFER_CORRUPT on read and delete

 fs/btrfs/ctree.c       |  123 ------------------------------------------------
 fs/btrfs/disk-io.c     |   90 ++++++++++++++++++++++++++++++++++-
 fs/btrfs/extent-tree.c |    5 ++
 fs/btrfs/extent_io.h   |    1 +
 4 files changed, 95 insertions(+), 124 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0..73e5300 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -732,122 +732,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
 	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
-/*
- * extra debugging checks to make sure all the items in a key are
- * well formed and in the proper order
- */
-static int check_node(struct btrfs_root *root, struct btrfs_path *path,
-		      int level)
-{
-	struct extent_buffer *parent = NULL;
-	struct extent_buffer *node = path->nodes[level];
-	struct btrfs_disk_key parent_key;
-	struct btrfs_disk_key node_key;
-	int parent_slot;
-	int slot;
-	struct btrfs_key cpukey;
-	u32 nritems = btrfs_header_nritems(node);
-
-	if (path->nodes[level + 1])
-		parent = path->nodes[level + 1];
-
-	slot = path->slots[level];
-	BUG_ON(nritems == 0);
-	if (parent) {
-		parent_slot = path->slots[level + 1];
-		btrfs_node_key(parent, &parent_key, parent_slot);
-		btrfs_node_key(node, &node_key, 0);
-		BUG_ON(memcmp(&parent_key, &node_key,
-			      sizeof(struct btrfs_disk_key)));
-		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_bytenr(node));
-	}
-	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
-	if (slot != 0) {
-		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
-		btrfs_node_key(node, &node_key, slot);
-		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
-	}
-	if (slot < nritems - 1) {
-		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
-		btrfs_node_key(node, &node_key, slot);
-		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
-	}
-	return 0;
-}
-
-/*
- * extra checking to make sure all the items in a leaf are
- * well formed and in the proper order
- */
-static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
-		      int level)
-{
-	struct extent_buffer *leaf = path->nodes[level];
-	struct extent_buffer *parent = NULL;
-	int parent_slot;
-	struct btrfs_key cpukey;
-	struct btrfs_disk_key parent_key;
-	struct btrfs_disk_key leaf_key;
-	int slot = path->slots[0];
-
-	u32 nritems = btrfs_header_nritems(leaf);
-
-	if (path->nodes[level + 1])
-		parent = path->nodes[level + 1];
-
-	if (nritems == 0)
-		return 0;
-
-	if (parent) {
-		parent_slot = path->slots[level + 1];
-		btrfs_node_key(parent, &parent_key, parent_slot);
-		btrfs_item_key(leaf, &leaf_key, 0);
-
-		BUG_ON(memcmp(&parent_key, &leaf_key,
-		       sizeof(struct btrfs_disk_key)));
-		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_bytenr(leaf));
-	}
-	if (slot != 0 && slot < nritems - 1) {
-		btrfs_item_key(leaf, &leaf_key, slot);
-		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
-		if (comp_keys(&leaf_key, &cpukey) <= 0) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad key\n", slot);
-			BUG_ON(1);
-		}
-		if (btrfs_item_offset_nr(leaf, slot - 1) !=
-		       btrfs_item_end_nr(leaf, slot)) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad\n", slot);
-			BUG_ON(1);
-		}
-	}
-	if (slot < nritems - 1) {
-		btrfs_item_key(leaf, &leaf_key, slot);
-		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
-		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
-		if (btrfs_item_offset_nr(leaf, slot) !=
-			btrfs_item_end_nr(leaf, slot + 1)) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad\n", slot);
-			BUG_ON(1);
-		}
-	}
-	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
-	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
-	return 0;
-}
-
-static noinline int check_block(struct btrfs_root *root,
-				struct btrfs_path *path, int level)
-{
-	return 0;
-	if (level == 0)
-		return check_leaf(root, path, level);
-	return check_node(root, path, level);
-}
 
 /*
  * search for key in the extent_buffer.  The items start at offset p,
@@ -1188,7 +1072,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		}
 	}
 	/* double check we haven't messed things up */
-	check_block(root, path, level);
 	if (orig_ptr !=
 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
 		BUG();
@@ -1798,12 +1681,6 @@ cow_done:
 		if (!cow)
 			btrfs_unlock_up_safe(p, level + 1);
 
-		ret = check_block(root, p, level);
-		if (ret) {
-			ret = -1;
-			goto done;
-		}
-
 		ret = bin_search(b, key, level, &slot);
 
 		if (level != 0) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 495b1ac..9f31e11 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -323,6 +323,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	int num_copies = 0;
 	int mirror_num = 0;
 
+	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	while (1) {
 		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -331,6 +332,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
 
+		/*
+		 * This buffer's crc is fine, but its contents are corrupted, so
+		 * there is no reason to read the other copies, they won't be
+		 * any less wrong.
+		 */
+		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+			return ret;
+
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
@@ -419,6 +428,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
+#define CORRUPT(reason, eb, root, slot)				\
+	printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"	\
+	       "root=%llu, slot=%d\n", reason,			\
+	       (unsigned long long)btrfs_header_bytenr(eb),	\
+	       (unsigned long long)root->objectid, slot)
+
+static noinline int check_leaf(struct btrfs_root *root,
+			       struct extent_buffer *leaf)
+{
+	struct btrfs_key key;
+	struct btrfs_key leaf_key;
+	u32 nritems = btrfs_header_nritems(leaf);
+	int slot;
+
+	if (nritems == 0)
+		return 0;
+
+	/* Check the 0 item */
+	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+	    BTRFS_LEAF_DATA_SIZE(root)) {
+		CORRUPT("invalid item offset size pair", leaf, root, 0);
+		return -EIO;
+	}
+
+	/*
+	 * Check to make sure each items keys are in the correct order and their
+	 * offsets make sense.  We only have to loop through nritems-1 because
+	 * we check the current slot against the next slot, which verifies the
+	 * next slot's offset+size makes sense and that the current's slot
+	 * offset is correct.
+	 */
+	for (slot = 0; slot < nritems - 1; slot++) {
+		btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+		btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+
+		/* Make sure the keys are in the right order */
+		if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+			CORRUPT("bad key order", leaf, root, slot);
+			return -EIO;
+		}
+
+		/*
+		 * Make sure the offset and ends are right, remember that the
+		 * item data starts at the end of the leaf and grows towards the
+		 * front.
+		 */
+		if (btrfs_item_offset_nr(leaf, slot) !=
+			btrfs_item_end_nr(leaf, slot + 1)) {
+			CORRUPT("slot offset bad", leaf, root, slot);
+			return -EIO;
+		}
+
+		/*
+		 * Check to make sure that we don't point outside of the leaf,
+		 * just incase all the items are consistent to eachother, but
+		 * all point outside of the leaf.
+		 */
+		if (btrfs_item_end_nr(leaf, slot) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			CORRUPT("slot end outside of leaf", leaf, root, slot);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
 {
@@ -485,8 +561,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	btrfs_set_buffer_lockdep_class(eb, found_level);
 
 	ret = csum_tree_block(root, eb, 1);
-	if (ret)
+	if (ret) {
 		ret = -EIO;
+		goto err;
+	}
+
+	/*
+	 * If this is a leaf block and it is corrupt, set the corrupt bit so
+	 * that we don't try and read the other copies of this block, just
+	 * return -EIO.
+	 */
+	if (found_level == 0 && check_leaf(root, eb)) {
+		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+		ret = -EIO;
+	}
 
 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 	end = eb->start + end - 1;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a8f4e8d..cd794c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4754,6 +4754,11 @@ pin:
 		}
 	}
 out:
+	/*
+	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
+	 * anymore.
+	 */
+	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
 	btrfs_put_block_group(cache);
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9318dfe..f62c544 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -31,6 +31,7 @@
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] Btrfs: check items for correctness as we search V3
  2011-03-17 18:18 ` [PATCH] Btrfs: check items for correctness as we search V3 Josef Bacik
@ 2011-03-17 19:12   ` Andrey Kuzmin
  2011-03-18  0:52     ` Chris Mason
  0 siblings, 1 reply; 10+ messages in thread
From: Andrey Kuzmin @ 2011-03-17 19:12 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs

On Thu, Mar 17, 2011 at 9:18 PM, Josef Bacik <josef@redhat.com> wrote:
> Currently if we have corrupted items things will blow up in spectacul=
ar ways.
> So as we read in blocks and they are leaves, check the entire leaf to=
 make sure
> all of the items are correct and point to valid parts in the leaf for=
 the item
> data the are responsible for. =C2=A0If the item is corrupt we will ki=
ck back EIO and
> not read any of the copies since they are likely to not be correct ei=
ther. =C2=A0This
> will catch generic corruptions, it will be up to the individual calle=
rs of
> btrfs_search_slot to make sure their items are right. =C2=A0Thanks,
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 495b1ac..9f31e11 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -323,6 +323,7 @@ static int btree_read_extent_buffer_pages(struct =
btrfs_root *root,
> =C2=A0 =C2=A0 =C2=A0 =C2=A0int num_copies =3D 0;
> =C2=A0 =C2=A0 =C2=A0 =C2=A0int mirror_num =3D 0;
>
> + =C2=A0 =C2=A0 =C2=A0 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
> =C2=A0 =C2=A0 =C2=A0 =C2=A0io_tree =3D &BTRFS_I(root->fs_info->btree_=
inode)->io_tree;
> =C2=A0 =C2=A0 =C2=A0 =C2=A0while (1) {
> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0ret =3D read_e=
xtent_buffer_pages(io_tree, eb, start, 1,
> @@ -331,6 +332,14 @@ static int btree_read_extent_buffer_pages(struct=
 btrfs_root *root,
> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0=
!verify_parent_transid(io_tree, eb, parent_transid))
> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0=
 =C2=A0 =C2=A0return ret;
>
> + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 /*
> + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* This buffe=
r's crc is fine, but its contents are corrupted, so
> + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* there is n=
o reason to read the other copies, they won't be
> + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* any less w=
rong.
> + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0*/

This sounds like an overstatement to me. You may be dealing with an
error pattern CRC faled to catch, so giving up reading a mirror at
this point seems premature.

Regards,
Andrey
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" =
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Btrfs: check items for correctness as we search V3
  2011-03-17 19:12   ` Andrey Kuzmin
@ 2011-03-18  0:52     ` Chris Mason
  2011-03-18 13:05       ` Andrey Kuzmin
  0 siblings, 1 reply; 10+ messages in thread
From: Chris Mason @ 2011-03-18  0:52 UTC (permalink / raw)
  To: Andrey Kuzmin; +Cc: Josef Bacik, linux-btrfs

Excerpts from Andrey Kuzmin's message of 2011-03-17 15:12:32 -0400:
> On Thu, Mar 17, 2011 at 9:18 PM, Josef Bacik <josef@redhat.com> wrote=
:
> > Currently if we have corrupted items things will blow up in spectac=
ular ways.
> > So as we read in blocks and they are leaves, check the entire leaf =
to make sure
> > all of the items are correct and point to valid parts in the leaf f=
or the item
> > data the are responsible for. =C2=A0If the item is corrupt we will =
kick back EIO and
> > not read any of the copies since they are likely to not be correct =
either. =C2=A0This
> > will catch generic corruptions, it will be up to the individual cal=
lers of
> > btrfs_search_slot to make sure their items are right. =C2=A0Thanks,
> >
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 495b1ac..9f31e11 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -323,6 +323,7 @@ static int btree_read_extent_buffer_pages(struc=
t btrfs_root *root,
> > =C2=A0 =C2=A0 =C2=A0 =C2=A0int num_copies =3D 0;
> > =C2=A0 =C2=A0 =C2=A0 =C2=A0int mirror_num =3D 0;
> >
> > + =C2=A0 =C2=A0 =C2=A0 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags=
);
> > =C2=A0 =C2=A0 =C2=A0 =C2=A0io_tree =3D &BTRFS_I(root->fs_info->btre=
e_inode)->io_tree;
> > =C2=A0 =C2=A0 =C2=A0 =C2=A0while (1) {
> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0ret =3D read=
_extent_buffer_pages(io_tree, eb, start, 1,
> > @@ -331,6 +332,14 @@ static int btree_read_extent_buffer_pages(stru=
ct btrfs_root *root,
> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0=
!verify_parent_transid(io_tree, eb, parent_transid))
> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0=
 =C2=A0 =C2=A0return ret;
> >
> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 /*
> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* This buf=
fer's crc is fine, but its contents are corrupted, so
> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* there is=
 no reason to read the other copies, they won't be
> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* any less=
 wrong.
> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0*/
>=20
> This sounds like an overstatement to me. You may be dealing with an
> error pattern CRC faled to catch, so giving up reading a mirror at
> this point seems premature.

But we have no way to tell which one is more correct, at least not
without a full fsck.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" =
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Btrfs: check items for correctness as we search V3
  2011-03-18  0:52     ` Chris Mason
@ 2011-03-18 13:05       ` Andrey Kuzmin
  0 siblings, 0 replies; 10+ messages in thread
From: Andrey Kuzmin @ 2011-03-18 13:05 UTC (permalink / raw)
  To: Chris Mason; +Cc: Josef Bacik, linux-btrfs

On Fri, Mar 18, 2011 at 3:52 AM, Chris Mason <chris.mason@oracle.com> w=
rote:
> Excerpts from Andrey Kuzmin's message of 2011-03-17 15:12:32 -0400:
>> On Thu, Mar 17, 2011 at 9:18 PM, Josef Bacik <josef@redhat.com> wrot=
e:
>> > Currently if we have corrupted items things will blow up in specta=
cular ways.
>> > So as we read in blocks and they are leaves, check the entire leaf=
 to make sure
>> > all of the items are correct and point to valid parts in the leaf =
for the item
>> > data the are responsible for. =C2=A0If the item is corrupt we will=
 kick back EIO and
>> > not read any of the copies since they are likely to not be correct=
 either. =C2=A0This
>> > will catch generic corruptions, it will be up to the individual ca=
llers of
>> > btrfs_search_slot to make sure their items are right. =C2=A0Thanks=
,
>> >
>> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> > index 495b1ac..9f31e11 100644
>> > --- a/fs/btrfs/disk-io.c
>> > +++ b/fs/btrfs/disk-io.c
>> > @@ -323,6 +323,7 @@ static int btree_read_extent_buffer_pages(stru=
ct btrfs_root *root,
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0int num_copies =3D 0;
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0int mirror_num =3D 0;
>> >
>> > + =C2=A0 =C2=A0 =C2=A0 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflag=
s);
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0io_tree =3D &BTRFS_I(root->fs_info->btr=
ee_inode)->io_tree;
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0while (1) {
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0ret =3D rea=
d_extent_buffer_pages(io_tree, eb, start, 1,
>> > @@ -331,6 +332,14 @@ static int btree_read_extent_buffer_pages(str=
uct btrfs_root *root,
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=
=A0!verify_parent_transid(io_tree, eb, parent_transid))
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=
=A0 =C2=A0 =C2=A0return ret;
>> >
>> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 /*
>> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* This bu=
ffer's crc is fine, but its contents are corrupted, so
>> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* there i=
s no reason to read the other copies, they won't be
>> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0* any les=
s wrong.
>> > + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0*/
>>
>> This sounds like an overstatement to me. You may be dealing with an
>> error pattern CRC faled to catch, so giving up reading a mirror at
>> this point seems premature.
>
> But we have no way to tell which one is more correct, at least not
> without a full fsck.

Voting with two participants (would be better to have at least three,
though theory says even this is insufficient in the presence of
failures  :)) is naturally deficient, so you are right in general
except one particular case: when 2nd copy passes CRC _and_
verification, and two copies differ by a bit pattern undetectable by
CRC in use.

This is a corner case, of course, but the price to pay for false
positive (full fsck with associated downtime) is high enough to make
it worth a deeper dive.

Regards,
Andrey

>
> -chris
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" =
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Btrfs: check items for correctness as we search
       [not found]     ` <AANLkTi=CAuTL-wH-KmnqhoJH1hHhucFOzSukE5rJ2PUf@mail.gmail.com>
@ 2011-03-16 18:54       ` Josef Bacik
  0 siblings, 0 replies; 10+ messages in thread
From: Josef Bacik @ 2011-03-16 18:54 UTC (permalink / raw)
  To: Andrey Kuzmin; +Cc: Josef Bacik, linux-btrfs

On Wed, Mar 16, 2011 at 09:34:29PM +0300, Andrey Kuzmin wrote:
> On Wed, Mar 16, 2011 at 9:29 PM, Josef Bacik <josef@redhat.com> wrote:
> 
> > On Wed, Mar 16, 2011 at 09:23:58PM +0300, Andrey Kuzmin wrote:
> > > On Wed, Mar 16, 2011 at 8:44 PM, Josef Bacik <josef@redhat.com> wrote:
> > >
> > > > Currently if we have corrupted items things will blow up in spectacular
> > > > ways.
> > > > So as we search check the item that we are pointing at currently to
> > make
> > > > sure it
> > > >
> > >
> > > Will these checks run multiple times on the same slot? If yes, why not
> > have
> > > an in-memory bit 'checked' and skip checks of already checked items? Or
> > > simply check once when loading from disk?
> > >
> >
> > I thought about this but I have some reservations.  In order to make sure
> > the
> > block is actually right we'd have to check the entire thing when it's read
> > from
> > disk, which isn't too bad, but what happens when only one item in the block
> > is
> > corrupt?
> 
> For example if just the xattr item for an inode is corrupt, this
> > method would keep you from reading the inode item in the same block which
> > could
> > be just fine.
> 
> 
> I've lost you here. If you check once on load, you can just reread and
> that's it.
> 

Hrm I'll be a little clearer.  If in one block we have 4 valid items and one
bogus one, with my current patch we'd only fail if we tried to use the bogus
one, the valid items would work fine.

However if we check on read, we have to verify the _entire_ block, so if 1 item
is broken the entire block is thrown out, and if there were valid items in that
block we won't be able to read those items either.

It seemed to me a better choice to let the user limp along if at all possible,
but I'm not convinced that I'm right, so I could be persuaded to do it on read.
I'll see what other people think.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Btrfs: check items for correctness as we search
       [not found] ` <AANLkTikTnYr8hPWCQpVFBzHyA9gSPijiiRv5LBXXpeti@mail.gmail.com>
@ 2011-03-16 18:29   ` Josef Bacik
       [not found]     ` <AANLkTi=CAuTL-wH-KmnqhoJH1hHhucFOzSukE5rJ2PUf@mail.gmail.com>
  0 siblings, 1 reply; 10+ messages in thread
From: Josef Bacik @ 2011-03-16 18:29 UTC (permalink / raw)
  To: Andrey Kuzmin; +Cc: Josef Bacik, linux-btrfs

On Wed, Mar 16, 2011 at 09:23:58PM +0300, Andrey Kuzmin wrote:
> On Wed, Mar 16, 2011 at 8:44 PM, Josef Bacik <josef@redhat.com> wrote:
> 
> > Currently if we have corrupted items things will blow up in spectacular
> > ways.
> > So as we search check the item that we are pointing at currently to make
> > sure it
> >
> 
> Will these checks run multiple times on the same slot? If yes, why not have
> an in-memory bit 'checked' and skip checks of already checked items? Or
> simply check once when loading from disk?
> 

I thought about this but I have some reservations.  In order to make sure the
block is actually right we'd have to check the entire thing when it's read from
disk, which isn't too bad, but what happens when only one item in the block is
corrupt?  For example if just the xattr item for an inode is corrupt, this
method would keep you from reading the inode item in the same block which could
be just fine.  If this is acceptable for everybody then I don't mind doing it
this way, but the current implementation makes sure we only check what we are
actually interested in so we can have the best possible chance of limping along
if only part of the block is bogus.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] Btrfs: check items for correctness as we search
@ 2011-03-16 17:44 Josef Bacik
       [not found] ` <AANLkTikTnYr8hPWCQpVFBzHyA9gSPijiiRv5LBXXpeti@mail.gmail.com>
  0 siblings, 1 reply; 10+ messages in thread
From: Josef Bacik @ 2011-03-16 17:44 UTC (permalink / raw)
  To: linux-btrfs

Currently if we have corrupted items things will blow up in spectacular ways.
So as we search check the item that we are pointing at currently to make sure it
passes sanity checks before moving on.  This will catch corruptions in the base
item, the more complex item checking will be up to various consumers of
btrfs_search_slot, but for now this gets us good sanity checking.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.c |   88 +++++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0..f973ada 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -732,6 +732,12 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
 	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
+#define CORRUPT(reason, eb, root, level, slot)					\
+	printk(KERN_CRIT "btrfs: corrupt %s, %s: block=%llu, root=%llu, "	\
+	       "level=%d, slot=%d\n", level ? "node" : "leaf", reason,		\
+	       (unsigned long long)btrfs_header_bytenr(eb),			\
+	       (unsigned long long)root->objectid, level, slot)
+
 /*
  * extra debugging checks to make sure all the items in a key are
  * well formed and in the proper order
@@ -757,21 +763,42 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		parent_slot = path->slots[level + 1];
 		btrfs_node_key(parent, &parent_key, parent_slot);
 		btrfs_node_key(node, &node_key, 0);
-		BUG_ON(memcmp(&parent_key, &node_key,
-			      sizeof(struct btrfs_disk_key)));
-		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_bytenr(node));
+		if (memcmp(&parent_key, &node_key,
+			   sizeof(struct btrfs_disk_key))) {
+			CORRUPT("invalid node key", node, root, level, slot);
+			return -EIO;
+		}
+		if (btrfs_node_blockptr(parent, parent_slot) !=
+		    btrfs_header_bytenr(node)) {
+			CORRUPT("blockptr is not right", node, root, level,
+				slot);
+			return -EIO;
+		}
 	}
-	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
+
+	if (nritems > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+		CORRUPT("wrong nritems count", node, root, level, slot);
+		return -EIO;
+	}
+
 	if (slot != 0) {
 		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
 		btrfs_node_key(node, &node_key, slot);
-		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
+		if (comp_keys(&node_key, &cpukey) <= 0) {
+			CORRUPT("keys in wrong order", node, root, level,
+				slot);
+			return -EIO;
+		}
 	}
+
 	if (slot < nritems - 1) {
 		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
 		btrfs_node_key(node, &node_key, slot);
-		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
+		if (comp_keys(&node_key, &cpukey) >= 0) {
+			CORRUPT("keys in wrong order", node, root, level,
+				slot);
+			return -EIO;
+		}
 	}
 	return 0;
 }
@@ -804,24 +831,31 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		btrfs_node_key(parent, &parent_key, parent_slot);
 		btrfs_item_key(leaf, &leaf_key, 0);
 
-		BUG_ON(memcmp(&parent_key, &leaf_key,
-		       sizeof(struct btrfs_disk_key)));
-		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_bytenr(leaf));
+		if (memcmp(&parent_key, &leaf_key,
+			   sizeof(struct btrfs_disk_key))) {
+			CORRUPT("bad first key", leaf, root, level, slot);
+			return -EIO;
+		}
+
+		if (btrfs_node_blockptr(parent, parent_slot) !=
+		    btrfs_header_bytenr(leaf)) {
+			CORRUPT("bad block ptr", leaf, root, level, slot);
+			return -EIO;
+		}
 	}
 	if (slot != 0 && slot < nritems - 1) {
 		btrfs_item_key(leaf, &leaf_key, slot);
 		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
 		if (comp_keys(&leaf_key, &cpukey) <= 0) {
 			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad key\n", slot);
-			BUG_ON(1);
+			CORRUPT("offset bad key", leaf, root, level, slot);
+			return -EIO;
 		}
 		if (btrfs_item_offset_nr(leaf, slot - 1) !=
 		       btrfs_item_end_nr(leaf, slot)) {
 			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad\n", slot);
-			BUG_ON(1);
+			CORRUPT("slot offset bad", leaf, root, level, slot);
+			return -EIO;
 		}
 	}
 	if (slot < nritems - 1) {
@@ -831,19 +865,22 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		if (btrfs_item_offset_nr(leaf, slot) !=
 			btrfs_item_end_nr(leaf, slot + 1)) {
 			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d offset bad\n", slot);
-			BUG_ON(1);
+			CORRUPT("slot offset bad", leaf, root, level, slot);
+			return -EIO;
 		}
 	}
-	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
-	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
+	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+	    BTRFS_LEAF_DATA_SIZE(root)) {
+		CORRUPT("invalid offset size pair", leaf, root, level, slot);
+		return -EIO;
+	}
+
 	return 0;
 }
 
 static noinline int check_block(struct btrfs_root *root,
 				struct btrfs_path *path, int level)
 {
-	return 0;
 	if (level == 0)
 		return check_leaf(root, path, level);
 	return check_node(root, path, level);
@@ -1188,7 +1225,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		}
 	}
 	/* double check we haven't messed things up */
-	check_block(root, path, level);
 	if (orig_ptr !=
 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
 		BUG();
@@ -1799,10 +1835,8 @@ cow_done:
 			btrfs_unlock_up_safe(p, level + 1);
 
 		ret = check_block(root, p, level);
-		if (ret) {
-			ret = -1;
+		if (ret)
 			goto done;
-		}
 
 		ret = bin_search(b, key, level, &slot);
 
@@ -4442,6 +4476,12 @@ again:
 		if (!path->skip_locking)
 			path->locks[level] = 1;
 
+		ret = check_block(root, path, level);
+		if (ret) {
+			btrfs_release_path(root, path);
+			goto done;
+		}
+
 		if (!level)
 			break;
 
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-03-18 13:05 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-03-16 20:04 [PATCH] Btrfs: check items for correctness as we search Josef Bacik
2011-03-16 20:50 ` Chris Mason
2011-03-16 21:05   ` Josef Bacik
2011-03-17 18:18 ` [PATCH] Btrfs: check items for correctness as we search V3 Josef Bacik
2011-03-17 19:12   ` Andrey Kuzmin
2011-03-18  0:52     ` Chris Mason
2011-03-18 13:05       ` Andrey Kuzmin
  -- strict thread matches above, loose matches on Subject: below --
2011-03-16 17:44 [PATCH] Btrfs: check items for correctness as we search Josef Bacik
     [not found] ` <AANLkTikTnYr8hPWCQpVFBzHyA9gSPijiiRv5LBXXpeti@mail.gmail.com>
2011-03-16 18:29   ` Josef Bacik
     [not found]     ` <AANLkTi=CAuTL-wH-KmnqhoJH1hHhucFOzSukE5rJ2PUf@mail.gmail.com>
2011-03-16 18:54       ` Josef Bacik

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).