linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss
@ 2014-01-15 12:00 Miao Xie
  2014-01-15 12:00 ` [PATCH 2/5] Btrfs: cleanup the redundant code for the block group allocation and init Miao Xie
                   ` (4 more replies)
  0 siblings, 5 replies; 18+ messages in thread
From: Miao Xie @ 2014-01-15 12:00 UTC (permalink / raw)
  To: linux-btrfs

It is better that the position of the lock is close to the data which is
protected by it, because they may be in the same cache line, we will load
less cache lines when we access them. So we rearrange the members' position
of btrfs_space_info structure to make the lock be closer to the its data.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ctree.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54ab861..1667c9a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1094,7 +1094,7 @@ struct btrfs_qgroup_limit_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_space_info {
-	u64 flags;
+	spinlock_t lock;
 
 	u64 total_bytes;	/* total bytes in the space,
 				   this doesn't take mirrors into account */
@@ -1104,14 +1104,25 @@ struct btrfs_space_info {
 				   transaction finishes */
 	u64 bytes_reserved;	/* total bytes the allocator has reserved for
 				   current allocations */
-	u64 bytes_readonly;	/* total bytes that are read only */
-
 	u64 bytes_may_use;	/* number of bytes that may be used for
 				   delalloc/allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	unsigned int full:1;	/* indicates that we cannot allocate any more
+				   chunks for this space */
+	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
+
+	unsigned int flush:1;		/* set if we are trying to make space */
+
+	unsigned int force_alloc;	/* set if we need to force a chunk
+					   alloc for this space */
+
 	u64 disk_used;		/* total bytes used on disk */
 	u64 disk_total;		/* total bytes on disk, takes mirrors into
 				   account */
 
+	u64 flags;
+
 	/*
 	 * bytes_pinned is kept in line with what is actually pinned, as in
 	 * we've called update_block_group and dropped the bytes_used counter
@@ -1124,21 +1135,11 @@ struct btrfs_space_info {
 	 */
 	struct percpu_counter total_bytes_pinned;
 
-	unsigned int full:1;	/* indicates that we cannot allocate any more
-				   chunks for this space */
-	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
-
-	unsigned int flush:1;		/* set if we are trying to make space */
-
-	unsigned int force_alloc;	/* set if we need to force a chunk
-					   alloc for this space */
-
 	struct list_head list;
 
+	struct rw_semaphore groups_sem;
 	/* for block groups in our same type */
 	struct list_head block_groups[BTRFS_NR_RAID_TYPES];
-	spinlock_t lock;
-	struct rw_semaphore groups_sem;
 	wait_queue_head_t wait;
 };
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/5] Btrfs: cleanup the redundant code for the block group allocation and init
  2014-01-15 12:00 [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss Miao Xie
@ 2014-01-15 12:00 ` Miao Xie
  2014-01-15 12:00 ` [PATCH 3/5] Btrfs: cleanup the code of used_block_group in find_free_extent() Miao Xie
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 18+ messages in thread
From: Miao Xie @ 2014-01-15 12:00 UTC (permalink / raw)
  To: linux-btrfs

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 94 +++++++++++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 50 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 45d98d0..69a79ed 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8362,6 +8362,41 @@ static void __link_block_group(struct btrfs_space_info *space_info,
 	up_write(&space_info->groups_sem);
 }
 
+static struct btrfs_block_group_cache *
+btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = start;
+	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+
+	cache->sectorsize = root->sectorsize;
+	cache->fs_info = root->fs_info;
+	cache->full_stripe_len = btrfs_full_stripe_len(root,
+					       &root->fs_info->mapping_tree,
+					       start);
+	atomic_set(&cache->count, 1);
+	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
+	btrfs_init_free_space_ctl(cache);
+
+	return cache;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -8397,26 +8432,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		if (ret != 0)
 			goto error;
+
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		cache = kzalloc(sizeof(*cache), GFP_NOFS);
+
+		cache = btrfs_create_block_group_cache(root, found_key.objectid,
+						       found_key.offset);
 		if (!cache) {
 			ret = -ENOMEM;
 			goto error;
 		}
-		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-						GFP_NOFS);
-		if (!cache->free_space_ctl) {
-			kfree(cache);
-			ret = -ENOMEM;
-			goto error;
-		}
-
-		atomic_set(&cache->count, 1);
-		spin_lock_init(&cache->lock);
-		cache->fs_info = info;
-		INIT_LIST_HEAD(&cache->list);
-		INIT_LIST_HEAD(&cache->cluster_list);
 
 		if (need_clear) {
 			/*
@@ -8437,16 +8462,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
-		memcpy(&cache->key, &found_key, sizeof(found_key));
+		cache->flags = btrfs_block_group_flags(&cache->item);
 
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(path);
-		cache->flags = btrfs_block_group_flags(&cache->item);
-		cache->sectorsize = root->sectorsize;
-		cache->full_stripe_len = btrfs_full_stripe_len(root,
-					       &root->fs_info->mapping_tree,
-					       found_key.objectid);
-		btrfs_init_free_space_ctl(cache);
 
 		/*
 		 * We need to exclude the super stripes now so that the space
@@ -8460,8 +8479,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			 * case.
 			 */
 			free_excluded_extents(root, cache);
-			kfree(cache->free_space_ctl);
-			kfree(cache);
+			btrfs_put_block_group(cache);
 			goto error;
 		}
 
@@ -8592,38 +8610,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	root->fs_info->last_trans_log_full_commit = trans->transid;
 
-	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	cache = btrfs_create_block_group_cache(root, chunk_offset, size);
 	if (!cache)
 		return -ENOMEM;
-	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-					GFP_NOFS);
-	if (!cache->free_space_ctl) {
-		kfree(cache);
-		return -ENOMEM;
-	}
-
-	cache->key.objectid = chunk_offset;
-	cache->key.offset = size;
-	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-	cache->sectorsize = root->sectorsize;
-	cache->fs_info = root->fs_info;
-	cache->full_stripe_len = btrfs_full_stripe_len(root,
-					       &root->fs_info->mapping_tree,
-					       chunk_offset);
-
-	atomic_set(&cache->count, 1);
-	spin_lock_init(&cache->lock);
-	INIT_LIST_HEAD(&cache->list);
-	INIT_LIST_HEAD(&cache->cluster_list);
-	INIT_LIST_HEAD(&cache->new_bg_list);
-
-	btrfs_init_free_space_ctl(cache);
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
-	cache->flags = type;
 	btrfs_set_block_group_flags(&cache->item, type);
 
+	cache->flags = type;
 	cache->last_byte_to_unpin = (u64)-1;
 	cache->cached = BTRFS_CACHE_FINISHED;
 	ret = exclude_super_stripes(root, cache);
@@ -8633,8 +8628,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 		 * case.
 		 */
 		free_excluded_extents(root, cache);
-		kfree(cache->free_space_ctl);
-		kfree(cache);
+		btrfs_put_block_group(cache);
 		return ret;
 	}
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/5] Btrfs: cleanup the code of used_block_group in find_free_extent()
  2014-01-15 12:00 [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss Miao Xie
  2014-01-15 12:00 ` [PATCH 2/5] Btrfs: cleanup the redundant code for the block group allocation and init Miao Xie
@ 2014-01-15 12:00 ` Miao Xie
  2014-01-15 12:00 ` [PATCH 4/5] Btrfs: fix wrong block group in trace during the free space allocation Miao Xie
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 18+ messages in thread
From: Miao Xie @ 2014-01-15 12:00 UTC (permalink / raw)
  To: linux-btrfs

used_block_group is just used for the space cluster which doesn't
belong to the current block group, the other place needn't use it.
Or the logic of code seems unclear.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 69a79ed..a47efc1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6179,7 +6179,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	struct btrfs_free_cluster *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group = NULL;
-	struct btrfs_block_group_cache *used_block_group;
 	u64 search_start = 0;
 	u64 max_extent_size = 0;
 	int empty_cluster = 2 * 1024 * 1024;
@@ -6241,7 +6240,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	if (search_start == hint_byte) {
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
-		used_block_group = block_group;
 		/*
 		 * we don't want to use the block group if it doesn't match our
 		 * allocation bits, or if its not cached.
@@ -6278,7 +6276,6 @@ search:
 		u64 offset;
 		int cached;
 
-		used_block_group = block_group;
 		btrfs_get_block_group(block_group);
 		search_start = block_group->key.objectid;
 
@@ -6322,6 +6319,7 @@ have_block_group:
 		 * lets look there
 		 */
 		if (last_ptr) {
+			struct btrfs_block_group_cache *used_block_group;
 			unsigned long aligned_cluster;
 			/*
 			 * the refill lock keeps out other
@@ -6332,10 +6330,8 @@ have_block_group:
 			if (used_block_group != block_group &&
 			    (!used_block_group ||
 			     used_block_group->ro ||
-			     !block_group_bits(used_block_group, flags))) {
-				used_block_group = block_group;
+			     !block_group_bits(used_block_group, flags)))
 				goto refill_cluster;
-			}
 
 			if (used_block_group != block_group)
 				btrfs_get_block_group(used_block_group);
@@ -6350,16 +6346,17 @@ have_block_group:
 				spin_unlock(&last_ptr->refill_lock);
 				trace_btrfs_reserve_extent_cluster(root,
 					block_group, search_start, num_bytes);
+				if (used_block_group != block_group) {
+					btrfs_put_block_group(block_group);
+					block_group = used_block_group;
+				}
 				goto checks;
 			}
 
 			WARN_ON(last_ptr->block_group != used_block_group);
-			if (used_block_group != block_group) {
+			if (used_block_group != block_group)
 				btrfs_put_block_group(used_block_group);
-				used_block_group = block_group;
-			}
 refill_cluster:
-			BUG_ON(used_block_group != block_group);
 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
 			 * set up a new clusters, so lets just skip it
 			 * and let the allocator find whatever block
@@ -6478,25 +6475,25 @@ unclustered_alloc:
 			goto loop;
 		}
 checks:
-		search_start = stripe_align(root, used_block_group,
+		search_start = stripe_align(root, block_group,
 					    offset, num_bytes);
 
 		/* move on to the next group */
 		if (search_start + num_bytes >
-		    used_block_group->key.objectid + used_block_group->key.offset) {
-			btrfs_add_free_space(used_block_group, offset, num_bytes);
+		    block_group->key.objectid + block_group->key.offset) {
+			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
 		}
 
 		if (offset < search_start)
-			btrfs_add_free_space(used_block_group, offset,
+			btrfs_add_free_space(block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
-		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
+		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
 						  alloc_type);
 		if (ret == -EAGAIN) {
-			btrfs_add_free_space(used_block_group, offset, num_bytes);
+			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
 		}
 
@@ -6506,16 +6503,12 @@ checks:
 
 		trace_btrfs_reserve_extent(orig_root, block_group,
 					   search_start, num_bytes);
-		if (used_block_group != block_group)
-			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
 		break;
 loop:
 		failed_cluster_refill = false;
 		failed_alloc = false;
 		BUG_ON(index != get_block_group_index(block_group));
-		if (used_block_group != block_group)
-			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
 	}
 	up_read(&space_info->groups_sem);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/5] Btrfs: fix wrong block group in trace during the free space allocation
  2014-01-15 12:00 [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss Miao Xie
  2014-01-15 12:00 ` [PATCH 2/5] Btrfs: cleanup the redundant code for the block group allocation and init Miao Xie
  2014-01-15 12:00 ` [PATCH 3/5] Btrfs: cleanup the code of used_block_group in find_free_extent() Miao Xie
@ 2014-01-15 12:00 ` Miao Xie
  2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
  2014-01-15 15:56 ` [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss David Sterba
  4 siblings, 0 replies; 18+ messages in thread
From: Miao Xie @ 2014-01-15 12:00 UTC (permalink / raw)
  To: linux-btrfs

We allocate the free space from the former block group, not the current
one, so should use the former one to output the trace information.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a47efc1..3664cfb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6345,7 +6345,8 @@ have_block_group:
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
 				trace_btrfs_reserve_extent_cluster(root,
-					block_group, search_start, num_bytes);
+						used_block_group,
+						search_start, num_bytes);
 				if (used_block_group != block_group) {
 					btrfs_put_block_group(block_group);
 					block_group = used_block_group;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-01-15 12:00 [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss Miao Xie
                   ` (2 preceding siblings ...)
  2014-01-15 12:00 ` [PATCH 4/5] Btrfs: fix wrong block group in trace during the free space allocation Miao Xie
@ 2014-01-15 12:00 ` Miao Xie
  2014-01-16  5:54   ` Liu Bo
                     ` (5 more replies)
  2014-01-15 15:56 ` [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss David Sterba
  4 siblings, 6 replies; 18+ messages in thread
From: Miao Xie @ 2014-01-15 12:00 UTC (permalink / raw)
  To: linux-btrfs

When we mounted the filesystem after the crash, we got the following
message:
  BTRFS error (device xxx): block group 4315938816 has wrong amount of free space
  BTRFS error (device xxx): failed to load free space cache for block group 4315938816

It is because we didn't update the metadata of the allocated space until
the file data was written into the disk. During this time, there was no
information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time, those
spaces were lost.

In ordered to fix this problem, I use a state tree for every block group
to record those allocated spaces. We record the information when they are
allocated, and clean up the information after the metadata update. Besides
that, we also introduce a read-write semaphore to avoid the race between
the allocation and the free space cache write out.

Only data block groups had this problem, so the above change is just
for data space allocation.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ctree.h            | 15 ++++++++++++++-
 fs/btrfs/disk-io.c          |  2 +-
 fs/btrfs/extent-tree.c      | 24 ++++++++++++++++++++----
 fs/btrfs/free-space-cache.c | 42 ++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/inode.c            | 42 +++++++++++++++++++++++++++++++++++-------
 5 files changed, 108 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1667c9a..f58e1f7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1244,6 +1244,12 @@ struct btrfs_block_group_cache {
 	/* free space cache stuff */
 	struct btrfs_free_space_ctl *free_space_ctl;
 
+	/*
+	 * It is used to record the extents that are allocated for
+	 * the data, but don/t update its metadata.
+	 */
+	struct extent_io_tree pinned_extents;
+
 	/* block group cache stuff */
 	struct rb_node cache_node;
 
@@ -1540,6 +1546,13 @@ struct btrfs_fs_info {
 	 */
 	struct list_head space_info;
 
+	/*
+	 * It is just used for the delayed data space allocation
+	 * because only the data space allocation can be done during
+	 * we write out the free space cache.
+	 */
+	struct rw_semaphore data_rwsem;
+
 	struct btrfs_space_info *data_sinfo;
 
 	struct reloc_control *reloc_ctl;
@@ -3183,7 +3196,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 				   struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
 			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
-			 struct btrfs_key *ins, int is_data);
+			 struct btrfs_key *ins, int is_data, bool need_pin);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8072cfa..426b558 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2276,7 +2276,6 @@ int open_ctree(struct super_block *sb,
 	fs_info->pinned_extents = &fs_info->freed_extents[0];
 	fs_info->do_barriers = 1;
 
-
 	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->ordered_extent_flush_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
@@ -2287,6 +2286,7 @@ int open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->extent_commit_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
+	init_rwsem(&fs_info->data_rwsem);
 	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
 	fs_info->dev_replace.lock_owner = 0;
 	atomic_set(&fs_info->dev_replace.nesting_level, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3664cfb..7b07876 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6173,7 +6173,7 @@ enum btrfs_loop_type {
 static noinline int find_free_extent(struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
 				     u64 hint_byte, struct btrfs_key *ins,
-				     u64 flags)
+				     u64 flags, bool need_pin)
 {
 	int ret = 0;
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -6502,6 +6502,16 @@ checks:
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
+		if (need_pin) {
+			ASSERT(search_start >= block_group->key.objectid &&
+			       search_start < block_group->key.objectid +
+					      block_group->key.offset);
+			set_extent_dirty(&block_group->pinned_extents,
+					 search_start,
+					 search_start + num_bytes - 1,
+					 GFP_NOFS);
+		}
+
 		trace_btrfs_reserve_extent(orig_root, block_group,
 					   search_start, num_bytes);
 		btrfs_put_block_group(block_group);
@@ -6614,17 +6624,20 @@ again:
 int btrfs_reserve_extent(struct btrfs_root *root,
 			 u64 num_bytes, u64 min_alloc_size,
 			 u64 empty_size, u64 hint_byte,
-			 struct btrfs_key *ins, int is_data)
+			 struct btrfs_key *ins, int is_data, bool need_pin)
 {
 	bool final_tried = false;
 	u64 flags;
 	int ret;
 
 	flags = btrfs_get_alloc_profile(root, is_data);
+
+	if (need_pin)
+		down_read(&root->fs_info->data_rwsem);
 again:
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
-			       flags);
+			       flags, need_pin);
 
 	if (ret == -ENOSPC) {
 		if (!final_tried && ins->offset) {
@@ -6645,6 +6658,8 @@ again:
 		}
 	}
 
+	if (need_pin)
+		up_read(&root->fs_info->data_rwsem);
 	return ret;
 }
 
@@ -7016,7 +7031,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_CAST(block_rsv);
 
 	ret = btrfs_reserve_extent(root, blocksize, blocksize,
-				   empty_size, hint, &ins, 0);
+				   empty_size, hint, &ins, 0, false);
 	if (ret) {
 		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
 		return ERR_PTR(ret);
@@ -8387,6 +8402,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 	INIT_LIST_HEAD(&cache->cluster_list);
 	INIT_LIST_HEAD(&cache->new_bg_list);
 	btrfs_init_free_space_ctl(cache);
+	extent_io_tree_init(&cache->pinned_extents, NULL);
 
 	return cache;
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 057be95..486e12a3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -875,6 +875,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	struct rb_node *node;
 	struct list_head *pos, *n;
 	struct extent_state *cached_state = NULL;
+	struct extent_state *pinned_extent = NULL;
 	struct btrfs_free_cluster *cluster = NULL;
 	struct extent_io_tree *unpin = NULL;
 	struct io_ctl io_ctl;
@@ -948,17 +949,17 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * so we don't leak the space
 	 */
 
+	if (!block_group)
+		goto bitmap;
 	/*
 	 * We shouldn't have switched the pinned extents yet so this is the
 	 * right one
 	 */
 	unpin = root->fs_info->pinned_extents;
 
-	if (block_group)
-		start = block_group->key.objectid;
+	start = block_group->key.objectid;
 
-	while (block_group && (start < block_group->key.objectid +
-			       block_group->key.offset)) {
+	while (start < block_group->key.objectid + block_group->key.offset) {
 		ret = find_first_extent_bit(unpin, start,
 					    &extent_start, &extent_end,
 					    EXTENT_DIRTY, NULL);
@@ -985,6 +986,33 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		start = extent_end;
 	}
 
+	if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+		goto bitmap;
+
+	start = block_group->key.objectid;
+	unpin = &block_group->pinned_extents;
+	while (1) {
+		ret = find_first_extent_bit(unpin, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY, &pinned_extent);
+		if (ret) {
+			ret = 0;
+			break;
+		}
+
+		len = extent_end - extent_start + 1;
+
+		entries++;
+		ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
+		if (ret) {
+			free_extent_state(pinned_extent);
+			goto out_nospc;
+		}
+
+		start = extent_end + 1;
+	}
+	free_extent_state(pinned_extent);
+bitmap:
 	/* Write out the bitmaps */
 	list_for_each_safe(pos, n, &bitmap_list) {
 		struct btrfs_free_space *entry =
@@ -1097,6 +1125,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 	if (IS_ERR(inode))
 		return 0;
 
+	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+		down_write(&root->fs_info->data_rwsem);
+
 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
 				      path, block_group->key.objectid);
 	if (ret) {
@@ -1111,6 +1142,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 #endif
 	}
 
+	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+		up_write(&root->fs_info->data_rwsem);
+
 	iput(inode);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a7744..8172ca6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -592,6 +592,28 @@ free_pages_out:
 	goto out;
 }
 
+static void btrfs_unpin_data_extent(struct btrfs_root *root, u64 start,
+				    u64 len)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	BUG_ON(!cache);
+	clear_extent_dirty(&cache->pinned_extents, start, start + len - 1,
+			   GFP_NOFS);
+	btrfs_put_block_group(cache);
+}
+
+/* it is not used for free space cache file */
+static void btrfs_free_reserved_data_extent(struct btrfs_root *root, u64 start,
+					    u64 len)
+{
+	down_read(&root->fs_info->data_rwsem);
+	btrfs_unpin_data_extent(root, start, len);
+	btrfs_free_reserved_extent(root, start, len);
+	up_read(&root->fs_info->data_rwsem);
+}
+
 /*
  * phase two of compressed writeback.  This is the ordered portion
  * of the code, which only gets called in the order the work was
@@ -666,7 +688,7 @@ retry:
 		ret = btrfs_reserve_extent(root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
-					   0, alloc_hint, &ins, 1);
+					   0, alloc_hint, &ins, 1, true);
 		if (ret) {
 			int i;
 
@@ -767,7 +789,7 @@ retry:
 out:
 	return ret;
 out_free_reserve:
-	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+	btrfs_free_reserved_data_extent(root, ins.objectid, ins.offset);
 out_free:
 	extent_clear_unlock_delalloc(inode, async_extent->start,
 				     async_extent->start +
@@ -889,7 +911,7 @@ static noinline int cow_file_range(struct inode *inode,
 		cur_alloc_size = disk_num_bytes;
 		ret = btrfs_reserve_extent(root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
-					   &ins, 1);
+					   &ins, 1, true);
 		if (ret < 0)
 			goto out_unlock;
 
@@ -967,6 +989,7 @@ out:
 	return ret;
 
 out_reserve:
+	btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_unlock:
 	extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -2647,6 +2670,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 						logical_len, logical_len,
 						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
+		BUG_ON(nolock);
+		btrfs_unpin_data_extent(root, ordered_extent->start,
+					ordered_extent->disk_len);
 	}
 	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
 			   ordered_extent->file_offset, ordered_extent->len,
@@ -2698,8 +2724,9 @@ out:
 		if ((ret || !logical_len) &&
 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
-			btrfs_free_reserved_extent(root, ordered_extent->start,
-						   ordered_extent->disk_len);
+			btrfs_free_reserved_data_extent(root,
+							ordered_extent->start,
+							ordered_extent->disk_len);
 	}
 
 
@@ -6342,7 +6369,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
 	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
-				   alloc_hint, &ins, 1);
+				   alloc_hint, &ins, 1, true);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -6356,6 +6383,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
 					   ins.offset, ins.offset, 0);
 	if (ret) {
+		btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 		free_extent_map(em);
 		return ERR_PTR(ret);
@@ -8507,7 +8535,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
 		cur_bytes = max(cur_bytes, min_size);
 		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
-					   *alloc_hint, &ins, 1);
+					   *alloc_hint, &ins, 1, false);
 		if (ret) {
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss
  2014-01-15 12:00 [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss Miao Xie
                   ` (3 preceding siblings ...)
  2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
@ 2014-01-15 15:56 ` David Sterba
  4 siblings, 0 replies; 18+ messages in thread
From: David Sterba @ 2014-01-15 15:56 UTC (permalink / raw)
  To: Miao Xie; +Cc: linux-btrfs

On Wed, Jan 15, 2014 at 08:00:54PM +0800, Miao Xie wrote:
> It is better that the position of the lock is close to the data which is
> protected by it, because they may be in the same cache line, we will load
> less cache lines when we access them. So we rearrange the members' position
> of btrfs_space_info structure to make the lock be closer to the its data.

I like that
Reviewed-by: David Sterba <dsterba@suse.cz>

New structure layout according to pahole looks reasonable. We can try to
split 'groups_sem' and 'wait' to 2 cachelines, but this would need more
shuffling and maybe some measurement if this wins any perf percents.

struct btrfs_space_info {
        spinlock_t                 lock;                 /*     0     4 */

        /* XXX 4 bytes hole, try to pack */

        u64                        total_bytes;          /*     8     8 */
        u64                        bytes_used;           /*    16     8 */
        u64                        bytes_pinned;         /*    24     8 */
        u64                        bytes_reserved;       /*    32     8 */
        u64                        bytes_may_use;        /*    40     8 */
        u64                        bytes_readonly;       /*    48     8 */
        unsigned int               full:1;               /*    56:31  4 */
        unsigned int               chunk_alloc:1;        /*    56:30  4 */
        unsigned int               flush:1;              /*    56:29  4 */

        /* XXX 29 bits hole, try to pack */

        unsigned int               force_alloc;          /*    60     4 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        u64                        disk_used;            /*    64     8 */
        u64                        disk_total;           /*    72     8 */
        u64                        flags;                /*    80     8 */
        struct percpu_counter      total_bytes_pinned;   /*    88    40 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        struct list_head           list;                 /*   128    16 */
        struct rw_semaphore        groups_sem;           /*   144    32 */
        struct list_head           block_groups[7];      /*   176   112 */
        /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
        wait_queue_head_t          wait;                 /*   288    24 */

        /* size: 312, cachelines: 5, members: 19 */
        /* sum members: 308, holes: 1, sum holes: 4 */
        /* bit holes: 1, sum bit holes: 29 bits */
        /* last cacheline: 56 bytes */
};

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
@ 2014-01-16  5:54   ` Liu Bo
  2014-03-01 18:05   ` Alex Lyakas
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Liu Bo @ 2014-01-16  5:54 UTC (permalink / raw)
  To: Miao Xie; +Cc: linux-btrfs

On Wed, Jan 15, 2014 at 08:00:58PM +0800, Miao Xie wrote:
> When we mounted the filesystem after the crash, we got the following
> message:
>   BTRFS error (device xxx): block group 4315938816 has wrong amount of free space
>   BTRFS error (device xxx): failed to load free space cache for block group 4315938816

The code itself is fine.
But I'm not sure if it's worth doing so, I mean those memory allocation and lock
acquire and release in our core endio path.

To fallback to rebuild space cache is fairly acceptable to me in the case of
crash.

-liubo

> 
> It is because we didn't update the metadata of the allocated space until
> the file data was written into the disk. During this time, there was no
> information about the allocated spaces in either the extent tree nor the
> free space cache. when we wrote out the free space cache at this time, those
> spaces were lost.
> 
> In ordered to fix this problem, I use a state tree for every block group
> to record those allocated spaces. We record the information when they are
> allocated, and clean up the information after the metadata update. Besides
> that, we also introduce a read-write semaphore to avoid the race between
> the allocation and the free space cache write out.
> 
> Only data block groups had this problem, so the above change is just
> for data space allocation.
> 
> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> ---
>  fs/btrfs/ctree.h            | 15 ++++++++++++++-
>  fs/btrfs/disk-io.c          |  2 +-
>  fs/btrfs/extent-tree.c      | 24 ++++++++++++++++++++----
>  fs/btrfs/free-space-cache.c | 42 ++++++++++++++++++++++++++++++++++++++----
>  fs/btrfs/inode.c            | 42 +++++++++++++++++++++++++++++++++++-------
>  5 files changed, 108 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1667c9a..f58e1f7 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1244,6 +1244,12 @@ struct btrfs_block_group_cache {
>  	/* free space cache stuff */
>  	struct btrfs_free_space_ctl *free_space_ctl;
>  
> +	/*
> +	 * It is used to record the extents that are allocated for
> +	 * the data, but don/t update its metadata.
> +	 */
> +	struct extent_io_tree pinned_extents;
> +
>  	/* block group cache stuff */
>  	struct rb_node cache_node;
>  
> @@ -1540,6 +1546,13 @@ struct btrfs_fs_info {
>  	 */
>  	struct list_head space_info;
>  
> +	/*
> +	 * It is just used for the delayed data space allocation
> +	 * because only the data space allocation can be done during
> +	 * we write out the free space cache.
> +	 */
> +	struct rw_semaphore data_rwsem;
> +
>  	struct btrfs_space_info *data_sinfo;
>  
>  	struct reloc_control *reloc_ctl;
> @@ -3183,7 +3196,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
>  				   struct btrfs_key *ins);
>  int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
>  			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
> -			 struct btrfs_key *ins, int is_data);
> +			 struct btrfs_key *ins, int is_data, bool need_pin);
>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>  		  struct extent_buffer *buf, int full_backref, int for_cow);
>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 8072cfa..426b558 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2276,7 +2276,6 @@ int open_ctree(struct super_block *sb,
>  	fs_info->pinned_extents = &fs_info->freed_extents[0];
>  	fs_info->do_barriers = 1;
>  
> -
>  	mutex_init(&fs_info->ordered_operations_mutex);
>  	mutex_init(&fs_info->ordered_extent_flush_mutex);
>  	mutex_init(&fs_info->tree_log_mutex);
> @@ -2287,6 +2286,7 @@ int open_ctree(struct super_block *sb,
>  	init_rwsem(&fs_info->extent_commit_sem);
>  	init_rwsem(&fs_info->cleanup_work_sem);
>  	init_rwsem(&fs_info->subvol_sem);
> +	init_rwsem(&fs_info->data_rwsem);
>  	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>  	fs_info->dev_replace.lock_owner = 0;
>  	atomic_set(&fs_info->dev_replace.nesting_level, 0);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 3664cfb..7b07876 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -6173,7 +6173,7 @@ enum btrfs_loop_type {
>  static noinline int find_free_extent(struct btrfs_root *orig_root,
>  				     u64 num_bytes, u64 empty_size,
>  				     u64 hint_byte, struct btrfs_key *ins,
> -				     u64 flags)
> +				     u64 flags, bool need_pin)
>  {
>  	int ret = 0;
>  	struct btrfs_root *root = orig_root->fs_info->extent_root;
> @@ -6502,6 +6502,16 @@ checks:
>  		ins->objectid = search_start;
>  		ins->offset = num_bytes;
>  
> +		if (need_pin) {
> +			ASSERT(search_start >= block_group->key.objectid &&
> +			       search_start < block_group->key.objectid +
> +					      block_group->key.offset);
> +			set_extent_dirty(&block_group->pinned_extents,
> +					 search_start,
> +					 search_start + num_bytes - 1,
> +					 GFP_NOFS);
> +		}
> +
>  		trace_btrfs_reserve_extent(orig_root, block_group,
>  					   search_start, num_bytes);
>  		btrfs_put_block_group(block_group);
> @@ -6614,17 +6624,20 @@ again:
>  int btrfs_reserve_extent(struct btrfs_root *root,
>  			 u64 num_bytes, u64 min_alloc_size,
>  			 u64 empty_size, u64 hint_byte,
> -			 struct btrfs_key *ins, int is_data)
> +			 struct btrfs_key *ins, int is_data, bool need_pin)
>  {
>  	bool final_tried = false;
>  	u64 flags;
>  	int ret;
>  
>  	flags = btrfs_get_alloc_profile(root, is_data);
> +
> +	if (need_pin)
> +		down_read(&root->fs_info->data_rwsem);
>  again:
>  	WARN_ON(num_bytes < root->sectorsize);
>  	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
> -			       flags);
> +			       flags, need_pin);
>  
>  	if (ret == -ENOSPC) {
>  		if (!final_tried && ins->offset) {
> @@ -6645,6 +6658,8 @@ again:
>  		}
>  	}
>  
> +	if (need_pin)
> +		up_read(&root->fs_info->data_rwsem);
>  	return ret;
>  }
>  
> @@ -7016,7 +7031,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>  		return ERR_CAST(block_rsv);
>  
>  	ret = btrfs_reserve_extent(root, blocksize, blocksize,
> -				   empty_size, hint, &ins, 0);
> +				   empty_size, hint, &ins, 0, false);
>  	if (ret) {
>  		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
>  		return ERR_PTR(ret);
> @@ -8387,6 +8402,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
>  	INIT_LIST_HEAD(&cache->cluster_list);
>  	INIT_LIST_HEAD(&cache->new_bg_list);
>  	btrfs_init_free_space_ctl(cache);
> +	extent_io_tree_init(&cache->pinned_extents, NULL);
>  
>  	return cache;
>  }
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 057be95..486e12a3 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -875,6 +875,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>  	struct rb_node *node;
>  	struct list_head *pos, *n;
>  	struct extent_state *cached_state = NULL;
> +	struct extent_state *pinned_extent = NULL;
>  	struct btrfs_free_cluster *cluster = NULL;
>  	struct extent_io_tree *unpin = NULL;
>  	struct io_ctl io_ctl;
> @@ -948,17 +949,17 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>  	 * so we don't leak the space
>  	 */
>  
> +	if (!block_group)
> +		goto bitmap;
>  	/*
>  	 * We shouldn't have switched the pinned extents yet so this is the
>  	 * right one
>  	 */
>  	unpin = root->fs_info->pinned_extents;
>  
> -	if (block_group)
> -		start = block_group->key.objectid;
> +	start = block_group->key.objectid;
>  
> -	while (block_group && (start < block_group->key.objectid +
> -			       block_group->key.offset)) {
> +	while (start < block_group->key.objectid + block_group->key.offset) {
>  		ret = find_first_extent_bit(unpin, start,
>  					    &extent_start, &extent_end,
>  					    EXTENT_DIRTY, NULL);
> @@ -985,6 +986,33 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>  		start = extent_end;
>  	}
>  
> +	if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
> +		goto bitmap;
> +
> +	start = block_group->key.objectid;
> +	unpin = &block_group->pinned_extents;
> +	while (1) {
> +		ret = find_first_extent_bit(unpin, start,
> +					    &extent_start, &extent_end,
> +					    EXTENT_DIRTY, &pinned_extent);
> +		if (ret) {
> +			ret = 0;
> +			break;
> +		}
> +
> +		len = extent_end - extent_start + 1;
> +
> +		entries++;
> +		ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
> +		if (ret) {
> +			free_extent_state(pinned_extent);
> +			goto out_nospc;
> +		}
> +
> +		start = extent_end + 1;
> +	}
> +	free_extent_state(pinned_extent);
> +bitmap:
>  	/* Write out the bitmaps */
>  	list_for_each_safe(pos, n, &bitmap_list) {
>  		struct btrfs_free_space *entry =
> @@ -1097,6 +1125,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  	if (IS_ERR(inode))
>  		return 0;
>  
> +	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +		down_write(&root->fs_info->data_rwsem);
> +
>  	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>  				      path, block_group->key.objectid);
>  	if (ret) {
> @@ -1111,6 +1142,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  #endif
>  	}
>  
> +	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +		up_write(&root->fs_info->data_rwsem);
> +
>  	iput(inode);
>  	return ret;
>  }
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index f1a7744..8172ca6 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -592,6 +592,28 @@ free_pages_out:
>  	goto out;
>  }
>  
> +static void btrfs_unpin_data_extent(struct btrfs_root *root, u64 start,
> +				    u64 len)
> +{
> +	struct btrfs_block_group_cache *cache;
> +
> +	cache = btrfs_lookup_block_group(root->fs_info, start);
> +	BUG_ON(!cache);
> +	clear_extent_dirty(&cache->pinned_extents, start, start + len - 1,
> +			   GFP_NOFS);
> +	btrfs_put_block_group(cache);
> +}
> +
> +/* it is not used for free space cache file */
> +static void btrfs_free_reserved_data_extent(struct btrfs_root *root, u64 start,
> +					    u64 len)
> +{
> +	down_read(&root->fs_info->data_rwsem);
> +	btrfs_unpin_data_extent(root, start, len);
> +	btrfs_free_reserved_extent(root, start, len);
> +	up_read(&root->fs_info->data_rwsem);
> +}
> +
>  /*
>   * phase two of compressed writeback.  This is the ordered portion
>   * of the code, which only gets called in the order the work was
> @@ -666,7 +688,7 @@ retry:
>  		ret = btrfs_reserve_extent(root,
>  					   async_extent->compressed_size,
>  					   async_extent->compressed_size,
> -					   0, alloc_hint, &ins, 1);
> +					   0, alloc_hint, &ins, 1, true);
>  		if (ret) {
>  			int i;
>  
> @@ -767,7 +789,7 @@ retry:
>  out:
>  	return ret;
>  out_free_reserve:
> -	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
> +	btrfs_free_reserved_data_extent(root, ins.objectid, ins.offset);
>  out_free:
>  	extent_clear_unlock_delalloc(inode, async_extent->start,
>  				     async_extent->start +
> @@ -889,7 +911,7 @@ static noinline int cow_file_range(struct inode *inode,
>  		cur_alloc_size = disk_num_bytes;
>  		ret = btrfs_reserve_extent(root, cur_alloc_size,
>  					   root->sectorsize, 0, alloc_hint,
> -					   &ins, 1);
> +					   &ins, 1, true);
>  		if (ret < 0)
>  			goto out_unlock;
>  
> @@ -967,6 +989,7 @@ out:
>  	return ret;
>  
>  out_reserve:
> +	btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>  	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>  out_unlock:
>  	extent_clear_unlock_delalloc(inode, start, end, locked_page,
> @@ -2647,6 +2670,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
>  						logical_len, logical_len,
>  						compress_type, 0, 0,
>  						BTRFS_FILE_EXTENT_REG);
> +		BUG_ON(nolock);
> +		btrfs_unpin_data_extent(root, ordered_extent->start,
> +					ordered_extent->disk_len);
>  	}
>  	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
>  			   ordered_extent->file_offset, ordered_extent->len,
> @@ -2698,8 +2724,9 @@ out:
>  		if ((ret || !logical_len) &&
>  		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
>  		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
> -			btrfs_free_reserved_extent(root, ordered_extent->start,
> -						   ordered_extent->disk_len);
> +			btrfs_free_reserved_data_extent(root,
> +							ordered_extent->start,
> +							ordered_extent->disk_len);
>  	}
>  
>  
> @@ -6342,7 +6369,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>  
>  	alloc_hint = get_extent_allocation_hint(inode, start, len);
>  	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
> -				   alloc_hint, &ins, 1);
> +				   alloc_hint, &ins, 1, true);
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> @@ -6356,6 +6383,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>  	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
>  					   ins.offset, ins.offset, 0);
>  	if (ret) {
> +		btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>  		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>  		free_extent_map(em);
>  		return ERR_PTR(ret);
> @@ -8507,7 +8535,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
>  		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
>  		cur_bytes = max(cur_bytes, min_size);
>  		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
> -					   *alloc_hint, &ins, 1);
> +					   *alloc_hint, &ins, 1, false);
>  		if (ret) {
>  			if (own_trans)
>  				btrfs_end_transaction(trans, root);
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
  2014-01-16  5:54   ` Liu Bo
@ 2014-03-01 18:05   ` Alex Lyakas
  2014-03-04  6:04     ` Miao Xie
  2014-03-04 15:19   ` Josef Bacik
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Alex Lyakas @ 2014-03-01 18:05 UTC (permalink / raw)
  To: Miao Xie; +Cc: linux-btrfs

Hi Miao,

On Wed, Jan 15, 2014 at 2:00 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> When we mounted the filesystem after the crash, we got the following
> message:
>   BTRFS error (device xxx): block group 4315938816 has wrong amount of free space
>   BTRFS error (device xxx): failed to load free space cache for block group 4315938816
>
> It is because we didn't update the metadata of the allocated space until
> the file data was written into the disk. During this time, there was no
> information about the allocated spaces in either the extent tree nor the
> free space cache. when we wrote out the free space cache at this time, those
> spaces were lost.
Can you please clarify more about the problem?
So I understand that we allocate something from the free space cache.
So after that, the free space cache does not account for this extent
anymore. On the other hand the extent tree also does not account for
it (yet). We need to add a delayed reference, which will be run at
transaction commit and update the extent tree. But free-space cache is
also written out during transaction commit. So how the issue happens?
Can you perhaps post a flow of events?

Thanks.
Alex.


>
> In ordered to fix this problem, I use a state tree for every block group
> to record those allocated spaces. We record the information when they are
> allocated, and clean up the information after the metadata update. Besides
> that, we also introduce a read-write semaphore to avoid the race between
> the allocation and the free space cache write out.
>
> Only data block groups had this problem, so the above change is just
> for data space allocation.
>
> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> ---
>  fs/btrfs/ctree.h            | 15 ++++++++++++++-
>  fs/btrfs/disk-io.c          |  2 +-
>  fs/btrfs/extent-tree.c      | 24 ++++++++++++++++++++----
>  fs/btrfs/free-space-cache.c | 42 ++++++++++++++++++++++++++++++++++++++----
>  fs/btrfs/inode.c            | 42 +++++++++++++++++++++++++++++++++++-------
>  5 files changed, 108 insertions(+), 17 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1667c9a..f58e1f7 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1244,6 +1244,12 @@ struct btrfs_block_group_cache {
>         /* free space cache stuff */
>         struct btrfs_free_space_ctl *free_space_ctl;
>
> +       /*
> +        * It is used to record the extents that are allocated for
> +        * the data, but don/t update its metadata.
> +        */
> +       struct extent_io_tree pinned_extents;
> +
>         /* block group cache stuff */
>         struct rb_node cache_node;
>
> @@ -1540,6 +1546,13 @@ struct btrfs_fs_info {
>          */
>         struct list_head space_info;
>
> +       /*
> +        * It is just used for the delayed data space allocation
> +        * because only the data space allocation can be done during
> +        * we write out the free space cache.
> +        */
> +       struct rw_semaphore data_rwsem;
> +
>         struct btrfs_space_info *data_sinfo;
>
>         struct reloc_control *reloc_ctl;
> @@ -3183,7 +3196,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
>                                    struct btrfs_key *ins);
>  int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
>                          u64 min_alloc_size, u64 empty_size, u64 hint_byte,
> -                        struct btrfs_key *ins, int is_data);
> +                        struct btrfs_key *ins, int is_data, bool need_pin);
>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>                   struct extent_buffer *buf, int full_backref, int for_cow);
>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 8072cfa..426b558 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2276,7 +2276,6 @@ int open_ctree(struct super_block *sb,
>         fs_info->pinned_extents = &fs_info->freed_extents[0];
>         fs_info->do_barriers = 1;
>
> -
>         mutex_init(&fs_info->ordered_operations_mutex);
>         mutex_init(&fs_info->ordered_extent_flush_mutex);
>         mutex_init(&fs_info->tree_log_mutex);
> @@ -2287,6 +2286,7 @@ int open_ctree(struct super_block *sb,
>         init_rwsem(&fs_info->extent_commit_sem);
>         init_rwsem(&fs_info->cleanup_work_sem);
>         init_rwsem(&fs_info->subvol_sem);
> +       init_rwsem(&fs_info->data_rwsem);
>         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>         fs_info->dev_replace.lock_owner = 0;
>         atomic_set(&fs_info->dev_replace.nesting_level, 0);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 3664cfb..7b07876 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -6173,7 +6173,7 @@ enum btrfs_loop_type {
>  static noinline int find_free_extent(struct btrfs_root *orig_root,
>                                      u64 num_bytes, u64 empty_size,
>                                      u64 hint_byte, struct btrfs_key *ins,
> -                                    u64 flags)
> +                                    u64 flags, bool need_pin)
>  {
>         int ret = 0;
>         struct btrfs_root *root = orig_root->fs_info->extent_root;
> @@ -6502,6 +6502,16 @@ checks:
>                 ins->objectid = search_start;
>                 ins->offset = num_bytes;
>
> +               if (need_pin) {
> +                       ASSERT(search_start >= block_group->key.objectid &&
> +                              search_start < block_group->key.objectid +
> +                                             block_group->key.offset);
> +                       set_extent_dirty(&block_group->pinned_extents,
> +                                        search_start,
> +                                        search_start + num_bytes - 1,
> +                                        GFP_NOFS);
> +               }
> +
>                 trace_btrfs_reserve_extent(orig_root, block_group,
>                                            search_start, num_bytes);
>                 btrfs_put_block_group(block_group);
> @@ -6614,17 +6624,20 @@ again:
>  int btrfs_reserve_extent(struct btrfs_root *root,
>                          u64 num_bytes, u64 min_alloc_size,
>                          u64 empty_size, u64 hint_byte,
> -                        struct btrfs_key *ins, int is_data)
> +                        struct btrfs_key *ins, int is_data, bool need_pin)
>  {
>         bool final_tried = false;
>         u64 flags;
>         int ret;
>
>         flags = btrfs_get_alloc_profile(root, is_data);
> +
> +       if (need_pin)
> +               down_read(&root->fs_info->data_rwsem);
>  again:
>         WARN_ON(num_bytes < root->sectorsize);
>         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
> -                              flags);
> +                              flags, need_pin);
>
>         if (ret == -ENOSPC) {
>                 if (!final_tried && ins->offset) {
> @@ -6645,6 +6658,8 @@ again:
>                 }
>         }
>
> +       if (need_pin)
> +               up_read(&root->fs_info->data_rwsem);
>         return ret;
>  }
>
> @@ -7016,7 +7031,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>                 return ERR_CAST(block_rsv);
>
>         ret = btrfs_reserve_extent(root, blocksize, blocksize,
> -                                  empty_size, hint, &ins, 0);
> +                                  empty_size, hint, &ins, 0, false);
>         if (ret) {
>                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
>                 return ERR_PTR(ret);
> @@ -8387,6 +8402,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
>         INIT_LIST_HEAD(&cache->cluster_list);
>         INIT_LIST_HEAD(&cache->new_bg_list);
>         btrfs_init_free_space_ctl(cache);
> +       extent_io_tree_init(&cache->pinned_extents, NULL);
>
>         return cache;
>  }
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 057be95..486e12a3 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -875,6 +875,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>         struct rb_node *node;
>         struct list_head *pos, *n;
>         struct extent_state *cached_state = NULL;
> +       struct extent_state *pinned_extent = NULL;
>         struct btrfs_free_cluster *cluster = NULL;
>         struct extent_io_tree *unpin = NULL;
>         struct io_ctl io_ctl;
> @@ -948,17 +949,17 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>          * so we don't leak the space
>          */
>
> +       if (!block_group)
> +               goto bitmap;
>         /*
>          * We shouldn't have switched the pinned extents yet so this is the
>          * right one
>          */
>         unpin = root->fs_info->pinned_extents;
>
> -       if (block_group)
> -               start = block_group->key.objectid;
> +       start = block_group->key.objectid;
>
> -       while (block_group && (start < block_group->key.objectid +
> -                              block_group->key.offset)) {
> +       while (start < block_group->key.objectid + block_group->key.offset) {
>                 ret = find_first_extent_bit(unpin, start,
>                                             &extent_start, &extent_end,
>                                             EXTENT_DIRTY, NULL);
> @@ -985,6 +986,33 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>                 start = extent_end;
>         }
>
> +       if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
> +               goto bitmap;
> +
> +       start = block_group->key.objectid;
> +       unpin = &block_group->pinned_extents;
> +       while (1) {
> +               ret = find_first_extent_bit(unpin, start,
> +                                           &extent_start, &extent_end,
> +                                           EXTENT_DIRTY, &pinned_extent);
> +               if (ret) {
> +                       ret = 0;
> +                       break;
> +               }
> +
> +               len = extent_end - extent_start + 1;
> +
> +               entries++;
> +               ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
> +               if (ret) {
> +                       free_extent_state(pinned_extent);
> +                       goto out_nospc;
> +               }
> +
> +               start = extent_end + 1;
> +       }
> +       free_extent_state(pinned_extent);
> +bitmap:
>         /* Write out the bitmaps */
>         list_for_each_safe(pos, n, &bitmap_list) {
>                 struct btrfs_free_space *entry =
> @@ -1097,6 +1125,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>         if (IS_ERR(inode))
>                 return 0;
>
> +       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +               down_write(&root->fs_info->data_rwsem);
> +
>         ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>                                       path, block_group->key.objectid);
>         if (ret) {
> @@ -1111,6 +1142,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  #endif
>         }
>
> +       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +               up_write(&root->fs_info->data_rwsem);
> +
>         iput(inode);
>         return ret;
>  }
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index f1a7744..8172ca6 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -592,6 +592,28 @@ free_pages_out:
>         goto out;
>  }
>
> +static void btrfs_unpin_data_extent(struct btrfs_root *root, u64 start,
> +                                   u64 len)
> +{
> +       struct btrfs_block_group_cache *cache;
> +
> +       cache = btrfs_lookup_block_group(root->fs_info, start);
> +       BUG_ON(!cache);
> +       clear_extent_dirty(&cache->pinned_extents, start, start + len - 1,
> +                          GFP_NOFS);
> +       btrfs_put_block_group(cache);
> +}
> +
> +/* it is not used for free space cache file */
> +static void btrfs_free_reserved_data_extent(struct btrfs_root *root, u64 start,
> +                                           u64 len)
> +{
> +       down_read(&root->fs_info->data_rwsem);
> +       btrfs_unpin_data_extent(root, start, len);
> +       btrfs_free_reserved_extent(root, start, len);
> +       up_read(&root->fs_info->data_rwsem);
> +}
> +
>  /*
>   * phase two of compressed writeback.  This is the ordered portion
>   * of the code, which only gets called in the order the work was
> @@ -666,7 +688,7 @@ retry:
>                 ret = btrfs_reserve_extent(root,
>                                            async_extent->compressed_size,
>                                            async_extent->compressed_size,
> -                                          0, alloc_hint, &ins, 1);
> +                                          0, alloc_hint, &ins, 1, true);
>                 if (ret) {
>                         int i;
>
> @@ -767,7 +789,7 @@ retry:
>  out:
>         return ret;
>  out_free_reserve:
> -       btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
> +       btrfs_free_reserved_data_extent(root, ins.objectid, ins.offset);
>  out_free:
>         extent_clear_unlock_delalloc(inode, async_extent->start,
>                                      async_extent->start +
> @@ -889,7 +911,7 @@ static noinline int cow_file_range(struct inode *inode,
>                 cur_alloc_size = disk_num_bytes;
>                 ret = btrfs_reserve_extent(root, cur_alloc_size,
>                                            root->sectorsize, 0, alloc_hint,
> -                                          &ins, 1);
> +                                          &ins, 1, true);
>                 if (ret < 0)
>                         goto out_unlock;
>
> @@ -967,6 +989,7 @@ out:
>         return ret;
>
>  out_reserve:
> +       btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>         btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>  out_unlock:
>         extent_clear_unlock_delalloc(inode, start, end, locked_page,
> @@ -2647,6 +2670,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
>                                                 logical_len, logical_len,
>                                                 compress_type, 0, 0,
>                                                 BTRFS_FILE_EXTENT_REG);
> +               BUG_ON(nolock);
> +               btrfs_unpin_data_extent(root, ordered_extent->start,
> +                                       ordered_extent->disk_len);
>         }
>         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
>                            ordered_extent->file_offset, ordered_extent->len,
> @@ -2698,8 +2724,9 @@ out:
>                 if ((ret || !logical_len) &&
>                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
>                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
> -                       btrfs_free_reserved_extent(root, ordered_extent->start,
> -                                                  ordered_extent->disk_len);
> +                       btrfs_free_reserved_data_extent(root,
> +                                                       ordered_extent->start,
> +                                                       ordered_extent->disk_len);
>         }
>
>
> @@ -6342,7 +6369,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>
>         alloc_hint = get_extent_allocation_hint(inode, start, len);
>         ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
> -                                  alloc_hint, &ins, 1);
> +                                  alloc_hint, &ins, 1, true);
>         if (ret)
>                 return ERR_PTR(ret);
>
> @@ -6356,6 +6383,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>         ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
>                                            ins.offset, ins.offset, 0);
>         if (ret) {
> +               btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>                 free_extent_map(em);
>                 return ERR_PTR(ret);
> @@ -8507,7 +8535,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
>                 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
>                 cur_bytes = max(cur_bytes, min_size);
>                 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
> -                                          *alloc_hint, &ins, 1);
> +                                          *alloc_hint, &ins, 1, false);
>                 if (ret) {
>                         if (own_trans)
>                                 btrfs_end_transaction(trans, root);
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-03-01 18:05   ` Alex Lyakas
@ 2014-03-04  6:04     ` Miao Xie
  2014-03-08 16:48       ` Alex Lyakas
  0 siblings, 1 reply; 18+ messages in thread
From: Miao Xie @ 2014-03-04  6:04 UTC (permalink / raw)
  To: Alex Lyakas; +Cc: linux-btrfs

On 	sat, 1 Mar 2014 20:05:01 +0200, Alex Lyakas wrote:
> Hi Miao,
> 
> On Wed, Jan 15, 2014 at 2:00 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>> When we mounted the filesystem after the crash, we got the following
>> message:
>>   BTRFS error (device xxx): block group 4315938816 has wrong amount of free space
>>   BTRFS error (device xxx): failed to load free space cache for block group 4315938816
>>
>> It is because we didn't update the metadata of the allocated space until
>> the file data was written into the disk. During this time, there was no
>> information about the allocated spaces in either the extent tree nor the
>> free space cache. when we wrote out the free space cache at this time, those
>> spaces were lost.
> Can you please clarify more about the problem?
> So I understand that we allocate something from the free space cache.
> So after that, the free space cache does not account for this extent
> anymore. On the other hand the extent tree also does not account for
> it (yet). We need to add a delayed reference, which will be run at
> transaction commit and update the extent tree. But free-space cache is
> also written out during transaction commit. So how the issue happens?
> Can you perhaps post a flow of events?

	Task1				Task2
	btrfs_writepages()
	  alloc data space
	    (The allocated space was
	     removed from the free
	     space cache)
	  submit_bio()
					btrfs_commit_transaction()
					  write out space cache
					  ...
					  commit transaction completed
					system crash
	 (end_io() wasn't executed)

The system crashed before end_io was executed, so no file references to the
allocated space, and the extent tree also does not account for it. That space
was lost.

Thanks
Miao
> 
> Thanks.
> Alex.
> 
> 
>>
>> In ordered to fix this problem, I use a state tree for every block group
>> to record those allocated spaces. We record the information when they are
>> allocated, and clean up the information after the metadata update. Besides
>> that, we also introduce a read-write semaphore to avoid the race between
>> the allocation and the free space cache write out.
>>
>> Only data block groups had this problem, so the above change is just
>> for data space allocation.
>>
>> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
>> ---
>>  fs/btrfs/ctree.h            | 15 ++++++++++++++-
>>  fs/btrfs/disk-io.c          |  2 +-
>>  fs/btrfs/extent-tree.c      | 24 ++++++++++++++++++++----
>>  fs/btrfs/free-space-cache.c | 42 ++++++++++++++++++++++++++++++++++++++----
>>  fs/btrfs/inode.c            | 42 +++++++++++++++++++++++++++++++++++-------
>>  5 files changed, 108 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 1667c9a..f58e1f7 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -1244,6 +1244,12 @@ struct btrfs_block_group_cache {
>>         /* free space cache stuff */
>>         struct btrfs_free_space_ctl *free_space_ctl;
>>
>> +       /*
>> +        * It is used to record the extents that are allocated for
>> +        * the data, but don/t update its metadata.
>> +        */
>> +       struct extent_io_tree pinned_extents;
>> +
>>         /* block group cache stuff */
>>         struct rb_node cache_node;
>>
>> @@ -1540,6 +1546,13 @@ struct btrfs_fs_info {
>>          */
>>         struct list_head space_info;
>>
>> +       /*
>> +        * It is just used for the delayed data space allocation
>> +        * because only the data space allocation can be done during
>> +        * we write out the free space cache.
>> +        */
>> +       struct rw_semaphore data_rwsem;
>> +
>>         struct btrfs_space_info *data_sinfo;
>>
>>         struct reloc_control *reloc_ctl;
>> @@ -3183,7 +3196,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
>>                                    struct btrfs_key *ins);
>>  int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
>>                          u64 min_alloc_size, u64 empty_size, u64 hint_byte,
>> -                        struct btrfs_key *ins, int is_data);
>> +                        struct btrfs_key *ins, int is_data, bool need_pin);
>>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>>                   struct extent_buffer *buf, int full_backref, int for_cow);
>>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 8072cfa..426b558 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -2276,7 +2276,6 @@ int open_ctree(struct super_block *sb,
>>         fs_info->pinned_extents = &fs_info->freed_extents[0];
>>         fs_info->do_barriers = 1;
>>
>> -
>>         mutex_init(&fs_info->ordered_operations_mutex);
>>         mutex_init(&fs_info->ordered_extent_flush_mutex);
>>         mutex_init(&fs_info->tree_log_mutex);
>> @@ -2287,6 +2286,7 @@ int open_ctree(struct super_block *sb,
>>         init_rwsem(&fs_info->extent_commit_sem);
>>         init_rwsem(&fs_info->cleanup_work_sem);
>>         init_rwsem(&fs_info->subvol_sem);
>> +       init_rwsem(&fs_info->data_rwsem);
>>         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>>         fs_info->dev_replace.lock_owner = 0;
>>         atomic_set(&fs_info->dev_replace.nesting_level, 0);
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index 3664cfb..7b07876 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -6173,7 +6173,7 @@ enum btrfs_loop_type {
>>  static noinline int find_free_extent(struct btrfs_root *orig_root,
>>                                      u64 num_bytes, u64 empty_size,
>>                                      u64 hint_byte, struct btrfs_key *ins,
>> -                                    u64 flags)
>> +                                    u64 flags, bool need_pin)
>>  {
>>         int ret = 0;
>>         struct btrfs_root *root = orig_root->fs_info->extent_root;
>> @@ -6502,6 +6502,16 @@ checks:
>>                 ins->objectid = search_start;
>>                 ins->offset = num_bytes;
>>
>> +               if (need_pin) {
>> +                       ASSERT(search_start >= block_group->key.objectid &&
>> +                              search_start < block_group->key.objectid +
>> +                                             block_group->key.offset);
>> +                       set_extent_dirty(&block_group->pinned_extents,
>> +                                        search_start,
>> +                                        search_start + num_bytes - 1,
>> +                                        GFP_NOFS);
>> +               }
>> +
>>                 trace_btrfs_reserve_extent(orig_root, block_group,
>>                                            search_start, num_bytes);
>>                 btrfs_put_block_group(block_group);
>> @@ -6614,17 +6624,20 @@ again:
>>  int btrfs_reserve_extent(struct btrfs_root *root,
>>                          u64 num_bytes, u64 min_alloc_size,
>>                          u64 empty_size, u64 hint_byte,
>> -                        struct btrfs_key *ins, int is_data)
>> +                        struct btrfs_key *ins, int is_data, bool need_pin)
>>  {
>>         bool final_tried = false;
>>         u64 flags;
>>         int ret;
>>
>>         flags = btrfs_get_alloc_profile(root, is_data);
>> +
>> +       if (need_pin)
>> +               down_read(&root->fs_info->data_rwsem);
>>  again:
>>         WARN_ON(num_bytes < root->sectorsize);
>>         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
>> -                              flags);
>> +                              flags, need_pin);
>>
>>         if (ret == -ENOSPC) {
>>                 if (!final_tried && ins->offset) {
>> @@ -6645,6 +6658,8 @@ again:
>>                 }
>>         }
>>
>> +       if (need_pin)
>> +               up_read(&root->fs_info->data_rwsem);
>>         return ret;
>>  }
>>
>> @@ -7016,7 +7031,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>>                 return ERR_CAST(block_rsv);
>>
>>         ret = btrfs_reserve_extent(root, blocksize, blocksize,
>> -                                  empty_size, hint, &ins, 0);
>> +                                  empty_size, hint, &ins, 0, false);
>>         if (ret) {
>>                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
>>                 return ERR_PTR(ret);
>> @@ -8387,6 +8402,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
>>         INIT_LIST_HEAD(&cache->cluster_list);
>>         INIT_LIST_HEAD(&cache->new_bg_list);
>>         btrfs_init_free_space_ctl(cache);
>> +       extent_io_tree_init(&cache->pinned_extents, NULL);
>>
>>         return cache;
>>  }
>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>> index 057be95..486e12a3 100644
>> --- a/fs/btrfs/free-space-cache.c
>> +++ b/fs/btrfs/free-space-cache.c
>> @@ -875,6 +875,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>>         struct rb_node *node;
>>         struct list_head *pos, *n;
>>         struct extent_state *cached_state = NULL;
>> +       struct extent_state *pinned_extent = NULL;
>>         struct btrfs_free_cluster *cluster = NULL;
>>         struct extent_io_tree *unpin = NULL;
>>         struct io_ctl io_ctl;
>> @@ -948,17 +949,17 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>>          * so we don't leak the space
>>          */
>>
>> +       if (!block_group)
>> +               goto bitmap;
>>         /*
>>          * We shouldn't have switched the pinned extents yet so this is the
>>          * right one
>>          */
>>         unpin = root->fs_info->pinned_extents;
>>
>> -       if (block_group)
>> -               start = block_group->key.objectid;
>> +       start = block_group->key.objectid;
>>
>> -       while (block_group && (start < block_group->key.objectid +
>> -                              block_group->key.offset)) {
>> +       while (start < block_group->key.objectid + block_group->key.offset) {
>>                 ret = find_first_extent_bit(unpin, start,
>>                                             &extent_start, &extent_end,
>>                                             EXTENT_DIRTY, NULL);
>> @@ -985,6 +986,33 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>>                 start = extent_end;
>>         }
>>
>> +       if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
>> +               goto bitmap;
>> +
>> +       start = block_group->key.objectid;
>> +       unpin = &block_group->pinned_extents;
>> +       while (1) {
>> +               ret = find_first_extent_bit(unpin, start,
>> +                                           &extent_start, &extent_end,
>> +                                           EXTENT_DIRTY, &pinned_extent);
>> +               if (ret) {
>> +                       ret = 0;
>> +                       break;
>> +               }
>> +
>> +               len = extent_end - extent_start + 1;
>> +
>> +               entries++;
>> +               ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
>> +               if (ret) {
>> +                       free_extent_state(pinned_extent);
>> +                       goto out_nospc;
>> +               }
>> +
>> +               start = extent_end + 1;
>> +       }
>> +       free_extent_state(pinned_extent);
>> +bitmap:
>>         /* Write out the bitmaps */
>>         list_for_each_safe(pos, n, &bitmap_list) {
>>                 struct btrfs_free_space *entry =
>> @@ -1097,6 +1125,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>>         if (IS_ERR(inode))
>>                 return 0;
>>
>> +       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
>> +               down_write(&root->fs_info->data_rwsem);
>> +
>>         ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>>                                       path, block_group->key.objectid);
>>         if (ret) {
>> @@ -1111,6 +1142,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>>  #endif
>>         }
>>
>> +       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
>> +               up_write(&root->fs_info->data_rwsem);
>> +
>>         iput(inode);
>>         return ret;
>>  }
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index f1a7744..8172ca6 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -592,6 +592,28 @@ free_pages_out:
>>         goto out;
>>  }
>>
>> +static void btrfs_unpin_data_extent(struct btrfs_root *root, u64 start,
>> +                                   u64 len)
>> +{
>> +       struct btrfs_block_group_cache *cache;
>> +
>> +       cache = btrfs_lookup_block_group(root->fs_info, start);
>> +       BUG_ON(!cache);
>> +       clear_extent_dirty(&cache->pinned_extents, start, start + len - 1,
>> +                          GFP_NOFS);
>> +       btrfs_put_block_group(cache);
>> +}
>> +
>> +/* it is not used for free space cache file */
>> +static void btrfs_free_reserved_data_extent(struct btrfs_root *root, u64 start,
>> +                                           u64 len)
>> +{
>> +       down_read(&root->fs_info->data_rwsem);
>> +       btrfs_unpin_data_extent(root, start, len);
>> +       btrfs_free_reserved_extent(root, start, len);
>> +       up_read(&root->fs_info->data_rwsem);
>> +}
>> +
>>  /*
>>   * phase two of compressed writeback.  This is the ordered portion
>>   * of the code, which only gets called in the order the work was
>> @@ -666,7 +688,7 @@ retry:
>>                 ret = btrfs_reserve_extent(root,
>>                                            async_extent->compressed_size,
>>                                            async_extent->compressed_size,
>> -                                          0, alloc_hint, &ins, 1);
>> +                                          0, alloc_hint, &ins, 1, true);
>>                 if (ret) {
>>                         int i;
>>
>> @@ -767,7 +789,7 @@ retry:
>>  out:
>>         return ret;
>>  out_free_reserve:
>> -       btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>> +       btrfs_free_reserved_data_extent(root, ins.objectid, ins.offset);
>>  out_free:
>>         extent_clear_unlock_delalloc(inode, async_extent->start,
>>                                      async_extent->start +
>> @@ -889,7 +911,7 @@ static noinline int cow_file_range(struct inode *inode,
>>                 cur_alloc_size = disk_num_bytes;
>>                 ret = btrfs_reserve_extent(root, cur_alloc_size,
>>                                            root->sectorsize, 0, alloc_hint,
>> -                                          &ins, 1);
>> +                                          &ins, 1, true);
>>                 if (ret < 0)
>>                         goto out_unlock;
>>
>> @@ -967,6 +989,7 @@ out:
>>         return ret;
>>
>>  out_reserve:
>> +       btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>>         btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>>  out_unlock:
>>         extent_clear_unlock_delalloc(inode, start, end, locked_page,
>> @@ -2647,6 +2670,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
>>                                                 logical_len, logical_len,
>>                                                 compress_type, 0, 0,
>>                                                 BTRFS_FILE_EXTENT_REG);
>> +               BUG_ON(nolock);
>> +               btrfs_unpin_data_extent(root, ordered_extent->start,
>> +                                       ordered_extent->disk_len);
>>         }
>>         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
>>                            ordered_extent->file_offset, ordered_extent->len,
>> @@ -2698,8 +2724,9 @@ out:
>>                 if ((ret || !logical_len) &&
>>                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
>>                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
>> -                       btrfs_free_reserved_extent(root, ordered_extent->start,
>> -                                                  ordered_extent->disk_len);
>> +                       btrfs_free_reserved_data_extent(root,
>> +                                                       ordered_extent->start,
>> +                                                       ordered_extent->disk_len);
>>         }
>>
>>
>> @@ -6342,7 +6369,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>>
>>         alloc_hint = get_extent_allocation_hint(inode, start, len);
>>         ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
>> -                                  alloc_hint, &ins, 1);
>> +                                  alloc_hint, &ins, 1, true);
>>         if (ret)
>>                 return ERR_PTR(ret);
>>
>> @@ -6356,6 +6383,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>>         ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
>>                                            ins.offset, ins.offset, 0);
>>         if (ret) {
>> +               btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>>                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>>                 free_extent_map(em);
>>                 return ERR_PTR(ret);
>> @@ -8507,7 +8535,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
>>                 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
>>                 cur_bytes = max(cur_bytes, min_size);
>>                 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
>> -                                          *alloc_hint, &ins, 1);
>> +                                          *alloc_hint, &ins, 1, false);
>>                 if (ret) {
>>                         if (own_trans)
>>                                 btrfs_end_transaction(trans, root);
>> --
>> 1.8.3.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
  2014-01-16  5:54   ` Liu Bo
  2014-03-01 18:05   ` Alex Lyakas
@ 2014-03-04 15:19   ` Josef Bacik
  2014-03-05  7:02     ` Miao Xie
  2014-03-26  6:36   ` miaox
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Josef Bacik @ 2014-03-04 15:19 UTC (permalink / raw)
  To: Miao Xie, linux-btrfs

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 01/15/2014 07:00 AM, Miao Xie wrote:
> When we mounted the filesystem after the crash, we got the
> following message: BTRFS error (device xxx): block group 4315938816
> has wrong amount of free space BTRFS error (device xxx): failed to
> load free space cache for block group 4315938816
> 
> It is because we didn't update the metadata of the allocated space
> until the file data was written into the disk. During this time,
> there was no information about the allocated spaces in either the
> extent tree nor the free space cache. when we wrote out the free
> space cache at this time, those spaces were lost.
> 
> In ordered to fix this problem, I use a state tree for every block
> group to record those allocated spaces. We record the information
> when they are allocated, and clean up the information after the
> metadata update. Besides that, we also introduce a read-write
> semaphore to avoid the race between the allocation and the free
> space cache write out.
> 
> Only data block groups had this problem, so the above change is
> just for data space allocation.
> 

I didn't like this idea at first but I've come around to it.  The only
thing is the data_rwsem thing, we don't need it as we are protected by
the transaction being blocked when we do writeout, so nobody can data
allocations during this time.  Thanks,

Josef

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1
Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/

iQIcBAEBAgAGBQJTFe73AAoJEANb+wAKly3BMMMP/1yY3WBGzaz5oTyNRRSgnVvl
sjk7WSN2A0mhrl6lQOBlTvhgFo3gDNAtIwJaJIfWxWXZfM5i1hXSViFIc5NND3lF
Jm10oaCPiTaYM0S5jqj8oeSXFyb1ny+/D4C1OfC/eRVpu9juO3bFUZYcM1XMFUcI
mJxXg9Mqi1C6TOocCIdQI7ijXgm8xEPauaQy71EJZmgSjjVAXFG9BHay26L2a3Yu
XIlnjyHMcgIFZXGVHQ+45S2pwWVgVZBIHLcKFSDFy4aupq/+EAN15oxdeTLBG8hJ
RFJh15wLQguawlirc7boPzEugSieixbbUjn6CZqikVZke1g1fjGvrTAjiacpTDxv
jskrCPYaXYWKMqGjugsVSI8GSonuWmmVRBsg4k+52U9AYM8wapjL+RHzBXU1cBqu
zfyqaJhBj7EOQIu2oQDT2ZF9E+XA8dLcrysasMS+CYmcmR1Bs1fzpP2W3DoOg68F
YoYuXnaAvIkvUQSsPUNGPjCn+iGCq65ZwiwnwF93RN7zlFIQt12DSn7l/NKnT/p9
1jvLkQ2MFmb6QD264mAOL5TULWyyB0AXsitIBJOMs0XWXZheLqqpv1clcaZacBFH
P1yN51+d8DJmF0x0j1HacWi62WnGCh8GzR5ef5Pqi11/11xknbPugrzfFI9v1vR7
STfhGs/0sg7oQWYgarOj
=Y2df
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-03-04 15:19   ` Josef Bacik
@ 2014-03-05  7:02     ` Miao Xie
  2014-03-05 15:02       ` Josef Bacik
  0 siblings, 1 reply; 18+ messages in thread
From: Miao Xie @ 2014-03-05  7:02 UTC (permalink / raw)
  To: Josef Bacik, linux-btrfs

On Tue, 4 Mar 2014 10:19:20 -0500, Josef Bacik wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> On 01/15/2014 07:00 AM, Miao Xie wrote:
>> When we mounted the filesystem after the crash, we got the
>> following message: BTRFS error (device xxx): block group 4315938816
>> has wrong amount of free space BTRFS error (device xxx): failed to
>> load free space cache for block group 4315938816
>>
>> It is because we didn't update the metadata of the allocated space
>> until the file data was written into the disk. During this time,
>> there was no information about the allocated spaces in either the
>> extent tree nor the free space cache. when we wrote out the free
>> space cache at this time, those spaces were lost.
>>
>> In ordered to fix this problem, I use a state tree for every block
>> group to record those allocated spaces. We record the information
>> when they are allocated, and clean up the information after the
>> metadata update. Besides that, we also introduce a read-write
>> semaphore to avoid the race between the allocation and the free
>> space cache write out.
>>
>> Only data block groups had this problem, so the above change is
>> just for data space allocation.
>>
> 
> I didn't like this idea at first but I've come around to it.  The only
> thing is the data_rwsem thing, we don't need it as we are protected by
> the transaction being blocked when we do writeout, so nobody can data
> allocations during this time.  Thanks,

But this protection was removed by the patch

  Commit ID: 00361589d2eebd90fca022148c763e40d3e90871

Thanks
Miao

> 
> Josef
> 
> -----BEGIN PGP SIGNATURE-----
> Version: GnuPG v1
> Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/
> 
> iQIcBAEBAgAGBQJTFe73AAoJEANb+wAKly3BMMMP/1yY3WBGzaz5oTyNRRSgnVvl
> sjk7WSN2A0mhrl6lQOBlTvhgFo3gDNAtIwJaJIfWxWXZfM5i1hXSViFIc5NND3lF
> Jm10oaCPiTaYM0S5jqj8oeSXFyb1ny+/D4C1OfC/eRVpu9juO3bFUZYcM1XMFUcI
> mJxXg9Mqi1C6TOocCIdQI7ijXgm8xEPauaQy71EJZmgSjjVAXFG9BHay26L2a3Yu
> XIlnjyHMcgIFZXGVHQ+45S2pwWVgVZBIHLcKFSDFy4aupq/+EAN15oxdeTLBG8hJ
> RFJh15wLQguawlirc7boPzEugSieixbbUjn6CZqikVZke1g1fjGvrTAjiacpTDxv
> jskrCPYaXYWKMqGjugsVSI8GSonuWmmVRBsg4k+52U9AYM8wapjL+RHzBXU1cBqu
> zfyqaJhBj7EOQIu2oQDT2ZF9E+XA8dLcrysasMS+CYmcmR1Bs1fzpP2W3DoOg68F
> YoYuXnaAvIkvUQSsPUNGPjCn+iGCq65ZwiwnwF93RN7zlFIQt12DSn7l/NKnT/p9
> 1jvLkQ2MFmb6QD264mAOL5TULWyyB0AXsitIBJOMs0XWXZheLqqpv1clcaZacBFH
> P1yN51+d8DJmF0x0j1HacWi62WnGCh8GzR5ef5Pqi11/11xknbPugrzfFI9v1vR7
> STfhGs/0sg7oQWYgarOj
> =Y2df
> -----END PGP SIGNATURE-----
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-03-05  7:02     ` Miao Xie
@ 2014-03-05 15:02       ` Josef Bacik
  0 siblings, 0 replies; 18+ messages in thread
From: Josef Bacik @ 2014-03-05 15:02 UTC (permalink / raw)
  To: miaox, linux-btrfs

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 03/05/2014 02:02 AM, Miao Xie wrote:
> On Tue, 4 Mar 2014 10:19:20 -0500, Josef Bacik wrote: On 01/15/2014
> 07:00 AM, Miao Xie wrote:
>>>> When we mounted the filesystem after the crash, we got the 
>>>> following message: BTRFS error (device xxx): block group
>>>> 4315938816 has wrong amount of free space BTRFS error (device
>>>> xxx): failed to load free space cache for block group
>>>> 4315938816
>>>> 
>>>> It is because we didn't update the metadata of the allocated
>>>> space until the file data was written into the disk. During
>>>> this time, there was no information about the allocated
>>>> spaces in either the extent tree nor the free space cache.
>>>> when we wrote out the free space cache at this time, those
>>>> spaces were lost.
>>>> 
>>>> In ordered to fix this problem, I use a state tree for every
>>>> block group to record those allocated spaces. We record the
>>>> information when they are allocated, and clean up the
>>>> information after the metadata update. Besides that, we also
>>>> introduce a read-write semaphore to avoid the race between
>>>> the allocation and the free space cache write out.
>>>> 
>>>> Only data block groups had this problem, so the above change
>>>> is just for data space allocation.
>>>> 
> 
> I didn't like this idea at first but I've come around to it.  The
> only thing is the data_rwsem thing, we don't need it as we are
> protected by the transaction being blocked when we do writeout, so
> nobody can data allocations during this time.  Thanks,
> 
>> But this protection was removed by the patch
> 
>> Commit ID: 00361589d2eebd90fca022148c763e40d3e90871
> 

Excellent point, then I'm good overall, I'll pull it in once I finish
qgroups.  Thanks,

Josef
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1
Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/

iQIcBAEBAgAGBQJTFzyJAAoJEANb+wAKly3BFHMP+wctKXDuKxP4kI27bfRlfJmV
QB5i6qXr4WvQQw04FF3BtxCW9Z36l/1cFLFK0OaQ8Q54+4s0FUXNAynXFkf41TJz
XWhkjTq0hJmzM95tB2+B2HwtEDI6m0l6fnOhsyiiSHF8V1g7V9VRwkeqpB9ZGu4/
ZryAUkSJGXFDvtFgTmRvOAclwD0R6oCXCw3f/AgtZQL1H9ucaogI2XKmllllcsFC
jOIbn7L+gtFmTAJdvSFlRQAYaV2g69rf0Q5bVHZMAaLKN5rMIXBC2xFOCUxwg3si
ZyOl72ojaGbCt7MI/s2X0uZ5d+xWYjaG+tF2K+XLjFoIUkny3RndFfU6pKk54gHP
9O/GXiilq2t3qZUn3zMuLXIG+cCaYTt3QsHnNyJisqOVLL95LbIvsINm0Xgu6bF/
cJ0acApJr6y2EdEbfVU/mrB06K81bxnZez8rOgyFXKD4yWoTMG23xttKZHG5LbdT
xkwrJWPeJ77mI0+V/MPWePgomcH5cDs0IckSOXXwy8gZF4HJzVbXklcq22BfgIQA
SLVgJYxqIlzIHYHZPisWJHUwFXe4C58YCDP2w4FI32g5LZuzhlus/wFI9Tg1543w
sYf23ZYxlDUYAiD+zb+UipEA4CLtdZgZGonoG9lxK9JkgU2VHzXuTgty2+B0F9kt
l3B5jpy1H2cP2TUskkbZ
=E0en
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-03-04  6:04     ` Miao Xie
@ 2014-03-08 16:48       ` Alex Lyakas
  0 siblings, 0 replies; 18+ messages in thread
From: Alex Lyakas @ 2014-03-08 16:48 UTC (permalink / raw)
  To: Miao Xie; +Cc: linux-btrfs

Thanks, Miao,
so the problem is that cow_file_range() joins transaction, allocates
space through btrfs_reserve_extent(), then detaches from transaction.
And then btrfs_finish_ordered_io() joins transaction again, adds a
delayed ref and detaches from transaction. So if between these two,
the transaction commits and we crash, then yes, the allocation is
lost.

Alex.


On Tue, Mar 4, 2014 at 8:04 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> On      sat, 1 Mar 2014 20:05:01 +0200, Alex Lyakas wrote:
>> Hi Miao,
>>
>> On Wed, Jan 15, 2014 at 2:00 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>>> When we mounted the filesystem after the crash, we got the following
>>> message:
>>>   BTRFS error (device xxx): block group 4315938816 has wrong amount of free space
>>>   BTRFS error (device xxx): failed to load free space cache for block group 4315938816
>>>
>>> It is because we didn't update the metadata of the allocated space until
>>> the file data was written into the disk. During this time, there was no
>>> information about the allocated spaces in either the extent tree nor the
>>> free space cache. when we wrote out the free space cache at this time, those
>>> spaces were lost.
>> Can you please clarify more about the problem?
>> So I understand that we allocate something from the free space cache.
>> So after that, the free space cache does not account for this extent
>> anymore. On the other hand the extent tree also does not account for
>> it (yet). We need to add a delayed reference, which will be run at
>> transaction commit and update the extent tree. But free-space cache is
>> also written out during transaction commit. So how the issue happens?
>> Can you perhaps post a flow of events?
>
>         Task1                           Task2
>         btrfs_writepages()
>           alloc data space
>             (The allocated space was
>              removed from the free
>              space cache)
>           submit_bio()
>                                         btrfs_commit_transaction()
>                                           write out space cache
>                                           ...
>                                           commit transaction completed
>                                         system crash
>          (end_io() wasn't executed)
>
> The system crashed before end_io was executed, so no file references to the
> allocated space, and the extent tree also does not account for it. That space
> was lost.
>
> Thanks
> Miao
>>
>> Thanks.
>> Alex.
>>
>>
>>>
>>> In ordered to fix this problem, I use a state tree for every block group
>>> to record those allocated spaces. We record the information when they are
>>> allocated, and clean up the information after the metadata update. Besides
>>> that, we also introduce a read-write semaphore to avoid the race between
>>> the allocation and the free space cache write out.
>>>
>>> Only data block groups had this problem, so the above change is just
>>> for data space allocation.
>>>
>>> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
>>> ---
>>>  fs/btrfs/ctree.h            | 15 ++++++++++++++-
>>>  fs/btrfs/disk-io.c          |  2 +-
>>>  fs/btrfs/extent-tree.c      | 24 ++++++++++++++++++++----
>>>  fs/btrfs/free-space-cache.c | 42 ++++++++++++++++++++++++++++++++++++++----
>>>  fs/btrfs/inode.c            | 42 +++++++++++++++++++++++++++++++++++-------
>>>  5 files changed, 108 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>> index 1667c9a..f58e1f7 100644
>>> --- a/fs/btrfs/ctree.h
>>> +++ b/fs/btrfs/ctree.h
>>> @@ -1244,6 +1244,12 @@ struct btrfs_block_group_cache {
>>>         /* free space cache stuff */
>>>         struct btrfs_free_space_ctl *free_space_ctl;
>>>
>>> +       /*
>>> +        * It is used to record the extents that are allocated for
>>> +        * the data, but don/t update its metadata.
>>> +        */
>>> +       struct extent_io_tree pinned_extents;
>>> +
>>>         /* block group cache stuff */
>>>         struct rb_node cache_node;
>>>
>>> @@ -1540,6 +1546,13 @@ struct btrfs_fs_info {
>>>          */
>>>         struct list_head space_info;
>>>
>>> +       /*
>>> +        * It is just used for the delayed data space allocation
>>> +        * because only the data space allocation can be done during
>>> +        * we write out the free space cache.
>>> +        */
>>> +       struct rw_semaphore data_rwsem;
>>> +
>>>         struct btrfs_space_info *data_sinfo;
>>>
>>>         struct reloc_control *reloc_ctl;
>>> @@ -3183,7 +3196,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
>>>                                    struct btrfs_key *ins);
>>>  int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
>>>                          u64 min_alloc_size, u64 empty_size, u64 hint_byte,
>>> -                        struct btrfs_key *ins, int is_data);
>>> +                        struct btrfs_key *ins, int is_data, bool need_pin);
>>>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>>>                   struct extent_buffer *buf, int full_backref, int for_cow);
>>>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>> index 8072cfa..426b558 100644
>>> --- a/fs/btrfs/disk-io.c
>>> +++ b/fs/btrfs/disk-io.c
>>> @@ -2276,7 +2276,6 @@ int open_ctree(struct super_block *sb,
>>>         fs_info->pinned_extents = &fs_info->freed_extents[0];
>>>         fs_info->do_barriers = 1;
>>>
>>> -
>>>         mutex_init(&fs_info->ordered_operations_mutex);
>>>         mutex_init(&fs_info->ordered_extent_flush_mutex);
>>>         mutex_init(&fs_info->tree_log_mutex);
>>> @@ -2287,6 +2286,7 @@ int open_ctree(struct super_block *sb,
>>>         init_rwsem(&fs_info->extent_commit_sem);
>>>         init_rwsem(&fs_info->cleanup_work_sem);
>>>         init_rwsem(&fs_info->subvol_sem);
>>> +       init_rwsem(&fs_info->data_rwsem);
>>>         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>>>         fs_info->dev_replace.lock_owner = 0;
>>>         atomic_set(&fs_info->dev_replace.nesting_level, 0);
>>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>>> index 3664cfb..7b07876 100644
>>> --- a/fs/btrfs/extent-tree.c
>>> +++ b/fs/btrfs/extent-tree.c
>>> @@ -6173,7 +6173,7 @@ enum btrfs_loop_type {
>>>  static noinline int find_free_extent(struct btrfs_root *orig_root,
>>>                                      u64 num_bytes, u64 empty_size,
>>>                                      u64 hint_byte, struct btrfs_key *ins,
>>> -                                    u64 flags)
>>> +                                    u64 flags, bool need_pin)
>>>  {
>>>         int ret = 0;
>>>         struct btrfs_root *root = orig_root->fs_info->extent_root;
>>> @@ -6502,6 +6502,16 @@ checks:
>>>                 ins->objectid = search_start;
>>>                 ins->offset = num_bytes;
>>>
>>> +               if (need_pin) {
>>> +                       ASSERT(search_start >= block_group->key.objectid &&
>>> +                              search_start < block_group->key.objectid +
>>> +                                             block_group->key.offset);
>>> +                       set_extent_dirty(&block_group->pinned_extents,
>>> +                                        search_start,
>>> +                                        search_start + num_bytes - 1,
>>> +                                        GFP_NOFS);
>>> +               }
>>> +
>>>                 trace_btrfs_reserve_extent(orig_root, block_group,
>>>                                            search_start, num_bytes);
>>>                 btrfs_put_block_group(block_group);
>>> @@ -6614,17 +6624,20 @@ again:
>>>  int btrfs_reserve_extent(struct btrfs_root *root,
>>>                          u64 num_bytes, u64 min_alloc_size,
>>>                          u64 empty_size, u64 hint_byte,
>>> -                        struct btrfs_key *ins, int is_data)
>>> +                        struct btrfs_key *ins, int is_data, bool need_pin)
>>>  {
>>>         bool final_tried = false;
>>>         u64 flags;
>>>         int ret;
>>>
>>>         flags = btrfs_get_alloc_profile(root, is_data);
>>> +
>>> +       if (need_pin)
>>> +               down_read(&root->fs_info->data_rwsem);
>>>  again:
>>>         WARN_ON(num_bytes < root->sectorsize);
>>>         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
>>> -                              flags);
>>> +                              flags, need_pin);
>>>
>>>         if (ret == -ENOSPC) {
>>>                 if (!final_tried && ins->offset) {
>>> @@ -6645,6 +6658,8 @@ again:
>>>                 }
>>>         }
>>>
>>> +       if (need_pin)
>>> +               up_read(&root->fs_info->data_rwsem);
>>>         return ret;
>>>  }
>>>
>>> @@ -7016,7 +7031,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>>>                 return ERR_CAST(block_rsv);
>>>
>>>         ret = btrfs_reserve_extent(root, blocksize, blocksize,
>>> -                                  empty_size, hint, &ins, 0);
>>> +                                  empty_size, hint, &ins, 0, false);
>>>         if (ret) {
>>>                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
>>>                 return ERR_PTR(ret);
>>> @@ -8387,6 +8402,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
>>>         INIT_LIST_HEAD(&cache->cluster_list);
>>>         INIT_LIST_HEAD(&cache->new_bg_list);
>>>         btrfs_init_free_space_ctl(cache);
>>> +       extent_io_tree_init(&cache->pinned_extents, NULL);
>>>
>>>         return cache;
>>>  }
>>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>>> index 057be95..486e12a3 100644
>>> --- a/fs/btrfs/free-space-cache.c
>>> +++ b/fs/btrfs/free-space-cache.c
>>> @@ -875,6 +875,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>>>         struct rb_node *node;
>>>         struct list_head *pos, *n;
>>>         struct extent_state *cached_state = NULL;
>>> +       struct extent_state *pinned_extent = NULL;
>>>         struct btrfs_free_cluster *cluster = NULL;
>>>         struct extent_io_tree *unpin = NULL;
>>>         struct io_ctl io_ctl;
>>> @@ -948,17 +949,17 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>>>          * so we don't leak the space
>>>          */
>>>
>>> +       if (!block_group)
>>> +               goto bitmap;
>>>         /*
>>>          * We shouldn't have switched the pinned extents yet so this is the
>>>          * right one
>>>          */
>>>         unpin = root->fs_info->pinned_extents;
>>>
>>> -       if (block_group)
>>> -               start = block_group->key.objectid;
>>> +       start = block_group->key.objectid;
>>>
>>> -       while (block_group && (start < block_group->key.objectid +
>>> -                              block_group->key.offset)) {
>>> +       while (start < block_group->key.objectid + block_group->key.offset) {
>>>                 ret = find_first_extent_bit(unpin, start,
>>>                                             &extent_start, &extent_end,
>>>                                             EXTENT_DIRTY, NULL);
>>> @@ -985,6 +986,33 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>>>                 start = extent_end;
>>>         }
>>>
>>> +       if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
>>> +               goto bitmap;
>>> +
>>> +       start = block_group->key.objectid;
>>> +       unpin = &block_group->pinned_extents;
>>> +       while (1) {
>>> +               ret = find_first_extent_bit(unpin, start,
>>> +                                           &extent_start, &extent_end,
>>> +                                           EXTENT_DIRTY, &pinned_extent);
>>> +               if (ret) {
>>> +                       ret = 0;
>>> +                       break;
>>> +               }
>>> +
>>> +               len = extent_end - extent_start + 1;
>>> +
>>> +               entries++;
>>> +               ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
>>> +               if (ret) {
>>> +                       free_extent_state(pinned_extent);
>>> +                       goto out_nospc;
>>> +               }
>>> +
>>> +               start = extent_end + 1;
>>> +       }
>>> +       free_extent_state(pinned_extent);
>>> +bitmap:
>>>         /* Write out the bitmaps */
>>>         list_for_each_safe(pos, n, &bitmap_list) {
>>>                 struct btrfs_free_space *entry =
>>> @@ -1097,6 +1125,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>>>         if (IS_ERR(inode))
>>>                 return 0;
>>>
>>> +       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
>>> +               down_write(&root->fs_info->data_rwsem);
>>> +
>>>         ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>>>                                       path, block_group->key.objectid);
>>>         if (ret) {
>>> @@ -1111,6 +1142,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>>>  #endif
>>>         }
>>>
>>> +       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
>>> +               up_write(&root->fs_info->data_rwsem);
>>> +
>>>         iput(inode);
>>>         return ret;
>>>  }
>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>> index f1a7744..8172ca6 100644
>>> --- a/fs/btrfs/inode.c
>>> +++ b/fs/btrfs/inode.c
>>> @@ -592,6 +592,28 @@ free_pages_out:
>>>         goto out;
>>>  }
>>>
>>> +static void btrfs_unpin_data_extent(struct btrfs_root *root, u64 start,
>>> +                                   u64 len)
>>> +{
>>> +       struct btrfs_block_group_cache *cache;
>>> +
>>> +       cache = btrfs_lookup_block_group(root->fs_info, start);
>>> +       BUG_ON(!cache);
>>> +       clear_extent_dirty(&cache->pinned_extents, start, start + len - 1,
>>> +                          GFP_NOFS);
>>> +       btrfs_put_block_group(cache);
>>> +}
>>> +
>>> +/* it is not used for free space cache file */
>>> +static void btrfs_free_reserved_data_extent(struct btrfs_root *root, u64 start,
>>> +                                           u64 len)
>>> +{
>>> +       down_read(&root->fs_info->data_rwsem);
>>> +       btrfs_unpin_data_extent(root, start, len);
>>> +       btrfs_free_reserved_extent(root, start, len);
>>> +       up_read(&root->fs_info->data_rwsem);
>>> +}
>>> +
>>>  /*
>>>   * phase two of compressed writeback.  This is the ordered portion
>>>   * of the code, which only gets called in the order the work was
>>> @@ -666,7 +688,7 @@ retry:
>>>                 ret = btrfs_reserve_extent(root,
>>>                                            async_extent->compressed_size,
>>>                                            async_extent->compressed_size,
>>> -                                          0, alloc_hint, &ins, 1);
>>> +                                          0, alloc_hint, &ins, 1, true);
>>>                 if (ret) {
>>>                         int i;
>>>
>>> @@ -767,7 +789,7 @@ retry:
>>>  out:
>>>         return ret;
>>>  out_free_reserve:
>>> -       btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>>> +       btrfs_free_reserved_data_extent(root, ins.objectid, ins.offset);
>>>  out_free:
>>>         extent_clear_unlock_delalloc(inode, async_extent->start,
>>>                                      async_extent->start +
>>> @@ -889,7 +911,7 @@ static noinline int cow_file_range(struct inode *inode,
>>>                 cur_alloc_size = disk_num_bytes;
>>>                 ret = btrfs_reserve_extent(root, cur_alloc_size,
>>>                                            root->sectorsize, 0, alloc_hint,
>>> -                                          &ins, 1);
>>> +                                          &ins, 1, true);
>>>                 if (ret < 0)
>>>                         goto out_unlock;
>>>
>>> @@ -967,6 +989,7 @@ out:
>>>         return ret;
>>>
>>>  out_reserve:
>>> +       btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>>>         btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>>>  out_unlock:
>>>         extent_clear_unlock_delalloc(inode, start, end, locked_page,
>>> @@ -2647,6 +2670,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
>>>                                                 logical_len, logical_len,
>>>                                                 compress_type, 0, 0,
>>>                                                 BTRFS_FILE_EXTENT_REG);
>>> +               BUG_ON(nolock);
>>> +               btrfs_unpin_data_extent(root, ordered_extent->start,
>>> +                                       ordered_extent->disk_len);
>>>         }
>>>         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
>>>                            ordered_extent->file_offset, ordered_extent->len,
>>> @@ -2698,8 +2724,9 @@ out:
>>>                 if ((ret || !logical_len) &&
>>>                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
>>>                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
>>> -                       btrfs_free_reserved_extent(root, ordered_extent->start,
>>> -                                                  ordered_extent->disk_len);
>>> +                       btrfs_free_reserved_data_extent(root,
>>> +                                                       ordered_extent->start,
>>> +                                                       ordered_extent->disk_len);
>>>         }
>>>
>>>
>>> @@ -6342,7 +6369,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>>>
>>>         alloc_hint = get_extent_allocation_hint(inode, start, len);
>>>         ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
>>> -                                  alloc_hint, &ins, 1);
>>> +                                  alloc_hint, &ins, 1, true);
>>>         if (ret)
>>>                 return ERR_PTR(ret);
>>>
>>> @@ -6356,6 +6383,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>>>         ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
>>>                                            ins.offset, ins.offset, 0);
>>>         if (ret) {
>>> +               btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>>>                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>>>                 free_extent_map(em);
>>>                 return ERR_PTR(ret);
>>> @@ -8507,7 +8535,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
>>>                 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
>>>                 cur_bytes = max(cur_bytes, min_size);
>>>                 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
>>> -                                          *alloc_hint, &ins, 1);
>>> +                                          *alloc_hint, &ins, 1, false);
>>>                 if (ret) {
>>>                         if (own_trans)
>>>                                 btrfs_end_transaction(trans, root);
>>> --
>>> 1.8.3.1
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
                     ` (2 preceding siblings ...)
  2014-03-04 15:19   ` Josef Bacik
@ 2014-03-26  6:36   ` miaox
  2014-04-24  5:31   ` [PATCH V2 1/2] Btrfs: output warning instead of error when loading free space cache failed Miao Xie
  2014-05-19  1:33   ` [RFC PATCH 5/5] " Chris Mason
  5 siblings, 0 replies; 18+ messages in thread
From: miaox @ 2014-03-26  6:36 UTC (permalink / raw)
  To: Josef Bacik, Chris Mason; +Cc: linux-btrfs

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 13663 bytes --]

Hi Chris, Josef

This patch is an important bug fix patch, could you merge it into your tree?

Thanks
Miao

On 	wed, 15 Jan 2014 20:00:58 +0800, Miao Xie wrote:
> When we mounted the filesystem after the crash, we got the following
> message:
>   BTRFS error (device xxx): block group 4315938816 has wrong amount of free space
>   BTRFS error (device xxx): failed to load free space cache for block group 4315938816
> 
> It is because we didn't update the metadata of the allocated space until
> the file data was written into the disk. During this time, there was no
> information about the allocated spaces in either the extent tree nor the
> free space cache. when we wrote out the free space cache at this time, those
> spaces were lost.
> 
> In ordered to fix this problem, I use a state tree for every block group
> to record those allocated spaces. We record the information when they are
> allocated, and clean up the information after the metadata update. Besides
> that, we also introduce a read-write semaphore to avoid the race between
> the allocation and the free space cache write out.
> 
> Only data block groups had this problem, so the above change is just
> for data space allocation.
> 
> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> ---
>  fs/btrfs/ctree.h            | 15 ++++++++++++++-
>  fs/btrfs/disk-io.c          |  2 +-
>  fs/btrfs/extent-tree.c      | 24 ++++++++++++++++++++----
>  fs/btrfs/free-space-cache.c | 42 ++++++++++++++++++++++++++++++++++++++----
>  fs/btrfs/inode.c            | 42 +++++++++++++++++++++++++++++++++++-------
>  5 files changed, 108 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1667c9a..f58e1f7 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1244,6 +1244,12 @@ struct btrfs_block_group_cache {
>  	/* free space cache stuff */
>  	struct btrfs_free_space_ctl *free_space_ctl;
>  
> +	/*
> +	 * It is used to record the extents that are allocated for
> +	 * the data, but don/t update its metadata.
> +	 */
> +	struct extent_io_tree pinned_extents;
> +
>  	/* block group cache stuff */
>  	struct rb_node cache_node;
>  
> @@ -1540,6 +1546,13 @@ struct btrfs_fs_info {
>  	 */
>  	struct list_head space_info;
>  
> +	/*
> +	 * It is just used for the delayed data space allocation
> +	 * because only the data space allocation can be done during
> +	 * we write out the free space cache.
> +	 */
> +	struct rw_semaphore data_rwsem;
> +
>  	struct btrfs_space_info *data_sinfo;
>  
>  	struct reloc_control *reloc_ctl;
> @@ -3183,7 +3196,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
>  				   struct btrfs_key *ins);
>  int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
>  			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
> -			 struct btrfs_key *ins, int is_data);
> +			 struct btrfs_key *ins, int is_data, bool need_pin);
>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>  		  struct extent_buffer *buf, int full_backref, int for_cow);
>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 8072cfa..426b558 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2276,7 +2276,6 @@ int open_ctree(struct super_block *sb,
>  	fs_info->pinned_extents = &fs_info->freed_extents[0];
>  	fs_info->do_barriers = 1;
>  
> -
>  	mutex_init(&fs_info->ordered_operations_mutex);
>  	mutex_init(&fs_info->ordered_extent_flush_mutex);
>  	mutex_init(&fs_info->tree_log_mutex);
> @@ -2287,6 +2286,7 @@ int open_ctree(struct super_block *sb,
>  	init_rwsem(&fs_info->extent_commit_sem);
>  	init_rwsem(&fs_info->cleanup_work_sem);
>  	init_rwsem(&fs_info->subvol_sem);
> +	init_rwsem(&fs_info->data_rwsem);
>  	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>  	fs_info->dev_replace.lock_owner = 0;
>  	atomic_set(&fs_info->dev_replace.nesting_level, 0);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 3664cfb..7b07876 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -6173,7 +6173,7 @@ enum btrfs_loop_type {
>  static noinline int find_free_extent(struct btrfs_root *orig_root,
>  				     u64 num_bytes, u64 empty_size,
>  				     u64 hint_byte, struct btrfs_key *ins,
> -				     u64 flags)
> +				     u64 flags, bool need_pin)
>  {
>  	int ret = 0;
>  	struct btrfs_root *root = orig_root->fs_info->extent_root;
> @@ -6502,6 +6502,16 @@ checks:
>  		ins->objectid = search_start;
>  		ins->offset = num_bytes;
>  
> +		if (need_pin) {
> +			ASSERT(search_start >= block_group->key.objectid &&
> +			       search_start < block_group->key.objectid +
> +					      block_group->key.offset);
> +			set_extent_dirty(&block_group->pinned_extents,
> +					 search_start,
> +					 search_start + num_bytes - 1,
> +					 GFP_NOFS);
> +		}
> +
>  		trace_btrfs_reserve_extent(orig_root, block_group,
>  					   search_start, num_bytes);
>  		btrfs_put_block_group(block_group);
> @@ -6614,17 +6624,20 @@ again:
>  int btrfs_reserve_extent(struct btrfs_root *root,
>  			 u64 num_bytes, u64 min_alloc_size,
>  			 u64 empty_size, u64 hint_byte,
> -			 struct btrfs_key *ins, int is_data)
> +			 struct btrfs_key *ins, int is_data, bool need_pin)
>  {
>  	bool final_tried = false;
>  	u64 flags;
>  	int ret;
>  
>  	flags = btrfs_get_alloc_profile(root, is_data);
> +
> +	if (need_pin)
> +		down_read(&root->fs_info->data_rwsem);
>  again:
>  	WARN_ON(num_bytes < root->sectorsize);
>  	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
> -			       flags);
> +			       flags, need_pin);
>  
>  	if (ret == -ENOSPC) {
>  		if (!final_tried && ins->offset) {
> @@ -6645,6 +6658,8 @@ again:
>  		}
>  	}
>  
> +	if (need_pin)
> +		up_read(&root->fs_info->data_rwsem);
>  	return ret;
>  }
>  
> @@ -7016,7 +7031,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>  		return ERR_CAST(block_rsv);
>  
>  	ret = btrfs_reserve_extent(root, blocksize, blocksize,
> -				   empty_size, hint, &ins, 0);
> +				   empty_size, hint, &ins, 0, false);
>  	if (ret) {
>  		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
>  		return ERR_PTR(ret);
> @@ -8387,6 +8402,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
>  	INIT_LIST_HEAD(&cache->cluster_list);
>  	INIT_LIST_HEAD(&cache->new_bg_list);
>  	btrfs_init_free_space_ctl(cache);
> +	extent_io_tree_init(&cache->pinned_extents, NULL);
>  
>  	return cache;
>  }
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 057be95..486e12a3 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -875,6 +875,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>  	struct rb_node *node;
>  	struct list_head *pos, *n;
>  	struct extent_state *cached_state = NULL;
> +	struct extent_state *pinned_extent = NULL;
>  	struct btrfs_free_cluster *cluster = NULL;
>  	struct extent_io_tree *unpin = NULL;
>  	struct io_ctl io_ctl;
> @@ -948,17 +949,17 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>  	 * so we don't leak the space
>  	 */
>  
> +	if (!block_group)
> +		goto bitmap;
>  	/*
>  	 * We shouldn't have switched the pinned extents yet so this is the
>  	 * right one
>  	 */
>  	unpin = root->fs_info->pinned_extents;
>  
> -	if (block_group)
> -		start = block_group->key.objectid;
> +	start = block_group->key.objectid;
>  
> -	while (block_group && (start < block_group->key.objectid +
> -			       block_group->key.offset)) {
> +	while (start < block_group->key.objectid + block_group->key.offset) {
>  		ret = find_first_extent_bit(unpin, start,
>  					    &extent_start, &extent_end,
>  					    EXTENT_DIRTY, NULL);
> @@ -985,6 +986,33 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>  		start = extent_end;
>  	}
>  
> +	if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
> +		goto bitmap;
> +
> +	start = block_group->key.objectid;
> +	unpin = &block_group->pinned_extents;
> +	while (1) {
> +		ret = find_first_extent_bit(unpin, start,
> +					    &extent_start, &extent_end,
> +					    EXTENT_DIRTY, &pinned_extent);
> +		if (ret) {
> +			ret = 0;
> +			break;
> +		}
> +
> +		len = extent_end - extent_start + 1;
> +
> +		entries++;
> +		ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
> +		if (ret) {
> +			free_extent_state(pinned_extent);
> +			goto out_nospc;
> +		}
> +
> +		start = extent_end + 1;
> +	}
> +	free_extent_state(pinned_extent);
> +bitmap:
>  	/* Write out the bitmaps */
>  	list_for_each_safe(pos, n, &bitmap_list) {
>  		struct btrfs_free_space *entry =
> @@ -1097,6 +1125,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  	if (IS_ERR(inode))
>  		return 0;
>  
> +	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +		down_write(&root->fs_info->data_rwsem);
> +
>  	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>  				      path, block_group->key.objectid);
>  	if (ret) {
> @@ -1111,6 +1142,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  #endif
>  	}
>  
> +	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +		up_write(&root->fs_info->data_rwsem);
> +
>  	iput(inode);
>  	return ret;
>  }
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index f1a7744..8172ca6 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -592,6 +592,28 @@ free_pages_out:
>  	goto out;
>  }
>  
> +static void btrfs_unpin_data_extent(struct btrfs_root *root, u64 start,
> +				    u64 len)
> +{
> +	struct btrfs_block_group_cache *cache;
> +
> +	cache = btrfs_lookup_block_group(root->fs_info, start);
> +	BUG_ON(!cache);
> +	clear_extent_dirty(&cache->pinned_extents, start, start + len - 1,
> +			   GFP_NOFS);
> +	btrfs_put_block_group(cache);
> +}
> +
> +/* it is not used for free space cache file */
> +static void btrfs_free_reserved_data_extent(struct btrfs_root *root, u64 start,
> +					    u64 len)
> +{
> +	down_read(&root->fs_info->data_rwsem);
> +	btrfs_unpin_data_extent(root, start, len);
> +	btrfs_free_reserved_extent(root, start, len);
> +	up_read(&root->fs_info->data_rwsem);
> +}
> +
>  /*
>   * phase two of compressed writeback.  This is the ordered portion
>   * of the code, which only gets called in the order the work was
> @@ -666,7 +688,7 @@ retry:
>  		ret = btrfs_reserve_extent(root,
>  					   async_extent->compressed_size,
>  					   async_extent->compressed_size,
> -					   0, alloc_hint, &ins, 1);
> +					   0, alloc_hint, &ins, 1, true);
>  		if (ret) {
>  			int i;
>  
> @@ -767,7 +789,7 @@ retry:
>  out:
>  	return ret;
>  out_free_reserve:
> -	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
> +	btrfs_free_reserved_data_extent(root, ins.objectid, ins.offset);
>  out_free:
>  	extent_clear_unlock_delalloc(inode, async_extent->start,
>  				     async_extent->start +
> @@ -889,7 +911,7 @@ static noinline int cow_file_range(struct inode *inode,
>  		cur_alloc_size = disk_num_bytes;
>  		ret = btrfs_reserve_extent(root, cur_alloc_size,
>  					   root->sectorsize, 0, alloc_hint,
> -					   &ins, 1);
> +					   &ins, 1, true);
>  		if (ret < 0)
>  			goto out_unlock;
>  
> @@ -967,6 +989,7 @@ out:
>  	return ret;
>  
>  out_reserve:
> +	btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>  	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>  out_unlock:
>  	extent_clear_unlock_delalloc(inode, start, end, locked_page,
> @@ -2647,6 +2670,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
>  						logical_len, logical_len,
>  						compress_type, 0, 0,
>  						BTRFS_FILE_EXTENT_REG);
> +		BUG_ON(nolock);
> +		btrfs_unpin_data_extent(root, ordered_extent->start,
> +					ordered_extent->disk_len);
>  	}
>  	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
>  			   ordered_extent->file_offset, ordered_extent->len,
> @@ -2698,8 +2724,9 @@ out:
>  		if ((ret || !logical_len) &&
>  		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
>  		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
> -			btrfs_free_reserved_extent(root, ordered_extent->start,
> -						   ordered_extent->disk_len);
> +			btrfs_free_reserved_data_extent(root,
> +							ordered_extent->start,
> +							ordered_extent->disk_len);
>  	}
>  
>  
> @@ -6342,7 +6369,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>  
>  	alloc_hint = get_extent_allocation_hint(inode, start, len);
>  	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
> -				   alloc_hint, &ins, 1);
> +				   alloc_hint, &ins, 1, true);
>  	if (ret)
>  		return ERR_PTR(ret);
>  
> @@ -6356,6 +6383,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>  	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
>  					   ins.offset, ins.offset, 0);
>  	if (ret) {
> +		btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>  		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>  		free_extent_map(em);
>  		return ERR_PTR(ret);
> @@ -8507,7 +8535,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
>  		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
>  		cur_bytes = max(cur_bytes, min_size);
>  		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
> -					   *alloc_hint, &ins, 1);
> +					   *alloc_hint, &ins, 1, false);
>  		if (ret) {
>  			if (own_trans)
>  				btrfs_end_transaction(trans, root);
> 
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±ý»k~ÏâžØ^n‡r¡ö¦zË\x1aëh™¨è­Ú&£ûàz¿äz¹Þ—ú+€Ê+zf£¢·hšˆ§~†­†Ûiÿÿïêÿ‘êçz_è®\x0fæj:+v‰¨þ)ߣøm

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH V2 1/2] Btrfs: output warning instead of error when loading free space cache failed
  2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
                     ` (3 preceding siblings ...)
  2014-03-26  6:36   ` miaox
@ 2014-04-24  5:31   ` Miao Xie
  2014-04-24  5:31     ` [PATCH V2 2/2] Btrfs: fix broken free space cache after the system crashed Miao Xie
  2014-05-19  1:33   ` [RFC PATCH 5/5] " Chris Mason
  5 siblings, 1 reply; 18+ messages in thread
From: Miao Xie @ 2014-04-24  5:31 UTC (permalink / raw)
  To: linux-btrfs

If we fail to load a free space cache, we can rebuild it from the extent tree,
so it is not a serious error, we should not output a error message that
would make the users uncomfortable. This patch uses warning message instead
of it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
Changelog v1 -> v2:
- New patch
---
 fs/btrfs/free-space-cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 73f3de7..a6bd654 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -831,7 +831,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 
 	if (!matched) {
 		__btrfs_remove_free_space_cache(ctl);
-		btrfs_err(fs_info, "block group %llu has wrong amount of free space",
+		btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
 			block_group->key.objectid);
 		ret = -1;
 	}
@@ -843,7 +843,7 @@ out:
 		spin_unlock(&block_group->lock);
 		ret = 0;
 
-		btrfs_err(fs_info, "failed to load free space cache for block group %llu",
+		btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
 			block_group->key.objectid);
 	}
 
-- 
1.9.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH V2 2/2] Btrfs: fix broken free space cache after the system crashed
  2014-04-24  5:31   ` [PATCH V2 1/2] Btrfs: output warning instead of error when loading free space cache failed Miao Xie
@ 2014-04-24  5:31     ` Miao Xie
  0 siblings, 0 replies; 18+ messages in thread
From: Miao Xie @ 2014-04-24  5:31 UTC (permalink / raw)
  To: linux-btrfs

When we mounted the filesystem after the crash, we got the following
message:
  BTRFS error (device xxx): block group xxxx has wrong amount of free space
  BTRFS error (device xxx): failed to load free space cache for block group xxx

It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.

There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
  space cache
- account the size of the allocated space that is used to store the file
  data, if the size is not zero, don't write out the free space cache.

The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
Changelog v1 -> v2
- use the other method to fix the problem.
---
 fs/btrfs/ctree.h            |  13 ++++-
 fs/btrfs/extent-tree.c      | 135 ++++++++++++++++++++++++++++++++++----------
 fs/btrfs/free-space-cache.c |  20 +++++++
 fs/btrfs/inode.c            |  41 ++++++++++----
 4 files changed, 165 insertions(+), 44 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2a9d32e..6ffd77b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1235,11 +1235,19 @@ struct btrfs_block_group_cache {
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
+	u64 delalloc_bytes;
 	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
 	u64 cache_generation;
 
+	/*
+	 * It is just used for the delayed data space allocation because
+	 * only the data space allocation and the relative metadata update
+	 * can be done cross the transaction.
+	 */
+	struct rw_semaphore data_rwsem;
+
 	/* for raid56, this is a full stripe, without parity */
 	unsigned long full_stripe_len;
 
@@ -3252,7 +3260,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 				   struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
 			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
-			 struct btrfs_key *ins, int is_data);
+			 struct btrfs_key *ins, int is_data, int delalloc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3266,7 +3274,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 		      u64 owner, u64 offset, int for_cow);
 
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+			       int delalloc);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
 				       u64 start, u64 len);
 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6b6a6e..d049b7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -103,7 +103,8 @@ static int find_next_key(struct btrfs_path *path, int level,
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups);
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				       u64 num_bytes, int reserve);
+				       u64 num_bytes, int reserve,
+				       int delalloc);
 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 			       u64 num_bytes);
 int btrfs_pin_extent(struct btrfs_root *root,
@@ -3162,7 +3163,8 @@ again:
 
 	spin_lock(&block_group->lock);
 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
-	    !btrfs_test_opt(root, SPACE_CACHE)) {
+	    !btrfs_test_opt(root, SPACE_CACHE) ||
+	    block_group->delalloc_bytes) {
 		/*
 		 * don't bother trying to write stuff out _if_
 		 * a) we're not cached,
@@ -5412,6 +5414,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
  * @cache:	The cache we are manipulating
  * @num_bytes:	The number of bytes in question
  * @reserve:	One of the reservation enums
+ * @delalloc:   The blocks are allocated for the delalloc write
  *
  * This is called by the allocator when it reserves space, or by somebody who is
  * freeing space that was never actually used on disk.  For example if you
@@ -5430,7 +5433,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
  * succeeds.
  */
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				       u64 num_bytes, int reserve)
+				       u64 num_bytes, int reserve, int delalloc)
 {
 	struct btrfs_space_info *space_info = cache->space_info;
 	int ret = 0;
@@ -5449,12 +5452,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 						num_bytes, 0);
 				space_info->bytes_may_use -= num_bytes;
 			}
+
+			if (delalloc)
+				cache->delalloc_bytes += num_bytes;
 		}
 	} else {
 		if (cache->ro)
 			space_info->bytes_readonly += num_bytes;
 		cache->reserved -= num_bytes;
 		space_info->bytes_reserved -= num_bytes;
+
+		if (delalloc)
+			cache->delalloc_bytes -= num_bytes;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&space_info->lock);
@@ -5980,7 +5989,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
-		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
 		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
 		pin = 0;
 	}
@@ -6135,6 +6144,62 @@ enum btrfs_loop_type {
 	LOOP_NO_EMPTY_SIZE = 3,
 };
 
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+		       int delalloc)
+{
+	btrfs_get_block_group(cache);
+	if (delalloc)
+		down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+		   struct btrfs_free_cluster *cluster,
+		   int delalloc)
+{
+	struct btrfs_block_group_cache *used_bg;
+	bool locked = false;
+again:
+	spin_lock(&cluster->refill_lock);
+	if (locked) {
+		if (used_bg == cluster->block_group)
+			return used_bg;
+
+		up_read(&used_bg->data_rwsem);
+		btrfs_put_block_group(used_bg);
+	}
+
+	used_bg = cluster->block_group;
+	if (!used_bg)
+		return NULL;
+
+	if (used_bg == block_group)
+		return used_bg;
+
+	btrfs_get_block_group(used_bg);
+
+	if (!delalloc)
+		return used_bg;
+
+	if (down_read_trylock(&used_bg->data_rwsem))
+		return used_bg;
+
+	spin_unlock(&cluster->refill_lock);
+	down_read(&used_bg->data_rwsem);
+	locked = true;
+	goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+			 int delalloc)
+{
+	if (delalloc)
+		up_read(&cache->data_rwsem);
+	btrfs_put_block_group(cache);
+}
+
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
@@ -6149,7 +6214,7 @@ enum btrfs_loop_type {
 static noinline int find_free_extent(struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
 				     u64 hint_byte, struct btrfs_key *ins,
-				     u64 flags)
+				     u64 flags, int delalloc)
 {
 	int ret = 0;
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -6237,6 +6302,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 				up_read(&space_info->groups_sem);
 			} else {
 				index = get_block_group_index(block_group);
+				btrfs_grab_block_group(block_group, delalloc);
 				goto have_block_group;
 			}
 		} else if (block_group) {
@@ -6251,7 +6317,7 @@ search:
 		u64 offset;
 		int cached;
 
-		btrfs_get_block_group(block_group);
+		btrfs_grab_block_group(block_group, delalloc);
 		search_start = block_group->key.objectid;
 
 		/*
@@ -6299,16 +6365,16 @@ have_block_group:
 			 * the refill lock keeps out other
 			 * people trying to start a new cluster
 			 */
-			spin_lock(&last_ptr->refill_lock);
-			used_block_group = last_ptr->block_group;
-			if (used_block_group != block_group &&
-			    (!used_block_group ||
-			     used_block_group->ro ||
-			     !block_group_bits(used_block_group, flags)))
+			used_block_group = btrfs_lock_cluster(block_group,
+							      last_ptr,
+							      delalloc);
+			if (!used_block_group)
 				goto refill_cluster;
 
-			if (used_block_group != block_group)
-				btrfs_get_block_group(used_block_group);
+			if (used_block_group != block_group &&
+			    (used_block_group->ro ||
+			     !block_group_bits(used_block_group, flags)))
+				goto release_cluster;
 
 			offset = btrfs_alloc_from_cluster(used_block_group,
 						last_ptr,
@@ -6322,16 +6388,15 @@ have_block_group:
 						used_block_group,
 						search_start, num_bytes);
 				if (used_block_group != block_group) {
-					btrfs_put_block_group(block_group);
+					btrfs_release_block_group(block_group,
+								  delalloc);
 					block_group = used_block_group;
 				}
 				goto checks;
 			}
 
 			WARN_ON(last_ptr->block_group != used_block_group);
-			if (used_block_group != block_group)
-				btrfs_put_block_group(used_block_group);
-refill_cluster:
+release_cluster:
 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
 			 * set up a new clusters, so lets just skip it
 			 * and let the allocator find whatever block
@@ -6348,8 +6413,10 @@ refill_cluster:
 			 * succeeding in the unclustered
 			 * allocation.  */
 			if (loop >= LOOP_NO_EMPTY_SIZE &&
-			    last_ptr->block_group != block_group) {
+			    used_block_group != block_group) {
 				spin_unlock(&last_ptr->refill_lock);
+				btrfs_release_block_group(used_block_group,
+							  delalloc);
 				goto unclustered_alloc;
 			}
 
@@ -6359,6 +6426,10 @@ refill_cluster:
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
+			if (used_block_group != block_group)
+				btrfs_release_block_group(used_block_group,
+							  delalloc);
+refill_cluster:
 			if (loop >= LOOP_NO_EMPTY_SIZE) {
 				spin_unlock(&last_ptr->refill_lock);
 				goto unclustered_alloc;
@@ -6466,7 +6537,7 @@ checks:
 		BUG_ON(offset > search_start);
 
 		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
-						  alloc_type);
+						  alloc_type, delalloc);
 		if (ret == -EAGAIN) {
 			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
@@ -6478,13 +6549,13 @@ checks:
 
 		trace_btrfs_reserve_extent(orig_root, block_group,
 					   search_start, num_bytes);
-		btrfs_put_block_group(block_group);
+		btrfs_release_block_group(block_group, delalloc);
 		break;
 loop:
 		failed_cluster_refill = false;
 		failed_alloc = false;
 		BUG_ON(index != get_block_group_index(block_group));
-		btrfs_put_block_group(block_group);
+		btrfs_release_block_group(block_group, delalloc);
 	}
 	up_read(&space_info->groups_sem);
 
@@ -6590,7 +6661,7 @@ again:
 int btrfs_reserve_extent(struct btrfs_root *root,
 			 u64 num_bytes, u64 min_alloc_size,
 			 u64 empty_size, u64 hint_byte,
-			 struct btrfs_key *ins, int is_data)
+			 struct btrfs_key *ins, int is_data, int delalloc)
 {
 	bool final_tried = false;
 	u64 flags;
@@ -6600,7 +6671,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
 again:
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
-			       flags);
+			       flags, delalloc);
 
 	if (ret == -ENOSPC) {
 		if (!final_tried && ins->offset) {
@@ -6625,7 +6696,8 @@ again:
 }
 
 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
-					u64 start, u64 len, int pin)
+					u64 start, u64 len,
+					int pin, int delalloc)
 {
 	struct btrfs_block_group_cache *cache;
 	int ret = 0;
@@ -6644,7 +6716,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
 		pin_down_extent(root, cache, start, len, 1);
 	else {
 		btrfs_add_free_space(cache, start, len);
-		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
 	}
 	btrfs_put_block_group(cache);
 
@@ -6654,15 +6726,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
 }
 
 int btrfs_free_reserved_extent(struct btrfs_root *root,
-					u64 start, u64 len)
+			       u64 start, u64 len, int delalloc)
 {
-	return __btrfs_free_reserved_extent(root, start, len, 0);
+	return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
 }
 
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
 				       u64 start, u64 len)
 {
-	return __btrfs_free_reserved_extent(root, start, len, 1);
+	return __btrfs_free_reserved_extent(root, start, len, 1, 0);
 }
 
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -6859,7 +6931,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 		return -EINVAL;
 
 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
-					  RESERVE_ALLOC_NO_ACCOUNT);
+					  RESERVE_ALLOC_NO_ACCOUNT, 0);
 	BUG_ON(ret); /* logic error */
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
@@ -6992,7 +7064,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_CAST(block_rsv);
 
 	ret = btrfs_reserve_extent(root, blocksize, blocksize,
-				   empty_size, hint, &ins, 0);
+				   empty_size, hint, &ins, 0, 0);
 	if (ret) {
 		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
 		return ERR_PTR(ret);
@@ -8381,6 +8453,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 					       start);
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
+	init_rwsem(&cache->data_rwsem);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
 	INIT_LIST_HEAD(&cache->new_bg_list);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a6bd654..2f6df5b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1091,12 +1091,29 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 		spin_unlock(&block_group->lock);
 		return 0;
 	}
+
+	if (block_group->delalloc_bytes) {
+		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		spin_unlock(&block_group->lock);
+		return 0;
+	}
 	spin_unlock(&block_group->lock);
 
 	inode = lookup_free_space_inode(root, block_group, path);
 	if (IS_ERR(inode))
 		return 0;
 
+	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
+		down_write(&block_group->data_rwsem);
+		spin_lock(&block_group->lock);
+		if (block_group->delalloc_bytes) {
+			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+			spin_unlock(&block_group->lock);
+			goto out;
+		}
+		spin_unlock(&block_group->lock);
+	}
+
 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
 				      path, block_group->key.objectid);
 	if (ret) {
@@ -1110,6 +1127,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 			block_group->key.objectid);
 #endif
 	}
+out:
+	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+		up_write(&block_group->data_rwsem);
 
 	iput(inode);
 	return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0ec8766..8928d4d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -685,7 +685,7 @@ retry:
 		ret = btrfs_reserve_extent(root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
-					   0, alloc_hint, &ins, 1);
+					   0, alloc_hint, &ins, 1, 1);
 		if (ret) {
 			int i;
 
@@ -786,7 +786,7 @@ retry:
 out:
 	return ret;
 out_free_reserve:
-	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_free:
 	extent_clear_unlock_delalloc(inode, async_extent->start,
 				     async_extent->start +
@@ -909,7 +909,7 @@ static noinline int cow_file_range(struct inode *inode,
 		cur_alloc_size = disk_num_bytes;
 		ret = btrfs_reserve_extent(root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
-					   &ins, 1);
+					   &ins, 1, 1);
 		if (ret < 0)
 			goto out_unlock;
 
@@ -987,7 +987,7 @@ out:
 	return ret;
 
 out_reserve:
-	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
 	extent_clear_unlock_delalloc(inode, start, end, locked_page,
 				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
@@ -2572,6 +2572,21 @@ out_kfree:
 	return NULL;
 }
 
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+					 u64 start, u64 len)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	ASSERT(cache);
+
+	spin_lock(&cache->lock);
+	cache->delalloc_bytes -= len;
+	spin_unlock(&cache->lock);
+
+	btrfs_put_block_group(cache);
+}
+
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -2670,6 +2685,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 						logical_len, logical_len,
 						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
+		if (!ret)
+			btrfs_release_delalloc_bytes(root,
+						     ordered_extent->start,
+						     ordered_extent->disk_len);
 	}
 	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
 			   ordered_extent->file_offset, ordered_extent->len,
@@ -2722,7 +2741,7 @@ out:
 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
 			btrfs_free_reserved_extent(root, ordered_extent->start,
-						   ordered_extent->disk_len);
+						   ordered_extent->disk_len, 1);
 	}
 
 
@@ -6519,21 +6538,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
 	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
-				   alloc_hint, &ins, 1);
+				   alloc_hint, &ins, 1, 1);
 	if (ret)
 		return ERR_PTR(ret);
 
 	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
 			      ins.offset, ins.offset, ins.offset, 0);
 	if (IS_ERR(em)) {
-		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 		return em;
 	}
 
 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
 					   ins.offset, ins.offset, 0);
 	if (ret) {
-		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 		free_extent_map(em);
 		return ERR_PTR(ret);
 	}
@@ -7353,7 +7372,7 @@ free_ordered:
 		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
 			btrfs_free_reserved_extent(root, ordered->start,
-						   ordered->disk_len);
+						   ordered->disk_len, 1);
 		btrfs_put_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 	}
@@ -8734,7 +8753,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
 		cur_bytes = max(cur_bytes, min_size);
 		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
-					   *alloc_hint, &ins, 1);
+					   *alloc_hint, &ins, 1, 0);
 		if (ret) {
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
@@ -8748,7 +8767,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		if (ret) {
 			btrfs_free_reserved_extent(root, ins.objectid,
-						   ins.offset);
+						   ins.offset, 0);
 			btrfs_abort_transaction(trans, root, ret);
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
-- 
1.9.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
                     ` (4 preceding siblings ...)
  2014-04-24  5:31   ` [PATCH V2 1/2] Btrfs: output warning instead of error when loading free space cache failed Miao Xie
@ 2014-05-19  1:33   ` Chris Mason
  2014-06-10  8:15     ` Alin Dobre
  5 siblings, 1 reply; 18+ messages in thread
From: Chris Mason @ 2014-05-19  1:33 UTC (permalink / raw)
  To: Miao Xie, linux-btrfs

On 01/15/2014 07:00 AM, Miao Xie wrote:
> When we mounted the filesystem after the crash, we got the following
> message:
>   BTRFS error (device xxx): block group 4315938816 has wrong amount of free space
>   BTRFS error (device xxx): failed to load free space cache for block group 4315938816
> 
> It is because we didn't update the metadata of the allocated space until
> the file data was written into the disk. During this time, there was no
> information about the allocated spaces in either the extent tree nor the
> free space cache. when we wrote out the free space cache at this time, those
> spaces were lost.
> 
> In ordered to fix this problem, I use a state tree for every block group
> to record those allocated spaces. We record the information when they are
> allocated, and clean up the information after the metadata update. Besides
> that, we also introduce a read-write semaphore to avoid the race between
> the allocation and the free space cache write out.
> 
> Only data block groups had this problem, so the above change is just
> for data space allocation.

I had this one in the integration branch, but lockdep reported troubles.  It
looks like lockdep is correct.  find_free_extent is nesting the cache rwsem
inside the groups rwsem, but btrfs_write_out_cache is holding the new cache
rwsem when it calls find_free_extent.

-chris

[ 2857.610731] ======================================================
[ 2857.623158] [ INFO: possible circular locking dependency detected ]
[ 2857.635771] 3.15.0-rc5-mason+ #43 Not tainted
[ 2857.644553] -------------------------------------------------------
[ 2857.657139] btrfs-transacti/19476 is trying to acquire lock:
[ 2857.668518]  (&found->groups_sem){++++..}, at: [<ffffffffa059dbc1>] find_free_extent+0x931/0xe20 [btrfs]
[ 2857.687771] 
[ 2857.687771] but task is already holding lock:
[ 2857.699566]  (&cache->data_rwsem){++++..}, at: [<ffffffffa05f78bf>] btrfs_write_out_cache+0x9f/0x170 [btrfs]
[ 2857.719480] 
[ 2857.719480] which lock already depends on the new lock.
[ 2857.719480] 
[ 2857.736021] 
[ 2857.736021] the existing dependency chain (in reverse order) is:
[ 2857.751120] 
-> #1 (&cache->data_rwsem){++++..}:
[ 2857.760823]        [<ffffffff810a14fe>] lock_acquire+0x8e/0x110
[ 2857.772772]        [<ffffffff81649cf7>] down_read+0x47/0x60
[ 2857.784028]        [<ffffffffa059db2c>] find_free_extent+0x89c/0xe20 [btrfs]
[ 2857.798253]        [<ffffffffa059e11b>] btrfs_reserve_extent+0x6b/0x140 [btrfs]
[ 2857.813041]        [<ffffffffa05b73ec>] cow_file_range+0x13c/0x460 [btrfs]
[ 2857.826892]        [<ffffffffa05bc097>] run_delalloc_range+0x347/0x380 [btrfs]
[ 2857.841510]        [<ffffffffa05d3f3d>] __extent_writepage+0x70d/0x870 [btrfs]
[ 2857.856129]        [<ffffffffa05d456a>] extent_write_cache_pages.clone.6+0x30a/0x410 [btrfs]
[ 2857.873185]        [<ffffffffa05d46c2>] extent_writepages+0x52/0x70 [btrfs]
[ 2857.887224]        [<ffffffffa05b3d57>] btrfs_writepages+0x27/0x30 [btrfs]
[ 2857.901078]        [<ffffffff81142543>] do_writepages+0x23/0x40
[ 2857.913034]        [<ffffffff81135b99>] __filemap_fdatawrite_range+0x59/0x60
[ 2857.927240]        [<ffffffff81135dec>] filemap_flush+0x1c/0x20
[ 2857.939215]        [<ffffffffa05b2502>] btrfs_run_delalloc_work+0x72/0xa0 [btrfs]
[ 2857.954367]        [<ffffffffa05e05fe>] normal_work_helper+0x6e/0x2d0 [btrfs]
[ 2857.968749]        [<ffffffff8106b9e2>] process_one_work+0x1d2/0x550
[ 2857.981561]        [<ffffffff8106cd8f>] worker_thread+0x11f/0x3a0
[ 2857.993856]        [<ffffffff8107317e>] kthread+0xde/0x100
[ 2858.004936]        [<ffffffff8165436c>] ret_from_fork+0x7c/0xb0
[ 2858.016887] 
-> #0 (&found->groups_sem){++++..}:
[ 2858.026590]        [<ffffffff810a12de>] __lock_acquire+0x161e/0x17b0
[ 2858.039407]        [<ffffffff810a14fe>] lock_acquire+0x8e/0x110
[ 2858.051370]        [<ffffffff81649cf7>] down_read+0x47/0x60
[ 2858.062629]        [<ffffffffa059dbc1>] find_free_extent+0x931/0xe20 [btrfs]
[ 2858.076841]        [<ffffffffa059e11b>] btrfs_reserve_extent+0x6b/0x140 [btrfs]
[ 2858.091629]        [<ffffffffa059e307>] btrfs_alloc_free_block+0x117/0x420 [btrfs]
[ 2858.106940]        [<ffffffffa0589a5b>] __btrfs_cow_block+0x11b/0x530 [btrfs]
[ 2858.121331]        [<ffffffffa058a4a0>] btrfs_cow_block+0x130/0x1e0 [btrfs]
[ 2858.135375]        [<ffffffffa058c999>] btrfs_search_slot+0x219/0x9c0 [btrfs]
[ 2858.149760]        [<ffffffffa05f7595>] __btrfs_write_out_cache+0x755/0x970 [btrfs]
[ 2858.165245]        [<ffffffffa05f7958>] btrfs_write_out_cache+0x138/0x170 [btrfs]
[ 2858.180411]        [<ffffffffa059ccb0>] btrfs_write_dirty_block_groups+0x480/0x600 [btrfs]
[ 2858.197107]        [<ffffffffa05ae7af>] commit_cowonly_roots+0x19f/0x250 [btrfs]
[ 2858.212084]        [<ffffffffa05afbc0>] btrfs_commit_transaction+0x450/0xa60 [btrfs]
[ 2858.227738]        [<ffffffffa05aa8a6>] transaction_kthread+0x216/0x290 [btrfs]
[ 2858.242533]        [<ffffffff8107317e>] kthread+0xde/0x100
[ 2858.253617]        [<ffffffff8165436c>] ret_from_fork+0x7c/0xb0
[ 2858.265569] 
[ 2858.265569] other info that might help us debug this:
[ 2858.265569] 
[ 2858.281780]  Possible unsafe locking scenario:
[ 2858.281780] 
[ 2858.293750]        CPU0                    CPU1
[ 2858.302869]        ----                    ----
[ 2858.312000]   lock(&cache->data_rwsem);
[ 2858.319828]                                lock(&found->groups_sem);
[ 2858.332661]                                lock(&cache->data_rwsem);
[ 2858.345508]   lock(&found->groups_sem);
[ 2858.353300] 
[ 2858.353300]  *** DEADLOCK ***
[ 2858.353300] 
[ 2858.365337] 4 locks held by btrfs-transacti/19476:
[ 2858.374993]  #0:  (&fs_info->transaction_kthread_mutex){+.+...}, at: [<ffffffffa05aa740>] transaction_kthread+0xb0/0x290 [btrfs]
[ 2858.398451]  #1:  (&fs_info->reloc_mutex){+.+...}, at: [<ffffffffa05afaf0>] btrfs_commit_transaction+0x380/0xa60 [btrfs]
[ 2858.420535]  #2:  (&fs_info->tree_log_mutex){+.+...}, at: [<ffffffffa05afb66>] btrfs_commit_transaction+0x3f6/0xa60 [btrfs]
[ 2858.443135]  #3:  (&cache->data_rwsem){++++..}, at: [<ffffffffa05f78bf>] btrfs_write_out_cache+0x9f/0x170 [btrfs]
[ 2858.463953] 
[ 2858.463953] stack backtrace:
[ 2858.472807] CPU: 25 PID: 19476 Comm: btrfs-transacti Not tainted 3.15.0-rc5-mason+ #43
[ 2858.488772] Hardware name: ZTSYSTEMS Echo Ridge T4  /A9DRPF-10D, BIOS 1.07 05/10/2012
[ 2858.504564]  ffffffff820f1b10 ffff8807ff5f94e8 ffffffff8164585c 0000000000000001
[ 2858.519677]  ffffffff820e6170 ffff8807ff5f9538 ffffffff8109e322 0000000000000004
[ 2858.534761]  ffff8807ff5f9598 ffff8807ff5f9538 ffff8808444fd118 ffff8808444fc890
[ 2858.549850] Call Trace:
[ 2858.554830]  [<ffffffff8164585c>] dump_stack+0x51/0x6d
[ 2858.565168]  [<ffffffff8109e322>] print_circular_bug+0x212/0x310
[ 2858.577247]  [<ffffffff810a12de>] __lock_acquire+0x161e/0x17b0
[ 2858.588979]  [<ffffffff810a14fe>] lock_acquire+0x8e/0x110
[ 2858.599845]  [<ffffffffa059dbc1>] ? find_free_extent+0x931/0xe20 [btrfs]
[ 2858.613312]  [<ffffffff81649cf7>] down_read+0x47/0x60
[ 2858.623501]  [<ffffffffa059dbc1>] ? find_free_extent+0x931/0xe20 [btrfs]
[ 2858.636978]  [<ffffffffa059dbc1>] find_free_extent+0x931/0xe20 [btrfs]
[ 2858.650092]  [<ffffffff8164b60b>] ? _raw_spin_unlock+0x2b/0x40
[ 2858.661842]  [<ffffffffa059e11b>] btrfs_reserve_extent+0x6b/0x140 [btrfs]
[ 2858.675483]  [<ffffffffa059e307>] btrfs_alloc_free_block+0x117/0x420 [btrfs]
[ 2858.689644]  [<ffffffff810a01d0>] ? __lock_acquire+0x510/0x17b0
[ 2858.701564]  [<ffffffffa05cc600>] ? find_extent_buffer+0x10/0xf0 [btrfs]
[ 2858.715034]  [<ffffffffa0589a5b>] __btrfs_cow_block+0x11b/0x530 [btrfs]
[ 2858.728333]  [<ffffffffa058a4a0>] btrfs_cow_block+0x130/0x1e0 [btrfs]
[ 2858.741284]  [<ffffffffa058c999>] btrfs_search_slot+0x219/0x9c0 [btrfs]
[ 2858.754597]  [<ffffffffa05f7595>] __btrfs_write_out_cache+0x755/0x970 [btrfs]
[ 2858.768946]  [<ffffffffa05f78c7>] ? btrfs_write_out_cache+0xa7/0x170 [btrfs]
[ 2858.783108]  [<ffffffffa05f7900>] ? btrfs_write_out_cache+0xe0/0x170 [btrfs]
[ 2858.797289]  [<ffffffffa05f7958>] btrfs_write_out_cache+0x138/0x170 [btrfs]
[ 2858.811278]  [<ffffffff8164b60b>] ? _raw_spin_unlock+0x2b/0x40
[ 2858.823017]  [<ffffffffa059ccb0>] btrfs_write_dirty_block_groups+0x480/0x600 [btrfs]
[ 2858.838646]  [<ffffffffa05ae7af>] commit_cowonly_roots+0x19f/0x250 [btrfs]
[ 2858.852461]  [<ffffffffa05afbc0>] btrfs_commit_transaction+0x450/0xa60 [btrfs]
[ 2858.867043]  [<ffffffffa05aa826>] ? transaction_kthread+0x196/0x290 [btrfs]
[ 2858.881032]  [<ffffffffa05aa8a6>] transaction_kthread+0x216/0x290 [btrfs]
[ 2858.894679]  [<ffffffffa05aa690>] ? close_ctree+0x2d0/0x2d0 [btrfs]
[ 2858.907276]  [<ffffffff8107317e>] kthread+0xde/0x100
[ 2858.917280]  [<ffffffff810730a0>] ? __init_kthread_worker+0x70/0x70
[ 2858.929873]  [<ffffffff8165436c>] ret_from_fork+0x7c/0xb0
[ 2858.940745]  [<ffffffff810730a0>] ? __init_kthread_worker+0x70/0x70
[ 2893.109104] EXT4-fs (loop0): mounted filesystem with ordered data mode. Opts: (null)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed
  2014-05-19  1:33   ` [RFC PATCH 5/5] " Chris Mason
@ 2014-06-10  8:15     ` Alin Dobre
  0 siblings, 0 replies; 18+ messages in thread
From: Alin Dobre @ 2014-06-10  8:15 UTC (permalink / raw)
  To: Chris Mason, Miao Xie, linux-btrfs

On 19/05/14 02:33, Chris Mason wrote:
> I had this one in the integration branch, but lockdep reported troubles.  It
> looks like lockdep is correct.  find_free_extent is nesting the cache rwsem
> inside the groups rwsem, but btrfs_write_out_cache is holding the new cache
> rwsem when it calls find_free_extent.

Is there a new patch for this problem? We have another issue that makes
us reboot the system quite often - I'm going to report that in a
separate thread - and after each machine restart, btrfs reports broken
free space cache. I know that's probably not that harmful, but it's
disturbing and it's not that easy to remount the filesystem to fix the
free space cache.

Cheers,
Alin.

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2014-06-10  9:04 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-15 12:00 [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss Miao Xie
2014-01-15 12:00 ` [PATCH 2/5] Btrfs: cleanup the redundant code for the block group allocation and init Miao Xie
2014-01-15 12:00 ` [PATCH 3/5] Btrfs: cleanup the code of used_block_group in find_free_extent() Miao Xie
2014-01-15 12:00 ` [PATCH 4/5] Btrfs: fix wrong block group in trace during the free space allocation Miao Xie
2014-01-15 12:00 ` [RFC PATCH 5/5] Btrfs: fix broken free space cache after the system crashed Miao Xie
2014-01-16  5:54   ` Liu Bo
2014-03-01 18:05   ` Alex Lyakas
2014-03-04  6:04     ` Miao Xie
2014-03-08 16:48       ` Alex Lyakas
2014-03-04 15:19   ` Josef Bacik
2014-03-05  7:02     ` Miao Xie
2014-03-05 15:02       ` Josef Bacik
2014-03-26  6:36   ` miaox
2014-04-24  5:31   ` [PATCH V2 1/2] Btrfs: output warning instead of error when loading free space cache failed Miao Xie
2014-04-24  5:31     ` [PATCH V2 2/2] Btrfs: fix broken free space cache after the system crashed Miao Xie
2014-05-19  1:33   ` [RFC PATCH 5/5] " Chris Mason
2014-06-10  8:15     ` Alin Dobre
2014-01-15 15:56 ` [PATCH 1/5] Btrfs: change the members' order of btrfs_space_info structure to reduce the cache miss David Sterba

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).