[PATCH 0/4] btrfs: reduce block group cache writeout times during commit

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
@ 2015-04-13 19:52 Chris Mason
  2015-04-13 19:52 ` [PATCH 1/4] btrfs: move struct io_ctl into ctree.h and rename it Chris Mason
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-13 19:52 UTC (permalink / raw)
  To: linux-btrfs

Large filesystems with lots of block groups can suffer long stalls during
commit while we create and send down all of the block group caches.  The
more blocks groups dirtied in a transaction, the longer these stalls can be.
Some workloads average 10 seconds per commit, but see peak times much higher.

The first problem is that we write and wait for each block group cache
individually, so we aren't keeping the disk pipeline full.  This patch
set uses the io_ctl struct to start cache IO, and then waits on it in bulk.

The second problem is that we only allow cache writeout while new modifications
are blocked during the final stage of commit.  This adds some locking
so that cache writeout can happen very early in the commit, and any block
groups that are redirtied will be sent down during the final stages.

With both together, average commit stalls are under a second and our overall
performance is much smoother.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/4] btrfs: move struct io_ctl into ctree.h and rename it
  2015-04-13 19:52 [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Chris Mason
@ 2015-04-13 19:52 ` Chris Mason
  2015-04-13 19:52 ` [PATCH 2/4] Btrfs: two stage dirty block group writeout Chris Mason
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-13 19:52 UTC (permalink / raw)
  To: linux-btrfs

We'll need to put the io_ctl into the block_group cache struct, so
name it struct btrfs_io_ctl and move it into ctree.h

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            | 11 +++++++++
 fs/btrfs/free-space-cache.c | 55 ++++++++++++++++++---------------------------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6bf16d5..e305ccd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1256,6 +1256,17 @@ struct btrfs_caching_control {
 	atomic_t count;
 };
 
+struct btrfs_io_ctl {
+	void *cur, *orig;
+	struct page *page;
+	struct page **pages;
+	struct btrfs_root *root;
+	unsigned long size;
+	int index;
+	int num_pages;
+	unsigned check_crcs:1;
+};
+
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index c514820..47c2adb 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -271,18 +271,7 @@ static int readahead_cache(struct inode *inode)
 	return 0;
 }
 
-struct io_ctl {
-	void *cur, *orig;
-	struct page *page;
-	struct page **pages;
-	struct btrfs_root *root;
-	unsigned long size;
-	int index;
-	int num_pages;
-	unsigned check_crcs:1;
-};
-
-static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 		       struct btrfs_root *root, int write)
 {
 	int num_pages;
@@ -298,7 +287,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
 	    (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
 		return -ENOSPC;
 
-	memset(io_ctl, 0, sizeof(struct io_ctl));
+	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
 
 	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
 	if (!io_ctl->pages)
@@ -311,12 +300,12 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
 	return 0;
 }
 
-static void io_ctl_free(struct io_ctl *io_ctl)
+static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
 	kfree(io_ctl->pages);
 }
 
-static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 {
 	if (io_ctl->cur) {
 		kunmap(io_ctl->page);
@@ -325,7 +314,7 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
 	}
 }
 
-static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 {
 	ASSERT(io_ctl->index < io_ctl->num_pages);
 	io_ctl->page = io_ctl->pages[io_ctl->index++];
@@ -336,7 +325,7 @@ static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
 		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
 }
 
-static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 {
 	int i;
 
@@ -351,7 +340,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
 	}
 }
 
-static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 				int uptodate)
 {
 	struct page *page;
@@ -385,7 +374,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
 	return 0;
 }
 
-static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 {
 	__le64 *val;
 
@@ -408,7 +397,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
 	io_ctl->cur += sizeof(u64);
 }
 
-static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 {
 	__le64 *gen;
 
@@ -437,7 +426,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 	return 0;
 }
 
-static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 {
 	u32 *tmp;
 	u32 crc = ~(u32)0;
@@ -461,7 +450,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
 	kunmap(io_ctl->pages[0]);
 }
 
-static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 {
 	u32 *tmp, val;
 	u32 crc = ~(u32)0;
@@ -494,7 +483,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
 	return 0;
 }
 
-static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
 			    void *bitmap)
 {
 	struct btrfs_free_space_entry *entry;
@@ -524,7 +513,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
 	return 0;
 }
 
-static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
 {
 	if (!io_ctl->cur)
 		return -ENOSPC;
@@ -547,7 +536,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
 	return 0;
 }
 
-static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
 {
 	/*
 	 * If we're not on the boundary we know we've modified the page and we
@@ -564,7 +553,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
 	}
 }
 
-static int io_ctl_read_entry(struct io_ctl *io_ctl,
+static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
 			    struct btrfs_free_space *entry, u8 *type)
 {
 	struct btrfs_free_space_entry *e;
@@ -591,7 +580,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
 	return 0;
 }
 
-static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 			      struct btrfs_free_space *entry)
 {
 	int ret;
@@ -650,7 +639,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 {
 	struct btrfs_free_space_header *header;
 	struct extent_buffer *leaf;
-	struct io_ctl io_ctl;
+	struct btrfs_io_ctl io_ctl;
 	struct btrfs_key key;
 	struct btrfs_free_space *e, *n;
 	LIST_HEAD(bitmaps);
@@ -879,7 +868,7 @@ out:
 }
 
 static noinline_for_stack
-int write_cache_extent_entries(struct io_ctl *io_ctl,
+int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 			      struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_block_group_cache *block_group,
 			      int *entries, int *bitmaps,
@@ -1002,7 +991,7 @@ fail:
 static noinline_for_stack int
 write_pinned_extent_entries(struct btrfs_root *root,
 			    struct btrfs_block_group_cache *block_group,
-			    struct io_ctl *io_ctl,
+			    struct btrfs_io_ctl *io_ctl,
 			    int *entries)
 {
 	u64 start, extent_start, extent_end, len;
@@ -1052,7 +1041,7 @@ write_pinned_extent_entries(struct btrfs_root *root,
 }
 
 static noinline_for_stack int
-write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
 {
 	struct list_head *pos, *n;
 	int ret;
@@ -1086,7 +1075,7 @@ static int flush_dirty_cache(struct inode *inode)
 
 static void noinline_for_stack
 cleanup_write_cache_enospc(struct inode *inode,
-			   struct io_ctl *io_ctl,
+			   struct btrfs_io_ctl *io_ctl,
 			   struct extent_state **cached_state,
 			   struct list_head *bitmap_list)
 {
@@ -1123,7 +1112,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 				   struct btrfs_path *path, u64 offset)
 {
 	struct extent_state *cached_state = NULL;
-	struct io_ctl io_ctl;
+	struct btrfs_io_ctl io_ctl;
 	LIST_HEAD(bitmap_list);
 	int entries = 0;
 	int bitmaps = 0;
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/4] Btrfs: two stage dirty block group writeout
  2015-04-13 19:52 [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Chris Mason
  2015-04-13 19:52 ` [PATCH 1/4] btrfs: move struct io_ctl into ctree.h and rename it Chris Mason
@ 2015-04-13 19:52 ` Chris Mason
  2015-04-13 19:52 ` [PATCH 3/4] Btrfs: don't use highmem for free space cache pages Chris Mason
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-13 19:52 UTC (permalink / raw)
  To: linux-btrfs

Block group cache writeout is currently waiting on the pages for each
block group cache before moving on to writing the next one.  This commit
switches things around to send down all the caches and then wait on them
in batches.

The end result is much faster, since we're keeping the disk pipeline
full.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            |   6 ++
 fs/btrfs/extent-tree.c      |  57 +++++++++++++++++--
 fs/btrfs/free-space-cache.c | 131 +++++++++++++++++++++++++++++++++++---------
 fs/btrfs/free-space-cache.h |   8 ++-
 4 files changed, 170 insertions(+), 32 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e305ccd..1df0d9d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1261,9 +1261,12 @@ struct btrfs_io_ctl {
 	struct page *page;
 	struct page **pages;
 	struct btrfs_root *root;
+	struct inode *inode;
 	unsigned long size;
 	int index;
 	int num_pages;
+	int entries;
+	int bitmaps;
 	unsigned check_crcs:1;
 };
 
@@ -1332,6 +1335,9 @@ struct btrfs_block_group_cache {
 
 	/* For dirty block groups */
 	struct list_head dirty_list;
+	struct list_head io_list;
+
+	struct btrfs_io_ctl io_ctl;
 };
 
 /* delayed seq elem */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d4b3d680..40c9513 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3388,7 +3388,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	int ret = 0;
+	int should_put;
 	struct btrfs_path *path;
+	LIST_HEAD(io);
+	int num_started = 0;
+	int num_waited = 0;
 
 	if (list_empty(&cur_trans->dirty_bgs))
 		return 0;
@@ -3407,16 +3411,60 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		cache = list_first_entry(&cur_trans->dirty_bgs,
 					 struct btrfs_block_group_cache,
 					 dirty_list);
+
+		/*
+		 * this can happen if cache_save_setup re-dirties a block
+		 * group that is already under IO.  Just wait for it to
+		 * finish and then do it all again
+		 */
+		if (!list_empty(&cache->io_list)) {
+			list_del_init(&cache->io_list);
+			btrfs_wait_cache_io(root, trans, cache,
+					    &cache->io_ctl, path,
+					    cache->key.objectid);
+			btrfs_put_block_group(cache);
+			num_waited++;
+		}
+
 		list_del_init(&cache->dirty_list);
+		should_put = 1;
+
 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
 			cache_save_setup(cache, trans, path);
+
 		if (!ret)
-			ret = btrfs_run_delayed_refs(trans, root,
-						     (unsigned long) -1);
-		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
-			btrfs_write_out_cache(root, trans, cache, path);
+			ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+			cache->io_ctl.inode = NULL;
+			ret = btrfs_write_out_cache(root, trans, cache, path);
+			if (ret == 0 && cache->io_ctl.inode) {
+				num_started++;
+				should_put = 0;
+				list_add_tail(&cache->io_list, &io);
+			} else {
+				/*
+				 * if we failed to write the cache, the
+				 * generation will be bad and life goes on
+				 */
+				ret = 0;
+			}
+		}
 		if (!ret)
 			ret = write_one_cache_group(trans, root, path, cache);
+
+		/* if its not on the io list, we need to put the block group */
+		if (should_put)
+			btrfs_put_block_group(cache);
+	}
+
+	while (!list_empty(&io)) {
+		cache = list_first_entry(&io, struct btrfs_block_group_cache,
+					 io_list);
+		list_del_init(&cache->io_list);
+		num_waited++;
+		btrfs_wait_cache_io(root, trans, cache,
+				    &cache->io_ctl, path, cache->key.objectid);
 		btrfs_put_block_group(cache);
 	}
 
@@ -9013,6 +9061,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 	INIT_LIST_HEAD(&cache->bg_list);
 	INIT_LIST_HEAD(&cache->ro_list);
 	INIT_LIST_HEAD(&cache->dirty_list);
+	INIT_LIST_HEAD(&cache->io_list);
 	btrfs_init_free_space_ctl(cache);
 	atomic_set(&cache->trimming, 0);
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 47c2adb..6886ae0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -170,13 +170,13 @@ static int __create_free_space_inode(struct btrfs_root *root,
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
 	key.type = 0;
-
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(struct btrfs_free_space_header));
 	if (ret < 0) {
 		btrfs_release_path(path);
 		return ret;
 	}
+
 	leaf = path->nodes[0];
 	header = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_free_space_header);
@@ -296,6 +296,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 	io_ctl->num_pages = num_pages;
 	io_ctl->root = root;
 	io_ctl->check_crcs = check_crcs;
+	io_ctl->inode = inode;
 
 	return 0;
 }
@@ -303,6 +304,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
 	kfree(io_ctl->pages);
+	io_ctl->pages = NULL;
 }
 
 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
@@ -1092,6 +1094,61 @@ cleanup_write_cache_enospc(struct inode *inode,
 			     GFP_NOFS);
 }
 
+int btrfs_wait_cache_io(struct btrfs_root *root,
+			struct btrfs_trans_handle *trans,
+			struct btrfs_block_group_cache *block_group,
+			struct btrfs_io_ctl *io_ctl,
+			struct btrfs_path *path, u64 offset)
+{
+	int ret;
+	struct inode *inode = io_ctl->inode;
+
+	root = root->fs_info->tree_root;
+
+	/* Flush the dirty pages in the cache file. */
+	ret = flush_dirty_cache(inode);
+	if (ret)
+		goto out;
+
+	/* Update the cache item to tell everyone this cache file is valid. */
+	ret = update_cache_item(trans, root, inode, path, offset,
+				io_ctl->entries, io_ctl->bitmaps);
+out:
+	io_ctl_free(io_ctl);
+	if (ret) {
+		invalidate_inode_pages2(inode->i_mapping);
+		BTRFS_I(inode)->generation = 0;
+		if (block_group) {
+#ifdef DEBUG
+			btrfs_err(root->fs_info,
+				"failed to write free space cache for block group %llu",
+				block_group->key.objectid);
+#endif
+		}
+	}
+	btrfs_update_inode(trans, root, inode);
+
+	if (block_group) {
+		spin_lock(&block_group->lock);
+
+		/*
+		 * only mark this as written if we didn't get put back on
+		 * the dirty list while waiting for IO.
+		 */
+		if (!ret && list_empty(&block_group->dirty_list))
+			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+		else if (ret)
+			block_group->disk_cache_state = BTRFS_DC_ERROR;
+
+		spin_unlock(&block_group->lock);
+		io_ctl->inode = NULL;
+		iput(inode);
+	}
+
+	return ret;
+
+}
+
 /**
  * __btrfs_write_out_cache - write out cached info to an inode
  * @root - the root the inode belongs to
@@ -1108,20 +1165,22 @@ cleanup_write_cache_enospc(struct inode *inode,
 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 				   struct btrfs_free_space_ctl *ctl,
 				   struct btrfs_block_group_cache *block_group,
+				   struct btrfs_io_ctl *io_ctl,
 				   struct btrfs_trans_handle *trans,
 				   struct btrfs_path *path, u64 offset)
 {
 	struct extent_state *cached_state = NULL;
-	struct btrfs_io_ctl io_ctl;
 	LIST_HEAD(bitmap_list);
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
+	int must_iput = 0;
 
 	if (!i_size_read(inode))
 		return -1;
 
-	ret = io_ctl_init(&io_ctl, inode, root, 1);
+	WARN_ON(io_ctl->pages);
+	ret = io_ctl_init(io_ctl, inode, root, 1);
 	if (ret)
 		return -1;
 
@@ -1134,22 +1193,23 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			up_write(&block_group->data_rwsem);
 			BTRFS_I(inode)->generation = 0;
 			ret = 0;
+			must_iput = 1;
 			goto out;
 		}
 		spin_unlock(&block_group->lock);
 	}
 
 	/* Lock all pages first so we can lock the extent safely. */
-	io_ctl_prepare_pages(&io_ctl, inode, 0);
+	io_ctl_prepare_pages(io_ctl, inode, 0);
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
 			 0, &cached_state);
 
-	io_ctl_set_generation(&io_ctl, trans->transid);
+	io_ctl_set_generation(io_ctl, trans->transid);
 
 	mutex_lock(&ctl->cache_writeout_mutex);
 	/* Write out the extent entries in the free space cache */
-	ret = write_cache_extent_entries(&io_ctl, ctl,
+	ret = write_cache_extent_entries(io_ctl, ctl,
 					 block_group, &entries, &bitmaps,
 					 &bitmap_list);
 	if (ret) {
@@ -1162,7 +1222,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * they will be added into free space cache after the transaction is
 	 * committed, we shouldn't lose them.
 	 */
-	ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+	ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
 	if (ret) {
 		mutex_unlock(&ctl->cache_writeout_mutex);
 		goto out_nospc;
@@ -1173,16 +1233,16 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * locked while doing it because a concurrent trim can be manipulating
 	 * or freeing the bitmap.
 	 */
-	ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+	ret = write_bitmap_entries(io_ctl, &bitmap_list);
 	mutex_unlock(&ctl->cache_writeout_mutex);
 	if (ret)
 		goto out_nospc;
 
 	/* Zero out the rest of the pages just to make sure */
-	io_ctl_zero_remaining_pages(&io_ctl);
+	io_ctl_zero_remaining_pages(io_ctl);
 
 	/* Everything is written out, now we dirty the pages in the file. */
-	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+	ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
 				0, i_size_read(inode), &cached_state);
 	if (ret)
 		goto out_nospc;
@@ -1193,30 +1253,39 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * Release the pages and unlock the extent, we will flush
 	 * them out later
 	 */
-	io_ctl_drop_pages(&io_ctl);
+	io_ctl_drop_pages(io_ctl);
 
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
 
-	/* Flush the dirty pages in the cache file. */
-	ret = flush_dirty_cache(inode);
+	/*
+	 * at this point the pages are under IO and we're happy,
+	 * The caller is responsible for waiting on them and updating the
+	 * the cache and the inode
+	 */
+	io_ctl->entries = entries;
+	io_ctl->bitmaps = bitmaps;
+
+	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
 	if (ret)
 		goto out;
 
-	/* Update the cache item to tell everyone this cache file is valid. */
-	ret = update_cache_item(trans, root, inode, path, offset,
-				entries, bitmaps);
+	return 0;
+
 out:
-	io_ctl_free(&io_ctl);
+	io_ctl->inode = NULL;
+	io_ctl_free(io_ctl);
 	if (ret) {
 		invalidate_inode_pages2(inode->i_mapping);
 		BTRFS_I(inode)->generation = 0;
 	}
 	btrfs_update_inode(trans, root, inode);
+	if (must_iput)
+		iput(inode);
 	return ret;
 
 out_nospc:
-	cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+	cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
 
 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
 		up_write(&block_group->data_rwsem);
@@ -1232,7 +1301,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct inode *inode;
 	int ret = 0;
-	enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
 
 	root = root->fs_info->tree_root;
 
@@ -1253,22 +1321,28 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 	if (IS_ERR(inode))
 		return 0;
 
-	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+	ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
+				      &block_group->io_ctl, trans,
 				      path, block_group->key.objectid);
 	if (ret) {
-		dcs = BTRFS_DC_ERROR;
-		ret = 0;
 #ifdef DEBUG
 		btrfs_err(root->fs_info,
 			"failed to write free space cache for block group %llu",
 			block_group->key.objectid);
 #endif
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_ERROR;
+		spin_unlock(&block_group->lock);
+
+		block_group->io_ctl.inode = NULL;
+		iput(inode);
 	}
 
-	spin_lock(&block_group->lock);
-	block_group->disk_cache_state = dcs;
-	spin_unlock(&block_group->lock);
-	iput(inode);
+	/*
+	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
+	 * to wait for IO and put the inode
+	 */
+
 	return ret;
 }
 
@@ -3331,11 +3405,14 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 {
 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
 	int ret;
+	struct btrfs_io_ctl io_ctl;
 
 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
 		return 0;
 
-	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
+				      trans, path, 0) ||
+		btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
 	if (ret) {
 		btrfs_delalloc_release_metadata(inode, inode->i_size);
 #ifdef DEBUG
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 88b2238..c433986 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -48,6 +48,8 @@ struct btrfs_free_space_op {
 			   struct btrfs_free_space *info);
 };
 
+struct btrfs_io_ctl;
+
 struct inode *lookup_free_space_inode(struct btrfs_root *root,
 				      struct btrfs_block_group_cache
 				      *block_group, struct btrfs_path *path);
@@ -63,11 +65,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_group_cache *block_group);
+int btrfs_wait_cache_io(struct btrfs_root *root,
+			struct btrfs_trans_handle *trans,
+			struct btrfs_block_group_cache *block_group,
+			struct btrfs_io_ctl *io_ctl,
+			struct btrfs_path *path, u64 offset);
 int btrfs_write_out_cache(struct btrfs_root *root,
 			  struct btrfs_trans_handle *trans,
 			  struct btrfs_block_group_cache *block_group,
 			  struct btrfs_path *path);
-
 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
 				    struct btrfs_path *path);
 int create_free_ino_inode(struct btrfs_root *root,
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/4] Btrfs: don't use highmem for free space cache pages
  2015-04-13 19:52 [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Chris Mason
  2015-04-13 19:52 ` [PATCH 1/4] btrfs: move struct io_ctl into ctree.h and rename it Chris Mason
  2015-04-13 19:52 ` [PATCH 2/4] Btrfs: two stage dirty block group writeout Chris Mason
@ 2015-04-13 19:52 ` Chris Mason
  2015-04-13 19:52 ` [PATCH 4/4] Btrfs: allow block group cache writeout outside critical section in commit Chris Mason
  2015-04-22 16:09 ` [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Lutz Vieweg
  4 siblings, 0 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-13 19:52 UTC (permalink / raw)
  To: linux-btrfs

In order to create the free space cache concurrently with FS modifications,
we need to take a few block group locks.

The cache code also does kmap, which would schedule with the locks held.
Instead of going through kmap_atomic, lets just use lowmem for the cache
pages.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6886ae0..83532a2 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
 	}
 
 	mapping_set_gfp_mask(inode->i_mapping,
-			mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+			mapping_gfp_mask(inode->i_mapping) &
+			~(GFP_NOFS & ~__GFP_HIGHMEM));
 
 	return inode;
 }
@@ -310,7 +311,6 @@ static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 {
 	if (io_ctl->cur) {
-		kunmap(io_ctl->page);
 		io_ctl->cur = NULL;
 		io_ctl->orig = NULL;
 	}
@@ -320,7 +320,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 {
 	ASSERT(io_ctl->index < io_ctl->num_pages);
 	io_ctl->page = io_ctl->pages[io_ctl->index++];
-	io_ctl->cur = kmap(io_ctl->page);
+	io_ctl->cur = page_address(io_ctl->page);
 	io_ctl->orig = io_ctl->cur;
 	io_ctl->size = PAGE_CACHE_SIZE;
 	if (clear)
@@ -446,10 +446,9 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	io_ctl_unmap_page(io_ctl);
-	tmp = kmap(io_ctl->pages[0]);
+	tmp = page_address(io_ctl->pages[0]);
 	tmp += index;
 	*tmp = crc;
-	kunmap(io_ctl->pages[0]);
 }
 
 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
@@ -466,10 +465,9 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 	if (index == 0)
 		offset = sizeof(u32) * io_ctl->num_pages;
 
-	tmp = kmap(io_ctl->pages[0]);
+	tmp = page_address(io_ctl->pages[0]);
 	tmp += index;
 	val = *tmp;
-	kunmap(io_ctl->pages[0]);
 
 	io_ctl_map_page(io_ctl, 0);
 	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 4/4] Btrfs: allow block group cache writeout outside critical section in commit
  2015-04-13 19:52 [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Chris Mason
                   ` (2 preceding siblings ...)
  2015-04-13 19:52 ` [PATCH 3/4] Btrfs: don't use highmem for free space cache pages Chris Mason
@ 2015-04-13 19:52 ` Chris Mason
  2015-04-22 16:09 ` [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Lutz Vieweg
  4 siblings, 0 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-13 19:52 UTC (permalink / raw)
  To: linux-btrfs

We loop through all of the dirty block groups during commit and write
the free space cache.  In order to make sure the cache is currect, we do
this while no other writers are allowed in the commit.

If a large number of block groups are dirty, this can introduce long
stalls during the final stages of the commit, which can block new procs
trying to change the filesystem.

This commit changes the block group cache writeout to take appropriate
locks and allow it to run earlier in the commit.  We'll still have to
redo some of the block groups, but it means we can get most of the work
out of the way without blocking the entire FS.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h            |   8 ++
 fs/btrfs/disk-io.c          |   1 +
 fs/btrfs/extent-tree.c      | 241 +++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/free-space-cache.c |  69 +++++++++++--
 fs/btrfs/free-space-cache.h |   1 +
 fs/btrfs/inode-map.c        |   2 +-
 fs/btrfs/relocation.c       |   9 +-
 fs/btrfs/transaction.c      |  38 ++++++-
 fs/btrfs/transaction.h      |   9 ++
 9 files changed, 341 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1df0d9d..83051fa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1491,6 +1491,12 @@ struct btrfs_fs_info {
 	struct mutex chunk_mutex;
 	struct mutex volume_mutex;
 
+	/*
+	 * this is taken to make sure we don't set block groups ro after
+	 * the free space cache has been allocated on them
+	 */
+	struct mutex ro_block_group_mutex;
+
 	/* this is used during read/modify/write to make sure
 	 * no two ios are trying to mod the same stripe at the same
 	 * time
@@ -3407,6 +3413,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 owner, u64 offset, int no_quota);
 
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 568cc4e..b5e3d5f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2572,6 +2572,7 @@ int open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
+	mutex_init(&fs_info->ro_block_group_mutex);
 	init_rwsem(&fs_info->commit_root_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 40c9513..02c2b29 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3298,7 +3298,7 @@ again:
 		if (ret)
 			goto out_put;
 
-		ret = btrfs_truncate_free_space_cache(root, trans, inode);
+		ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
 		if (ret)
 			goto out_put;
 	}
@@ -3382,20 +3382,156 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS.  This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO.  There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	int ret = 0;
 	int should_put;
-	struct btrfs_path *path;
-	LIST_HEAD(io);
+	struct btrfs_path *path = NULL;
+	LIST_HEAD(dirty);
+	struct list_head *io = &cur_trans->io_bgs;
 	int num_started = 0;
-	int num_waited = 0;
+	int loops = 0;
+
+	spin_lock(&cur_trans->dirty_bgs_lock);
+	if (!list_empty(&cur_trans->dirty_bgs)) {
+		list_splice_init(&cur_trans->dirty_bgs, &dirty);
+	}
+	spin_unlock(&cur_trans->dirty_bgs_lock);
 
-	if (list_empty(&cur_trans->dirty_bgs))
+again:
+	if (list_empty(&dirty)) {
+		btrfs_free_path(path);
 		return 0;
+	}
+
+	/*
+	 * make sure all the block groups on our dirty list actually
+	 * exist
+	 */
+	btrfs_create_pending_block_groups(trans, root);
+
+	if (!path) {
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+	}
+
+	while (!list_empty(&dirty)) {
+		cache = list_first_entry(&dirty,
+					 struct btrfs_block_group_cache,
+					 dirty_list);
+
+		/*
+		 * cache_write_mutex is here only to save us from balance
+		 * deleting this block group while we are writing out the
+		 * cache
+		 */
+		mutex_lock(&trans->transaction->cache_write_mutex);
+
+		/*
+		 * this can happen if something re-dirties a block
+		 * group that is already under IO.  Just wait for it to
+		 * finish and then do it all again
+		 */
+		if (!list_empty(&cache->io_list)) {
+			list_del_init(&cache->io_list);
+			btrfs_wait_cache_io(root, trans, cache,
+					    &cache->io_ctl, path,
+					    cache->key.objectid);
+			btrfs_put_block_group(cache);
+		}
+
+
+		/*
+		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
+		 * if it should update the cache_state.  Don't delete
+		 * until after we wait.
+		 *
+		 * Since we're not running in the commit critical section
+		 * we need the dirty_bgs_lock to protect from update_block_group
+		 */
+		spin_lock(&cur_trans->dirty_bgs_lock);
+		list_del_init(&cache->dirty_list);
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+
+		should_put = 1;
+
+		cache_save_setup(cache, trans, path);
+
+		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+			cache->io_ctl.inode = NULL;
+			ret = btrfs_write_out_cache(root, trans, cache, path);
+			if (ret == 0 && cache->io_ctl.inode) {
+				num_started++;
+				should_put = 0;
+
+				/*
+				 * the cache_write_mutex is protecting
+				 * the io_list
+				 */
+				list_add_tail(&cache->io_list, io);
+			} else {
+				/*
+				 * if we failed to write the cache, the
+				 * generation will be bad and life goes on
+				 */
+				ret = 0;
+			}
+		}
+		if (!ret)
+			ret = write_one_cache_group(trans, root, path, cache);
+		mutex_unlock(&trans->transaction->cache_write_mutex);
+
+		/* if its not on the io list, we need to put the block group */
+		if (should_put)
+			btrfs_put_block_group(cache);
+
+		if (ret)
+			break;
+	}
+
+	/*
+	 * go through delayed refs for all the stuff we've just kicked off
+	 * and then loop back (just once)
+	 */
+	ret = btrfs_run_delayed_refs(trans, root, 0);
+	if (!ret && loops == 0) {
+		loops++;
+		spin_lock(&cur_trans->dirty_bgs_lock);
+		list_splice_init(&cur_trans->dirty_bgs, &dirty);
+		spin_unlock(&cur_trans->dirty_bgs_lock);
+		goto again;
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int ret = 0;
+	int should_put;
+	struct btrfs_path *path;
+	struct list_head *io = &cur_trans->io_bgs;
+	int num_started = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -3423,14 +3559,16 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 					    &cache->io_ctl, path,
 					    cache->key.objectid);
 			btrfs_put_block_group(cache);
-			num_waited++;
 		}
 
+		/*
+		 * don't remove from the dirty list until after we've waited
+		 * on any pending IO
+		 */
 		list_del_init(&cache->dirty_list);
 		should_put = 1;
 
-		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-			cache_save_setup(cache, trans, path);
+		cache_save_setup(cache, trans, path);
 
 		if (!ret)
 			ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
@@ -3441,7 +3579,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 			if (ret == 0 && cache->io_ctl.inode) {
 				num_started++;
 				should_put = 0;
-				list_add_tail(&cache->io_list, &io);
+				list_add_tail(&cache->io_list, io);
 			} else {
 				/*
 				 * if we failed to write the cache, the
@@ -3458,11 +3596,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 			btrfs_put_block_group(cache);
 	}
 
-	while (!list_empty(&io)) {
-		cache = list_first_entry(&io, struct btrfs_block_group_cache,
+	while (!list_empty(io)) {
+		cache = list_first_entry(io, struct btrfs_block_group_cache,
 					 io_list);
 		list_del_init(&cache->io_list);
-		num_waited++;
 		btrfs_wait_cache_io(root, trans, cache,
 				    &cache->io_ctl, path, cache->key.objectid);
 		btrfs_put_block_group(cache);
@@ -5459,15 +5596,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
 			cache_block_group(cache, 1);
 
-		spin_lock(&trans->transaction->dirty_bgs_lock);
-		if (list_empty(&cache->dirty_list)) {
-			list_add_tail(&cache->dirty_list,
-				      &trans->transaction->dirty_bgs);
-				trans->transaction->num_dirty_bgs++;
-			btrfs_get_block_group(cache);
-		}
-		spin_unlock(&trans->transaction->dirty_bgs_lock);
-
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
@@ -5516,6 +5644,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				spin_unlock(&info->unused_bgs_lock);
 			}
 		}
+
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+		if (list_empty(&cache->dirty_list)) {
+			list_add_tail(&cache->dirty_list,
+				      &trans->transaction->dirty_bgs);
+				trans->transaction->num_dirty_bgs++;
+			btrfs_get_block_group(cache);
+		}
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+
 		btrfs_put_block_group(cache);
 		total -= num_bytes;
 		bytenr += num_bytes;
@@ -8602,10 +8740,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
 	BUG_ON(cache->ro);
 
+again:
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
+	/*
+	 * we're not allowed to set block groups readonly after the dirty
+	 * block groups cache has started writing.  If it already started,
+	 * back off and let this transaction commit
+	 */
+	mutex_lock(&root->fs_info->ro_block_group_mutex);
+	if (trans->transaction->dirty_bg_run) {
+		u64 transid = trans->transid;
+
+		mutex_unlock(&root->fs_info->ro_block_group_mutex);
+		btrfs_end_transaction(trans, root);
+
+		ret = btrfs_wait_for_commit(root, transid);
+		if (ret)
+			return ret;
+		goto again;
+	}
+
+
 	ret = set_block_group_ro(cache, 0);
 	if (!ret)
 		goto out;
@@ -8620,6 +8778,7 @@ out:
 		alloc_flags = update_block_group_flags(root, cache->flags);
 		check_system_chunk(trans, root, alloc_flags);
 	}
+	mutex_unlock(&root->fs_info->ro_block_group_mutex);
 
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -9425,7 +9584,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
+	/*
+	 * get the inode first so any iput calls done for the io_list
+	 * aren't the final iput (no unlinks allowed now)
+	 */
 	inode = lookup_free_space_inode(tree_root, block_group, path);
+
+	mutex_lock(&trans->transaction->cache_write_mutex);
+	/*
+	 * make sure our free spache cache IO is done before remove the
+	 * free space inode
+	 */
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	if (!list_empty(&block_group->io_list)) {
+		list_del_init(&block_group->io_list);
+
+		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
+		btrfs_wait_cache_io(root, trans, block_group,
+				    &block_group->io_ctl, path,
+				    block_group->key.objectid);
+		btrfs_put_block_group(block_group);
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+	}
+
+	if (!list_empty(&block_group->dirty_list)) {
+		list_del_init(&block_group->dirty_list);
+		btrfs_put_block_group(block_group);
+	}
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
+	mutex_unlock(&trans->transaction->cache_write_mutex);
+
 	if (!IS_ERR(inode)) {
 		ret = btrfs_orphan_add(trans, inode);
 		if (ret) {
@@ -9518,11 +9708,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	spin_lock(&trans->transaction->dirty_bgs_lock);
 	if (!list_empty(&block_group->dirty_list)) {
-		list_del_init(&block_group->dirty_list);
-		btrfs_put_block_group(block_group);
+		WARN_ON(1);
+	}
+	if (!list_empty(&block_group->io_list)) {
+		WARN_ON(1);
 	}
 	spin_unlock(&trans->transaction->dirty_bgs_lock);
-
 	btrfs_remove_free_space_cache(block_group);
 
 	spin_lock(&block_group->space_info->lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 83532a2..253cb74 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -226,9 +226,37 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
+				    struct btrfs_block_group_cache *block_group,
 				    struct inode *inode)
 {
 	int ret = 0;
+	struct btrfs_path *path = btrfs_alloc_path();
+
+	if (!path) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	if (block_group) {
+		mutex_lock(&trans->transaction->cache_write_mutex);
+		if (!list_empty(&block_group->io_list)) {
+			list_del_init(&block_group->io_list);
+
+			btrfs_wait_cache_io(root, trans, block_group,
+					    &block_group->io_ctl, path,
+					    block_group->key.objectid);
+			btrfs_put_block_group(block_group);
+		}
+
+		/*
+		 * now that we've truncated the cache away, its no longer
+		 * setup or written
+		 */
+		spin_lock(&block_group->lock);
+		block_group->disk_cache_state = BTRFS_DC_CLEAR;
+		spin_unlock(&block_group->lock);
+	}
+	btrfs_free_path(path);
 
 	btrfs_i_size_write(inode, 0);
 	truncate_pagecache(inode, 0);
@@ -242,11 +270,17 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
 	if (ret) {
+		mutex_unlock(&trans->transaction->cache_write_mutex);
 		btrfs_abort_transaction(trans, root, ret);
 		return ret;
 	}
 
 	ret = btrfs_update_inode(trans, root, inode);
+
+	if (block_group)
+		mutex_unlock(&trans->transaction->cache_write_mutex);
+
+fail:
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
 
@@ -876,6 +910,7 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 {
 	int ret;
 	struct btrfs_free_cluster *cluster = NULL;
+	struct btrfs_free_cluster *cluster_locked = NULL;
 	struct rb_node *node = rb_first(&ctl->free_space_offset);
 	struct btrfs_trim_range *trim_entry;
 
@@ -887,6 +922,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 	}
 
 	if (!node && cluster) {
+		cluster_locked = cluster;
+		spin_lock(&cluster_locked->lock);
 		node = rb_first(&cluster->root);
 		cluster = NULL;
 	}
@@ -910,9 +947,15 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 		node = rb_next(node);
 		if (!node && cluster) {
 			node = rb_first(&cluster->root);
+			cluster_locked = cluster;
+			spin_lock(&cluster_locked->lock);
 			cluster = NULL;
 		}
 	}
+	if (cluster_locked) {
+		spin_unlock(&cluster_locked->lock);
+		cluster_locked = NULL;
+	}
 
 	/*
 	 * Make sure we don't miss any range that was removed from our rbtree
@@ -930,6 +973,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 
 	return 0;
 fail:
+	if (cluster_locked)
+		spin_unlock(&cluster_locked->lock);
 	return -ENOSPC;
 }
 
@@ -1101,6 +1146,9 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
 	int ret;
 	struct inode *inode = io_ctl->inode;
 
+	if (!inode)
+		return 0;
+
 	root = root->fs_info->tree_root;
 
 	/* Flush the dirty pages in the cache file. */
@@ -1127,11 +1175,16 @@ out:
 	btrfs_update_inode(trans, root, inode);
 
 	if (block_group) {
+		/* the dirty list is protected by the dirty_bgs_lock */
+		spin_lock(&trans->transaction->dirty_bgs_lock);
+
+		/* the disk_cache_state is protected by the block group lock */
 		spin_lock(&block_group->lock);
 
 		/*
 		 * only mark this as written if we didn't get put back on
-		 * the dirty list while waiting for IO.
+		 * the dirty list while waiting for IO.   Otherwise our
+		 * cache state won't be right, and we won't get written again
 		 */
 		if (!ret && list_empty(&block_group->dirty_list))
 			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
@@ -1139,6 +1192,7 @@ out:
 			block_group->disk_cache_state = BTRFS_DC_ERROR;
 
 		spin_unlock(&block_group->lock);
+		spin_unlock(&trans->transaction->dirty_bgs_lock);
 		io_ctl->inode = NULL;
 		iput(inode);
 	}
@@ -1207,9 +1261,11 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	mutex_lock(&ctl->cache_writeout_mutex);
 	/* Write out the extent entries in the free space cache */
+	spin_lock(&ctl->tree_lock);
 	ret = write_cache_extent_entries(io_ctl, ctl,
 					 block_group, &entries, &bitmaps,
 					 &bitmap_list);
+	spin_unlock(&ctl->tree_lock);
 	if (ret) {
 		mutex_unlock(&ctl->cache_writeout_mutex);
 		goto out_nospc;
@@ -1219,6 +1275,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * Some spaces that are freed in the current transaction are pinned,
 	 * they will be added into free space cache after the transaction is
 	 * committed, we shouldn't lose them.
+	 *
+	 * If this changes while we are working we'll get added back to
+	 * the dirty list and redo it.  No locking needed
 	 */
 	ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
 	if (ret) {
@@ -1231,7 +1290,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * locked while doing it because a concurrent trim can be manipulating
 	 * or freeing the bitmap.
 	 */
+	spin_lock(&ctl->tree_lock);
 	ret = write_bitmap_entries(io_ctl, &bitmap_list);
+	spin_unlock(&ctl->tree_lock);
 	mutex_unlock(&ctl->cache_writeout_mutex);
 	if (ret)
 		goto out_nospc;
@@ -1307,12 +1368,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 		spin_unlock(&block_group->lock);
 		return 0;
 	}
-
-	if (block_group->delalloc_bytes) {
-		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
-		spin_unlock(&block_group->lock);
-		return 0;
-	}
 	spin_unlock(&block_group->lock);
 
 	inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index c433986..a16a029 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -62,6 +62,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 				       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
+				    struct btrfs_block_group_cache *block_group,
 				    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_group_cache *block_group);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 74faea3..f6a596d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -456,7 +456,7 @@ again:
 	}
 
 	if (i_size_read(inode) > 0) {
-		ret = btrfs_truncate_free_space_cache(root, trans, inode);
+		ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
 		if (ret) {
 			if (ret != -ENOSPC)
 				btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d830853..840a4eb 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3430,7 +3430,9 @@ static int block_use_full_backref(struct reloc_control *rc,
 }
 
 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
-				    struct inode *inode, u64 ino)
+				    struct btrfs_block_group_cache *block_group,
+				    struct inode *inode,
+				    u64 ino)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = fs_info->tree_root;
@@ -3463,7 +3465,7 @@ truncate:
 		goto out;
 	}
 
-	ret = btrfs_truncate_free_space_cache(root, trans, inode);
+	ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
 
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
@@ -3509,6 +3511,7 @@ static int find_data_references(struct reloc_control *rc,
 	 */
 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
 		ret = delete_block_group_cache(rc->extent_root->fs_info,
+					       rc->block_group,
 					       NULL, ref_objectid);
 		if (ret != -ENOENT)
 			return ret;
@@ -4223,7 +4226,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	btrfs_free_path(path);
 
 	if (!IS_ERR(inode))
-		ret = delete_block_group_cache(fs_info, inode, 0);
+		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
 	else
 		ret = PTR_ERR(inode);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 234d606..5628e25 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -222,6 +222,7 @@ loop:
 	atomic_set(&cur_trans->use_count, 2);
 	cur_trans->have_free_bgs = 0;
 	cur_trans->start_time = get_seconds();
+	cur_trans->dirty_bg_run = 0;
 
 	cur_trans->delayed_refs.href_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
@@ -251,6 +252,8 @@ loop:
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
 	INIT_LIST_HEAD(&cur_trans->pending_ordered);
 	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+	INIT_LIST_HEAD(&cur_trans->io_bgs);
+	mutex_init(&cur_trans->cache_write_mutex);
 	cur_trans->num_dirty_bgs = 0;
 	spin_lock_init(&cur_trans->dirty_bgs_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -1059,6 +1062,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
+	struct list_head *io_bgs = &trans->transaction->io_bgs;
 	struct list_head *next;
 	struct extent_buffer *eb;
 	int ret;
@@ -1112,7 +1116,7 @@ again:
 			return ret;
 	}
 
-	while (!list_empty(dirty_bgs)) {
+	while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
 		ret = btrfs_write_dirty_block_groups(trans, root);
 		if (ret)
 			return ret;
@@ -1812,6 +1816,37 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
+	if (!cur_trans->dirty_bg_run) {
+		int run_it = 0;
+
+		/* this mutex is also taken before trying to set
+		 * block groups readonly.  We need to make sure
+		 * that nobody has set a block group readonly
+		 * after a extents from that block group have been
+		 * allocated for cache files.  btrfs_set_block_group_ro
+		 * will wait for the transaction to commit if it
+		 * finds dirty_bg_run = 1
+		 *
+		 * The dirty_bg_run flag is also used to make sure only
+		 * one process starts all the block group IO.  It wouldn't
+		 * hurt to have more than one go through, but there's no
+		 * real advantage to it either.
+		 */
+		mutex_lock(&root->fs_info->ro_block_group_mutex);
+		if (!cur_trans->dirty_bg_run) {
+			run_it = 1;
+			cur_trans->dirty_bg_run = 1;
+		}
+		mutex_unlock(&root->fs_info->ro_block_group_mutex);
+
+		if (run_it)
+			ret = btrfs_start_dirty_block_groups(trans, root);
+	}
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
 	spin_lock(&root->fs_info->trans_lock);
 	list_splice(&trans->ordered, &cur_trans->pending_ordered);
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
@@ -2005,6 +2040,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	assert_qgroups_uptodate(trans);
 	ASSERT(list_empty(&cur_trans->dirty_bgs));
+	ASSERT(list_empty(&cur_trans->io_bgs));
 	update_super_roots(root);
 
 	btrfs_set_super_log_root(root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4cb0ae2..0b24755 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -64,10 +64,19 @@ struct btrfs_transaction {
 	struct list_head pending_ordered;
 	struct list_head switch_commits;
 	struct list_head dirty_bgs;
+	struct list_head io_bgs;
 	u64 num_dirty_bgs;
+
+	/*
+	 * we need to make sure block group deletion doesn't race with
+	 * free space cache writeout.  This mutex keeps them from stomping
+	 * on each other
+	 */
+	struct mutex cache_write_mutex;
 	spinlock_t dirty_bgs_lock;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
+	int dirty_bg_run;
 };
 
 #define __TRANS_FREEZABLE	(1U << 0)
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-13 19:52 [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Chris Mason
                   ` (3 preceding siblings ...)
  2015-04-13 19:52 ` [PATCH 4/4] Btrfs: allow block group cache writeout outside critical section in commit Chris Mason
@ 2015-04-22 16:09 ` Lutz Vieweg
  2015-04-22 16:37   ` Holger Hoffstätte
  2015-06-26 17:12   ` Lutz Vieweg
  4 siblings, 2 replies; 26+ messages in thread
From: Lutz Vieweg @ 2015-04-22 16:09 UTC (permalink / raw)
  To: linux-btrfs

On 04/13/2015 09:52 PM, Chris Mason wrote:
> Large filesystems with lots of block groups can suffer long stalls during
> commit while we create and send down all of the block group caches.  The
> more blocks groups dirtied in a transaction, the longer these stalls can be.
> Some workloads average 10 seconds per commit, but see peak times much higher.

Since we see this problem very frequently on some shared development servers,
I will try to install this ASAP.

Meanwhile, can anybody already tell success stories about successfully removing
lags by this patch?

Regards,

Lutz Vieweg


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-22 16:09 ` [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Lutz Vieweg
@ 2015-04-22 16:37   ` Holger Hoffstätte
  2015-04-22 16:55     ` Chris Mason
  2015-06-26 17:12   ` Lutz Vieweg
  1 sibling, 1 reply; 26+ messages in thread
From: Holger Hoffstätte @ 2015-04-22 16:37 UTC (permalink / raw)
  To: linux-btrfs

On Wed, 22 Apr 2015 18:09:18 +0200, Lutz Vieweg wrote:

> On 04/13/2015 09:52 PM, Chris Mason wrote:
>> Large filesystems with lots of block groups can suffer long stalls during
>> commit while we create and send down all of the block group caches.  The
>> more blocks groups dirtied in a transaction, the longer these stalls can be.
>> Some workloads average 10 seconds per commit, but see peak times much higher.
> 
> Since we see this problem very frequently on some shared development servers,
> I will try to install this ASAP.
> 
> Meanwhile, can anybody already tell success stories about successfully removing
> lags by this patch?

Works fine, but make sure to get the followup patch [1] as well while you're
at it. I've observed that my (bandwidth-throttled) backups now cause shorter,
nicely spaced-out blips of activity instead of longer ones when the writeback
kicks in.

-h

[1] https://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=integration-4.1&id=c1e31ffc317e4c28d242b1d961c9c6fe673c0377



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-22 16:37   ` Holger Hoffstätte
@ 2015-04-22 16:55     ` Chris Mason
  2015-04-23 12:45       ` Filipe David Manana
  2015-04-23 16:34       ` Holger Hoffstätte
  0 siblings, 2 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-22 16:55 UTC (permalink / raw)
  To: Holger Hoffstätte, linux-btrfs

On 04/22/2015 12:37 PM, Holger Hoffstätte wrote:
> On Wed, 22 Apr 2015 18:09:18 +0200, Lutz Vieweg wrote:
> 
>> On 04/13/2015 09:52 PM, Chris Mason wrote:
>>> Large filesystems with lots of block groups can suffer long stalls during
>>> commit while we create and send down all of the block group caches.  The
>>> more blocks groups dirtied in a transaction, the longer these stalls can be.
>>> Some workloads average 10 seconds per commit, but see peak times much higher.
>>
>> Since we see this problem very frequently on some shared development servers,
>> I will try to install this ASAP.
>>
>> Meanwhile, can anybody already tell success stories about successfully removing
>> lags by this patch?
> 
> Works fine, but make sure to get the followup patch [1] as well while you're
> at it. I've observed that my (bandwidth-throttled) backups now cause shorter,
> nicely spaced-out blips of activity instead of longer ones when the writeback
> kicks in.
> 
> -h
> 
> [1] https://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=integration-4.1&id=c1e31ffc317e4c28d242b1d961c9c6fe673c0377
> 

Great to hear.  I recommend just using my for-linus-4.1 branch, since it
has all the good things  in one place.

-chris


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-22 16:55     ` Chris Mason
@ 2015-04-23 12:45       ` Filipe David Manana
  2015-04-23 12:52         ` Chris Mason
  2015-04-23 16:34       ` Holger Hoffstätte
  1 sibling, 1 reply; 26+ messages in thread
From: Filipe David Manana @ 2015-04-23 12:45 UTC (permalink / raw)
  To: Chris Mason; +Cc: Holger Hoffstätte, linux-btrfs

On Wed, Apr 22, 2015 at 5:55 PM, Chris Mason <clm@fb.com> wrote:
> On 04/22/2015 12:37 PM, Holger Hoffstätte wrote:
>> On Wed, 22 Apr 2015 18:09:18 +0200, Lutz Vieweg wrote:
>>
>>> On 04/13/2015 09:52 PM, Chris Mason wrote:
>>>> Large filesystems with lots of block groups can suffer long stalls during
>>>> commit while we create and send down all of the block group caches.  The
>>>> more blocks groups dirtied in a transaction, the longer these stalls can be.
>>>> Some workloads average 10 seconds per commit, but see peak times much higher.
>>>
>>> Since we see this problem very frequently on some shared development servers,
>>> I will try to install this ASAP.
>>>
>>> Meanwhile, can anybody already tell success stories about successfully removing
>>> lags by this patch?
>>
>> Works fine, but make sure to get the followup patch [1] as well while you're
>> at it. I've observed that my (bandwidth-throttled) backups now cause shorter,
>> nicely spaced-out blips of activity instead of longer ones when the writeback
>> kicks in.
>>
>> -h
>>
>> [1] https://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=integration-4.1&id=c1e31ffc317e4c28d242b1d961c9c6fe673c0377
>>
>
> Great to hear.  I recommend just using my for-linus-4.1 branch, since it
> has all the good things  in one place.

Trying the current integration-4.1 branch, I ran into the following
during xfstests/btrfs/049:

[ 1702.295711] run fstests btrfs/049 at 2015-04-23 13:25:22
[ 1703.704334] device-mapper: uevent: version 1.0.3
[ 1703.707590] device-mapper: ioctl: 4.30.0-ioctl (2014-12-22)
initialised: dm-devel@redhat.com
[ 1704.081385] BTRFS: device fsid b5fe6a65-1a70-4a74-9d27-ba7032df51f7
devid 1 transid 3 /dev/sdc
[ 1704.662570] BTRFS info (device dm-0): turning on discard
[ 1704.663981] BTRFS info (device dm-0): enabling inode map caching
[ 1704.665929] BTRFS info (device dm-0): disk space caching is enabled
[ 1704.667451] BTRFS: has skinny extents
[ 1704.692617] BTRFS: creating UUID tree
[ 1704.883424] ------------[ cut here ]------------
[ 1704.884487] WARNING: CPU: 2 PID: 3645 at
fs/btrfs/free-space-cache.c:1234
__btrfs_write_out_cache.isra.21+0x75/0x3a1 [btrfs]()
[ 1704.886519] Modules linked in: dm_flakey dm_mod crc32c_generic
btrfs xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd
grace fscache sunrpc loop fuse i2c_piix4 i2c_core psmouse acpi_cpufreq
processor thermal_sys parport_pc parport pcspkr serio_raw microcode
evdev button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom
virtio_scsi ata_generic floppy virtio_pci virtio_ring virtio e1000
ata_piix libata scsi_mod
[ 1704.897327] CPU: 2 PID: 3645 Comm: xfs_io Not tainted
4.0.0-rc5-btrfs-next-8+ #1
[ 1704.898875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
04/01/2014
[ 1704.902507]  0000000000000009 ffff8801be5b7b58 ffffffff8142bd70
ffff88043dd0f2d0
[ 1704.904381]  0000000000000000 ffff8801be5b7b98 ffffffff810463e6
ffff8801be5b7b88
[ 1704.906377]  ffffffffa03ad57b ffff8801be5b7c38 0000000000000000
ffff880231948c90
[ 1704.908786] Call Trace:
[ 1704.909428]  [<ffffffff8142bd70>] dump_stack+0x4c/0x65
[ 1704.911249]  [<ffffffff810463e6>] warn_slowpath_common+0xa1/0xbb
[ 1704.912700]  [<ffffffffa03ad57b>] ?
__btrfs_write_out_cache.isra.21+0x75/0x3a1 [btrfs]
[ 1704.914342]  [<ffffffff810464a3>] warn_slowpath_null+0x1a/0x1c
[ 1704.915383]  [<ffffffffa03ad57b>]
__btrfs_write_out_cache.isra.21+0x75/0x3a1 [btrfs]
[ 1704.916890]  [<ffffffffa03b00d8>] btrfs_write_out_ino_cache+0x52/0x97 [btrfs]
[ 1704.918173]  [<ffffffff814311fb>] ? _raw_spin_unlock+0x28/0x33
[ 1704.919208]  [<ffffffffa035fbe0>] ?
btrfs_free_reserved_data_space+0x84/0x8c [btrfs]
[ 1704.920706]  [<ffffffffa036b447>] btrfs_save_ino_cache+0x275/0x2dc [btrfs]
[ 1704.922486]  [<ffffffffa03ded17>] commit_fs_roots.isra.13+0xaa/0x137 [btrfs]
[ 1704.924211]  [<ffffffff8107d0da>] ? trace_hardirqs_on+0xd/0xf
[ 1704.925634]  [<ffffffffa037429f>] ?
btrfs_commit_transaction+0x4bb/0x9d3 [btrfs]
[ 1704.927459]  [<ffffffff814311fb>] ? _raw_spin_unlock+0x28/0x33
[ 1704.928709]  [<ffffffffa03742ae>]
btrfs_commit_transaction+0x4ca/0x9d3 [btrfs]
[ 1704.930205]  [<ffffffff8107d0da>] ? trace_hardirqs_on+0xd/0xf
[ 1704.931314]  [<ffffffffa038656c>] btrfs_sync_file+0x307/0x367 [btrfs]
[ 1704.932584]  [<ffffffff81178e69>] vfs_fsync_range+0x95/0xa4
[ 1704.933550]  [<ffffffff81432651>] ? retint_swapgs+0xe/0x44
[ 1704.934615]  [<ffffffff81178e94>] vfs_fsync+0x1c/0x1e
[ 1704.935590]  [<ffffffff81179010>] do_fsync+0x34/0x4e
[ 1704.936609]  [<ffffffff81179238>] SyS_fsync+0x10/0x14
[ 1704.937614]  [<ffffffff81431a72>] system_call_fastpath+0x12/0x17
[ 1704.938693] ---[ end trace 543759e9dc39d3b9 ]---
[ 1704.942149] BTRFS: assertion failed:
BTRFS_I(inode)->outstanding_extents >= num_extents, file:
fs/btrfs/extent-tree.c, line: 5266
[ 1704.944085] ------------[ cut here ]------------
[ 1704.945073] kernel BUG at fs/btrfs/ctree.h:4057!
[ 1704.945912] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
[ 1704.946932] Modules linked in: dm_flakey dm_mod crc32c_generic
btrfs xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd
grace fscache sunrpc loop fuse i2c_piix4 i2c_core psmouse acpi_cpufreq
processor thermal_sys parport_pc parport pcspkr serio_raw microcode
evdev button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom
virtio_scsi ata_generic floppy virtio_pci virtio_ring virtio e1000
ata_piix libata scsi_mod
[ 1704.948065] CPU: 2 PID: 3645 Comm: xfs_io Tainted: G        W
4.0.0-rc5-btrfs-next-8+ #1
[ 1704.948065] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
04/01/2014
[ 1704.948065] task: ffff8801c878c250 ti: ffff8801be5b4000 task.ti:
ffff8801be5b4000
[ 1704.948065] RIP: 0010:[<ffffffffa035a448>]  [<ffffffffa035a448>]
assfail.constprop.68+0x1e/0x20 [btrfs]
[ 1704.948065] RSP: 0018:ffff8801be5b7bc8  EFLAGS: 00010246
[ 1704.948065] RAX: 0000000000000075 RBX: 0000000000009000 RCX: ffffffff8107a4cc
[ 1704.948065] RDX: 0000000000009e85 RSI: ffffffff8143127f RDI: ffffffff8107d0da
[ 1704.948065] RBP: ffff8801be5b7bc8 R08: 0000000000000001 R09: 0000000000000000
[ 1704.948065] R10: 0000000000000000 R11: ffffffff8165a140 R12: ffff8802b3aed000
[ 1704.948065] R13: ffff8801e4f90f40 R14: 0000000000000000 R15: ffff880231948c90
[ 1704.948065] FS:  00007f3640959700(0000) GS:ffff88043dd00000(0000)
knlGS:0000000000000000
[ 1704.948065] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1704.948065] CR2: 00007f3640961000 CR3: 00000002531e1000 CR4: 00000000000006e0
[ 1704.948065] Stack:
[ 1704.948065]  ffff8801be5b7bd8 ffffffffa035a487 ffff8801be5b7c28
ffffffffa0360851
[ 1704.948065]  ffff8801c66e3f60 ffff880231948948 ffff880231948980
ffff8802b3aed000
[ 1704.948065]  ffff880231948c90 ffff8801e4f90f40 ffff8801c66e3f60
0000000000009000
[ 1704.948065] Call Trace:
[ 1704.948065]  [<ffffffffa035a487>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[ 1704.948065]  [<ffffffffa0360851>]
btrfs_delalloc_release_metadata+0x54/0xe6 [btrfs]
[ 1704.948065]  [<ffffffffa03b0108>] btrfs_write_out_ino_cache+0x82/0x97 [btrfs]
[ 1704.948065]  [<ffffffffa036b447>] btrfs_save_ino_cache+0x275/0x2dc [btrfs]
[ 1704.948065]  [<ffffffffa03ded17>] commit_fs_roots.isra.13+0xaa/0x137 [btrfs]
[ 1704.948065]  [<ffffffff8107d0da>] ? trace_hardirqs_on+0xd/0xf
[ 1704.948065]  [<ffffffffa037429f>] ?
btrfs_commit_transaction+0x4bb/0x9d3 [btrfs]
[ 1704.948065]  [<ffffffff814311fb>] ? _raw_spin_unlock+0x28/0x33
[ 1704.948065]  [<ffffffffa03742ae>]
btrfs_commit_transaction+0x4ca/0x9d3 [btrfs]
[ 1704.948065]  [<ffffffff8107d0da>] ? trace_hardirqs_on+0xd/0xf
[ 1704.948065]  [<ffffffffa038656c>] btrfs_sync_file+0x307/0x367 [btrfs]
[ 1704.948065]  [<ffffffff81178e69>] vfs_fsync_range+0x95/0xa4
[ 1704.948065]  [<ffffffff81432651>] ? retint_swapgs+0xe/0x44
[ 1704.948065]  [<ffffffff81178e94>] vfs_fsync+0x1c/0x1e
[ 1704.948065]  [<ffffffff81179010>] do_fsync+0x34/0x4e
[ 1704.948065]  [<ffffffff81179238>] SyS_fsync+0x10/0x14
[ 1704.948065]  [<ffffffff81431a72>] system_call_fastpath+0x12/0x17
[ 1704.948065] Code: 89 f0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 55 89 f1
48 c7 c2 12 9c 3e a0 48 89 fe 31 c0 48 c7 c7 1e 9d 3e a0 48 89 e5 e8
f1 09 0d e1 <0f> 0b 0f 1f 44 00 00 48 81 c6 ff ff ff 07 55 48 c1 ee 1b
85 f6
[ 1704.948065] RIP  [<ffffffffa035a448>] assfail.constprop.68+0x1e/0x20 [btrfs]
[ 1704.948065]  RSP <ffff8801be5b7bc8>
[ 1705.018483] ---[ end trace 543759e9dc39d3ba ]---
[ 1705.020564] BUG: sleeping function called from invalid context at
kernel/locking/rwsem.c:41
[ 1705.022182] in_atomic(): 1, irqs_disabled(): 0, pid: 3645, name: xfs_io
[ 1705.023403] INFO: lockdep is turned off.
[ 1705.024248] CPU: 2 PID: 3645 Comm: xfs_io Tainted: G      D W
4.0.0-rc5-btrfs-next-8+ #1
[ 1705.026809] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
04/01/2014
[ 1705.029167]  ffff8801c878c250 ffff8801be5b7848 ffffffff8142bd70
000000000000a1e3
[ 1705.031709]  0000000000000e3d ffff8801be5b7868 ffffffff8106542b
ffffffff817d0e77
[ 1705.033609]  0000000000000029 ffff8801be5b7898 ffffffff810654d0
00007ffffffff000
[ 1705.035596] Call Trace:
[ 1705.036273]  [<ffffffff8142bd70>] dump_stack+0x4c/0x65
[ 1705.037398]  [<ffffffff8106542b>] ___might_sleep+0x148/0x14d
[ 1705.038573]  [<ffffffff810654d0>] __might_sleep+0xa0/0xa8
[ 1705.039697]  [<ffffffff8142fd6b>] down_read+0x21/0x55
[ 1705.040836]  [<ffffffff81052c4d>] exit_signals+0x26/0x11a
[ 1705.042024]  [<ffffffff810606e9>] ? blocking_notifier_call_chain+0x14/0x16
[ 1705.044102]  [<ffffffff8104779c>] do_exit+0x128/0x9c4
[ 1705.045213]  [<ffffffff8107adff>] ? arch_local_irq_save+0x9/0xc
[ 1705.046470]  [<ffffffff8108c05b>] ? kmsg_dump+0xec/0xfc
[ 1705.047544]  [<ffffffff81005754>] oops_end+0xa6/0xae
[ 1705.048629]  [<ffffffff81005bcb>] die+0x5a/0x63
[ 1705.049516]  [<ffffffff81002aee>] do_trap+0x6b/0x124
[ 1705.050490]  [<ffffffff81002cbc>] do_error_trap+0xc6/0xd8
[ 1705.051587]  [<ffffffffa035a448>] ? assfail.constprop.68+0x1e/0x20 [btrfs]
[ 1705.052721]  [<ffffffff8107d0da>] ? trace_hardirqs_on+0xd/0xf
[ 1705.054805]  [<ffffffff810e4efb>] ? time_hardirqs_off+0x15/0x28
[ 1705.055914]  [<ffffffff8123720a>] ? trace_hardirqs_off_thunk+0x3a/0x3c
[ 1705.057244]  [<ffffffff8107b06a>] ? trace_hardirqs_off_caller+0x4c/0xb9
[ 1705.058692]  [<ffffffff8123720a>] ? trace_hardirqs_off_thunk+0x3a/0x3c
[ 1705.060732]  [<ffffffff810035e3>] do_invalid_op+0x20/0x22
[ 1705.061962]  [<ffffffff81433428>] invalid_op+0x18/0x20
[ 1705.063117]  [<ffffffff8107a4cc>] ? up+0x39/0x3e
[ 1705.064205]  [<ffffffff8143127f>] ? _raw_spin_unlock_irqrestore+0x3f/0x4d
[ 1705.065659]  [<ffffffff8107d0da>] ? trace_hardirqs_on+0xd/0xf
[ 1705.066621]  [<ffffffffa035a448>] ? assfail.constprop.68+0x1e/0x20 [btrfs]
[ 1705.067904]  [<ffffffffa035a448>] ? assfail.constprop.68+0x1e/0x20 [btrfs]
[ 1705.069456]  [<ffffffffa035a487>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[ 1705.070862]  [<ffffffffa0360851>]
btrfs_delalloc_release_metadata+0x54/0xe6 [btrfs]
[ 1705.072372]  [<ffffffffa03b0108>] btrfs_write_out_ino_cache+0x82/0x97 [btrfs]
[ 1705.073821]  [<ffffffffa036b447>] btrfs_save_ino_cache+0x275/0x2dc [btrfs]
[ 1705.075150]  [<ffffffffa03ded17>] commit_fs_roots.isra.13+0xaa/0x137 [btrfs]
[ 1705.076593]  [<ffffffff8107d0da>] ? trace_hardirqs_on+0xd/0xf
[ 1705.077830]  [<ffffffffa037429f>] ?
btrfs_commit_transaction+0x4bb/0x9d3 [btrfs]
[ 1705.079381]  [<ffffffff814311fb>] ? _raw_spin_unlock+0x28/0x33
[ 1705.080588]  [<ffffffffa03742ae>]
btrfs_commit_transaction+0x4ca/0x9d3 [btrfs]
[ 1705.082148]  [<ffffffff8107d0da>] ? trace_hardirqs_on+0xd/0xf
[ 1705.083298]  [<ffffffffa038656c>] btrfs_sync_file+0x307/0x367 [btrfs]
[ 1705.084667]  [<ffffffff81178e69>] vfs_fsync_range+0x95/0xa4
[ 1705.086187]  [<ffffffff81432651>] ? retint_swapgs+0xe/0x44
[ 1705.087289]  [<ffffffff81178e94>] vfs_fsync+0x1c/0x1e
[ 1705.088389]  [<ffffffff81179010>] do_fsync+0x34/0x4e
[ 1705.089438]  [<ffffffff81179238>] SyS_fsync+0x10/0x14
[ 1705.092118]  [<ffffffff81431a72>] system_call_fastpath+0x12/0x17
[ 1705.093373] note: xfs_io[3645] exited with preempt_count 1
[ 1946.983579] kmemleak: 1 new suspected memory leaks (see
/sys/kernel/debug/kmemleak)
[ 2566.080608] kmemleak: 1 new suspected memory leaks (see
/sys/kernel/debug/kmemleak)


>
> -chris
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 12:45       ` Filipe David Manana
@ 2015-04-23 12:52         ` Chris Mason
  2015-04-23 12:56           ` Chris Mason
  2015-04-23 13:05           ` Filipe David Manana
  0 siblings, 2 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-23 12:52 UTC (permalink / raw)
  To: fdmanana; +Cc: Holger Hoffstätte, linux-btrfs

On 04/23/2015 08:45 AM, Filipe David Manana wrote:
> On Wed, Apr 22, 2015 at 5:55 PM, Chris Mason <clm@fb.com> wrote:
>> On 04/22/2015 12:37 PM, Holger Hoffstätte wrote:
>>> On Wed, 22 Apr 2015 18:09:18 +0200, Lutz Vieweg wrote:
>>>
>>>> On 04/13/2015 09:52 PM, Chris Mason wrote:
>>>>> Large filesystems with lots of block groups can suffer long stalls during
>>>>> commit while we create and send down all of the block group caches.  The
>>>>> more blocks groups dirtied in a transaction, the longer these stalls can be.
>>>>> Some workloads average 10 seconds per commit, but see peak times much higher.
>>>>
>>>> Since we see this problem very frequently on some shared development servers,
>>>> I will try to install this ASAP.
>>>>
>>>> Meanwhile, can anybody already tell success stories about successfully removing
>>>> lags by this patch?
>>>
>>> Works fine, but make sure to get the followup patch [1] as well while you're
>>> at it. I've observed that my (bandwidth-throttled) backups now cause shorter,
>>> nicely spaced-out blips of activity instead of longer ones when the writeback
>>> kicks in.
>>>
>>> -h
>>>
>>> [1] https://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=integration-4.1&id=c1e31ffc317e4c28d242b1d961c9c6fe673c0377
>>>
>>
>> Great to hear.  I recommend just using my for-linus-4.1 branch, since it
>> has all the good things  in one place.
> 
> Trying the current integration-4.1 branch, I ran into the following
> during xfstests/btrfs/049:
> 

Ugh, I must not be waiting correctly in one of the inode cache writeout
sections.  But I've run 049 a whole bunch of times without triggering,
can you get this to happen consistently?

-chris


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 12:52         ` Chris Mason
@ 2015-04-23 12:56           ` Chris Mason
  2015-04-23 13:05           ` Filipe David Manana
  1 sibling, 0 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-23 12:56 UTC (permalink / raw)
  To: fdmanana; +Cc: Holger Hoffstätte, linux-btrfs

On 04/23/2015 08:52 AM, Chris Mason wrote:
> On 04/23/2015 08:45 AM, Filipe David Manana wrote:
>> On Wed, Apr 22, 2015 at 5:55 PM, Chris Mason <clm@fb.com> wrote:
>>> On 04/22/2015 12:37 PM, Holger Hoffstätte wrote:
>>>> On Wed, 22 Apr 2015 18:09:18 +0200, Lutz Vieweg wrote:
>>>>
>>>>> On 04/13/2015 09:52 PM, Chris Mason wrote:
>>>>>> Large filesystems with lots of block groups can suffer long stalls during
>>>>>> commit while we create and send down all of the block group caches.  The
>>>>>> more blocks groups dirtied in a transaction, the longer these stalls can be.
>>>>>> Some workloads average 10 seconds per commit, but see peak times much higher.
>>>>>
>>>>> Since we see this problem very frequently on some shared development servers,
>>>>> I will try to install this ASAP.
>>>>>
>>>>> Meanwhile, can anybody already tell success stories about successfully removing
>>>>> lags by this patch?
>>>>
>>>> Works fine, but make sure to get the followup patch [1] as well while you're
>>>> at it. I've observed that my (bandwidth-throttled) backups now cause shorter,
>>>> nicely spaced-out blips of activity instead of longer ones when the writeback
>>>> kicks in.
>>>>
>>>> -h
>>>>
>>>> [1] https://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=integration-4.1&id=c1e31ffc317e4c28d242b1d961c9c6fe673c0377
>>>>
>>>
>>> Great to hear.  I recommend just using my for-linus-4.1 branch, since it
>>> has all the good things  in one place.
>>
>> Trying the current integration-4.1 branch, I ran into the following
>> during xfstests/btrfs/049:
>>
> 
> Ugh, I must not be waiting correctly in one of the inode cache writeout
> sections.  But I've run 049 a whole bunch of times without triggering,
> can you get this to happen consistently?
> 

Ok, I can actually trigger with an fs_mark script.  Fixing.

-chris



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 12:52         ` Chris Mason
  2015-04-23 12:56           ` Chris Mason
@ 2015-04-23 13:05           ` Filipe David Manana
  2015-04-23 15:17             ` Chris Mason
  1 sibling, 1 reply; 26+ messages in thread
From: Filipe David Manana @ 2015-04-23 13:05 UTC (permalink / raw)
  To: Chris Mason; +Cc: Holger Hoffstätte, linux-btrfs

On Thu, Apr 23, 2015 at 1:52 PM, Chris Mason <clm@fb.com> wrote:
> On 04/23/2015 08:45 AM, Filipe David Manana wrote:
>> On Wed, Apr 22, 2015 at 5:55 PM, Chris Mason <clm@fb.com> wrote:
>>> On 04/22/2015 12:37 PM, Holger Hoffstätte wrote:
>>>> On Wed, 22 Apr 2015 18:09:18 +0200, Lutz Vieweg wrote:
>>>>
>>>>> On 04/13/2015 09:52 PM, Chris Mason wrote:
>>>>>> Large filesystems with lots of block groups can suffer long stalls during
>>>>>> commit while we create and send down all of the block group caches.  The
>>>>>> more blocks groups dirtied in a transaction, the longer these stalls can be.
>>>>>> Some workloads average 10 seconds per commit, but see peak times much higher.
>>>>>
>>>>> Since we see this problem very frequently on some shared development servers,
>>>>> I will try to install this ASAP.
>>>>>
>>>>> Meanwhile, can anybody already tell success stories about successfully removing
>>>>> lags by this patch?
>>>>
>>>> Works fine, but make sure to get the followup patch [1] as well while you're
>>>> at it. I've observed that my (bandwidth-throttled) backups now cause shorter,
>>>> nicely spaced-out blips of activity instead of longer ones when the writeback
>>>> kicks in.
>>>>
>>>> -h
>>>>
>>>> [1] https://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=integration-4.1&id=c1e31ffc317e4c28d242b1d961c9c6fe673c0377
>>>>
>>>
>>> Great to hear.  I recommend just using my for-linus-4.1 branch, since it
>>> has all the good things  in one place.
>>
>> Trying the current integration-4.1 branch, I ran into the following
>> during xfstests/btrfs/049:
>>
>
> Ugh, I must not be waiting correctly in one of the inode cache writeout
> sections.  But I've run 049 a whole bunch of times without triggering,
> can you get this to happen consistently?

All the time so far.

>
> -chris
>



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 13:05           ` Filipe David Manana
@ 2015-04-23 15:17             ` Chris Mason
  2015-04-23 15:48               ` Filipe David Manana
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Mason @ 2015-04-23 15:17 UTC (permalink / raw)
  To: Filipe David Manana; +Cc: Holger Hoffstätte, linux-btrfs

On Thu, Apr 23, 2015 at 02:05:48PM +0100, Filipe David Manana wrote:
> >> Trying the current integration-4.1 branch, I ran into the following
> >> during xfstests/btrfs/049:
> >>
> >
> > Ugh, I must not be waiting correctly in one of the inode cache writeout
> > sections.  But I've run 049 a whole bunch of times without triggering,
> > can you get this to happen consistently?
> 
> All the time so far.

I'm testing with this now:

commit 9f433238891b1b243c4f19d3f36eed913b270cbc
Author: Chris Mason <clm@fb.com>
Date:   Thu Apr 23 08:02:49 2015 -0700

    Btrfs: fix inode cache writeout
    
    The code to fix stalls during free spache cache IO wasn't using
    the correct root when waiting on the IO for inode caches.  This
    is only a problem when the inode cache is enabled with
    
    mount -o inode_cache
    
    This fixes the inode cache writeout to preserve any error values and
    makes sure not to override the root when inode cache writeout is done.
    
    Reported-by: Filipe Manana <fdmanana@suse.com>
    Signed-off-by: Chris Mason <clm@fb.com>

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5a4f5d1..8cd797f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1149,7 +1149,8 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
 	if (!inode)
 		return 0;
 
-	root = root->fs_info->tree_root;
+	if (block_group)
+		root = root->fs_info->tree_root;
 
 	/* Flush the dirty pages in the cache file. */
 	ret = flush_dirty_cache(inode);
@@ -3465,9 +3466,12 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
 		return 0;
 
+	memset(&io_ctl, 0, sizeof(io_ctl));
 	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
-				      trans, path, 0) ||
-		btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+				      trans, path, 0);
+	if (!ret)
+		ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+
 	if (ret) {
 		btrfs_delalloc_release_metadata(inode, inode->i_size);
 #ifdef DEBUG

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 15:17             ` Chris Mason
@ 2015-04-23 15:48               ` Filipe David Manana
  2015-04-23 19:43                 ` Filipe David Manana
  0 siblings, 1 reply; 26+ messages in thread
From: Filipe David Manana @ 2015-04-23 15:48 UTC (permalink / raw)
  To: Chris Mason, Filipe David Manana, Holger Hoffstätte, linux-btrfs

On Thu, Apr 23, 2015 at 4:17 PM, Chris Mason <clm@fb.com> wrote:
> On Thu, Apr 23, 2015 at 02:05:48PM +0100, Filipe David Manana wrote:
>> >> Trying the current integration-4.1 branch, I ran into the following
>> >> during xfstests/btrfs/049:
>> >>
>> >
>> > Ugh, I must not be waiting correctly in one of the inode cache writeout
>> > sections.  But I've run 049 a whole bunch of times without triggering,
>> > can you get this to happen consistently?
>>
>> All the time so far.
>
> I'm testing with this now:
>
> commit 9f433238891b1b243c4f19d3f36eed913b270cbc
> Author: Chris Mason <clm@fb.com>
> Date:   Thu Apr 23 08:02:49 2015 -0700
>
>     Btrfs: fix inode cache writeout
>
>     The code to fix stalls during free spache cache IO wasn't using
>     the correct root when waiting on the IO for inode caches.  This
>     is only a problem when the inode cache is enabled with
>
>     mount -o inode_cache
>
>     This fixes the inode cache writeout to preserve any error values and
>     makes sure not to override the root when inode cache writeout is done.
>
>     Reported-by: Filipe Manana <fdmanana@suse.com>
>     Signed-off-by: Chris Mason <clm@fb.com>

Thanks, btrfs/049 now passes with that patch applied.
Running the whole xfstests suite now.

>
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 5a4f5d1..8cd797f 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -1149,7 +1149,8 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
>         if (!inode)
>                 return 0;
>
> -       root = root->fs_info->tree_root;
> +       if (block_group)
> +               root = root->fs_info->tree_root;
>
>         /* Flush the dirty pages in the cache file. */
>         ret = flush_dirty_cache(inode);
> @@ -3465,9 +3466,12 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
>         if (!btrfs_test_opt(root, INODE_MAP_CACHE))
>                 return 0;
>
> +       memset(&io_ctl, 0, sizeof(io_ctl));
>         ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
> -                                     trans, path, 0) ||
> -               btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
> +                                     trans, path, 0);
> +       if (!ret)
> +               ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
> +
>         if (ret) {
>                 btrfs_delalloc_release_metadata(inode, inode->i_size);
>  #ifdef DEBUG



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-22 16:55     ` Chris Mason
  2015-04-23 12:45       ` Filipe David Manana
@ 2015-04-23 16:34       ` Holger Hoffstätte
  2015-04-23 17:57         ` Chris Mason
  1 sibling, 1 reply; 26+ messages in thread
From: Holger Hoffstätte @ 2015-04-23 16:34 UTC (permalink / raw)
  To: linux-btrfs

On Wed, 22 Apr 2015 12:55:26 -0400, Chris Mason wrote:

> Great to hear.  I recommend just using my for-linus-4.1 branch, since it
> has all the good things  in one place.

Even with the latest patch for xfstests/049 I reproducibly get:

[  405.052123] =============================================================================
[  405.052125] BUG btrfs_transaction (Not tainted): Objects remaining in btrfs_transaction on kmem_cache_close()
[  405.052126] -----------------------------------------------------------------------------
[  405.052127] Disabling lock debugging due to kernel taint
[  405.052129] INFO: Slab 0xffffea000fea1e00 objects=22 used=1 fp=0xffff8803fa879d88 flags=0x20000000004080
[  405.052131] CPU: 0 PID: 3625 Comm: rmmod Tainted: G    B           4.0.0 #1
[  405.052132] Hardware name: System manufacturer System Product Name/P8Z68-V LX, BIOS 4105 07/01/2013
[  405.052133]  ffff88040c03e300 ffff88040cd37d48 ffffffff8154c085 0000000000000007
[  405.052136]  ffffea000fea1e00 ffff88040cd37e18 ffffffff811557a0 0000000000000020
[  405.052137]  ffff88040cd37e28 ffff88040cd37dd8 656a624f00000202 616d657220737463
[  405.052139] Call Trace:
[  405.052145]  [<ffffffff8154c085>] dump_stack+0x45/0x57
[  405.052148]  [<ffffffff811557a0>] slab_err+0xa0/0xb0
[  405.052151]  [<ffffffff8110de00>] ? free_hot_cold_page_list+0x30/0xa0
[  405.052153]  [<ffffffff811561de>] ? __free_slab+0xce/0x1a0
[  405.052155]  [<ffffffff81157d7a>] ? __kmalloc+0x16a/0x1b0
[  405.052163]  [<ffffffff81159462>] ? free_partial+0xc2/0x230
[  405.052165]  [<ffffffff81159482>] free_partial+0xe2/0x230
[  405.052166]  [<ffffffff81156fd0>] ? deactivate_slab+0x580/0x580
[  405.052168]  [<ffffffff811596de>] __kmem_cache_shutdown+0x5e/0xb0
[  405.052170]  [<ffffffff811286b0>] kmem_cache_destroy+0x90/0x110
[  405.052192]  [<ffffffffa04683f1>] btrfs_destroy_cachep+0x41/0x80 [btrfs]
[  405.052197]  [<ffffffffa04d0466>] exit_btrfs_fs+0x9/0x5c [btrfs]
[  405.052199]  [<ffffffff810a69e1>] SyS_delete_module+0x171/0x240
[  405.052202]  [<ffffffff81002ccd>] ? do_notify_resume+0x6d/0x70
[  405.052205]  [<ffffffff81551272>] system_call_fastpath+0x12/0x17
[  405.052207] INFO: Object 0xffff8803fa878168 @offset=360
[  405.052208] kmem_cache_destroy btrfs_transaction: Slab cache still has objects
[  405.052209] CPU: 0 PID: 3625 Comm: rmmod Tainted: G    B           4.0.0 #1
[  405.052210] Hardware name: System manufacturer System Product Name/P8Z68-V LX, BIOS 4105 07/01/2013
[  405.052210]  ffff88040cd37eb8 ffff88040cd37ea8 ffffffff8154c085 0000000000000007
[  405.052212]  ffff88040c03e300 ffff88040cd37ee8 ffffffff81128721 ffff88040cd37eb8
[  405.052213]  ffff88040cd37eb8 0000000000000800 ffffffffa04e9a00 00007ffd13de7932
[  405.052215] Call Trace:
[  405.052217]  [<ffffffff8154c085>] dump_stack+0x45/0x57
[  405.052218]  [<ffffffff81128721>] kmem_cache_destroy+0x101/0x110
[  405.052225]  [<ffffffffa04683f1>] btrfs_destroy_cachep+0x41/0x80 [btrfs]
[  405.052229]  [<ffffffffa04d0466>] exit_btrfs_fs+0x9/0x5c [btrfs]
[  405.052230]  [<ffffffff810a69e1>] SyS_delete_module+0x171/0x240
[  405.052232]  [<ffffffff81002ccd>] ? do_notify_resume+0x6d/0x70
[  405.052234]  [<ffffffff81551272>] system_call_fastpath+0x12/0x17

After a clean unmount (with a fresh space_cache but no inode_cache) & rmmod.
This used to work. :)

-h


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 16:34       ` Holger Hoffstätte
@ 2015-04-23 17:57         ` Chris Mason
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-23 17:57 UTC (permalink / raw)
  To: Holger Hoffstätte, linux-btrfs

On 04/23/2015 12:34 PM, Holger Hoffstätte wrote:
> kmem_cache_destroy btrfs_transaction: Slab cache still has objects

Josef has a fix for this, but the fix was causing other problems.  It's
not a new bug, but we'll get it fixed up.

-chris


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 15:48               ` Filipe David Manana
@ 2015-04-23 19:43                 ` Filipe David Manana
  2015-04-23 19:50                   ` Chris Mason
  0 siblings, 1 reply; 26+ messages in thread
From: Filipe David Manana @ 2015-04-23 19:43 UTC (permalink / raw)
  To: Chris Mason, Filipe David Manana, linux-btrfs

On Thu, Apr 23, 2015 at 4:48 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
> On Thu, Apr 23, 2015 at 4:17 PM, Chris Mason <clm@fb.com> wrote:
>> On Thu, Apr 23, 2015 at 02:05:48PM +0100, Filipe David Manana wrote:
>>> >> Trying the current integration-4.1 branch, I ran into the following
>>> >> during xfstests/btrfs/049:
>>> >>
>>> >
>>> > Ugh, I must not be waiting correctly in one of the inode cache writeout
>>> > sections.  But I've run 049 a whole bunch of times without triggering,
>>> > can you get this to happen consistently?
>>>
>>> All the time so far.
>>
>> I'm testing with this now:
>>
>> commit 9f433238891b1b243c4f19d3f36eed913b270cbc
>> Author: Chris Mason <clm@fb.com>
>> Date:   Thu Apr 23 08:02:49 2015 -0700
>>
>>     Btrfs: fix inode cache writeout
>>
>>     The code to fix stalls during free spache cache IO wasn't using
>>     the correct root when waiting on the IO for inode caches.  This
>>     is only a problem when the inode cache is enabled with
>>
>>     mount -o inode_cache
>>
>>     This fixes the inode cache writeout to preserve any error values and
>>     makes sure not to override the root when inode cache writeout is done.
>>
>>     Reported-by: Filipe Manana <fdmanana@suse.com>
>>     Signed-off-by: Chris Mason <clm@fb.com>
>
> Thanks, btrfs/049 now passes with that patch applied.
> Running the whole xfstests suite now.

btrfs/066 also failed once during final fsck with:

_check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent
*** fsck.btrfs output ***
checking extents
checking free space cache
There is no free space entry for 21676032-21680128
There is no free space entry for 21676032-87031808
cache appears valid but isnt 20971520
Checking filesystem on /dev/sdc
UUID: f7785aa7-d5ba-479d-a211-7c31039dc9b1
found 11911316 bytes used err is -22
total csum bytes: 7656
total tree bytes: 454656
total fs tree bytes: 376832
total extent tree bytes: 36864
btree space waste bytes: 122959
file data blocks allocated: 42893312
 referenced 31158272

(it failed like that 1 out of 4 runs)


>
>>
>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>> index 5a4f5d1..8cd797f 100644
>> --- a/fs/btrfs/free-space-cache.c
>> +++ b/fs/btrfs/free-space-cache.c
>> @@ -1149,7 +1149,8 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
>>         if (!inode)
>>                 return 0;
>>
>> -       root = root->fs_info->tree_root;
>> +       if (block_group)
>> +               root = root->fs_info->tree_root;
>>
>>         /* Flush the dirty pages in the cache file. */
>>         ret = flush_dirty_cache(inode);
>> @@ -3465,9 +3466,12 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
>>         if (!btrfs_test_opt(root, INODE_MAP_CACHE))
>>                 return 0;
>>
>> +       memset(&io_ctl, 0, sizeof(io_ctl));
>>         ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
>> -                                     trans, path, 0) ||
>> -               btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
>> +                                     trans, path, 0);
>> +       if (!ret)
>> +               ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
>> +
>>         if (ret) {
>>                 btrfs_delalloc_release_metadata(inode, inode->i_size);
>>  #ifdef DEBUG
>
>
>
> --
> Filipe David Manana,
>
> "Reasonable men adapt themselves to the world.
>  Unreasonable men adapt the world to themselves.
>  That's why all progress depends on unreasonable men."



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 19:43                 ` Filipe David Manana
@ 2015-04-23 19:50                   ` Chris Mason
  2015-04-24  6:34                     ` Filipe David Manana
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Mason @ 2015-04-23 19:50 UTC (permalink / raw)
  To: fdmanana, linux-btrfs

On 04/23/2015 03:43 PM, Filipe David Manana wrote:
> On Thu, Apr 23, 2015 at 4:48 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
>> On Thu, Apr 23, 2015 at 4:17 PM, Chris Mason <clm@fb.com> wrote:
>>> On Thu, Apr 23, 2015 at 02:05:48PM +0100, Filipe David Manana wrote:
>>>>>> Trying the current integration-4.1 branch, I ran into the following
>>>>>> during xfstests/btrfs/049:
>>>>>>
>>>>>
>>>>> Ugh, I must not be waiting correctly in one of the inode cache writeout
>>>>> sections.  But I've run 049 a whole bunch of times without triggering,
>>>>> can you get this to happen consistently?
>>>>
>>>> All the time so far.
>>>
>>> I'm testing with this now:
>>>
>>> commit 9f433238891b1b243c4f19d3f36eed913b270cbc
>>> Author: Chris Mason <clm@fb.com>
>>> Date:   Thu Apr 23 08:02:49 2015 -0700
>>>
>>>     Btrfs: fix inode cache writeout
>>>
>>>     The code to fix stalls during free spache cache IO wasn't using
>>>     the correct root when waiting on the IO for inode caches.  This
>>>     is only a problem when the inode cache is enabled with
>>>
>>>     mount -o inode_cache
>>>
>>>     This fixes the inode cache writeout to preserve any error values and
>>>     makes sure not to override the root when inode cache writeout is done.
>>>
>>>     Reported-by: Filipe Manana <fdmanana@suse.com>
>>>     Signed-off-by: Chris Mason <clm@fb.com>
>>
>> Thanks, btrfs/049 now passes with that patch applied.
>> Running the whole xfstests suite now.
> 
> btrfs/066 also failed once during final fsck with:
> 
> _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent
> *** fsck.btrfs output ***
> checking extents
> checking free space cache
> There is no free space entry for 21676032-21680128
> There is no free space entry for 21676032-87031808
> cache appears valid but isnt 20971520

Josef has a btrfs-progs patch for this.  The kernel will toss the cache.
 There's a somewhat fundamental race in cache writeout this patch makes
a little bigger, but it has always been there.

(compare what find_free_extent can do with no trans running vs the
actual cache writeback)

-chris

> Checking filesystem on /dev/sdc
> UUID: f7785aa7-d5ba-479d-a211-7c31039dc9b1
> found 11911316 bytes used err is -22
> total csum bytes: 7656
> total tree bytes: 454656
> total fs tree bytes: 376832
> total extent tree bytes: 36864
> btree space waste bytes: 122959
> file data blocks allocated: 42893312
>  referenced 31158272
> 
> (it failed like that 1 out of 4 runs)
> 
> 
>>
>>>
>>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>>> index 5a4f5d1..8cd797f 100644
>>> --- a/fs/btrfs/free-space-cache.c
>>> +++ b/fs/btrfs/free-space-cache.c
>>> @@ -1149,7 +1149,8 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
>>>         if (!inode)
>>>                 return 0;
>>>
>>> -       root = root->fs_info->tree_root;
>>> +       if (block_group)
>>> +               root = root->fs_info->tree_root;
>>>
>>>         /* Flush the dirty pages in the cache file. */
>>>         ret = flush_dirty_cache(inode);
>>> @@ -3465,9 +3466,12 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
>>>         if (!btrfs_test_opt(root, INODE_MAP_CACHE))
>>>                 return 0;
>>>
>>> +       memset(&io_ctl, 0, sizeof(io_ctl));
>>>         ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
>>> -                                     trans, path, 0) ||
>>> -               btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
>>> +                                     trans, path, 0);
>>> +       if (!ret)
>>> +               ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
>>> +
>>>         if (ret) {
>>>                 btrfs_delalloc_release_metadata(inode, inode->i_size);
>>>  #ifdef DEBUG
>>
>>
>>
>> --
>> Filipe David Manana,
>>
>> "Reasonable men adapt themselves to the world.
>>  Unreasonable men adapt the world to themselves.
>>  That's why all progress depends on unreasonable men."
> 
> 
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-23 19:50                   ` Chris Mason
@ 2015-04-24  6:34                     ` Filipe David Manana
  2015-04-24 13:00                       ` Chris Mason
  2015-04-24 13:33                       ` Chris Mason
  0 siblings, 2 replies; 26+ messages in thread
From: Filipe David Manana @ 2015-04-24  6:34 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

On Thu, Apr 23, 2015 at 8:50 PM, Chris Mason <clm@fb.com> wrote:
> On 04/23/2015 03:43 PM, Filipe David Manana wrote:
>> On Thu, Apr 23, 2015 at 4:48 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
>>> On Thu, Apr 23, 2015 at 4:17 PM, Chris Mason <clm@fb.com> wrote:
>>>> On Thu, Apr 23, 2015 at 02:05:48PM +0100, Filipe David Manana wrote:
>>>>>>> Trying the current integration-4.1 branch, I ran into the following
>>>>>>> during xfstests/btrfs/049:
>>>>>>>
>>>>>>
>>>>>> Ugh, I must not be waiting correctly in one of the inode cache writeout
>>>>>> sections.  But I've run 049 a whole bunch of times without triggering,
>>>>>> can you get this to happen consistently?
>>>>>
>>>>> All the time so far.
>>>>
>>>> I'm testing with this now:
>>>>
>>>> commit 9f433238891b1b243c4f19d3f36eed913b270cbc
>>>> Author: Chris Mason <clm@fb.com>
>>>> Date:   Thu Apr 23 08:02:49 2015 -0700
>>>>
>>>>     Btrfs: fix inode cache writeout
>>>>
>>>>     The code to fix stalls during free spache cache IO wasn't using
>>>>     the correct root when waiting on the IO for inode caches.  This
>>>>     is only a problem when the inode cache is enabled with
>>>>
>>>>     mount -o inode_cache
>>>>
>>>>     This fixes the inode cache writeout to preserve any error values and
>>>>     makes sure not to override the root when inode cache writeout is done.
>>>>
>>>>     Reported-by: Filipe Manana <fdmanana@suse.com>
>>>>     Signed-off-by: Chris Mason <clm@fb.com>
>>>
>>> Thanks, btrfs/049 now passes with that patch applied.
>>> Running the whole xfstests suite now.
>>
>> btrfs/066 also failed once during final fsck with:
>>
>> _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent
>> *** fsck.btrfs output ***
>> checking extents
>> checking free space cache
>> There is no free space entry for 21676032-21680128
>> There is no free space entry for 21676032-87031808
>> cache appears valid but isnt 20971520
>
> Josef has a btrfs-progs patch for this.  The kernel will toss the cache.
>  There's a somewhat fundamental race in cache writeout this patch makes
> a little bigger, but it has always been there.
>
> (compare what find_free_extent can do with no trans running vs the
> actual cache writeback)

There's also one list corruption I didn't get before and happened
while running fsstress (btrfs/078), apparently due to some race:

[25590.799058] ------------[ cut here ]------------
[25590.800204] WARNING: CPU: 3 PID: 7280 at lib/list_debug.c:62
__list_del_entry+0x5a/0x98()
[25590.802101] list_del corruption. next->prev should be
ffff8801a0f74d50, but was a56b6b6b6b6b6b6b
[25590.804236] Modules linked in: btrfs dm_flakey dm_mod
crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs
lockd grace fscache sunrpc loop fuse i2c_piix4 i2c_core psmouse
serio_raw evdev parport_pc parport acpi_cpufreq processor button
pcspkr thermal_sys microcode ext4 crc16 jbd2 mbcache sd_mod sg sr_mod
cdrom virtio_scsi ata_generic virtio_pci virtio_ring ata_piix e1000
virtio libata floppy scsi_mod [last unloaded: btrfs]
[25590.818580] CPU: 3 PID: 7280 Comm: fsstress Tainted: G        W
  4.0.0-rc5-btrfs-next-9+ #1
[25590.820597] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
04/01/2014
[25590.823458]  0000000000000009 ffff8803f031bc08 ffffffff8142fa46
ffffffff8108b6a2
[25590.825081]  ffff8803f031bc58 ffff8803f031bc48 ffffffff81045ea5
0000000000000011
[25590.826568]  ffffffff81245af7 ffff8801a0f74d50 ffff8801a0f74460
ffff880041710df0
[25590.828106] Call Trace:
[25590.828630]  [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[25590.829706]  [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[25590.830785]  [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[25590.831957]  [<ffffffff81245af7>] ? __list_del_entry+0x5a/0x98
[25590.867473]  [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[25590.868631]  [<ffffffffa04907e2>] ? btrfs_csum_data+0x16/0x18 [btrfs]
[25590.869524]  [<ffffffff81245af7>] __list_del_entry+0x5a/0x98
[25590.870918]  [<ffffffffa04cff5f>] write_bitmap_entries+0x99/0xbd [btrfs]
[25590.872377]  [<ffffffffa04d0858>] ?
__btrfs_write_out_cache.isra.21+0x20b/0x3a1 [btrfs]
[25590.874079]  [<ffffffffa04d0864>]
__btrfs_write_out_cache.isra.21+0x217/0x3a1 [btrfs]
[25590.875594]  [<ffffffffa04d1269>] ? btrfs_write_out_cache+0x41/0xdc [btrfs]
[25590.877032]  [<ffffffffa04d12bb>] btrfs_write_out_cache+0x93/0xdc [btrfs]
[25590.878406]  [<ffffffffa04888e7>] ?
btrfs_start_dirty_block_groups+0x156/0x29b [btrfs]
[25590.879859]  [<ffffffffa0488977>]
btrfs_start_dirty_block_groups+0x1e6/0x29b [btrfs]
[25590.881360]  [<ffffffffa04970f2>]
btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[25590.882504]  [<ffffffffa0471748>] btrfs_sync_fs+0xe1/0x12d [btrfs]
[25590.883600]  [<ffffffff811569d3>] ? iterate_supers+0x60/0xc4
[25590.884671]  [<ffffffff8117acda>] ? vfs_fsync+0x1e/0x1e
[25590.885640]  [<ffffffff8117acfa>] sync_fs_one_sb+0x20/0x22
[25590.886619]  [<ffffffff811569e9>] iterate_supers+0x76/0xc4
[25590.887586]  [<ffffffff8117af58>] sys_sync+0x55/0x83
[25590.888534]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[25590.889694] ---[ end trace d1235b0201a01949 ]---
[25590.890573] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[25590.891870] Modules linked in: btrfs dm_flakey dm_mod
crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs
lockd grace fscache sunrpc loop fuse i2c_piix4 i2c_core psmouse
serio_raw evdev parport_pc parport acpi_cpufreq processor button
pcspkr thermal_sys microcode ext4 crc16 jbd2 mbcache sd_mod sg sr_mod
cdrom virtio_scsi ata_generic virtio_pci virtio_ring ata_piix e1000
virtio libata floppy scsi_mod [last unloaded: btrfs]
[25590.892522] CPU: 3 PID: 7280 Comm: fsstress Tainted: G        W
  4.0.0-rc5-btrfs-next-9+ #1
[25590.892522] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
04/01/2014
[25590.892522] task: ffff88040f485bd0 ti: ffff8803f0318000 task.ti:
ffff8803f0318000
[25590.892522] RIP: 0010:[<ffffffffa04cff2c>]  [<ffffffffa04cff2c>]
write_bitmap_entries+0x66/0xbd [btrfs]
[25590.892522] RSP: 0018:ffff8803f031bcc8  EFLAGS: 00010246
[25590.892522] RAX: 0000000000000011 RBX: ffff8801a0f74460 RCX: 0000000000000400
[25590.892522] RDX: ffff8804062e8000 RSI: 6b6b6b6b6b6b6b6b RDI: ffff8804062e8000
[25590.892522] RBP: ffff8803f031bcf8 R08: 0000000000000001 R09: 0000000000000000
[25590.892522] R10: 0000000000000000 R11: ffffffff8165a180 R12: 6b6b6b6b6b6b6b6b
[25590.892522] R13: ffff880041710df0 R14: 6b6b6b6b6b6b6b6b R15: ffff8803f031bd38
[25590.892522] FS:  00007f6a2fd9f700(0000) GS:ffff88043dd80000(0000)
knlGS:0000000000000000
[25590.892522] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[25590.892522] CR2: 00007f6a280d9ad8 CR3: 00000003c65c5000 CR4: 00000000000006e0
[25590.892522] Stack:
[25590.892522]  ffffffffa04d0858 ffff880041710df0 ffff880041710c00
ffff88033ba59c90
[25590.892522]  0000000000000000 ffff8803efb6ff70 ffff8803f031bd78
ffffffffa04d0864
[25590.892522]  ffff88037cae7f40 ffff88033ba599d0 ffff880226cd4000
ffff8803efb6ff00
[25590.892522] Call Trace:
[25590.892522]  [<ffffffffa04d0858>] ?
__btrfs_write_out_cache.isra.21+0x20b/0x3a1 [btrfs]
[25590.892522]  [<ffffffffa04d0864>]
__btrfs_write_out_cache.isra.21+0x217/0x3a1 [btrfs]
[25590.892522]  [<ffffffffa04d1269>] ? btrfs_write_out_cache+0x41/0xdc [btrfs]
[25590.892522]  [<ffffffffa04d12bb>] btrfs_write_out_cache+0x93/0xdc [btrfs]
[25590.892522]  [<ffffffffa04888e7>] ?
btrfs_start_dirty_block_groups+0x156/0x29b [btrfs]
[25590.892522]  [<ffffffffa0488977>]
btrfs_start_dirty_block_groups+0x1e6/0x29b [btrfs]
[25590.892522]  [<ffffffffa04970f2>]
btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[25590.892522]  [<ffffffffa0471748>] btrfs_sync_fs+0xe1/0x12d [btrfs]
[25590.892522]  [<ffffffff811569d3>] ? iterate_supers+0x60/0xc4
[25590.892522]  [<ffffffff8117acda>] ? vfs_fsync+0x1e/0x1e
[25590.892522]  [<ffffffff8117acfa>] sync_fs_one_sb+0x20/0x22
[25590.892522]  [<ffffffff811569e9>] iterate_supers+0x76/0xc4
[25590.892522]  [<ffffffff8117af58>] sys_sync+0x55/0x83
[25590.892522]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[25590.892522] Code: 4c 89 ef 8d 70 ff e8 a4 ec ff ff 41 8b 45 3c 41
39 45 38 7d 5c 31 f6 4c 89 ef e8 a9 f5 ff ff 49 8b 7d 00 4c 89 f6 b9
00 04 00 00 <f3> a5 4c 89 ef 41 8b 45 38 8d 70 ff e8 73 ec ff ff 41 8b
45 3c
[25590.892522] RIP  [<ffffffffa04cff2c>] write_bitmap_entries+0x66/0xbd [btrfs]
[25590.892522]  RSP <ffff8803f031bcc8>
[25590.951443] ---[ end trace d1235b0201a0194a ]---
[25590.958107] BUG: sleeping function called from invalid context at
kernel/locking/rwsem.c:41
[25590.959805] in_atomic(): 1, irqs_disabled(): 0, pid: 7280, name: fsstress
[25590.961219] INFO: lockdep is turned off.
[25590.962095] Preemption disabled at:[<ffffffffa04d0858>]
__btrfs_write_out_cache.isra.21+0x20b/0x3a1 [btrfs]
[25590.964190]
[25590.964675] CPU: 3 PID: 7280 Comm: fsstress Tainted: G      D W
  4.0.0-rc5-btrfs-next-9+ #1
[25590.966441] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
04/01/2014
[25590.969050]  0000000000001c70 ffff8803f031ba28 ffffffff8142fa46
ffffffff8108b6a2
[25590.971014]  0000000000000000 ffff8803f031ba58 ffffffff8106674b
00007ffffffff000
[25590.973141]  ffffffff817d1e02 0000000000000029 0000000000000000
ffff8803f031ba88
[25590.975044] Call Trace:
[25590.975635]  [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[25590.976733]  [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[25590.977956]  [<ffffffff8106674b>] ___might_sleep+0x1b9/0x1c1
[25590.979252]  [<ffffffff810667f3>] __might_sleep+0xa0/0xa8
[25590.989557]  [<ffffffff81433ce9>] down_read+0x21/0x50
[25590.990930]  [<ffffffff810528fb>] exit_signals+0x26/0x11a
[25590.992152]  [<ffffffff810603d0>] ? blocking_notifier_call_chain+0x14/0x16
[25591.002069]  [<ffffffff8104727e>] do_exit+0x128/0x9cb
[25591.002719]  [<ffffffff8107b024>] ? arch_local_irq_save+0x9/0xc
[25591.003542]  [<ffffffff8108c36d>] ? kmsg_dump+0xd2/0xf8
[25591.004677]  [<ffffffff8108c38a>] ? kmsg_dump+0xef/0xf8
[25591.005763]  [<ffffffff8100594c>] oops_end+0xa6/0xae
[25591.006797]  [<ffffffff81005dd1>] die+0x5a/0x63
[25591.007760]  [<ffffffff81002dc0>] do_general_protection+0x97/0x142
[25591.009032]  [<ffffffff81437812>] general_protection+0x22/0x30
[25591.010245]  [<ffffffffa04cff2c>] ? write_bitmap_entries+0x66/0xbd [btrfs]
[25591.012880]  [<ffffffffa04cff5f>] ? write_bitmap_entries+0x99/0xbd [btrfs]
[25591.013686]  [<ffffffffa04d0858>] ?
__btrfs_write_out_cache.isra.21+0x20b/0x3a1 [btrfs]
[25591.014709]  [<ffffffffa04d0864>]
__btrfs_write_out_cache.isra.21+0x217/0x3a1 [btrfs]
[25591.016398]  [<ffffffffa04d1269>] ? btrfs_write_out_cache+0x41/0xdc [btrfs]
[25591.017207]  [<ffffffffa04d12bb>] btrfs_write_out_cache+0x93/0xdc [btrfs]
[25591.018002]  [<ffffffffa04888e7>] ?
btrfs_start_dirty_block_groups+0x156/0x29b [btrfs]
[25591.019000]  [<ffffffffa0488977>]
btrfs_start_dirty_block_groups+0x1e6/0x29b [btrfs]
[25591.019980]  [<ffffffffa04970f2>]
btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[25591.020944]  [<ffffffffa0471748>] btrfs_sync_fs+0xe1/0x12d [btrfs]
[25591.021679]  [<ffffffff811569d3>] ? iterate_supers+0x60/0xc4
[25591.022693]  [<ffffffff8117acda>] ? vfs_fsync+0x1e/0x1e
[25591.023767]  [<ffffffff8117acfa>] sync_fs_one_sb+0x20/0x22
[25591.024931]  [<ffffffff811569e9>] iterate_supers+0x76/0xc4
[25591.026050]  [<ffffffff8117af58>] sys_sync+0x55/0x83
[25591.027084]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
(....)


>
> -chris
>
>> Checking filesystem on /dev/sdc
>> UUID: f7785aa7-d5ba-479d-a211-7c31039dc9b1
>> found 11911316 bytes used err is -22
>> total csum bytes: 7656
>> total tree bytes: 454656
>> total fs tree bytes: 376832
>> total extent tree bytes: 36864
>> btree space waste bytes: 122959
>> file data blocks allocated: 42893312
>>  referenced 31158272
>>
>> (it failed like that 1 out of 4 runs)
>>
>>
>>>
>>>>
>>>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>>>> index 5a4f5d1..8cd797f 100644
>>>> --- a/fs/btrfs/free-space-cache.c
>>>> +++ b/fs/btrfs/free-space-cache.c
>>>> @@ -1149,7 +1149,8 @@ int btrfs_wait_cache_io(struct btrfs_root *root,
>>>>         if (!inode)
>>>>                 return 0;
>>>>
>>>> -       root = root->fs_info->tree_root;
>>>> +       if (block_group)
>>>> +               root = root->fs_info->tree_root;
>>>>
>>>>         /* Flush the dirty pages in the cache file. */
>>>>         ret = flush_dirty_cache(inode);
>>>> @@ -3465,9 +3466,12 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
>>>>         if (!btrfs_test_opt(root, INODE_MAP_CACHE))
>>>>                 return 0;
>>>>
>>>> +       memset(&io_ctl, 0, sizeof(io_ctl));
>>>>         ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
>>>> -                                     trans, path, 0) ||
>>>> -               btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
>>>> +                                     trans, path, 0);
>>>> +       if (!ret)
>>>> +               ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
>>>> +
>>>>         if (ret) {
>>>>                 btrfs_delalloc_release_metadata(inode, inode->i_size);
>>>>  #ifdef DEBUG
>>>
>>>
>>>
>>> --
>>> Filipe David Manana,
>>>
>>> "Reasonable men adapt themselves to the world.
>>>  Unreasonable men adapt the world to themselves.
>>>  That's why all progress depends on unreasonable men."
>>
>>
>>
>



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-24  6:34                     ` Filipe David Manana
@ 2015-04-24 13:00                       ` Chris Mason
  2015-04-24 13:43                         ` Filipe David Manana
  2015-04-24 13:33                       ` Chris Mason
  1 sibling, 1 reply; 26+ messages in thread
From: Chris Mason @ 2015-04-24 13:00 UTC (permalink / raw)
  To: fdmanana; +Cc: linux-btrfs

On 04/24/2015 02:34 AM, Filipe David Manana wrote:
> On Thu, Apr 23, 2015 at 8:50 PM, Chris Mason <clm@fb.com> wrote:
>> On 04/23/2015 03:43 PM, Filipe David Manana wrote:
>>> On Thu, Apr 23, 2015 at 4:48 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
>>>> On Thu, Apr 23, 2015 at 4:17 PM, Chris Mason <clm@fb.com> wrote:
>>>>> On Thu, Apr 23, 2015 at 02:05:48PM +0100, Filipe David Manana wrote:
>>>>>>>> Trying the current integration-4.1 branch, I ran into the following
>>>>>>>> during xfstests/btrfs/049:
>>>>>>>>
>>>>>>>
>>>>>>> Ugh, I must not be waiting correctly in one of the inode cache writeout
>>>>>>> sections.  But I've run 049 a whole bunch of times without triggering,
>>>>>>> can you get this to happen consistently?
>>>>>>
>>>>>> All the time so far.
>>>>>
>>>>> I'm testing with this now:
>>>>>
>>>>> commit 9f433238891b1b243c4f19d3f36eed913b270cbc
>>>>> Author: Chris Mason <clm@fb.com>
>>>>> Date:   Thu Apr 23 08:02:49 2015 -0700
>>>>>
>>>>>     Btrfs: fix inode cache writeout
>>>>>
>>>>>     The code to fix stalls during free spache cache IO wasn't using
>>>>>     the correct root when waiting on the IO for inode caches.  This
>>>>>     is only a problem when the inode cache is enabled with
>>>>>
>>>>>     mount -o inode_cache
>>>>>
>>>>>     This fixes the inode cache writeout to preserve any error values and
>>>>>     makes sure not to override the root when inode cache writeout is done.
>>>>>
>>>>>     Reported-by: Filipe Manana <fdmanana@suse.com>
>>>>>     Signed-off-by: Chris Mason <clm@fb.com>
>>>>
>>>> Thanks, btrfs/049 now passes with that patch applied.
>>>> Running the whole xfstests suite now.
>>>
>>> btrfs/066 also failed once during final fsck with:
>>>
>>> _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent
>>> *** fsck.btrfs output ***
>>> checking extents
>>> checking free space cache
>>> There is no free space entry for 21676032-21680128
>>> There is no free space entry for 21676032-87031808
>>> cache appears valid but isnt 20971520
>>
>> Josef has a btrfs-progs patch for this.  The kernel will toss the cache.
>>  There's a somewhat fundamental race in cache writeout this patch makes
>> a little bigger, but it has always been there.
>>
>> (compare what find_free_extent can do with no trans running vs the
>> actual cache writeback)
> 
> There's also one list corruption I didn't get before and happened
> while running fsstress (btrfs/078), apparently due to some race:

Can you please bang on this and get a more reliable reproduction? I'll
take a look.

-chris


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-24  6:34                     ` Filipe David Manana
  2015-04-24 13:00                       ` Chris Mason
@ 2015-04-24 13:33                       ` Chris Mason
  1 sibling, 0 replies; 26+ messages in thread
From: Chris Mason @ 2015-04-24 13:33 UTC (permalink / raw)
  To: Filipe David Manana; +Cc: linux-btrfs

On Fri, Apr 24, 2015 at 07:34:43AM +0100, Filipe David Manana wrote:
> There's also one list corruption I didn't get before and happened
> while running fsstress (btrfs/078), apparently due to some race:
> 
> [25590.799058] ------------[ cut here ]------------
> [25590.800204] WARNING: CPU: 3 PID: 7280 at lib/list_debug.c:62
> __list_del_entry+0x5a/0x98()
> [25590.802101] list_del corruption. next->prev should be
> ffff8801a0f74d50, but was a56b6b6b6b6b6b6b
> [25590.804236] Modules linked in: btrfs dm_flakey dm_mod
> crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs
> lockd grace fscache sunrpc loop fuse i2c_piix4 i2c_core psmouse
> serio_raw evdev parport_pc parport acpi_cpufreq processor button
> pcspkr thermal_sys microcode ext4 crc16 jbd2 mbcache sd_mod sg sr_mod
> cdrom virtio_scsi ata_generic virtio_pci virtio_ring ata_piix e1000
> virtio libata floppy scsi_mod [last unloaded: btrfs]
> [25590.818580] CPU: 3 PID: 7280 Comm: fsstress Tainted: G        W
>   4.0.0-rc5-btrfs-next-9+ #1
> [25590.820597] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
> 04/01/2014
> [25590.823458]  0000000000000009 ffff8803f031bc08 ffffffff8142fa46
> ffffffff8108b6a2
> [25590.825081]  ffff8803f031bc58 ffff8803f031bc48 ffffffff81045ea5
> 0000000000000011
> [25590.826568]  ffffffff81245af7 ffff8801a0f74d50 ffff8801a0f74460
> ffff880041710df0
> [25590.828106] Call Trace:
> [25590.828630]  [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
> [25590.829706]  [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
> [25590.830785]  [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
> [25590.831957]  [<ffffffff81245af7>] ? __list_del_entry+0x5a/0x98
> [25590.867473]  [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
> [25590.868631]  [<ffffffffa04907e2>] ? btrfs_csum_data+0x16/0x18 [btrfs]
> [25590.869524]  [<ffffffff81245af7>] __list_del_entry+0x5a/0x98
> [25590.870918]  [<ffffffffa04cff5f>] write_bitmap_entries+0x99/0xbd [btrfs]

Can you please see which line write_bitmap_entries+0x99 is?

I'm hoping this will fix it:

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d773f22..5c7746f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1119,18 +1119,21 @@ static int flush_dirty_cache(struct inode *inode)
 }
 
 static void noinline_for_stack
-cleanup_write_cache_enospc(struct inode *inode,
+cleanup_write_cache_enospc(struct btrfs_free_space_ctl *ctl,
+			   struct inode *inode,
 			   struct btrfs_io_ctl *io_ctl,
 			   struct extent_state **cached_state,
 			   struct list_head *bitmap_list)
 {
 	struct list_head *pos, *n;
 
+	spin_lock(&ctl->tree_lock);
 	list_for_each_safe(pos, n, bitmap_list) {
 		struct btrfs_free_space *entry =
 			list_entry(pos, struct btrfs_free_space, list);
 		list_del_init(&entry->list);
 	}
+	spin_unlock(&ctl->tree_lock);
 	io_ctl_drop_pages(io_ctl);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 			     i_size_read(inode) - 1, cached_state,
@@ -1345,7 +1348,8 @@ out:
 	return ret;
 
 out_nospc:
-	cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
+	cleanup_write_cache_enospc(ctl, inode, io_ctl,
+				   &cached_state, &bitmap_list);
 
 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
 		up_write(&block_group->data_rwsem);


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-24 13:00                       ` Chris Mason
@ 2015-04-24 13:43                         ` Filipe David Manana
  2015-04-24 13:55                           ` Chris Mason
  0 siblings, 1 reply; 26+ messages in thread
From: Filipe David Manana @ 2015-04-24 13:43 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

On Fri, Apr 24, 2015 at 2:00 PM, Chris Mason <clm@fb.com> wrote:
> On 04/24/2015 02:34 AM, Filipe David Manana wrote:
>> On Thu, Apr 23, 2015 at 8:50 PM, Chris Mason <clm@fb.com> wrote:
>>> On 04/23/2015 03:43 PM, Filipe David Manana wrote:
>>>> On Thu, Apr 23, 2015 at 4:48 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
>>>>> On Thu, Apr 23, 2015 at 4:17 PM, Chris Mason <clm@fb.com> wrote:
>>>>>> On Thu, Apr 23, 2015 at 02:05:48PM +0100, Filipe David Manana wrote:
>>>>>>>>> Trying the current integration-4.1 branch, I ran into the following
>>>>>>>>> during xfstests/btrfs/049:
>>>>>>>>>
>>>>>>>>
>>>>>>>> Ugh, I must not be waiting correctly in one of the inode cache writeout
>>>>>>>> sections.  But I've run 049 a whole bunch of times without triggering,
>>>>>>>> can you get this to happen consistently?
>>>>>>>
>>>>>>> All the time so far.
>>>>>>
>>>>>> I'm testing with this now:
>>>>>>
>>>>>> commit 9f433238891b1b243c4f19d3f36eed913b270cbc
>>>>>> Author: Chris Mason <clm@fb.com>
>>>>>> Date:   Thu Apr 23 08:02:49 2015 -0700
>>>>>>
>>>>>>     Btrfs: fix inode cache writeout
>>>>>>
>>>>>>     The code to fix stalls during free spache cache IO wasn't using
>>>>>>     the correct root when waiting on the IO for inode caches.  This
>>>>>>     is only a problem when the inode cache is enabled with
>>>>>>
>>>>>>     mount -o inode_cache
>>>>>>
>>>>>>     This fixes the inode cache writeout to preserve any error values and
>>>>>>     makes sure not to override the root when inode cache writeout is done.
>>>>>>
>>>>>>     Reported-by: Filipe Manana <fdmanana@suse.com>
>>>>>>     Signed-off-by: Chris Mason <clm@fb.com>
>>>>>
>>>>> Thanks, btrfs/049 now passes with that patch applied.
>>>>> Running the whole xfstests suite now.
>>>>
>>>> btrfs/066 also failed once during final fsck with:
>>>>
>>>> _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent
>>>> *** fsck.btrfs output ***
>>>> checking extents
>>>> checking free space cache
>>>> There is no free space entry for 21676032-21680128
>>>> There is no free space entry for 21676032-87031808
>>>> cache appears valid but isnt 20971520
>>>
>>> Josef has a btrfs-progs patch for this.  The kernel will toss the cache.
>>>  There's a somewhat fundamental race in cache writeout this patch makes
>>> a little bigger, but it has always been there.
>>>
>>> (compare what find_free_extent can do with no trans running vs the
>>> actual cache writeback)
>>
>> There's also one list corruption I didn't get before and happened
>> while running fsstress (btrfs/078), apparently due to some race:
>
> Can you please bang on this and get a more reliable reproduction? I'll
> take a look.

Not really that easy to get a more reliable reproducer - just run
fsstress with multiple processes - it already happened twice again
after I sent the previous mail.
>From the quick look I had at this, this seems to be the change causing
the problem:

http://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=for-linus-4.1&id=1bbc621ef28462456131c035eaeb5567a1a2a2fe

Early in btrfs_commit_transaction(), btrfs_start_dirty_block_groups()
is called which ends up calling __btrfs_write_out_cache() for each
dirty block group, which collects all the bitmap entries from the bg's
space cache into a local list while holding the cache's ctl->tree_lock
(to serialize with concurrent allocation requests).

Then we unlock ctl->tree_lock, do other stuff and later acquire
ctl->tree_lock again and call write_bitmap_entries() to write the
bitmap entries we previously collected. However, while we were doing
the other stuff without holding that lock, allocation requests might
have happened right? - since when we call
btrfs_start_dirty_block_groups() in btrfs_commit_transaction() the
transaction state wasn't yet changed, allowing other tasks to join the
current transaction. If such other task allocates all the remaining
space from a bitmap entry we collected before (because it's still in
the space cache's rbtree), it ends up deleting it and freeing its
->bitmap member, which results in an invalid memory access (and the
warning on the list corruption) when we later call
write_bitmap_entries() in __btrfs_write_out_cache() - which is what
the second part of the trace I sent says:


[25590.890573] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
(...)
[25590.892522] CPU: 3 PID: 7280 Comm: fsstress Tainted: G        W
  4.0.0-rc5-btrfs-next-9+ #1
[25590.892522] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
04/01/2014
[25590.892522] task: ffff88040f485bd0 ti: ffff8803f0318000 task.ti:
ffff8803f0318000
[25590.892522] RIP: 0010:[<ffffffffa04cff2c>]  [<ffffffffa04cff2c>]
write_bitmap_entries+0x66/0xbd [btrfs]
(...)
[25590.892522] Stack:
[25590.892522]  ffffffffa04d0858 ffff880041710df0 ffff880041710c00
ffff88033ba59c90
[25590.892522]  0000000000000000 ffff8803efb6ff70 ffff8803f031bd78
ffffffffa04d0864
[25590.892522]  ffff88037cae7f40 ffff88033ba599d0 ffff880226cd4000
ffff8803efb6ff00
[25590.892522] Call Trace:
[25590.892522]  [<ffffffffa04d0858>] ?
__btrfs_write_out_cache.isra.21+0x20b/0x3a1 [btrfs]
[25590.892522]  [<ffffffffa04d0864>]
__btrfs_write_out_cache.isra.21+0x217/0x3a1 [btrfs]
[25590.892522]  [<ffffffffa04d1269>] ? btrfs_write_out_cache+0x41/0xdc [btrfs]
[25590.892522]  [<ffffffffa04d12bb>] btrfs_write_out_cache+0x93/0xdc [btrfs]
[25590.892522]  [<ffffffffa04888e7>] ?
btrfs_start_dirty_block_groups+0x156/0x29b [btrfs]
[25590.892522]  [<ffffffffa0488977>]
btrfs_start_dirty_block_groups+0x1e6/0x29b [btrfs]
[25590.892522]  [<ffffffffa04970f2>]
btrfs_commit_transaction+0x130/0x9c9 [btrfs]

And write_bitmap_entries+0x66 corresponds to:

static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
{
(...)
memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
(...)
}

I.e. the ->bitmap field from an entry we pass from write_bitmap_entries().
So releasing the tree_lock after collecting the bitmaps and before
writing them out seems to me what leads to the race and invalid memory
access. But I might be wrong, I haven't spent more than 5 minutes
analyzing it.


>
> -chris
>



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-24 13:43                         ` Filipe David Manana
@ 2015-04-24 13:55                           ` Chris Mason
  2015-04-24 15:05                             ` Filipe David Manana
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Mason @ 2015-04-24 13:55 UTC (permalink / raw)
  To: fdmanana; +Cc: linux-btrfs

On 04/24/2015 09:43 AM, Filipe David Manana wrote:
> On Fri, Apr 24, 2015 at 2:00 PM, Chris Mason <clm@fb.com> wrote:

>> Can you please bang on this and get a more reliable reproduction? I'll
>> take a look.
> 
> Not really that easy to get a more reliable reproducer - just run
> fsstress with multiple processes - it already happened twice again
> after I sent the previous mail.
> From the quick look I had at this, this seems to be the change causing
> the problem:
> 
> http://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=for-linus-4.1&id=1bbc621ef28462456131c035eaeb5567a1a2a2fe
> 
> Early in btrfs_commit_transaction(), btrfs_start_dirty_block_groups()
> is called which ends up calling __btrfs_write_out_cache() for each
> dirty block group, which collects all the bitmap entries from the bg's
> space cache into a local list while holding the cache's ctl->tree_lock
> (to serialize with concurrent allocation requests).
> 
> Then we unlock ctl->tree_lock, do other stuff and later acquire
> ctl->tree_lock again and call write_bitmap_entries() to write the
> bitmap entries we previously collected. However, while we were doing
> the other stuff without holding that lock, allocation requests might
> have happened right? - since when we call
> btrfs_start_dirty_block_groups() in btrfs_commit_transaction() the
> transaction state wasn't yet changed, allowing other tasks to join the
> current transaction. If such other task allocates all the remaining
> space from a bitmap entry we collected before (because it's still in
> the space cache's rbtree), it ends up deleting it and freeing its
> ->bitmap member, which results in an invalid memory access (and the
> warning on the list corruption) when we later call
> write_bitmap_entries() in __btrfs_write_out_cache() - which is what
> the second part of the trace I sent says:

It's easy to hold the ctl->tree_lock from collection write out, but
everyone deleting items is using list_del_init, so it should be fine to
take the lock again and run through any items that are left.

Here's a replacement incremental that'll cover both cases:


diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d773f22..657a8ec 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1119,18 +1119,21 @@ static int flush_dirty_cache(struct inode *inode)
 }

 static void noinline_for_stack
-cleanup_write_cache_enospc(struct inode *inode,
+cleanup_write_cache_enospc(struct btrfs_free_space_ctl *ctl,
+			   struct inode *inode,
 			   struct btrfs_io_ctl *io_ctl,
 			   struct extent_state **cached_state,
 			   struct list_head *bitmap_list)
 {
 	struct list_head *pos, *n;

+	spin_lock(&ctl->tree_lock);
 	list_for_each_safe(pos, n, bitmap_list) {
 		struct btrfs_free_space *entry =
 			list_entry(pos, struct btrfs_free_space, list);
 		list_del_init(&entry->list);
 	}
+	spin_unlock(&ctl->tree_lock);
 	io_ctl_drop_pages(io_ctl);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 			     i_size_read(inode) - 1, cached_state,
@@ -1266,8 +1269,8 @@ static int __btrfs_write_out_cache(struct
btrfs_root *root, struct inode *inode,
 	ret = write_cache_extent_entries(io_ctl, ctl,
 					 block_group, &entries, &bitmaps,
 					 &bitmap_list);
-	spin_unlock(&ctl->tree_lock);
 	if (ret) {
+		spin_unlock(&ctl->tree_lock);
 		mutex_unlock(&ctl->cache_writeout_mutex);
 		goto out_nospc;
 	}
@@ -1282,6 +1285,7 @@ static int __btrfs_write_out_cache(struct
btrfs_root *root, struct inode *inode,
 	 */
 	ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
 	if (ret) {
+		spin_unlock(&ctl->tree_lock);
 		mutex_unlock(&ctl->cache_writeout_mutex);
 		goto out_nospc;
 	}
@@ -1291,7 +1295,6 @@ static int __btrfs_write_out_cache(struct
btrfs_root *root, struct inode *inode,
 	 * locked while doing it because a concurrent trim can be manipulating
 	 * or freeing the bitmap.
 	 */
-	spin_lock(&ctl->tree_lock);
 	ret = write_bitmap_entries(io_ctl, &bitmap_list);
 	spin_unlock(&ctl->tree_lock);
 	mutex_unlock(&ctl->cache_writeout_mutex);
@@ -1345,7 +1348,8 @@ out:
 	return ret;

 out_nospc:
-	cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
+	cleanup_write_cache_enospc(ctl, inode, io_ctl,
+				   &cached_state, &bitmap_list);

 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
 		up_write(&block_group->data_rwsem);

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-24 13:55                           ` Chris Mason
@ 2015-04-24 15:05                             ` Filipe David Manana
  2015-04-25 17:33                               ` Filipe David Manana
  0 siblings, 1 reply; 26+ messages in thread
From: Filipe David Manana @ 2015-04-24 15:05 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

On Fri, Apr 24, 2015 at 2:55 PM, Chris Mason <clm@fb.com> wrote:
> On 04/24/2015 09:43 AM, Filipe David Manana wrote:
>> On Fri, Apr 24, 2015 at 2:00 PM, Chris Mason <clm@fb.com> wrote:
>
>>> Can you please bang on this and get a more reliable reproduction? I'll
>>> take a look.
>>
>> Not really that easy to get a more reliable reproducer - just run
>> fsstress with multiple processes - it already happened twice again
>> after I sent the previous mail.
>> From the quick look I had at this, this seems to be the change causing
>> the problem:
>>
>> http://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=for-linus-4.1&id=1bbc621ef28462456131c035eaeb5567a1a2a2fe
>>
>> Early in btrfs_commit_transaction(), btrfs_start_dirty_block_groups()
>> is called which ends up calling __btrfs_write_out_cache() for each
>> dirty block group, which collects all the bitmap entries from the bg's
>> space cache into a local list while holding the cache's ctl->tree_lock
>> (to serialize with concurrent allocation requests).
>>
>> Then we unlock ctl->tree_lock, do other stuff and later acquire
>> ctl->tree_lock again and call write_bitmap_entries() to write the
>> bitmap entries we previously collected. However, while we were doing
>> the other stuff without holding that lock, allocation requests might
>> have happened right? - since when we call
>> btrfs_start_dirty_block_groups() in btrfs_commit_transaction() the
>> transaction state wasn't yet changed, allowing other tasks to join the
>> current transaction. If such other task allocates all the remaining
>> space from a bitmap entry we collected before (because it's still in
>> the space cache's rbtree), it ends up deleting it and freeing its
>> ->bitmap member, which results in an invalid memory access (and the
>> warning on the list corruption) when we later call
>> write_bitmap_entries() in __btrfs_write_out_cache() - which is what
>> the second part of the trace I sent says:
>
> It's easy to hold the ctl->tree_lock from collection write out, but
> everyone deleting items is using list_del_init, so it should be fine to
> take the lock again and run through any items that are left.

Right, but free_bitmap() / unlink_free_space() free the bitmap entry
without deleting it from the list (nor its callers do it), which
should be enough to cause such list corruption report.

I'll try the patch and see if I can get at least one successful entire
run of xfstests.
Thanks Chris.

>
> Here's a replacement incremental that'll cover both cases:
>
>
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index d773f22..657a8ec 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -1119,18 +1119,21 @@ static int flush_dirty_cache(struct inode *inode)
>  }
>
>  static void noinline_for_stack
> -cleanup_write_cache_enospc(struct inode *inode,
> +cleanup_write_cache_enospc(struct btrfs_free_space_ctl *ctl,
> +                          struct inode *inode,
>                            struct btrfs_io_ctl *io_ctl,
>                            struct extent_state **cached_state,
>                            struct list_head *bitmap_list)
>  {
>         struct list_head *pos, *n;
>
> +       spin_lock(&ctl->tree_lock);
>         list_for_each_safe(pos, n, bitmap_list) {
>                 struct btrfs_free_space *entry =
>                         list_entry(pos, struct btrfs_free_space, list);
>                 list_del_init(&entry->list);
>         }
> +       spin_unlock(&ctl->tree_lock);
>         io_ctl_drop_pages(io_ctl);
>         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
>                              i_size_read(inode) - 1, cached_state,
> @@ -1266,8 +1269,8 @@ static int __btrfs_write_out_cache(struct
> btrfs_root *root, struct inode *inode,
>         ret = write_cache_extent_entries(io_ctl, ctl,
>                                          block_group, &entries, &bitmaps,
>                                          &bitmap_list);
> -       spin_unlock(&ctl->tree_lock);
>         if (ret) {
> +               spin_unlock(&ctl->tree_lock);
>                 mutex_unlock(&ctl->cache_writeout_mutex);
>                 goto out_nospc;
>         }
> @@ -1282,6 +1285,7 @@ static int __btrfs_write_out_cache(struct
> btrfs_root *root, struct inode *inode,
>          */
>         ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
>         if (ret) {
> +               spin_unlock(&ctl->tree_lock);
>                 mutex_unlock(&ctl->cache_writeout_mutex);
>                 goto out_nospc;
>         }
> @@ -1291,7 +1295,6 @@ static int __btrfs_write_out_cache(struct
> btrfs_root *root, struct inode *inode,
>          * locked while doing it because a concurrent trim can be manipulating
>          * or freeing the bitmap.
>          */
> -       spin_lock(&ctl->tree_lock);
>         ret = write_bitmap_entries(io_ctl, &bitmap_list);
>         spin_unlock(&ctl->tree_lock);
>         mutex_unlock(&ctl->cache_writeout_mutex);
> @@ -1345,7 +1348,8 @@ out:
>         return ret;
>
>  out_nospc:
> -       cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
> +       cleanup_write_cache_enospc(ctl, inode, io_ctl,
> +                                  &cached_state, &bitmap_list);
>
>         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
>                 up_write(&block_group->data_rwsem);



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-24 15:05                             ` Filipe David Manana
@ 2015-04-25 17:33                               ` Filipe David Manana
  0 siblings, 0 replies; 26+ messages in thread
From: Filipe David Manana @ 2015-04-25 17:33 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

On Fri, Apr 24, 2015 at 4:05 PM, Filipe David Manana <fdmanana@gmail.com> wrote:
> On Fri, Apr 24, 2015 at 2:55 PM, Chris Mason <clm@fb.com> wrote:
>> On 04/24/2015 09:43 AM, Filipe David Manana wrote:
>>> On Fri, Apr 24, 2015 at 2:00 PM, Chris Mason <clm@fb.com> wrote:
>>
>>>> Can you please bang on this and get a more reliable reproduction? I'll
>>>> take a look.
>>>
>>> Not really that easy to get a more reliable reproducer - just run
>>> fsstress with multiple processes - it already happened twice again
>>> after I sent the previous mail.
>>> From the quick look I had at this, this seems to be the change causing
>>> the problem:
>>>
>>> http://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=for-linus-4.1&id=1bbc621ef28462456131c035eaeb5567a1a2a2fe
>>>
>>> Early in btrfs_commit_transaction(), btrfs_start_dirty_block_groups()
>>> is called which ends up calling __btrfs_write_out_cache() for each
>>> dirty block group, which collects all the bitmap entries from the bg's
>>> space cache into a local list while holding the cache's ctl->tree_lock
>>> (to serialize with concurrent allocation requests).
>>>
>>> Then we unlock ctl->tree_lock, do other stuff and later acquire
>>> ctl->tree_lock again and call write_bitmap_entries() to write the
>>> bitmap entries we previously collected. However, while we were doing
>>> the other stuff without holding that lock, allocation requests might
>>> have happened right? - since when we call
>>> btrfs_start_dirty_block_groups() in btrfs_commit_transaction() the
>>> transaction state wasn't yet changed, allowing other tasks to join the
>>> current transaction. If such other task allocates all the remaining
>>> space from a bitmap entry we collected before (because it's still in
>>> the space cache's rbtree), it ends up deleting it and freeing its
>>> ->bitmap member, which results in an invalid memory access (and the
>>> warning on the list corruption) when we later call
>>> write_bitmap_entries() in __btrfs_write_out_cache() - which is what
>>> the second part of the trace I sent says:
>>
>> It's easy to hold the ctl->tree_lock from collection write out, but
>> everyone deleting items is using list_del_init, so it should be fine to
>> take the lock again and run through any items that are left.
>
> Right, but free_bitmap() / unlink_free_space() free the bitmap entry
> without deleting it from the list (nor its callers do it), which
> should be enough to cause such list corruption report.
>
> I'll try the patch and see if I can get at least one successful entire
> run of xfstests.
> Thanks Chris.

So with the updated version of that last patch, found at [1], this
problem no longer happens as expected.
However found 2 others one for which I've just sent fixes, plus
another one with invalid on disk caches which I'm not sure if it's
related with the (not new) race you mentioned before when writing
block group caches. It happened once with generic/299 and fsck's
report was:

checking extents
checking free space cache
Wanted bytes 2883584, found 1310720 for off 4929859584
Wanted bytes 468209664, found 1310720 for off 4929859584
cache appears valid but isnt 4324327424
Wanted bytes 262144, found 131072 for off 5403049984
Wanted bytes 1068761088, found 131072 for off 5403049984
cache appears valid but isnt 5398069248
Wanted bytes 262144, found 131072 for off 6472204288
Wanted bytes 1073348608, found 131072 for off 6472204288
cache appears valid but isnt 6471811072
Wanted bytes 786432, found 655360 for off 7545552896
Wanted bytes 1073741824, found 655360 for off 7545552896
cache appears valid but isnt 7545552896
Wanted bytes 1048576, found 131072 for off 8621916160
Wanted bytes 1071120384, found 131072 for off 8621916160
cache appears valid but isnt 8619294720
There is no free space entry for 9693298688-9695395840
There is no free space entry for 9693298688-10766778368
cache appears valid but isnt 9693036544
There is no free space entry for 10769268736-10769661952
There is no free space entry for 10769268736-11840520192
cache appears valid but isnt 10766778368
Checking filesystem on /dev/sdc
UUID: abf30a2f-f784-4829-9131-86e20f13a8cf
found 788994098 bytes used err is -22
total csum bytes: 2586348
total tree bytes: 14954496
total fs tree bytes: 4702208
total extent tree bytes: 3817472
btree space waste bytes: 5422437
file data blocks allocated: 2651303936
 referenced 2651303936

[1] http://git.kernel.org/cgit/linux/kernel/git/mason/linux-btrfs.git/commit/?h=for-linus-4.1&id=a3bdccc4e683f0ac69230707ed3fa20e7cf73a79

>
>>
>> Here's a replacement incremental that'll cover both cases:
>>
>>
>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>> index d773f22..657a8ec 100644
>> --- a/fs/btrfs/free-space-cache.c
>> +++ b/fs/btrfs/free-space-cache.c
>> @@ -1119,18 +1119,21 @@ static int flush_dirty_cache(struct inode *inode)
>>  }
>>
>>  static void noinline_for_stack
>> -cleanup_write_cache_enospc(struct inode *inode,
>> +cleanup_write_cache_enospc(struct btrfs_free_space_ctl *ctl,
>> +                          struct inode *inode,
>>                            struct btrfs_io_ctl *io_ctl,
>>                            struct extent_state **cached_state,
>>                            struct list_head *bitmap_list)
>>  {
>>         struct list_head *pos, *n;
>>
>> +       spin_lock(&ctl->tree_lock);
>>         list_for_each_safe(pos, n, bitmap_list) {
>>                 struct btrfs_free_space *entry =
>>                         list_entry(pos, struct btrfs_free_space, list);
>>                 list_del_init(&entry->list);
>>         }
>> +       spin_unlock(&ctl->tree_lock);
>>         io_ctl_drop_pages(io_ctl);
>>         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
>>                              i_size_read(inode) - 1, cached_state,
>> @@ -1266,8 +1269,8 @@ static int __btrfs_write_out_cache(struct
>> btrfs_root *root, struct inode *inode,
>>         ret = write_cache_extent_entries(io_ctl, ctl,
>>                                          block_group, &entries, &bitmaps,
>>                                          &bitmap_list);
>> -       spin_unlock(&ctl->tree_lock);
>>         if (ret) {
>> +               spin_unlock(&ctl->tree_lock);
>>                 mutex_unlock(&ctl->cache_writeout_mutex);
>>                 goto out_nospc;
>>         }
>> @@ -1282,6 +1285,7 @@ static int __btrfs_write_out_cache(struct
>> btrfs_root *root, struct inode *inode,
>>          */
>>         ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
>>         if (ret) {
>> +               spin_unlock(&ctl->tree_lock);
>>                 mutex_unlock(&ctl->cache_writeout_mutex);
>>                 goto out_nospc;
>>         }
>> @@ -1291,7 +1295,6 @@ static int __btrfs_write_out_cache(struct
>> btrfs_root *root, struct inode *inode,
>>          * locked while doing it because a concurrent trim can be manipulating
>>          * or freeing the bitmap.
>>          */
>> -       spin_lock(&ctl->tree_lock);
>>         ret = write_bitmap_entries(io_ctl, &bitmap_list);
>>         spin_unlock(&ctl->tree_lock);
>>         mutex_unlock(&ctl->cache_writeout_mutex);
>> @@ -1345,7 +1348,8 @@ out:
>>         return ret;
>>
>>  out_nospc:
>> -       cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
>> +       cleanup_write_cache_enospc(ctl, inode, io_ctl,
>> +                                  &cached_state, &bitmap_list);
>>
>>         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
>>                 up_write(&block_group->data_rwsem);
>
>
>
> --
> Filipe David Manana,
>
> "Reasonable men adapt themselves to the world.
>  Unreasonable men adapt the world to themselves.
>  That's why all progress depends on unreasonable men."



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/4] btrfs: reduce block group cache writeout times during commit
  2015-04-22 16:09 ` [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Lutz Vieweg
  2015-04-22 16:37   ` Holger Hoffstätte
@ 2015-06-26 17:12   ` Lutz Vieweg
  1 sibling, 0 replies; 26+ messages in thread
From: Lutz Vieweg @ 2015-06-26 17:12 UTC (permalink / raw)
  To: linux-btrfs

Testing the patch took much longer than I anticipated due to pre-4.1-kernels
being "too risky" for use on our servers, but now it's done and I can say:

This patch, as integrated in linux-4.1, has successfully removed the lags.

Thanks!

Regards,

Lutz Vieweg


On 04/22/2015 06:09 PM, Lutz Vieweg wrote:
> On 04/13/2015 09:52 PM, Chris Mason wrote:
>> Large filesystems with lots of block groups can suffer long stalls during
>> commit while we create and send down all of the block group caches.  The
>> more blocks groups dirtied in a transaction, the longer these stalls can be.
>> Some workloads average 10 seconds per commit, but see peak times much higher.
>
> Since we see this problem very frequently on some shared development servers,
> I will try to install this ASAP.
>
> Meanwhile, can anybody already tell success stories about successfully removing
> lags by this patch?
>
> Regards,
>
> Lutz Vieweg
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2015-06-26 17:12 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-13 19:52 [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Chris Mason
2015-04-13 19:52 ` [PATCH 1/4] btrfs: move struct io_ctl into ctree.h and rename it Chris Mason
2015-04-13 19:52 ` [PATCH 2/4] Btrfs: two stage dirty block group writeout Chris Mason
2015-04-13 19:52 ` [PATCH 3/4] Btrfs: don't use highmem for free space cache pages Chris Mason
2015-04-13 19:52 ` [PATCH 4/4] Btrfs: allow block group cache writeout outside critical section in commit Chris Mason
2015-04-22 16:09 ` [PATCH 0/4] btrfs: reduce block group cache writeout times during commit Lutz Vieweg
2015-04-22 16:37   ` Holger Hoffstätte
2015-04-22 16:55     ` Chris Mason
2015-04-23 12:45       ` Filipe David Manana
2015-04-23 12:52         ` Chris Mason
2015-04-23 12:56           ` Chris Mason
2015-04-23 13:05           ` Filipe David Manana
2015-04-23 15:17             ` Chris Mason
2015-04-23 15:48               ` Filipe David Manana
2015-04-23 19:43                 ` Filipe David Manana
2015-04-23 19:50                   ` Chris Mason
2015-04-24  6:34                     ` Filipe David Manana
2015-04-24 13:00                       ` Chris Mason
2015-04-24 13:43                         ` Filipe David Manana
2015-04-24 13:55                           ` Chris Mason
2015-04-24 15:05                             ` Filipe David Manana
2015-04-25 17:33                               ` Filipe David Manana
2015-04-24 13:33                       ` Chris Mason
2015-04-23 16:34       ` Holger Hoffstätte
2015-04-23 17:57         ` Chris Mason
2015-06-26 17:12   ` Lutz Vieweg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.