All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] Kill the btree inode
@ 2016-09-02 19:39 Josef Bacik
  2016-09-02 19:40 ` [PATCH 1/7] Btrfs: replace tree->mapping with tree->private_data Josef Bacik
                   ` (7 more replies)
  0 siblings, 8 replies; 16+ messages in thread
From: Josef Bacik @ 2016-09-02 19:39 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

In order to provide a better way to do subpage blocksizes we need to stop
allocating pages from a per fs btree inode and instead allocate our own pages.
This work depends on 3 generic patches that I've sent previously

remove mapping from balance_dirty_pages*()
writeback: allow for dirty metadata accounting
writeback: introduce super_operations->write_metadata

This is a pretty big change but ultimately makes extent_buffer reclaim much
cleaner and will make the sub-pagesize blocksize work significantly cleaner.
I've been hammering on this for a few weeks now and seems to be pretty solid.
Thanks,

Josef


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/7] Btrfs: replace tree->mapping with tree->private_data
  2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
@ 2016-09-02 19:40 ` Josef Bacik
  2016-09-02 19:40 ` [PATCH 2/7] btrfs: remove inode argument from repair_io_failure Josef Bacik
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Josef Bacik @ 2016-09-02 19:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

For extent_io tree's we have carried the address_mapping of the inode around in
the io tree in order to pull the inode back out for calling into various tree
ops hooks.  This works fine when everything that has an extent_io_tree has an
inode.  But we are going to remove the btree_inode, so we need to change this.
Instead just have a generic void * for private data that we can initialize with,
and have all the tree ops use that instead.  This had a lot of cascading changes
but should be relatively straightforward.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h                 |  1 +
 fs/btrfs/disk-io.c               | 50 ++++++++++++----------
 fs/btrfs/disk-io.h               |  6 +--
 fs/btrfs/extent_io.c             | 51 +++++++---------------
 fs/btrfs/extent_io.h             | 21 ++++++----
 fs/btrfs/inode.c                 | 91 ++++++++++++++++++++++++++++++----------
 fs/btrfs/relocation.c            |  3 +-
 fs/btrfs/tests/extent-io-tests.c |  2 +-
 fs/btrfs/transaction.c           |  2 +-
 9 files changed, 133 insertions(+), 94 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2fe8f89..bb50d7c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3122,6 +3122,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags);
+void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4034ad6..c1d951a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -119,7 +119,8 @@ void btrfs_end_io_wq_exit(void)
  * just before they are sent down the IO stack.
  */
 struct async_submit_bio {
-	struct inode *inode;
+	void *private_data;
+	struct btrfs_fs_info *fs_info;
 	struct bio *bio;
 	struct list_head list;
 	extent_submit_bio_hook_t *submit_bio_start;
@@ -796,7 +797,7 @@ static void run_one_async_start(struct btrfs_work *work)
 	int ret;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	ret = async->submit_bio_start(async->inode, async->bio,
+	ret = async->submit_bio_start(async->private_data, async->bio,
 				      async->mirror_num, async->bio_flags,
 				      async->bio_offset);
 	if (ret)
@@ -810,7 +811,7 @@ static void run_one_async_done(struct btrfs_work *work)
 	int limit;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	fs_info = BTRFS_I(async->inode)->root->fs_info;
+	fs_info = async->fs_info;
 
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
@@ -829,7 +830,7 @@ static void run_one_async_done(struct btrfs_work *work)
 		return;
 	}
 
-	async->submit_bio_done(async->inode, async->bio, async->mirror_num,
+	async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
 			       async->bio_flags, async->bio_offset);
 }
 
@@ -841,10 +842,9 @@ static void run_one_async_free(struct btrfs_work *work)
 	kfree(async);
 }
 
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
-			struct bio *bio, int mirror_num,
-			unsigned long bio_flags,
-			u64 bio_offset,
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+			int mirror_num, unsigned long bio_flags,
+			u64 bio_offset, void *private_data,
 			extent_submit_bio_hook_t *submit_bio_start,
 			extent_submit_bio_hook_t *submit_bio_done)
 {
@@ -854,7 +854,8 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	if (!async)
 		return -ENOMEM;
 
-	async->inode = inode;
+	async->private_data = private_data;
+	async->fs_info = fs_info;
 	async->bio = bio;
 	async->mirror_num = mirror_num;
 	async->submit_bio_start = submit_bio_start;
@@ -900,7 +901,7 @@ static int btree_csum_one_bio(struct bio *bio)
 	return ret;
 }
 
-static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
+static int __btree_submit_bio_start(void *private_data, struct bio *bio,
 				    int mirror_num, unsigned long bio_flags,
 				    u64 bio_offset)
 {
@@ -911,10 +912,11 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
 	return btree_csum_one_bio(bio);
 }
 
-static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
+static int __btree_submit_bio_done(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	int ret;
 
 	/*
@@ -940,10 +942,11 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
 	return 1;
 }
 
-static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
+static int btree_submit_bio_hook(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	int async = check_async_write(inode, bio_flags);
 	int ret;
 
@@ -968,8 +971,8 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
 		 * checksumming can happen in parallel across all CPUs
 		 */
 		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-					  inode, bio, mirror_num, 0,
-					  bio_offset,
+					  bio, mirror_num, 0,
+					  bio_offset, private_data,
 					  __btree_submit_bio_start,
 					  __btree_submit_bio_done);
 	}
@@ -1283,8 +1286,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
 	if (!dummy)
-		extent_io_tree_init(&root->dirty_log_pages,
-				     fs_info->btree_inode->i_mapping);
+		extent_io_tree_init(&root->dirty_log_pages, NULL);
 
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
@@ -2261,7 +2263,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
 
 	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
-			     fs_info->btree_inode->i_mapping);
+			    fs_info->btree_inode);
 	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
 	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
 
@@ -2666,10 +2668,8 @@ int open_ctree(struct super_block *sb,
 	fs_info->block_group_cache_tree = RB_ROOT;
 	fs_info->first_logical_byte = (u64)-1;
 
-	extent_io_tree_init(&fs_info->freed_extents[0],
-			     fs_info->btree_inode->i_mapping);
-	extent_io_tree_init(&fs_info->freed_extents[1],
-			     fs_info->btree_inode->i_mapping);
+	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
 	fs_info->pinned_extents = &fs_info->freed_extents[0];
 	fs_info->do_barriers = 1;
 
@@ -4510,10 +4510,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 	return 0;
 }
 
+static struct btrfs_fs_info *btree_fs_info(void *private_data)
+{
+	struct inode *inode = private_data;
+	return btrfs_sb(inode->i_sb);
+}
+
 static const struct extent_io_ops btree_extent_io_ops = {
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
 	/* note we're sharing with inode.c for the merge bio hook */
 	.merge_bio_hook = btrfs_merge_bio_hook,
+	.tree_fs_info = btree_fs_info,
+	.set_range_writeback = btrfs_set_range_writeback,
 };
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b3207a0e..591f078 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -122,9 +122,9 @@ u32 btrfs_csum_data(char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			enum btrfs_wq_endio_type metadata);
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
-			struct bio *bio, int mirror_num,
-			unsigned long bio_flags, u64 bio_offset,
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+			int mirror_num, unsigned long bio_flags,
+			u64 bio_offset, void *private_data,
 			extent_submit_bio_hook_t *submit_bio_start,
 			extent_submit_bio_hook_t *submit_bio_done);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 80d9283..515b78a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void)
 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 		struct extent_io_tree *tree, u64 start, u64 end)
 {
-	struct inode *inode;
-	u64 isize;
-
-	if (!tree->mapping)
-		return;
-
-	inode = tree->mapping->host;
-	isize = i_size_read(inode);
-	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
-		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
-				caller, btrfs_ino(inode), isize, start, end);
-	}
+	if (tree->ops && tree->ops->check_extent_io_range)
+		tree->ops->check_extent_io_range(tree->private_data, caller,
+						 start, end);
 }
 #else
 #define btrfs_leak_debug_add(new, head)	do {} while (0)
@@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
 {
-	if (!tree->mapping)
-		return NULL;
-	return btrfs_sb(tree->mapping->host->i_sb);
+	if (tree->ops && tree->ops->tree_fs_info)
+		return tree->ops->tree_fs_info(tree->private_data);
+	return NULL;
 }
 
 int __init extent_io_init(void)
@@ -213,13 +203,13 @@ void extent_io_exit(void)
 }
 
 void extent_io_tree_init(struct extent_io_tree *tree,
-			 struct address_space *mapping)
+			 void *private_data)
 {
 	tree->state = RB_ROOT;
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
-	tree->mapping = mapping;
+	tree->private_data = private_data;
 }
 
 static struct extent_state *alloc_extent_state(gfp_t mask)
@@ -364,8 +354,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 		     struct extent_state *other)
 {
 	if (tree->ops && tree->ops->merge_extent_hook)
-		tree->ops->merge_extent_hook(tree->mapping->host, new,
-					     other);
+		tree->ops->merge_extent_hook(tree->private_data, new, other);
 }
 
 /*
@@ -416,14 +405,14 @@ static void set_state_cb(struct extent_io_tree *tree,
 			 struct extent_state *state, unsigned *bits)
 {
 	if (tree->ops && tree->ops->set_bit_hook)
-		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
+		tree->ops->set_bit_hook(tree->private_data, state, bits);
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
 			   struct extent_state *state, unsigned *bits)
 {
 	if (tree->ops && tree->ops->clear_bit_hook)
-		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+		tree->ops->clear_bit_hook(tree->private_data, state, bits);
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
@@ -473,7 +462,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 		     u64 split)
 {
 	if (tree->ops && tree->ops->split_extent_hook)
-		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
+		tree->ops->split_extent_hook(tree->private_data, orig, split);
 }
 
 /*
@@ -1398,17 +1387,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
  */
 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
-	unsigned long index = start >> PAGE_SHIFT;
-	unsigned long end_index = end >> PAGE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page); /* Pages should be in the extent_io_tree */
-		set_page_writeback(page);
-		put_page(page);
-		index++;
-	}
+	tree->ops->set_range_writeback(tree->private_data, start, end);
 }
 
 /* find the first state struct with 'bits' set after 'start', and
@@ -2418,7 +2397,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
 		 read_mode, failrec->this_mirror, failrec->in_validation);
 
-	ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
+	ret = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
 					 failrec->bio_flags, 0);
 	if (ret) {
 		free_io_failure(inode, failrec);
@@ -2739,7 +2718,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 	bio_get(bio);
 
 	if (tree->ops && tree->ops->submit_bio_hook)
-		ret = tree->ops->submit_bio_hook(page->mapping->host, bio,
+		ret = tree->ops->submit_bio_hook(tree->private_data, bio,
 					   mirror_num, bio_flags, start);
 	else
 		btrfsic_submit_bio(bio);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bc2729a..c3dd064 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -63,11 +63,11 @@ struct btrfs_root;
 struct btrfs_io_bio;
 struct io_failure_record;
 
-typedef	int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+typedef	int (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
 				       int mirror_num, unsigned long bio_flags,
 				       u64 bio_offset);
 struct extent_io_ops {
-	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+	int (*fill_delalloc)(void *private_data, struct page *locked_page,
 			     u64 start, u64 end, int *page_started,
 			     unsigned long *nr_written);
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
@@ -81,20 +81,24 @@ struct extent_io_ops {
 				    int mirror);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
-	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+	void (*set_bit_hook)(void *private_data, struct extent_state *state,
 			     unsigned *bits);
-	void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+	void (*clear_bit_hook)(void *private_data, struct extent_state *state,
 			       unsigned *bits);
-	void (*merge_extent_hook)(struct inode *inode,
+	void (*merge_extent_hook)(void *private_data,
 				  struct extent_state *new,
 				  struct extent_state *other);
-	void (*split_extent_hook)(struct inode *inode,
+	void (*split_extent_hook)(void *private_data,
 				  struct extent_state *orig, u64 split);
+	void (*check_extent_io_range)(void *private_data, const char *caller,
+				      u64 start, u64 end);
+	struct btrfs_fs_info *(*tree_fs_info)(void *private_data);
+	void (*set_range_writeback)(void *private_data, u64 start, u64 end);
 };
 
 struct extent_io_tree {
 	struct rb_root state;
-	struct address_space *mapping;
+	void *private_data;
 	u64 dirty_bytes;
 	int track_uptodate;
 	spinlock_t lock;
@@ -191,8 +195,7 @@ typedef struct extent_map *(get_extent_t)(struct inode *inode,
 					  u64 start, u64 len,
 					  int create);
 
-void extent_io_tree_init(struct extent_io_tree *tree,
-			 struct address_space *mapping);
+void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
 int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b0f421f..66b7e0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1551,10 +1551,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
 /*
  * extent_io.c call back to do delayed allocation processing
  */
-static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+static int run_delalloc_range(void *private_data, struct page *locked_page,
 			      u64 start, u64 end, int *page_started,
 			      unsigned long *nr_written)
 {
+	struct inode *inode = private_data;
 	int ret;
 	int force_cow = need_force_cow(inode, start, end);
 
@@ -1576,9 +1577,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	return ret;
 }
 
-static void btrfs_split_extent_hook(struct inode *inode,
+static void btrfs_split_extent_hook(void *private_data,
 				    struct extent_state *orig, u64 split)
 {
+	struct inode *inode = private_data;
 	u64 size;
 
 	/* not delalloc, ignore it */
@@ -1616,10 +1618,11 @@ static void btrfs_split_extent_hook(struct inode *inode,
  * extents, such as when we are doing sequential writes, so we can properly
  * account for the metadata space we'll need.
  */
-static void btrfs_merge_extent_hook(struct inode *inode,
+static void btrfs_merge_extent_hook(void *private_data,
 				    struct extent_state *new,
 				    struct extent_state *other)
 {
+	struct inode *inode = private_data;
 	u64 new_size, old_size;
 	u64 num_extents;
 
@@ -1719,9 +1722,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-static void btrfs_set_bit_hook(struct inode *inode,
+static void btrfs_set_bit_hook(void *private_data,
 			       struct extent_state *state, unsigned *bits)
 {
+	struct inode *inode = private_data;
 
 	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
 		WARN_ON(1);
@@ -1763,10 +1767,12 @@ static void btrfs_set_bit_hook(struct inode *inode,
 /*
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
-static void btrfs_clear_bit_hook(struct inode *inode,
+static void btrfs_clear_bit_hook(void *private_data,
 				 struct extent_state *state,
 				 unsigned *bits)
 {
+	struct inode *inode = private_data;
+
 	u64 len = state->end + 1 - state->start;
 	u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
 				    BTRFS_MAX_EXTENT_SIZE);
@@ -1863,10 +1869,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+static int __btrfs_submit_bio_start(void *private_data, struct bio *bio,
 				    int mirror_num, unsigned long bio_flags,
 				    u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
@@ -1883,10 +1890,11 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
+static int __btrfs_submit_bio_done(void *private_data, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags,
 			  u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
@@ -1902,10 +1910,11 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
  * extent_io.c submission hook. This does the right thing for csum calculation
  * on write, or reading the csums from the tree before a read
  */
-static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags,
-			  u64 bio_offset)
+static int btrfs_submit_bio_hook(void *private_data, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags,
+				 u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
 	int ret = 0;
@@ -1939,10 +1948,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
 			goto mapit;
 		/* we're doing a write, do the async checksumming */
 		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-				   inode, bio, mirror_num,
-				   bio_flags, bio_offset,
-				   __btrfs_submit_bio_start,
-				   __btrfs_submit_bio_done);
+					  bio, mirror_num, bio_flags,
+					  bio_offset, inode,
+					  __btrfs_submit_bio_start,
+					  __btrfs_submit_bio_done);
 		goto out;
 	} else if (!skip_sum) {
 		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
@@ -8190,11 +8199,12 @@ static void btrfs_endio_direct_write(struct bio *bio)
 	bio_put(bio);
 }
 
-static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
+static int __btrfs_submit_bio_start_direct_io(void *private_data,
 				    struct bio *bio, int mirror_num,
 				    unsigned long bio_flags, u64 offset)
 {
 	int ret;
+	struct inode *inode = private_data;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
 	BUG_ON(ret); /* -ENOMEM */
@@ -8307,10 +8317,10 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 		goto map;
 
 	if (write && async_submit) {
-		ret = btrfs_wq_submit_bio(root->fs_info,
-				   inode, bio, 0, 0, file_offset,
-				   __btrfs_submit_bio_start_direct_io,
-				   __btrfs_submit_bio_done);
+		ret = btrfs_wq_submit_bio(root->fs_info, bio, 0, 0,
+					  file_offset, inode,
+					  __btrfs_submit_bio_start_direct_io,
+					  __btrfs_submit_bio_done);
 		goto err;
 	} else if (write) {
 		/*
@@ -9274,8 +9284,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree);
-	extent_io_tree_init(&ei->io_tree, &inode->i_data);
-	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+	extent_io_tree_init(&ei->io_tree, inode);
+	extent_io_tree_init(&ei->io_failure_tree, inode);
 	ei->io_tree.track_uptodate = 1;
 	ei->io_failure_tree.track_uptodate = 1;
 	atomic_set(&ei->sync_writers, 0);
@@ -10515,6 +10525,42 @@ int btrfs_inode_check_errors(struct inode *inode)
 	return ret;
 }
 
+static struct btrfs_fs_info *iotree_fs_info(void *private_data)
+{
+	struct inode *inode = private_data;
+	return btrfs_sb(inode->i_sb);
+}
+
+static void btrfs_check_extent_io_range(void *private_data, const char *caller,
+					u64 start, u64 end)
+{
+	struct inode *inode = private_data;
+	u64 isize;
+
+	isize = i_size_read(inode);
+	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
+				caller, btrfs_ino(inode), isize, start, end);
+	}
+}
+
+void btrfs_set_range_writeback(void *private_data, u64 start, u64 end)
+{
+	struct inode *inode = private_data;
+	unsigned long index = start >> PAGE_SHIFT;
+	unsigned long end_index = end >> PAGE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		ASSERT(page); /* Pages should be in the extent_io_tree */
+		set_page_writeback(page);
+		put_page(page);
+		index++;
+	}
+}
+
 static const struct inode_operations btrfs_dir_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.lookup		= btrfs_lookup,
@@ -10568,6 +10614,9 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
 	.clear_bit_hook = btrfs_clear_bit_hook,
 	.merge_extent_hook = btrfs_merge_extent_hook,
 	.split_extent_hook = btrfs_split_extent_hook,
+	.check_extent_io_range = btrfs_check_extent_io_range,
+	.tree_fs_info = iotree_fs_info,
+	.set_range_writeback = btrfs_set_range_writeback,
 };
 
 /*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 6e194a5..7fc6ea7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4203,8 +4203,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 	INIT_LIST_HEAD(&rc->reloc_roots);
 	backref_cache_init(&rc->backref_cache);
 	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(&rc->processed_blocks,
-			    fs_info->btree_inode->i_mapping);
+	extent_io_tree_init(&rc->processed_blocks, NULL);
 	return rc;
 }
 
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d19ab03..1110b8c 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize)
 		return -ENOMEM;
 	}
 
-	extent_io_tree_init(&tmp, &inode->i_data);
+	extent_io_tree_init(&tmp, inode);
 
 	/*
 	 * First go through and create and mark all of our pages dirty, we pin
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9cca0a7..9de37b5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -294,7 +294,7 @@ loop:
 	spin_lock_init(&cur_trans->dropped_roots_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
-			     fs_info->btree_inode->i_mapping);
+			     fs_info->btree_inode);
 	fs_info->generation++;
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/7] btrfs: remove inode argument from repair_io_failure
  2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
  2016-09-02 19:40 ` [PATCH 1/7] Btrfs: replace tree->mapping with tree->private_data Josef Bacik
@ 2016-09-02 19:40 ` Josef Bacik
  2016-09-02 19:40 ` [PATCH 3/7] Btrfs: add a flags field to btrfs_fs_info Josef Bacik
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Josef Bacik @ 2016-09-02 19:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

Once we remove the btree_inode we won't have an inode to pass anymore, just pass
the fs_info directly and the inum since we use that to print out the repair
message.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent_io.c | 14 +++++++-------
 fs/btrfs/extent_io.h |  6 +++---
 fs/btrfs/scrub.c     |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 515b78a..3132f1a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1979,10 +1979,10 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
  * currently, there can be no more than two copies of every data bit. thus,
  * exactly one rewrite is required.
  */
-int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
-		      struct page *page, unsigned int pg_offset, int mirror_num)
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+		      u64 length, u64 logical, struct page *page,
+		      unsigned int pg_offset, int mirror_num)
 {
-	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 	struct bio *bio;
 	struct btrfs_device *dev;
 	u64 map_length = 0;
@@ -2041,7 +2041,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 
 	btrfs_info_rl_in_rcu(fs_info,
 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
-				  btrfs_ino(inode), start,
+				  ino, start,
 				  rcu_str_deref(dev->name), sector);
 	btrfs_bio_counter_dec(fs_info);
 	bio_put(bio);
@@ -2061,7 +2061,7 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = eb->pages[i];
 
-		ret = repair_io_failure(root->fs_info->btree_inode, start,
+		ret = repair_io_failure(root->fs_info, 0, start,
 					PAGE_SIZE, start, p,
 					start - page_offset(p), mirror_num);
 		if (ret)
@@ -2119,8 +2119,8 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
 		num_copies = btrfs_num_copies(fs_info, failrec->logical,
 					      failrec->len);
 		if (num_copies > 1)  {
-			repair_io_failure(inode, start, failrec->len,
-					  failrec->logical, page,
+			repair_io_failure(fs_info, btrfs_ino(inode), start,
+					  failrec->len, failrec->logical, page,
 					  pg_offset, failrec->failed_mirror);
 		}
 	}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c3dd064..61adf16 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -426,9 +426,9 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
 
 struct btrfs_fs_info;
 
-int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
-		      struct page *page, unsigned int pg_offset,
-		      int mirror_num);
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+		      u64 length, u64 logical, struct page *page,
+		      unsigned int pg_offset, int mirror_num);
 int clean_io_failure(struct inode *inode, u64 start, struct page *page,
 		     unsigned int pg_offset);
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1d195d2..4624283 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -732,7 +732,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 			ret = -EIO;
 			goto out;
 		}
-		ret = repair_io_failure(inode, offset, PAGE_SIZE,
+		ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
 					fixup->logical, page,
 					offset - page_offset(page),
 					fixup->mirror_num);
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/7] Btrfs: add a flags field to btrfs_fs_info
  2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
  2016-09-02 19:40 ` [PATCH 1/7] Btrfs: replace tree->mapping with tree->private_data Josef Bacik
  2016-09-02 19:40 ` [PATCH 2/7] btrfs: remove inode argument from repair_io_failure Josef Bacik
@ 2016-09-02 19:40 ` Josef Bacik
  2016-09-08 17:01   ` David Sterba
  2016-09-02 19:40 ` [PATCH 4/7] Btrfs: kill the start argument to read_extent_buffer_pages Josef Bacik
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Josef Bacik @ 2016-09-02 19:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

We have a lot of random ints in btrfs_fs_info that can be put into flags.  This
is mostly equivalent with the exception of how we deal with quota going on or
off, now instead we set a flag when we are turning it on or off and deal with
that appropriately, rather than just having a pending state that the current
quota_enabled gets set to.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/btrfs_inode.h        | 11 ---------
 fs/btrfs/ctree.h              | 44 +++++++++++++++++++-----------------
 fs/btrfs/delayed-inode.c      |  3 ++-
 fs/btrfs/delayed-ref.c        |  9 +++++---
 fs/btrfs/disk-io.c            | 23 +++++++------------
 fs/btrfs/extent-tree.c        | 14 ++++++------
 fs/btrfs/extent_io.c          |  7 +++---
 fs/btrfs/free-space-tree.c    |  6 ++---
 fs/btrfs/inode.c              |  4 ++--
 fs/btrfs/qgroup.c             | 52 +++++++++++++++++++++++--------------------
 fs/btrfs/super.c              |  2 +-
 fs/btrfs/tests/qgroup-tests.c |  2 +-
 fs/btrfs/transaction.c        | 24 +++++++++-----------
 fs/btrfs/tree-log.c           |  4 ++--
 fs/btrfs/volumes.c            |  2 +-
 15 files changed, 98 insertions(+), 109 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4919aed..1a8fa46 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,17 +44,6 @@
 #define BTRFS_INODE_IN_DELALLOC_LIST		9
 #define BTRFS_INODE_READDIO_NEED_LOCK		10
 #define BTRFS_INODE_HAS_PROPS		        11
-/*
- * The following 3 bits are meant only for the btree inode.
- * When any of them is set, it means an error happened while writing an
- * extent buffer belonging to:
- * 1) a non-log btree
- * 2) a log btree and first log sub-transaction
- * 3) a log btree and second log sub-transaction
- */
-#define BTRFS_INODE_BTREE_ERR		        12
-#define BTRFS_INODE_BTREE_LOG1_ERR		13
-#define BTRFS_INODE_BTREE_LOG2_ERR		14
 
 /* in memory btrfs inode */
 struct btrfs_inode {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bb50d7c..282a031 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,9 +675,25 @@ struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_balance_control;
 struct btrfs_delayed_root;
+
+#define BTRFS_FS_BARRIER			1
+#define BTRFS_FS_CLOSING_START			2
+#define BTRFS_FS_CLOSING_DONE			3
+#define BTRFS_FS_LOG_RECOVERING			4
+#define BTRFS_FS_OPEN				5
+#define BTRFS_FS_QUOTA_ENABLED			6
+#define BTRFS_FS_QUOTA_ENABLING			7
+#define BTRFS_FS_QUOTA_DISABLING		8
+#define BTRFS_FS_UPDATE_UUID_TREE_GEN		9
+#define BTRFS_FS_CREATING_FREE_SPACE_TREE	10
+#define BTRFS_FS_BTREE_ERR			11
+#define BTRFS_FS_LOG1_ERR			12
+#define BTRFS_FS_LOG2_ERR			13
+
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+	unsigned long flags;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
@@ -906,10 +922,6 @@ struct btrfs_fs_info {
 	int thread_pool_size;
 
 	struct kobject *space_info_kobj;
-	int do_barriers;
-	int closing;
-	int log_root_recovering;
-	int open;
 
 	u64 total_pinned;
 
@@ -986,17 +998,6 @@ struct btrfs_fs_info {
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	u32 check_integrity_print_mask;
 #endif
-	/*
-	 * quota information
-	 */
-	unsigned int quota_enabled:1;
-
-	/*
-	 * quota_enabled only changes state after a commit. This holds the
-	 * next state.
-	 */
-	unsigned int pending_quota_state:1;
-
 	/* is qgroup tracking in a consistent state? */
 	u64 qgroup_flags;
 
@@ -1059,7 +1060,6 @@ struct btrfs_fs_info {
 	wait_queue_head_t replace_wait;
 
 	struct semaphore uuid_tree_rescan_sem;
-	unsigned int update_uuid_tree_gen:1;
 
 	/* Used to reclaim the metadata space in the background. */
 	struct work_struct async_reclaim_work;
@@ -1077,8 +1077,6 @@ struct btrfs_fs_info {
 	 * and will be latter freed. Protected by fs_info->chunk_mutex.
 	 */
 	struct list_head pinned_chunks;
-
-	int creating_free_space_tree;
 };
 
 struct btrfs_subvolume_writers {
@@ -2863,10 +2861,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 {
 	/*
-	 * Get synced with close_ctree()
+	 * Do it this way so we only ever do one test_bit in the normal case.
 	 */
-	smp_mb();
-	return fs_info->closing;
+	if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
+		if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
+			return 2;
+		return 1;
+	}
+	return 0;
 }
 
 /*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 3eeb9cd..982a168 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1874,7 +1874,8 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode)
 	 * leads to enospc problems.  This means we also can't do
 	 * delayed inode refs
 	 */
-	if (BTRFS_I(inode)->root->fs_info->log_root_recovering)
+	if (test_bit(BTRFS_FS_LOG_RECOVERING,
+		     &BTRFS_I(inode)->root->fs_info->flags))
 		return -EAGAIN;
 
 	delayed_node = btrfs_get_or_create_delayed_node(inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b6d210e..1954465 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -773,7 +773,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	if (!head_ref)
 		goto free_ref;
 
-	if (fs_info->quota_enabled && is_fstree(ref_root)) {
+	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
+	    is_fstree(ref_root)) {
 		record = kmalloc(sizeof(*record), GFP_NOFS);
 		if (!record)
 			goto free_head_ref;
@@ -831,7 +832,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		return -ENOMEM;
 	}
 
-	if (fs_info->quota_enabled && is_fstree(ref_root)) {
+	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
+	    is_fstree(ref_root)) {
 		record = kmalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -870,7 +872,8 @@ int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_head *ref_head;
 	int ret = 0;
 
-	if (!fs_info->quota_enabled || !is_fstree(ref_root))
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	    !is_fstree(ref_root))
 		return 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c1d951a..ebd33ef 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1804,7 +1804,7 @@ static int cleaner_kthread(void *arg)
 		 * Do not do anything if we might cause open_ctree() to block
 		 * before we have finished mounting the filesystem.
 		 */
-		if (!root->fs_info->open)
+		if (!test_bit(BTRFS_FS_OPEN, &root->fs_info->flags))
 			goto sleep;
 
 		if (!mutex_trylock(&root->fs_info->cleaner_mutex))
@@ -2297,8 +2297,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 	fs_info->qgroup_op_tree = RB_ROOT;
 	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
 	fs_info->qgroup_seq = 1;
-	fs_info->quota_enabled = 0;
-	fs_info->pending_quota_state = 0;
 	fs_info->qgroup_ulist = NULL;
 	mutex_init(&fs_info->qgroup_rescan_lock);
 }
@@ -2482,8 +2480,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (!IS_ERR(root)) {
 		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-		fs_info->quota_enabled = 1;
-		fs_info->pending_quota_state = 1;
+		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 		fs_info->quota_root = root;
 	}
 
@@ -2671,8 +2668,7 @@ int open_ctree(struct super_block *sb,
 	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
 	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
 	fs_info->pinned_extents = &fs_info->freed_extents[0];
-	fs_info->do_barriers = 1;
-
+	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
 
 	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
@@ -3160,10 +3156,9 @@ retry_root_backup:
 			return ret;
 		}
 	} else {
-		fs_info->update_uuid_tree_gen = 1;
+		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
 	}
-
-	fs_info->open = 1;
+	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
 
 	/*
 	 * backuproot only affect mount behavior, and if open_ctree succeeded,
@@ -3847,8 +3842,7 @@ void close_ctree(struct btrfs_root *root)
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 
-	fs_info->closing = 1;
-	smp_mb();
+	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
 
 	/* wait for the qgroup rescan worker to stop */
 	btrfs_qgroup_wait_for_completion(fs_info);
@@ -3893,8 +3887,7 @@ void close_ctree(struct btrfs_root *root)
 	kthread_stop(fs_info->transaction_kthread);
 	kthread_stop(fs_info->cleaner_kthread);
 
-	fs_info->closing = 2;
-	smp_mb();
+	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
 
 	btrfs_free_qgroup_config(fs_info);
 
@@ -3919,7 +3912,7 @@ void close_ctree(struct btrfs_root *root)
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	btrfs_stop_all_workers(fs_info);
 
-	fs_info->open = 0;
+	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
 	free_root_pointers(fs_info, 1);
 
 	iput(fs_info->btree_inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 20f9fa6..03da2f6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2954,7 +2954,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	if (trans->aborted)
 		return 0;
 
-	if (root->fs_info->creating_free_space_tree)
+	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &root->fs_info->flags))
 		return 0;
 
 	if (root == root->fs_info->extent_root)
@@ -5196,7 +5196,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
 		 * which means we won't have fs_info->fs_root set, so don't do
 		 * the async reclaim as we will panic.
 		 */
-		if (!root->fs_info->log_root_recovering &&
+		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags) &&
 		    need_do_async_reclaim(space_info, root, used) &&
 		    !work_busy(&root->fs_info->async_reclaim_work)) {
 			trace_btrfs_trigger_flush(root->fs_info,
@@ -5800,7 +5800,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	int ret;
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 
-	if (root->fs_info->quota_enabled) {
+	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) {
 		/* One for parent inode, two for dir entries */
 		num_bytes = 3 * root->nodesize;
 		ret = btrfs_qgroup_reserve_meta(root, num_bytes);
@@ -5978,7 +5978,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	csum_bytes = BTRFS_I(inode)->csum_bytes;
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	if (root->fs_info->quota_enabled) {
+	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) {
 		ret = btrfs_qgroup_reserve_meta(root,
 				nr_extents * root->nodesize);
 		if (ret)
@@ -8561,7 +8561,7 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
 	u64 bytenr, num_bytes;
 
 	/* We can be called directly from walk_up_proc() */
-	if (!root->fs_info->quota_enabled)
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags))
 		return 0;
 
 	for (i = 0; i < nr; i++) {
@@ -8669,7 +8669,7 @@ static int account_shared_subtree(struct btrfs_trans_handle *trans,
 	BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
 	BUG_ON(root_eb == NULL);
 
-	if (!root->fs_info->quota_enabled)
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags))
 		return 0;
 
 	if (!extent_buffer_uptodate(root_eb)) {
@@ -10799,7 +10799,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	if (!fs_info->open)
+	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
 		return;
 
 	spin_lock(&fs_info->unused_bgs_lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3132f1a..6036431 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3609,7 +3609,6 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
 static void set_btree_ioerr(struct page *page)
 {
 	struct extent_buffer *eb = (struct extent_buffer *)page->private;
-	struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
 
 	SetPageError(page);
 	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
@@ -3655,13 +3654,13 @@ static void set_btree_ioerr(struct page *page)
 	 */
 	switch (eb->log_index) {
 	case -1:
-		set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
+		set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
 		break;
 	case 0:
-		set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+		set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
 		break;
 	case 1:
-		set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+		set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
 		break;
 	default:
 		BUG(); /* unexpected, logic error */
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 87e7e3d..83ee63b 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1163,7 +1163,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	fs_info->creating_free_space_tree = 1;
+	set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
 	free_space_root = btrfs_create_tree(trans, fs_info,
 					    BTRFS_FREE_SPACE_TREE_OBJECTID);
 	if (IS_ERR(free_space_root)) {
@@ -1183,7 +1183,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 	}
 
 	btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
-	fs_info->creating_free_space_tree = 0;
+	clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
 
 	ret = btrfs_commit_transaction(trans, tree_root);
 	if (ret)
@@ -1192,7 +1192,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 	return 0;
 
 abort:
-	fs_info->creating_free_space_tree = 0;
+	clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
 	btrfs_abort_transaction(trans, ret);
 	btrfs_end_transaction(trans, tree_root);
 	return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 66b7e0d..0b97cb4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3932,7 +3932,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	 */
 	if (!btrfs_is_free_space_inode(inode)
 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-	    && !root->fs_info->log_root_recovering) {
+	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
 		btrfs_update_root_times(trans, root);
 
 		ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -5202,7 +5202,7 @@ void btrfs_evict_inode(struct inode *inode)
 
 	btrfs_free_io_failure_record(inode, 0, (u64)-1);
 
-	if (root->fs_info->log_root_recovering) {
+	if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
 		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
 				 &BTRFS_I(inode)->runtime_flags));
 		goto no_delete;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 93ee1c1..31dd30d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -309,7 +309,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 	u64 flags = 0;
 	u64 rescan_progress = 0;
 
-	if (!fs_info->quota_enabled)
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 		return 0;
 
 	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
@@ -463,13 +463,11 @@ next2:
 	}
 out:
 	fs_info->qgroup_flags |= flags;
-	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
-		fs_info->quota_enabled = 0;
-		fs_info->pending_quota_state = 0;
-	} else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
-		   ret >= 0) {
+	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+		clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+	else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+		 ret >= 0)
 		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
-	}
 	btrfs_free_path(path);
 
 	if (ret < 0) {
@@ -847,7 +845,7 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 	}
 	ret = 0;
 out:
-	root->fs_info->pending_quota_state = 0;
+	set_bit(BTRFS_FS_QUOTA_DISABLING, &root->fs_info->flags);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -868,7 +866,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (fs_info->quota_root) {
-		fs_info->pending_quota_state = 1;
+		set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
 		goto out;
 	}
 
@@ -964,7 +962,7 @@ out_add_root:
 	}
 	spin_lock(&fs_info->qgroup_lock);
 	fs_info->quota_root = quota_root;
-	fs_info->pending_quota_state = 1;
+	set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
 	spin_unlock(&fs_info->qgroup_lock);
 out_free_path:
 	btrfs_free_path(path);
@@ -993,8 +991,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!fs_info->quota_root)
 		goto out;
-	fs_info->quota_enabled = 0;
-	fs_info->pending_quota_state = 0;
+	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+	set_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags);
 	btrfs_qgroup_wait_for_completion(fs_info);
 	spin_lock(&fs_info->qgroup_lock);
 	quota_root = fs_info->quota_root;
@@ -1684,7 +1682,7 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
 	if (old_roots)
 		nr_old_roots = old_roots->nnodes;
 
-	if (!fs_info->quota_enabled)
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 		goto out_free;
 	BUG_ON(!fs_info->quota_root);
 
@@ -1804,10 +1802,14 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 	if (!quota_root)
 		goto out;
 
-	if (!fs_info->quota_enabled && fs_info->pending_quota_state)
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
+	    test_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
 		start_rescan_worker = 1;
 
-	fs_info->quota_enabled = fs_info->pending_quota_state;
+	if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
+		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+	if (test_and_clear_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags))
+		clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 
 	spin_lock(&fs_info->qgroup_lock);
 	while (!list_empty(&fs_info->dirty_qgroups)) {
@@ -1826,7 +1828,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 		spin_lock(&fs_info->qgroup_lock);
 	}
-	if (fs_info->quota_enabled)
+	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
 	else
 		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -1871,7 +1873,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 	u64 nums;
 
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
-	if (!fs_info->quota_enabled)
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 		goto out;
 
 	if (!quota_root) {
@@ -2314,7 +2316,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 			err = PTR_ERR(trans);
 			break;
 		}
-		if (!fs_info->quota_enabled) {
+		if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
 			err = -EINTR;
 		} else {
 			err = qgroup_rescan_leaf(fs_info, path, trans);
@@ -2536,8 +2538,8 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
 	struct ulist_iterator uiter;
 	int ret;
 
-	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
-	    len == 0)
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+	    !is_fstree(root->objectid) || len == 0)
 		return 0;
 
 	changeset.bytes_changed = 0;
@@ -2634,8 +2636,8 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
 {
 	int ret;
 
-	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
-	    num_bytes == 0)
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+	    !is_fstree(root->objectid) || num_bytes == 0)
 		return 0;
 
 	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
@@ -2650,7 +2652,8 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
 {
 	int reserved;
 
-	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+	    !is_fstree(root->objectid))
 		return;
 
 	reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
@@ -2661,7 +2664,8 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
 
 void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 {
-	if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+	    !is_fstree(root->objectid))
 		return;
 
 	BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 864ce33..b9d4c43 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1836,7 +1836,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		}
 		sb->s_flags &= ~MS_RDONLY;
 
-		fs_info->open = 1;
+		set_bit(BTRFS_FS_OPEN, &fs_info->flags);
 	}
 out:
 	wake_up_process(fs_info->transaction_kthread);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 4407fef..ca7cb5e 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -480,7 +480,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 	 */
 	root->fs_info->tree_root = root;
 	root->fs_info->quota_root = root;
-	root->fs_info->quota_enabled = 1;
+	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 
 	/*
 	 * Can't use bytenr 0, some things freak out
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9de37b5..56bdb2f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -441,7 +441,7 @@ static void wait_current_trans(struct btrfs_root *root)
 
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
-	if (root->fs_info->log_root_recovering)
+	if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
 		return 0;
 
 	if (type == TRANS_USERSPACE)
@@ -993,7 +993,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
-	struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
 	bool errors = false;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
@@ -1025,17 +1024,17 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		if ((mark & EXTENT_DIRTY) &&
-		    test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
-				       &btree_ino->runtime_flags))
+		    test_and_clear_bit(BTRFS_FS_LOG1_ERR,
+				       &root->fs_info->flags))
 			errors = true;
 
 		if ((mark & EXTENT_NEW) &&
-		    test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
-				       &btree_ino->runtime_flags))
+		    test_and_clear_bit(BTRFS_FS_LOG2_ERR,
+				       &root->fs_info->flags))
 			errors = true;
 	} else {
-		if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
-				       &btree_ino->runtime_flags))
+		if (test_and_clear_bit(BTRFS_FS_BTREE_ERR,
+				       &root->fs_info->flags))
 			errors = true;
 	}
 
@@ -1335,7 +1334,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	 * kick in anyway.
 	 */
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
-	if (!fs_info->quota_enabled) {
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
 		mutex_unlock(&fs_info->qgroup_ioctl_lock);
 		return 0;
 	}
@@ -1712,7 +1711,7 @@ static void update_super_roots(struct btrfs_root *root)
 	super->root_level = root_item->level;
 	if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
 		super->cache_generation = root_item->generation;
-	if (root->fs_info->update_uuid_tree_gen)
+	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &root->fs_info->flags))
 		super->uuid_tree_generation = root_item->generation;
 }
 
@@ -1919,7 +1918,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_transaction *prev_trans = NULL;
-	struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
 	int ret;
 
 	/* Stop the commit early if ->aborted is set */
@@ -2213,8 +2211,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_update_commit_device_size(root->fs_info);
 	btrfs_update_commit_device_bytes_used(root, cur_trans);
 
-	clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
-	clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+	clear_bit(BTRFS_FS_LOG1_ERR, &root->fs_info->flags);
+	clear_bit(BTRFS_FS_LOG2_ERR, &root->fs_info->flags);
 
 	btrfs_trans_release_chunk_metadata(trans);
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d31a0c4..a25be18b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5492,7 +5492,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	if (!path)
 		return -ENOMEM;
 
-	fs_info->log_root_recovering = 1;
+	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
 
 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
 	if (IS_ERR(trans)) {
@@ -5602,7 +5602,7 @@ again:
 
 	free_extent_buffer(log_root_tree->node);
 	log_root_tree->log_root = NULL;
-	fs_info->log_root_recovering = 0;
+	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
 	kfree(log_root_tree);
 
 	return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bb0addc..89dc9c7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4210,7 +4210,7 @@ out:
 	if (ret)
 		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
 	else
-		fs_info->update_uuid_tree_gen = 1;
+		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
 	up(&fs_info->uuid_tree_rescan_sem);
 	return 0;
 }
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/7] Btrfs: kill the start argument to read_extent_buffer_pages
  2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
                   ` (2 preceding siblings ...)
  2016-09-02 19:40 ` [PATCH 3/7] Btrfs: add a flags field to btrfs_fs_info Josef Bacik
@ 2016-09-02 19:40 ` Josef Bacik
  2016-09-08 17:01   ` David Sterba
  2016-09-02 19:40 ` [PATCH 5/7] Btrfs: don't pass the inode through clean_io_failure Josef Bacik
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Josef Bacik @ 2016-09-02 19:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

Nobody uses this, it makes no sense to do partial reads of extent buffers.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/disk-io.c   | 13 ++++++-------
 fs/btrfs/extent_io.c | 26 +++++++-------------------
 fs/btrfs/extent_io.h |  2 +-
 3 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ebd33ef..9c42e53 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -443,7 +443,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
  */
 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  struct extent_buffer *eb,
-					  u64 start, u64 parent_transid)
+					  u64 parent_transid)
 {
 	struct extent_io_tree *io_tree;
 	int failed = 0;
@@ -455,8 +455,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	while (1) {
-		ret = read_extent_buffer_pages(io_tree, eb, start,
-					       WAIT_COMPLETE,
+		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
 					       btree_get_extent, mirror_num);
 		if (!ret) {
 			if (!verify_parent_transid(io_tree, eb,
@@ -1098,7 +1097,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
 	if (IS_ERR(buf))
 		return;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, 0, WAIT_NONE, btree_get_extent, 0);
+				 buf, WAIT_NONE, btree_get_extent, 0);
 	free_extent_buffer(buf);
 }
 
@@ -1116,7 +1115,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 
 	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
 
-	ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
 				       btree_get_extent, mirror_num);
 	if (ret) {
 		free_extent_buffer(buf);
@@ -1172,7 +1171,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	if (IS_ERR(buf))
 		return buf;
 
-	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+	ret = btree_read_extent_buffer_pages(root, buf, parent_transid);
 	if (ret) {
 		free_extent_buffer(buf);
 		return ERR_PTR(ret);
@@ -4035,7 +4034,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
 	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
-	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+	return btree_read_extent_buffer_pages(root, buf, parent_transid);
 }
 
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6036431..045e525 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5157,11 +5157,10 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, u64 start, int wait,
+			     struct extent_buffer *eb, int wait,
 			     get_extent_t *get_extent, int mirror_num)
 {
 	unsigned long i;
-	unsigned long start_i;
 	struct page *page;
 	int err;
 	int ret = 0;
@@ -5175,16 +5174,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 
-	if (start) {
-		WARN_ON(start < eb->start);
-		start_i = (start >> PAGE_SHIFT) -
-			(eb->start >> PAGE_SHIFT);
-	} else {
-		start_i = 0;
-	}
-
 	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = start_i; i < num_pages; i++) {
+	for (i = 0; i < num_pages; i++) {
 		page = eb->pages[i];
 		if (wait == WAIT_NONE) {
 			if (!trylock_page(page))
@@ -5199,15 +5190,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		}
 	}
 	if (all_uptodate) {
-		if (start_i == 0)
-			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 		goto unlock_exit;
 	}
 
 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
 	eb->read_mirror = 0;
 	atomic_set(&eb->io_pages, num_reads);
-	for (i = start_i; i < num_pages; i++) {
+	for (i = 0; i < num_pages; i++) {
 		page = eb->pages[i];
 
 		if (!PageUptodate(page)) {
@@ -5248,7 +5238,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (ret || wait != WAIT_COMPLETE)
 		return ret;
 
-	for (i = start_i; i < num_pages; i++) {
+	for (i = 0; i < num_pages; i++) {
 		page = eb->pages[i];
 		wait_on_page_locked(page);
 		if (!PageUptodate(page))
@@ -5258,12 +5248,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	return ret;
 
 unlock_exit:
-	i = start_i;
 	while (locked_pages > 0) {
-		page = eb->pages[i];
-		i++;
-		unlock_page(page);
 		locked_pages--;
+		page = eb->pages[locked_pages];
+		unlock_page(page);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 61adf16..0518e3e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -361,7 +361,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_COMPLETE	1
 #define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, u64 start, int wait,
+			     struct extent_buffer *eb, int wait,
 			     get_extent_t *get_extent, int mirror_num);
 void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 5/7] Btrfs: don't pass the inode through clean_io_failure
  2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
                   ` (3 preceding siblings ...)
  2016-09-02 19:40 ` [PATCH 4/7] Btrfs: kill the start argument to read_extent_buffer_pages Josef Bacik
@ 2016-09-02 19:40 ` Josef Bacik
  2016-09-02 19:40 ` [PATCH 6/7] Btrfs: kill the btree_inode Josef Bacik
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Josef Bacik @ 2016-09-02 19:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

Instead pass around the failure tree and the io tree.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent_io.c | 49 +++++++++++++++++++++++++++----------------------
 fs/btrfs/extent_io.h | 10 +++++++---
 fs/btrfs/inode.c     | 24 ++++++++++++++++++------
 3 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 045e525..5dcdd3e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1946,11 +1946,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 		SetPageUptodate(page);
 }
 
-int free_io_failure(struct inode *inode, struct io_failure_record *rec)
+int free_io_failure(struct extent_io_tree *failure_tree,
+		    struct extent_io_tree *io_tree,
+		    struct io_failure_record *rec)
 {
 	int ret;
 	int err = 0;
-	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
 
 	set_state_failrec(failure_tree, rec->start, NULL);
 	ret = clear_extent_bits(failure_tree, rec->start,
@@ -1959,7 +1960,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
 	if (ret)
 		err = ret;
 
-	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+	ret = clear_extent_bits(io_tree, rec->start,
 				rec->start + rec->len - 1,
 				EXTENT_DAMAGED);
 	if (ret && !err)
@@ -2076,24 +2077,24 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-int clean_io_failure(struct inode *inode, u64 start, struct page *page,
-		     unsigned int pg_offset)
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+		     struct extent_io_tree *failure_tree,
+		     struct extent_io_tree *io_tree, u64 start,
+		     struct page *page, u64 ino, unsigned int pg_offset)
 {
 	u64 private;
 	struct io_failure_record *failrec;
-	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 	struct extent_state *state;
 	int num_copies;
 	int ret;
 
 	private = 0;
-	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-				(u64)-1, 1, EXTENT_DIRTY, 0);
+	ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
+			       EXTENT_DIRTY, 0);
 	if (!ret)
 		return 0;
 
-	ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start,
-			&failrec);
+	ret = get_state_failrec(failure_tree, start, &failrec);
 	if (ret)
 		return 0;
 
@@ -2108,25 +2109,25 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
 	if (fs_info->sb->s_flags & MS_RDONLY)
 		goto out;
 
-	spin_lock(&BTRFS_I(inode)->io_tree.lock);
-	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+	spin_lock(&io_tree->lock);
+	state = find_first_extent_bit_state(io_tree,
 					    failrec->start,
 					    EXTENT_LOCKED);
-	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+	spin_unlock(&io_tree->lock);
 
 	if (state && state->start <= failrec->start &&
 	    state->end >= failrec->start + failrec->len - 1) {
 		num_copies = btrfs_num_copies(fs_info, failrec->logical,
 					      failrec->len);
 		if (num_copies > 1)  {
-			repair_io_failure(fs_info, btrfs_ino(inode), start,
-					  failrec->len, failrec->logical, page,
-					  pg_offset, failrec->failed_mirror);
+			repair_io_failure(fs_info, ino, start, failrec->len,
+					  failrec->logical, page, pg_offset,
+					  failrec->failed_mirror);
 		}
 	}
 
 out:
-	free_io_failure(inode, failrec);
+	free_io_failure(failure_tree, io_tree, failrec);
 
 	return 0;
 }
@@ -2362,6 +2363,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	struct io_failure_record *failrec;
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
 	struct bio *bio;
 	int read_mode;
 	int ret;
@@ -2374,7 +2376,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
 	ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
 	if (!ret) {
-		free_io_failure(inode, failrec);
+		free_io_failure(failure_tree, tree, failrec);
 		return -EIO;
 	}
 
@@ -2389,7 +2391,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 				      (int)phy_offset, failed_bio->bi_end_io,
 				      NULL);
 	if (!bio) {
-		free_io_failure(inode, failrec);
+		free_io_failure(failure_tree, tree, failrec);
 		return -EIO;
 	}
 	bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
@@ -2400,7 +2402,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	ret = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
 					 failrec->bio_flags, 0);
 	if (ret) {
-		free_io_failure(inode, failrec);
+		free_io_failure(failure_tree, tree, failrec);
 		bio_put(bio);
 	}
 
@@ -2506,7 +2508,7 @@ static void end_bio_extent_readpage(struct bio *bio)
 	struct bio_vec *bvec;
 	int uptodate = !bio->bi_error;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
-	struct extent_io_tree *tree;
+	struct extent_io_tree *tree, *failure_tree;
 	u64 offset = 0;
 	u64 start;
 	u64 end;
@@ -2525,6 +2527,7 @@ static void end_bio_extent_readpage(struct bio *bio)
 			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector,
 			 bio->bi_error, io_bio->mirror_num);
 		tree = &BTRFS_I(inode)->io_tree;
+		failure_tree = &BTRFS_I(inode)->io_failure_tree;
 
 		/* We always issue full-page reads, but if some block
 		 * in a page fails to read, blk_update_request() will
@@ -2556,7 +2559,9 @@ static void end_bio_extent_readpage(struct bio *bio)
 			if (ret)
 				uptodate = 0;
 			else
-				clean_io_failure(inode, start, page, 0);
+				clean_io_failure(BTRFS_I(inode)->root->fs_info,
+						 failure_tree, tree, start,
+						 page, btrfs_ino(inode), 0);
 		}
 
 		if (likely(uptodate))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0518e3e..c393c06 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -429,8 +429,10 @@ struct btrfs_fs_info;
 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 		      u64 length, u64 logical, struct page *page,
 		      unsigned int pg_offset, int mirror_num);
-int clean_io_failure(struct inode *inode, u64 start, struct page *page,
-		     unsigned int pg_offset);
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+		     struct extent_io_tree *failure_tree,
+		     struct extent_io_tree *io_tree, u64 start,
+		     struct page *page, u64 ino, unsigned int pg_offset);
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
@@ -463,7 +465,9 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
 				    struct io_failure_record *failrec,
 				    struct page *page, int pg_offset, int icsum,
 				    bio_end_io_t *endio_func, void *data);
-int free_io_failure(struct inode *inode, struct io_failure_record *rec);
+int free_io_failure(struct extent_io_tree *failure_tree,
+		    struct extent_io_tree *io_tree,
+		    struct io_failure_record *rec);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 noinline u64 find_lock_delalloc_range(struct inode *inode,
 				      struct extent_io_tree *tree,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0b97cb4..8ca791f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7870,6 +7870,8 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 			bio_end_io_t *repair_endio, void *repair_arg)
 {
 	struct io_failure_record *failrec;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
 	struct bio *bio;
 	int isector;
 	int read_mode;
@@ -7884,7 +7886,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
 					 failed_mirror);
 	if (!ret) {
-		free_io_failure(inode, failrec);
+		free_io_failure(failure_tree, io_tree, failrec);
 		return -EIO;
 	}
 
@@ -7900,7 +7902,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
 				pgoff, isector, repair_endio, repair_arg);
 	if (!bio) {
-		free_io_failure(inode, failrec);
+		free_io_failure(failure_tree, io_tree, failrec);
 		return -EIO;
 	}
 	bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
@@ -7911,7 +7913,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 
 	ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
 	if (ret) {
-		free_io_failure(inode, failrec);
+		free_io_failure(failure_tree, io_tree, failrec);
 		bio_put(bio);
 	}
 
@@ -7930,6 +7932,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
 	struct btrfs_retry_complete *done = bio->bi_private;
 	struct inode *inode;
 	struct bio_vec *bvec;
+	struct extent_io_tree *io_tree, *failure_tree;
 	int i;
 
 	if (bio->bi_error)
@@ -7937,11 +7940,15 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
 
 	ASSERT(bio->bi_vcnt == 1);
 	inode = bio->bi_io_vec->bv_page->mapping->host;
+	io_tree = &BTRFS_I(inode)->io_tree;
+	failure_tree = &BTRFS_I(inode)->io_failure_tree;
 	ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
 
 	done->uptodate = 1;
 	bio_for_each_segment_all(bvec, bio, i)
-		clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+		clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
+				 io_tree, done->start, bvec->bv_page,
+				 btrfs_ino(inode), 0);
 end:
 	complete(&done->done);
 	bio_put(bio);
@@ -8004,6 +8011,7 @@ static void btrfs_retry_endio(struct bio *bio)
 {
 	struct btrfs_retry_complete *done = bio->bi_private;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct extent_io_tree *io_tree, *failure_tree;
 	struct inode *inode;
 	struct bio_vec *bvec;
 	u64 start;
@@ -8020,6 +8028,8 @@ static void btrfs_retry_endio(struct bio *bio)
 
 	ASSERT(bio->bi_vcnt == 1);
 	inode = bio->bi_io_vec->bv_page->mapping->host;
+	io_tree = &BTRFS_I(inode)->io_tree;
+	failure_tree = &BTRFS_I(inode)->io_failure_tree;
 	ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
 
 	bio_for_each_segment_all(bvec, bio, i) {
@@ -8027,8 +8037,10 @@ static void btrfs_retry_endio(struct bio *bio)
 					bvec->bv_page, bvec->bv_offset,
 					done->start, bvec->bv_len);
 		if (!ret)
-			clean_io_failure(done->inode, done->start,
-					bvec->bv_page, bvec->bv_offset);
+			clean_io_failure(BTRFS_I(inode)->root->fs_info,
+					 failure_tree, io_tree, done->start,
+					 bvec->bv_page, btrfs_ino(inode),
+					 bvec->bv_offset);
 		else
 			uptodate = 0;
 	}
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 6/7] Btrfs: kill the btree_inode
  2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
                   ` (4 preceding siblings ...)
  2016-09-02 19:40 ` [PATCH 5/7] Btrfs: don't pass the inode through clean_io_failure Josef Bacik
@ 2016-09-02 19:40 ` Josef Bacik
  2016-09-08  5:17   ` Chandan Rajendra
  2016-09-09 17:40   ` [PATCH 6/7][V2] " Josef Bacik
  2016-09-02 19:40 ` [PATCH 7/7] Btrfs: kill BUG_ON()'s in btrfs_mark_extent_written Josef Bacik
  2016-09-05 16:31 ` [PATCH 0/7] Kill the btree inode David Sterba
  7 siblings, 2 replies; 16+ messages in thread
From: Josef Bacik @ 2016-09-02 19:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

In order to more efficiently support sub-page blocksizes we need to stop
allocating pages from pagecache for our metadata.  Instead switch to using the
account_metadata* counters for making sure we are keeping the system aware of
how much dirty metadata we have, and use the ->free_cached_objects super
operation in order to handle freeing up extent buffers.  This greatly simplifies
how we deal with extent buffers as now we no longer have to tie the page cache
reclaimation stuff to the extent buffer stuff.  This will also allow us to
simply kmalloc() our data for sub-page blocksizes.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/btrfs_inode.h                 |   3 +-
 fs/btrfs/ctree.c                       |  10 +-
 fs/btrfs/ctree.h                       |  13 +-
 fs/btrfs/disk-io.c                     | 389 ++++----------
 fs/btrfs/extent_io.c                   | 913 ++++++++++++++++++---------------
 fs/btrfs/extent_io.h                   |  49 +-
 fs/btrfs/inode.c                       |   6 +-
 fs/btrfs/root-tree.c                   |   2 +-
 fs/btrfs/super.c                       |  29 +-
 fs/btrfs/tests/btrfs-tests.c           |  37 +-
 fs/btrfs/tests/extent-io-tests.c       |   4 +-
 fs/btrfs/tests/free-space-tree-tests.c |   4 +-
 fs/btrfs/tests/qgroup-tests.c          |   4 +-
 fs/btrfs/transaction.c                 |  11 +-
 14 files changed, 726 insertions(+), 748 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1a8fa46..ad7b185 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -229,10 +229,9 @@ static inline u64 btrfs_ino(struct inode *inode)
 	u64 ino = BTRFS_I(inode)->location.objectid;
 
 	/*
-	 * !ino: btree_inode
 	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
 	 */
-	if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
+	if (BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
 		ino = inode->i_ino;
 	return ino;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d1c56c9..b267053 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1373,8 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 
 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 		BUG_ON(tm->slot != 0);
-		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
-						eb->len);
+		eb_rewin = alloc_dummy_extent_buffer(fs_info->eb_info,
+						     eb->start, eb->len);
 		if (!eb_rewin) {
 			btrfs_tree_read_unlock_blocking(eb);
 			free_extent_buffer(eb);
@@ -1455,8 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	} else if (old_root) {
 		btrfs_tree_read_unlock(eb_root);
 		free_extent_buffer(eb_root);
-		eb = alloc_dummy_extent_buffer(root->fs_info, logical,
-					root->nodesize);
+		eb = alloc_dummy_extent_buffer(root->fs_info->eb_info, logical,
+					       root->nodesize);
 	} else {
 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
 		eb = btrfs_clone_extent_buffer(eb_root);
@@ -1772,7 +1772,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 	int err;
 
 	if (low > high) {
-		btrfs_err(eb->fs_info,
+		btrfs_err(eb->eb_info->fs_info,
 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
 			  __func__, low, high, eb->start,
 			  btrfs_header_owner(eb), btrfs_header_level(eb));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 282a031..ee6956c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -37,6 +37,7 @@
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/sizes.h>
+#include <linux/list_lru.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -675,6 +676,7 @@ struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_balance_control;
 struct btrfs_delayed_root;
+struct btrfs_eb_info;
 
 #define BTRFS_FS_BARRIER			1
 #define BTRFS_FS_CLOSING_START			2
@@ -797,7 +799,7 @@ struct btrfs_fs_info {
 	struct btrfs_super_block *super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
-	struct inode *btree_inode;
+	struct btrfs_eb_info *eb_info;
 	struct backing_dev_info bdi;
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
@@ -1042,10 +1044,6 @@ struct btrfs_fs_info {
 	/* readahead works cnt */
 	atomic_t reada_works_cnt;
 
-	/* Extent buffer radix tree */
-	spinlock_t buffer_lock;
-	struct radix_tree_root buffer_radix;
-
 	/* next backup root to be overwritten */
 	int backup_root_index;
 
@@ -2884,6 +2882,8 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
 
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+	list_lru_destroy(&fs_info->eb_info->lru_list);
+	kfree(fs_info->eb_info);
 	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
@@ -3121,9 +3121,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
 			     struct btrfs_root *parent_root,
 			     u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio,
-			 unsigned long bio_flags);
 void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9c42e53..03ac601 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -217,56 +217,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
 
 #endif
 
-/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-static struct extent_map *btree_get_extent(struct inode *inode,
-		struct page *page, size_t pg_offset, u64 start, u64 len,
-		int create)
-{
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct extent_map *em;
-	int ret;
-
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, start, len);
-	if (em) {
-		em->bdev =
-			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-		read_unlock(&em_tree->lock);
-		goto out;
-	}
-	read_unlock(&em_tree->lock);
-
-	em = alloc_extent_map();
-	if (!em) {
-		em = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-	em->start = 0;
-	em->len = (u64)-1;
-	em->block_len = (u64)-1;
-	em->block_start = 0;
-	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-
-	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
-	if (ret == -EEXIST) {
-		free_extent_map(em);
-		em = lookup_extent_mapping(em_tree, start, len);
-		if (!em)
-			em = ERR_PTR(-EIO);
-	} else if (ret) {
-		free_extent_map(em);
-		em = ERR_PTR(ret);
-	}
-	write_unlock(&em_tree->lock);
-
-out:
-	return em;
-}
-
 u32 btrfs_csum_data(char *data, u32 seed, size_t len)
 {
 	return btrfs_crc32c(seed, data, len);
@@ -349,11 +299,11 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
  * detect blocks that either didn't get written at all or got written
  * in the wrong place.
  */
-static int verify_parent_transid(struct extent_io_tree *io_tree,
-				 struct extent_buffer *eb, u64 parent_transid,
+static int verify_parent_transid(struct extent_buffer *eb, u64 parent_transid,
 				 int atomic)
 {
 	struct extent_state *cached_state = NULL;
+	struct extent_io_tree *io_tree = &eb->eb_info->io_tree;
 	int ret;
 	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
 
@@ -375,7 +325,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		ret = 0;
 		goto out;
 	}
-	btrfs_err_rl(eb->fs_info,
+	btrfs_err_rl(eb->eb_info->fs_info,
 		"parent transid verify failed on %llu wanted %llu found %llu",
 			eb->start,
 			parent_transid, btrfs_header_generation(eb));
@@ -445,7 +395,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  struct extent_buffer *eb,
 					  u64 parent_transid)
 {
-	struct extent_io_tree *io_tree;
 	int failed = 0;
 	int ret;
 	int num_copies = 0;
@@ -453,13 +402,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	int failed_mirror = 0;
 
 	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
-	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	while (1) {
-		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
-					       btree_get_extent, mirror_num);
+		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
 		if (!ret) {
-			if (!verify_parent_transid(io_tree, eb,
-						   parent_transid, 0))
+			if (!verify_parent_transid(eb, parent_transid, 0))
 				break;
 			else
 				ret = -EIO;
@@ -504,24 +450,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 
 static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
 {
-	u64 start = page_offset(page);
-	u64 found_start;
 	struct extent_buffer *eb;
 
 	eb = (struct extent_buffer *)page->private;
 	if (page != eb->pages[0])
 		return 0;
-
-	found_start = btrfs_header_bytenr(eb);
-	/*
-	 * Please do not consolidate these warnings into a single if.
-	 * It is useful to know what went wrong.
-	 */
-	if (WARN_ON(found_start != start))
-		return -EUCLEAN;
-	if (WARN_ON(!PageUptodate(page)))
-		return -EUCLEAN;
-
 	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
 			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
 
@@ -619,8 +552,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	u64 found_start;
 	int found_level;
 	struct extent_buffer *eb;
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info;
 	int ret = 0;
 	int reads_done;
 
@@ -633,6 +566,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	 * in memory.  Make sure we have a ref for all this other checks
 	 */
 	extent_buffer_get(eb);
+	fs_info = eb->eb_info->fs_info;
+	root = fs_info->tree_root;
 
 	reads_done = atomic_dec_and_test(&eb->io_pages);
 	if (!reads_done)
@@ -693,11 +628,19 @@ err:
 		/*
 		 * our io error hook is going to dec the io pages
 		 * again, we have to make sure it has something
-		 * to decrement
+		 * to decrement.
+		 *
+		 * TODO: Kill this, we've re-arranged how this works now so we
+		 * don't need to do this io_pages dance.
 		 */
 		atomic_inc(&eb->io_pages);
 		clear_extent_buffer_uptodate(eb);
 	}
+	if (reads_done) {
+		clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+		smp_mb__after_atomic();
+		wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+	}
 	free_extent_buffer(eb);
 out:
 	return ret;
@@ -712,7 +655,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
 	eb->read_mirror = failed_mirror;
 	atomic_dec(&eb->io_pages);
 	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-		btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
+		btree_readahead_hook(eb->eb_info->fs_info, eb, eb->start, -EIO);
 	return -EIO;	/* we fixed nothing */
 }
 
@@ -884,15 +827,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 	return 0;
 }
 
-static int btree_csum_one_bio(struct bio *bio)
+static int btree_csum_one_bio(struct btrfs_fs_info *fs_info, struct bio *bio)
 {
 	struct bio_vec *bvec;
-	struct btrfs_root *root;
 	int i, ret = 0;
 
 	bio_for_each_segment_all(bvec, bio, i) {
-		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
-		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
+		ret = csum_dirty_buffer(fs_info, bvec->bv_page);
 		if (ret)
 			break;
 	}
@@ -904,25 +845,27 @@ static int __btree_submit_bio_start(void *private_data, struct bio *bio,
 				    int mirror_num, unsigned long bio_flags,
 				    u64 bio_offset)
 {
+	struct btrfs_eb_info *eb_info = private_data;
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	return btree_csum_one_bio(bio);
+	return btree_csum_one_bio(eb_info->fs_info, bio);
 }
 
 static int __btree_submit_bio_done(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
-	struct inode *inode = private_data;
+	struct btrfs_eb_info *eb_info = private_data;
+	struct btrfs_root *root = eb_info->fs_info->tree_root;
 	int ret;
 
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
+	ret = btrfs_map_bio(root, bio, mirror_num, 1);
 	if (ret) {
 		bio->bi_error = ret;
 		bio_endio(bio);
@@ -930,7 +873,7 @@ static int __btree_submit_bio_done(void *private_data, struct bio *bio,
 	return ret;
 }
 
-static int check_async_write(struct inode *inode, unsigned long bio_flags)
+static int check_async_write(unsigned long bio_flags)
 {
 	if (bio_flags & EXTENT_BIO_TREE_LOG)
 		return 0;
@@ -945,8 +888,9 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
-	struct inode *inode = private_data;
-	int async = check_async_write(inode, bio_flags);
+	struct btrfs_eb_info *eb_info = private_data;
+	struct btrfs_root *root = eb_info->fs_info->tree_root;
+	int async = check_async_write(bio_flags);
 	int ret;
 
 	if (bio_op(bio) != REQ_OP_WRITE) {
@@ -954,23 +898,22 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
 		 * called for a read, do the setup so that checksum validation
 		 * can happen in the async kernel threads
 		 */
-		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
-					  bio, BTRFS_WQ_ENDIO_METADATA);
+		ret = btrfs_bio_wq_end_io(eb_info->fs_info, bio,
+					  BTRFS_WQ_ENDIO_METADATA);
 		if (ret)
 			goto out_w_error;
-		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
+		ret = btrfs_map_bio(root, bio, mirror_num, 0);
 	} else if (!async) {
-		ret = btree_csum_one_bio(bio);
+		ret = btree_csum_one_bio(eb_info->fs_info, bio);
 		if (ret)
 			goto out_w_error;
-		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
+		ret = btrfs_map_bio(root, bio, mirror_num, 0);
 	} else {
 		/*
 		 * kthread helpers are used to submit writes so that
 		 * checksumming can happen in parallel across all CPUs
 		 */
-		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-					  bio, mirror_num, 0,
+		ret = btrfs_wq_submit_bio(eb_info->fs_info, bio, mirror_num, 0,
 					  bio_offset, private_data,
 					  __btree_submit_bio_start,
 					  __btree_submit_bio_done);
@@ -986,118 +929,14 @@ out_w_error:
 	return ret;
 }
 
-#ifdef CONFIG_MIGRATION
-static int btree_migratepage(struct address_space *mapping,
-			struct page *newpage, struct page *page,
-			enum migrate_mode mode)
-{
-	/*
-	 * we can't safely write a btree page from here,
-	 * we haven't done the locking hook
-	 */
-	if (PageDirty(page))
-		return -EAGAIN;
-	/*
-	 * Buffers may be managed in a filesystem specific way.
-	 * We must have no buffers or drop them.
-	 */
-	if (page_has_private(page) &&
-	    !try_to_release_page(page, GFP_KERNEL))
-		return -EAGAIN;
-	return migrate_page(mapping, newpage, page, mode);
-}
-#endif
-
-
-static int btree_writepages(struct address_space *mapping,
-			    struct writeback_control *wbc)
-{
-	struct btrfs_fs_info *fs_info;
-	int ret;
-
-	if (wbc->sync_mode == WB_SYNC_NONE) {
-
-		if (wbc->for_kupdate)
-			return 0;
-
-		fs_info = BTRFS_I(mapping->host)->root->fs_info;
-		/* this is a bit racy, but that's ok */
-		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
-					     BTRFS_DIRTY_METADATA_THRESH);
-		if (ret < 0)
-			return 0;
-	}
-	return btree_write_cache_pages(mapping, wbc);
-}
-
-static int btree_readpage(struct file *file, struct page *page)
-{
-	struct extent_io_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btree_get_extent, 0);
-}
-
-static int btree_releasepage(struct page *page, gfp_t gfp_flags)
-{
-	if (PageWriteback(page) || PageDirty(page))
-		return 0;
-
-	return try_release_extent_buffer(page);
-}
-
-static void btree_invalidatepage(struct page *page, unsigned int offset,
-				 unsigned int length)
-{
-	struct extent_io_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	extent_invalidatepage(tree, page, offset);
-	btree_releasepage(page, GFP_NOFS);
-	if (PagePrivate(page)) {
-		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
-			   "page private not zero on page %llu",
-			   (unsigned long long)page_offset(page));
-		ClearPagePrivate(page);
-		set_page_private(page, 0);
-		put_page(page);
-	}
-}
-
-static int btree_set_page_dirty(struct page *page)
-{
-#ifdef DEBUG
-	struct extent_buffer *eb;
-
-	BUG_ON(!PagePrivate(page));
-	eb = (struct extent_buffer *)page->private;
-	BUG_ON(!eb);
-	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-	BUG_ON(!atomic_read(&eb->refs));
-	btrfs_assert_tree_locked(eb);
-#endif
-	return __set_page_dirty_nobuffers(page);
-}
-
-static const struct address_space_operations btree_aops = {
-	.readpage	= btree_readpage,
-	.writepages	= btree_writepages,
-	.releasepage	= btree_releasepage,
-	.invalidatepage = btree_invalidatepage,
-#ifdef CONFIG_MIGRATION
-	.migratepage	= btree_migratepage,
-#endif
-	.set_page_dirty = btree_set_page_dirty,
-};
-
 void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
 {
 	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = root->fs_info->btree_inode;
 
 	buf = btrfs_find_create_tree_block(root, bytenr);
 	if (IS_ERR(buf))
 		return;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, WAIT_NONE, btree_get_extent, 0);
+	read_extent_buffer_pages(buf, WAIT_NONE, 0);
 	free_extent_buffer(buf);
 }
 
@@ -1105,8 +944,6 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 			 int mirror_num, struct extent_buffer **eb)
 {
 	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
 	int ret;
 
 	buf = btrfs_find_create_tree_block(root, bytenr);
@@ -1115,8 +952,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 
 	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
 
-	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
-				       btree_get_extent, mirror_num);
+	ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
 	if (ret) {
 		free_extent_buffer(buf);
 		return ret;
@@ -1136,29 +972,29 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 					    u64 bytenr)
 {
-	return find_extent_buffer(fs_info, bytenr);
+	return find_extent_buffer(fs_info->eb_info, bytenr);
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 bytenr)
 {
 	if (btrfs_is_testing(root->fs_info))
-		return alloc_test_extent_buffer(root->fs_info, bytenr,
-				root->nodesize);
+		return alloc_test_extent_buffer(root->fs_info->eb_info, bytenr,
+						root->nodesize);
 	return alloc_extent_buffer(root->fs_info, bytenr);
 }
 
 
 int btrfs_write_tree_block(struct extent_buffer *buf)
 {
-	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
-					buf->start + buf->len - 1);
+	return btree_write_range(buf->eb_info->fs_info, buf->start,
+				 buf->start + buf->len - 1);
 }
 
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-	return filemap_fdatawait_range(buf->pages[0]->mapping,
-				       buf->start, buf->start + buf->len - 1);
+	return btree_wait_range(buf->eb_info->fs_info, buf->start,
+				buf->start + buf->len - 1);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -1188,14 +1024,10 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
 	    fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
 
-		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+		if (clear_extent_buffer_dirty(buf))
 			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
 					     -buf->len,
 					     fs_info->dirty_metadata_batch);
-			/* ugh, clear_extent_buffer_dirty needs to lock the page */
-			btrfs_set_lock_blocking(buf);
-			clear_extent_buffer_dirty(buf);
-		}
 	}
 }
 
@@ -2247,33 +2079,20 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
 	init_waitqueue_head(&fs_info->balance_wait_q);
 }
 
-static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
-				   struct btrfs_root *tree_root)
+int btrfs_init_eb_info(struct btrfs_fs_info *fs_info)
 {
-	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
-	set_nlink(fs_info->btree_inode, 1);
-	/*
-	 * we set the i_size on the btree inode to the max possible int.
-	 * the real end of the address space is determined by all of
-	 * the devices in the system
-	 */
-	fs_info->btree_inode->i_size = OFFSET_MAX;
-	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-
-	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
-	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
-			    fs_info->btree_inode);
-	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
-	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
-
-	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
-
-	BTRFS_I(fs_info->btree_inode)->root = tree_root;
-	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
-	       sizeof(struct btrfs_key));
-	set_bit(BTRFS_INODE_DUMMY,
-		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
-	btrfs_insert_inode_hash(fs_info->btree_inode);
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
+
+	eb_info->fs_info = fs_info;
+	extent_io_tree_init(&eb_info->io_tree, eb_info);
+	eb_info->io_tree.track_uptodate = 0;
+	eb_info->io_tree.ops = &btree_extent_io_ops;
+	extent_io_tree_init(&eb_info->io_failure_tree, eb_info);
+	INIT_RADIX_TREE(&eb_info->buffer_radix, GFP_ATOMIC);
+	spin_lock_init(&eb_info->buffer_lock);
+	if (list_lru_init(&eb_info->lru_list))
+		return -ENOMEM;
+	return 0;
 }
 
 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2566,16 +2385,7 @@ int open_ctree(struct super_block *sb,
 		goto fail_delalloc_bytes;
 	}
 
-	fs_info->btree_inode = new_inode(sb);
-	if (!fs_info->btree_inode) {
-		err = -ENOMEM;
-		goto fail_bio_counter;
-	}
-
-	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
-	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2590,7 +2400,6 @@ int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->tree_mod_seq_lock);
 	spin_lock_init(&fs_info->super_lock);
 	spin_lock_init(&fs_info->qgroup_op_lock);
-	spin_lock_init(&fs_info->buffer_lock);
 	spin_lock_init(&fs_info->unused_bgs_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->unused_bg_unpin_mutex);
@@ -2643,7 +2452,7 @@ int open_ctree(struct super_block *sb,
 					GFP_KERNEL);
 	if (!fs_info->delayed_root) {
 		err = -ENOMEM;
-		goto fail_iput;
+		goto fail_alloc;
 	}
 	btrfs_init_delayed_root(fs_info->delayed_root);
 
@@ -2658,7 +2467,15 @@ int open_ctree(struct super_block *sb,
 	sb->s_blocksize_bits = blksize_bits(4096);
 	sb->s_bdi = &fs_info->bdi;
 
-	btrfs_init_btree_inode(fs_info, tree_root);
+	fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info), GFP_KERNEL);
+	if (!fs_info->eb_info) {
+		err = -ENOMEM;
+		goto fail_alloc;
+	}
+	if (btrfs_init_eb_info(fs_info)) {
+		err = -ENOMEM;
+		goto fail_alloc;
+	}
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT;
@@ -3085,6 +2902,14 @@ retry_root_backup:
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
+	/*
+	 * We need to make sure we are on the bdi's dirty list so we get
+	 * writeback requests for our fs properly.
+	 */
+	spin_lock(&fs_info->bdi.sb_list_lock);
+	list_add_tail(&fs_info->bdi.dirty_sb_list, &sb->s_bdi_list);
+	spin_unlock(&fs_info->bdi.sb_list_lock);
+
 	if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
 	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
 		btrfs_info(fs_info, "creating free space tree");
@@ -3180,7 +3005,8 @@ fail_cleaner:
 	 * make sure we're done with the btree inode before we stop our
 	 * kthreads
 	 */
-	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+	btree_write_range(fs_info, 0, (u64)-1);
+	btree_wait_range(fs_info, 0, (u64)-1);
 
 fail_sysfs:
 	btrfs_sysfs_remove_mounted(fs_info);
@@ -3194,16 +3020,11 @@ fail_block_groups:
 
 fail_tree_roots:
 	free_root_pointers(fs_info, 1);
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
-
+	btrfs_invalidate_eb_info(fs_info->eb_info);
 fail_sb_buffer:
 	btrfs_stop_all_workers(fs_info);
 fail_alloc:
-fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
-
-	iput(fs_info->btree_inode);
-fail_bio_counter:
 	percpu_counter_destroy(&fs_info->bio_counter);
 fail_delalloc_bytes:
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
@@ -3908,14 +3729,11 @@ void close_ctree(struct btrfs_root *root)
 	 * we must make sure there is not any read request to
 	 * submit after we stopping all workers.
 	 */
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	btrfs_stop_all_workers(fs_info);
 
 	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
 	free_root_pointers(fs_info, 1);
 
-	iput(fs_info->btree_inode);
-
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
 		btrfsic_unmount(root, fs_info->fs_devices);
@@ -3924,6 +3742,8 @@ void close_ctree(struct btrfs_root *root)
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
+	btrfs_invalidate_eb_info(fs_info->eb_info);
+
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
 	percpu_counter_destroy(&fs_info->bio_counter);
@@ -3951,14 +3771,12 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic)
 {
 	int ret;
-	struct inode *btree_inode = buf->pages[0]->mapping->host;
 
 	ret = extent_buffer_uptodate(buf);
 	if (!ret)
 		return ret;
 
-	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
-				    parent_transid, atomic);
+	ret = verify_parent_transid(buf, parent_transid, atomic);
 	if (ret == -EAGAIN)
 		return ret;
 	return !ret;
@@ -3979,7 +3797,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
 		return;
 #endif
-	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	root = buf->eb_info->fs_info->tree_root;
 	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation)
 		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -4015,10 +3833,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
 
 	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
 				     BTRFS_DIRTY_METADATA_THRESH);
-	if (ret > 0) {
+	if (ret > 0)
 		balance_dirty_pages_ratelimited(&root->fs_info->bdi,
 						root->fs_info->sb);
-	}
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root)
@@ -4033,7 +3850,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
 
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = buf->eb_info->fs_info->tree_root;
 	return btree_read_extent_buffer_pages(root, buf, parent_transid);
 }
 
@@ -4376,10 +4193,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 			if (!eb)
 				continue;
 			wait_on_extent_buffer_writeback(eb);
-
-			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
-					       &eb->bflags))
-				clear_extent_buffer_dirty(eb);
+			clear_extent_buffer_dirty(eb);
 			free_extent_buffer_stale(eb);
 		}
 	}
@@ -4504,16 +4318,37 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 
 static struct btrfs_fs_info *btree_fs_info(void *private_data)
 {
-	struct inode *inode = private_data;
-	return btrfs_sb(inode->i_sb);
+	struct btrfs_eb_info *eb_info = private_data;
+	return eb_info->fs_info;
+}
+
+static int btree_merge_bio_hook(struct page *page, unsigned long offset,
+				size_t size, struct bio *bio,
+				unsigned long bio_flags)
+{
+	struct extent_buffer *eb = (struct extent_buffer *)page->private;
+	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+	u64 length = 0;
+	u64 map_length;
+	int ret;
+
+	length = bio->bi_iter.bi_size;
+	map_length = length;
+	ret = btrfs_map_block(fs_info, bio_op(bio), logical, &map_length,
+			      NULL, 0);
+	if (ret < 0)
+		return ret;
+	if (map_length < length + size)
+		return 1;
+	return 0;
 }
 
 static const struct extent_io_ops btree_extent_io_ops = {
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
-	/* note we're sharing with inode.c for the merge bio hook */
-	.merge_bio_hook = btrfs_merge_bio_hook,
+	.merge_bio_hook = btree_merge_bio_hook,
 	.tree_fs_info = btree_fs_info,
 	.set_range_writeback = btrfs_set_range_writeback,
 };
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5dcdd3e..5c18a49 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -75,8 +75,8 @@ void btrfs_leak_debug_check(void)
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
 		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
-		       "refs %d\n",
-		       eb->start, eb->len, atomic_read(&eb->refs));
+		       "bflags %lu refs %d\n",
+		       eb->start, eb->len, eb->bflags, atomic_read(&eb->refs));
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
@@ -3538,7 +3538,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
 			  struct btrfs_fs_info *fs_info,
 			  struct extent_page_data *epd)
 {
-	unsigned long i, num_pages;
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
 	int flush = 0;
 	int ret = 0;
 
@@ -3585,37 +3585,42 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
 
 	btrfs_tree_unlock(eb);
 
-	if (!ret)
-		return ret;
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		struct page *p = eb->pages[i];
-
-		if (!trylock_page(p)) {
-			if (!flush) {
-				flush_write_bio(epd);
-				flush = 1;
-			}
-			lock_page(p);
-		}
+	/*
+	 * We cleared dirty on this buffer, we need to adjust the radix tags.
+	 * We do the actual page accounting in write_one_eb.
+	 */
+	if (ret) {
+		spin_lock_irq(&eb_info->buffer_lock);
+		radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
+				   PAGECACHE_TAG_WRITEBACK);
+		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
+				     PAGECACHE_TAG_DIRTY);
+		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
+				     PAGECACHE_TAG_TOWRITE);
+		spin_unlock_irq(&eb_info->buffer_lock);
 	}
-
 	return ret;
 }
 
 static void end_extent_buffer_writeback(struct extent_buffer *eb)
 {
-	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
-	smp_mb__after_atomic();
-	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+	if (test_and_clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+		struct btrfs_eb_info *eb_info = eb->eb_info;
+		unsigned long flags;
+
+		spin_lock_irqsave(&eb_info->buffer_lock, flags);
+		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
+				     PAGECACHE_TAG_WRITEBACK);
+		spin_unlock_irqrestore(&eb_info->buffer_lock, flags);
+		wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+	}
 }
 
 static void set_btree_ioerr(struct page *page)
 {
 	struct extent_buffer *eb = (struct extent_buffer *)page->private;
+	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
 
-	SetPageError(page);
 	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
 		return;
 
@@ -3624,8 +3629,7 @@ static void set_btree_ioerr(struct page *page)
 	 * failed, increment the counter transaction->eb_write_errors.
 	 * We do this because while the transaction is running and before it's
 	 * committing (when we call filemap_fdata[write|wait]_range against
-	 * the btree inode), we might have
-	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+	 * the btree inode), we might have write_metadata() called - if it
 	 * returns an error or an error happens during writeback, when we're
 	 * committing the transaction we wouldn't know about it, since the pages
 	 * can be no longer dirty nor marked anymore for writeback (if a
@@ -3659,13 +3663,13 @@ static void set_btree_ioerr(struct page *page)
 	 */
 	switch (eb->log_index) {
 	case -1:
-		set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
+		set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
 		break;
 	case 0:
-		set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
+		set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
 		break;
 	case 1:
-		set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
+		set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
 		break;
 	default:
 		BUG(); /* unexpected, logic error */
@@ -3686,16 +3690,13 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
 		done = atomic_dec_and_test(&eb->io_pages);
 
 		if (bio->bi_error ||
-		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
-			ClearPageUptodate(page);
+		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
 			set_btree_ioerr(page);
-		}
-
-		end_page_writeback(page);
 
+		account_metadata_end_writeback(page,
+					       &eb->eb_info->fs_info->bdi);
 		if (!done)
 			continue;
-
 		end_extent_buffer_writeback(eb);
 	}
 
@@ -3708,7 +3709,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 			struct extent_page_data *epd)
 {
 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
-	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+	struct extent_io_tree *tree = &fs_info->eb_info->io_tree;
 	u64 offset = eb->start;
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
@@ -3724,8 +3725,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = eb->pages[i];
 
-		clear_page_dirty_for_io(p);
-		set_page_writeback(p);
 		ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
 					 p, offset >> 9, PAGE_SIZE, 0, bdev,
 					 &epd->bio, -1,
@@ -3734,34 +3733,90 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 		epd->bio_flags = bio_flags;
 		if (ret) {
 			set_btree_ioerr(p);
-			end_page_writeback(p);
 			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
 				end_extent_buffer_writeback(eb);
 			ret = -EIO;
 			break;
 		}
+		account_metadata_writeback(p, &fs_info->bdi);
 		offset += PAGE_SIZE;
 		update_nr_written(p, wbc, 1);
-		unlock_page(p);
 	}
 
-	if (unlikely(ret)) {
-		for (; i < num_pages; i++) {
-			struct page *p = eb->pages[i];
-			clear_page_dirty_for_io(p);
-			unlock_page(p);
+	return ret;
+}
+
+#define EB_TAG_BATCH 4096
+static void tag_ebs_for_writeback(struct btrfs_eb_info *eb_info, pgoff_t start,
+				  pgoff_t end)
+{
+	unsigned long tagged;
+
+	do {
+		spin_lock_irq(&eb_info->buffer_lock);
+		tagged = radix_tree_range_tag_if_tagged(&eb_info->buffer_radix,
+							&start, end,
+							EB_TAG_BATCH,
+							PAGECACHE_TAG_DIRTY,
+							PAGECACHE_TAG_TOWRITE);
+		spin_unlock_irq(&eb_info->buffer_lock);
+		cond_resched();
+	} while (tagged >= EB_TAG_BATCH && start);
+}
+
+static unsigned eb_lookup_tag(struct btrfs_eb_info *eb_info,
+			      struct extent_buffer **ebs, pgoff_t *index,
+			      int tag, unsigned nr)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+	unsigned ret = 0;
+
+	if (unlikely(!nr))
+		return 0;
+
+	rcu_read_lock();
+	radix_tree_for_each_tagged(slot, &eb_info->buffer_radix, &iter, *index,
+				   tag) {
+		struct extent_buffer *eb;
+repeat:
+		eb = radix_tree_deref_slot(slot);
+		if (unlikely(!eb))
+			continue;
+
+		if (radix_tree_exception(eb)) {
+			if (radix_tree_deref_retry(eb)) {
+				slot = radix_tree_iter_retry(&iter);
+				continue;
+			}
+			continue;
 		}
-	}
 
+		if (unlikely(!atomic_inc_not_zero(&eb->refs)))
+			continue;
+
+		if (unlikely(eb != *slot)) {
+			free_extent_buffer(eb);
+			goto repeat;
+		}
+
+		ebs[ret] = eb;
+		if (++ret == nr)
+			break;
+	}
+	rcu_read_unlock();
+	if (ret)
+		*index = (ebs[ret - 1]->start >> PAGE_SHIFT) + 1;
 	return ret;
 }
 
-int btree_write_cache_pages(struct address_space *mapping,
+#define EBVEC_SIZE 16
+static int btree_write_cache_pages(struct btrfs_fs_info *fs_info,
 				   struct writeback_control *wbc)
 {
-	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
-	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
-	struct extent_buffer *eb, *prev_eb = NULL;
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
+	struct extent_io_tree *tree = &eb_info->io_tree;
+	struct extent_buffer *eb;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
@@ -3772,16 +3827,16 @@ int btree_write_cache_pages(struct address_space *mapping,
 	int ret = 0;
 	int done = 0;
 	int nr_to_write_done = 0;
-	struct pagevec pvec;
-	int nr_pages;
+	struct extent_buffer *ebs[EBVEC_SIZE];
+	int nr_ebs;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
+	pgoff_t done_index = 0;
 	int scanned = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
-		index = mapping->writeback_index; /* Start from prev offset */
+		index = eb_info->writeback_index; /* Start from prev offset */
 		end = -1;
 	} else {
 		index = wbc->range_start >> PAGE_SHIFT;
@@ -3794,53 +3849,27 @@ int btree_write_cache_pages(struct address_space *mapping,
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		tag_pages_for_writeback(mapping, index, end);
+		tag_ebs_for_writeback(fs_info->eb_info, index, end);
 	while (!done && !nr_to_write_done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+	       (nr_ebs = eb_lookup_tag(eb_info, ebs, &index, tag,
+			min(end - index, (pgoff_t)EBVEC_SIZE-1) + 1))) {
 		unsigned i;
 
 		scanned = 1;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			if (!PagePrivate(page))
-				continue;
-
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				break;
-			}
-
-			spin_lock(&mapping->private_lock);
-			if (!PagePrivate(page)) {
-				spin_unlock(&mapping->private_lock);
-				continue;
-			}
-
-			eb = (struct extent_buffer *)page->private;
-
-			/*
-			 * Shouldn't happen and normally this would be a BUG_ON
-			 * but no sense in crashing the users box for something
-			 * we can survive anyway.
-			 */
-			if (WARN_ON(!eb)) {
-				spin_unlock(&mapping->private_lock);
+		for (i = 0; i < nr_ebs; i++) {
+			eb = ebs[i];
+			if (done) {
+				free_extent_buffer(eb);
 				continue;
 			}
 
-			if (eb == prev_eb) {
-				spin_unlock(&mapping->private_lock);
+			if (!wbc->range_cyclic && eb->start > wbc->range_end) {
+				done = 1;
+				free_extent_buffer(eb);
 				continue;
 			}
 
-			ret = atomic_inc_not_zero(&eb->refs);
-			spin_unlock(&mapping->private_lock);
-			if (!ret)
-				continue;
-
-			prev_eb = eb;
+			done_index = eb_index(eb);
 			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
 			if (!ret) {
 				free_extent_buffer(eb);
@@ -3848,12 +3877,11 @@ retry:
 			}
 
 			ret = write_one_eb(eb, fs_info, wbc, &epd);
+			free_extent_buffer(eb);
 			if (ret) {
 				done = 1;
-				free_extent_buffer(eb);
-				break;
+				continue;
 			}
-			free_extent_buffer(eb);
 
 			/*
 			 * the filesystem may choose to bump up nr_to_write.
@@ -3862,7 +3890,6 @@ retry:
 			 */
 			nr_to_write_done = wbc->nr_to_write <= 0;
 		}
-		pagevec_release(&pvec);
 		cond_resched();
 	}
 	if (!scanned && !done) {
@@ -3874,10 +3901,77 @@ retry:
 		index = 0;
 		goto retry;
 	}
+	if (wbc->range_cyclic)
+		fs_info->eb_info->writeback_index = done_index;
 	flush_write_bio(&epd);
 	return ret;
 }
 
+void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	btree_write_cache_pages(fs_info, wbc);
+}
+
+static int __btree_write_range(struct btrfs_fs_info *fs_info, u64 start,
+			       u64 end, int sync_mode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = sync_mode,
+		.nr_to_write = LONG_MAX,
+		.range_start = start,
+		.range_end = end,
+	};
+
+	return btree_write_cache_pages(fs_info, &wbc);
+}
+
+void btree_flush(struct btrfs_fs_info *fs_info)
+{
+	__btree_write_range(fs_info, 0, (u64)-1, WB_SYNC_NONE);
+}
+
+int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
+{
+	return __btree_write_range(fs_info, start, end, WB_SYNC_ALL);
+}
+
+int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
+{
+	struct extent_buffer *ebs[EBVEC_SIZE];
+	pgoff_t index = start >> PAGE_SHIFT;
+	pgoff_t end_index = end >> PAGE_SHIFT;
+	unsigned nr_ebs;
+	int ret = 0;
+
+	if (end < start)
+		return ret;
+
+	while ((index <= end) &&
+	       (nr_ebs = eb_lookup_tag(fs_info->eb_info, ebs, &index,
+				       PAGECACHE_TAG_WRITEBACK,
+				       min(end_index - index,
+					   (pgoff_t)EBVEC_SIZE-1) + 1)) != 0) {
+		unsigned i;
+
+		for (i = 0; i < nr_ebs; i++) {
+			struct extent_buffer *eb = ebs[i];
+
+			if (eb->start > end) {
+				free_extent_buffer(eb);
+				continue;
+			}
+
+			wait_on_extent_buffer_writeback(eb);
+			if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+				ret = -EIO;
+			free_extent_buffer(eb);
+		}
+		cond_resched();
+	}
+	return ret;
+}
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -4558,7 +4652,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
 {
 	unsigned long index;
 	struct page *page;
-	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
 
 	BUG_ON(extent_buffer_under_io(eb));
 
@@ -4566,39 +4659,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
 	if (index == 0)
 		return;
 
+	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 	do {
 		index--;
 		page = eb->pages[index];
 		if (!page)
 			continue;
-		if (mapped)
-			spin_lock(&page->mapping->private_lock);
-		/*
-		 * We do this since we'll remove the pages after we've
-		 * removed the eb from the radix tree, so we could race
-		 * and have this page now attached to the new eb.  So
-		 * only clear page_private if it's still connected to
-		 * this eb.
-		 */
-		if (PagePrivate(page) &&
-		    page->private == (unsigned long)eb) {
-			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-			BUG_ON(PageDirty(page));
-			BUG_ON(PageWriteback(page));
-			/*
-			 * We need to make sure we haven't be attached
-			 * to a new eb.
-			 */
-			ClearPagePrivate(page);
-			set_page_private(page, 0);
-			/* One for the page private */
-			put_page(page);
-		}
+		ASSERT(PagePrivate(page));
+		ASSERT(page->private == (unsigned long)eb);
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
 
-		if (mapped)
-			spin_unlock(&page->mapping->private_lock);
+		/* Once for the page private. */
+		put_page(page);
 
-		/* One for when we allocated the page */
+		/* Once for the alloc_page. */
 		put_page(page);
 	} while (index != 0);
 }
@@ -4613,7 +4688,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 }
 
 static struct extent_buffer *
-__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
+__alloc_extent_buffer(struct btrfs_eb_info *eb_info, u64 start,
 		      unsigned long len)
 {
 	struct extent_buffer *eb = NULL;
@@ -4621,7 +4696,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
 	eb->start = start;
 	eb->len = len;
-	eb->fs_info = fs_info;
+	eb->eb_info = eb_info;
 	eb->bflags = 0;
 	rwlock_init(&eb->lock);
 	atomic_set(&eb->write_locks, 0);
@@ -4633,6 +4708,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 	eb->lock_nested = 0;
 	init_waitqueue_head(&eb->write_lock_wq);
 	init_waitqueue_head(&eb->read_lock_wq);
+	INIT_LIST_HEAD(&eb->lru);
 
 	btrfs_leak_debug_add(&eb->leak_list, &buffers);
 
@@ -4657,7 +4733,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 	struct extent_buffer *new;
 	unsigned long num_pages = num_extent_pages(src->start, src->len);
 
-	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+	new = __alloc_extent_buffer(src->eb_info, src->start, src->len);
 	if (new == NULL)
 		return NULL;
 
@@ -4668,8 +4744,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 			return NULL;
 		}
 		attach_extent_buffer_page(new, p);
-		WARN_ON(PageDirty(p));
-		SetPageUptodate(p);
 		new->pages[i] = p;
 	}
 
@@ -4680,8 +4754,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 	return new;
 }
 
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						  u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info *eb_info,
+						u64 start, unsigned long len)
 {
 	struct extent_buffer *eb;
 	unsigned long num_pages;
@@ -4689,7 +4763,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	num_pages = num_extent_pages(start, len);
 
-	eb = __alloc_extent_buffer(fs_info, start, len);
+	eb = __alloc_extent_buffer(eb_info, start, len);
 	if (!eb)
 		return NULL;
 
@@ -4697,6 +4771,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 		eb->pages[i] = alloc_page(GFP_NOFS);
 		if (!eb->pages[i])
 			goto err;
+		attach_extent_buffer_page(eb, eb->pages[i]);
 	}
 	set_extent_buffer_uptodate(eb);
 	btrfs_set_header_nritems(eb, 0);
@@ -4704,30 +4779,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	return eb;
 err:
-	for (; i > 0; i--)
-		__free_page(eb->pages[i - 1]);
-	__free_extent_buffer(eb);
+	btrfs_release_extent_buffer(eb);
 	return NULL;
 }
 
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						u64 start, u32 nodesize)
-{
-	unsigned long len;
-
-	if (!fs_info) {
-		/*
-		 * Called only from tests that don't always have a fs_info
-		 * available
-		 */
-		len = nodesize;
-	} else {
-		len = fs_info->tree_root->nodesize;
-	}
-
-	return __alloc_dummy_extent_buffer(fs_info, start, len);
-}
-
 static void check_buffer_tree_ref(struct extent_buffer *eb)
 {
 	int refs;
@@ -4777,13 +4832,13 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
 	}
 }
 
-struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
 					 u64 start)
 {
 	struct extent_buffer *eb;
 
 	rcu_read_lock();
-	eb = radix_tree_lookup(&fs_info->buffer_radix,
+	eb = radix_tree_lookup(&eb_info->buffer_radix,
 			       start >> PAGE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
@@ -4815,30 +4870,30 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 }
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-					u64 start, u32 nodesize)
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
+					       u64 start, u32 nodesize)
 {
 	struct extent_buffer *eb, *exists = NULL;
 	int ret;
 
-	eb = find_extent_buffer(fs_info, start);
+	eb = find_extent_buffer(eb_info, start);
 	if (eb)
 		return eb;
-	eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
+	eb = alloc_dummy_extent_buffer(eb_info, start, nodesize);
 	if (!eb)
 		return NULL;
-	eb->fs_info = fs_info;
+	eb->eb_info = eb_info;
 again:
 	ret = radix_tree_preload(GFP_NOFS);
 	if (ret)
 		goto free_eb;
-	spin_lock(&fs_info->buffer_lock);
-	ret = radix_tree_insert(&fs_info->buffer_radix,
+	spin_lock_irq(&eb_info->buffer_lock);
+	ret = radix_tree_insert(&eb_info->buffer_radix,
 				start >> PAGE_SHIFT, eb);
-	spin_unlock(&fs_info->buffer_lock);
+	spin_unlock_irq(&eb_info->buffer_lock);
 	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = find_extent_buffer(fs_info, start);
+		exists = find_extent_buffer(eb_info, start);
 		if (exists)
 			goto free_eb;
 		else
@@ -4854,6 +4909,7 @@ again:
 	 * bump the ref count again.
 	 */
 	atomic_inc(&eb->refs);
+	set_extent_buffer_uptodate(eb);
 	return eb;
 free_eb:
 	btrfs_release_extent_buffer(eb);
@@ -4867,12 +4923,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	unsigned long len = fs_info->tree_root->nodesize;
 	unsigned long num_pages = num_extent_pages(start, len);
 	unsigned long i;
-	unsigned long index = start >> PAGE_SHIFT;
 	struct extent_buffer *eb;
 	struct extent_buffer *exists = NULL;
 	struct page *p;
-	struct address_space *mapping = fs_info->btree_inode->i_mapping;
-	int uptodate = 1;
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
+//	struct zone *last_zone = NULL;
+//	struct pg_data_t *last_pgdata = NULL;
 	int ret;
 
 	if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
@@ -4880,62 +4936,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		return ERR_PTR(-EINVAL);
 	}
 
-	eb = find_extent_buffer(fs_info, start);
+	eb = find_extent_buffer(eb_info, start);
 	if (eb)
 		return eb;
 
-	eb = __alloc_extent_buffer(fs_info, start, len);
+	eb = __alloc_extent_buffer(eb_info, start, len);
 	if (!eb)
 		return ERR_PTR(-ENOMEM);
 
-	for (i = 0; i < num_pages; i++, index++) {
-		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
+	for (i = 0; i < num_pages; i++) {
+		p = alloc_page(GFP_NOFS|__GFP_NOFAIL);
 		if (!p) {
 			exists = ERR_PTR(-ENOMEM);
 			goto free_eb;
 		}
 
-		spin_lock(&mapping->private_lock);
-		if (PagePrivate(p)) {
-			/*
-			 * We could have already allocated an eb for this page
-			 * and attached one so lets see if we can get a ref on
-			 * the existing eb, and if we can we know it's good and
-			 * we can just return that one, else we know we can just
-			 * overwrite page->private.
-			 */
-			exists = (struct extent_buffer *)p->private;
-			if (atomic_inc_not_zero(&exists->refs)) {
-				spin_unlock(&mapping->private_lock);
-				unlock_page(p);
-				put_page(p);
-				mark_extent_buffer_accessed(exists, p);
-				goto free_eb;
-			}
-			exists = NULL;
-
-			/*
-			 * Do this so attach doesn't complain and we need to
-			 * drop the ref the old guy had.
-			 */
-			ClearPagePrivate(p);
-			WARN_ON(PageDirty(p));
-			put_page(p);
-		}
+		/*
+		 * If our pages span zones or numa nodes we have to do
+		 * dirty/writeback accounting per page, otherwise we can do it
+		 * in bulk and save us some looping.
+		 *
+		if (!last_zone)
+			last_zone = page_zone(p);
+		if (!last_pgdata)
+			last_pgdata = page_pgdata(p);
+		if (last_zone != page_zone(p) || last_pgdata != page_pgdata(p))
+			set_bit(EXTENT_BUFFER_MIXED_PAGES, &eb->bflags);
+		*/
 		attach_extent_buffer_page(eb, p);
-		spin_unlock(&mapping->private_lock);
-		WARN_ON(PageDirty(p));
 		eb->pages[i] = p;
-		if (!PageUptodate(p))
-			uptodate = 0;
-
-		/*
-		 * see below about how we avoid a nasty race with release page
-		 * and why we unlock later
-		 */
 	}
-	if (uptodate)
-		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 again:
 	ret = radix_tree_preload(GFP_NOFS);
 	if (ret) {
@@ -4943,13 +4973,13 @@ again:
 		goto free_eb;
 	}
 
-	spin_lock(&fs_info->buffer_lock);
-	ret = radix_tree_insert(&fs_info->buffer_radix,
+	spin_lock_irq(&eb_info->buffer_lock);
+	ret = radix_tree_insert(&eb_info->buffer_radix,
 				start >> PAGE_SHIFT, eb);
-	spin_unlock(&fs_info->buffer_lock);
+	spin_unlock_irq(&eb_info->buffer_lock);
 	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = find_extent_buffer(fs_info, start);
+		exists = find_extent_buffer(eb_info, start);
 		if (exists)
 			goto free_eb;
 		else
@@ -4959,31 +4989,10 @@ again:
 	check_buffer_tree_ref(eb);
 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
 
-	/*
-	 * there is a race where release page may have
-	 * tried to find this extent buffer in the radix
-	 * but failed.  It will tell the VM it is safe to
-	 * reclaim the, and it will clear the page private bit.
-	 * We must make sure to set the page private bit properly
-	 * after the extent buffer is in the radix tree so
-	 * it doesn't get lost
-	 */
-	SetPageChecked(eb->pages[0]);
-	for (i = 1; i < num_pages; i++) {
-		p = eb->pages[i];
-		ClearPageChecked(p);
-		unlock_page(p);
-	}
-	unlock_page(eb->pages[0]);
 	return eb;
 
 free_eb:
 	WARN_ON(!atomic_dec_and_test(&eb->refs));
-	for (i = 0; i < num_pages; i++) {
-		if (eb->pages[i])
-			unlock_page(eb->pages[i]);
-	}
-
 	btrfs_release_extent_buffer(eb);
 	return exists;
 }
@@ -4999,17 +5008,19 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 /* Expects to have eb->eb_lock already held */
 static int release_extent_buffer(struct extent_buffer *eb)
 {
+	struct btrfs_eb_info *eb_info = eb->eb_info;
+
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	if (atomic_dec_and_test(&eb->refs)) {
+		if (eb_info)
+			list_lru_del(&eb_info->lru_list, &eb->lru);
 		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
-			struct btrfs_fs_info *fs_info = eb->fs_info;
-
 			spin_unlock(&eb->refs_lock);
 
-			spin_lock(&fs_info->buffer_lock);
-			radix_tree_delete(&fs_info->buffer_radix,
-					  eb->start >> PAGE_SHIFT);
-			spin_unlock(&fs_info->buffer_lock);
+			spin_lock_irq(&eb_info->buffer_lock);
+			radix_tree_delete(&eb_info->buffer_radix,
+					  eb_index(eb));
+			spin_unlock_irq(&eb_info->buffer_lock);
 		} else {
 			spin_unlock(&eb->refs_lock);
 		}
@@ -5024,6 +5035,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
 #endif
 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
 		return 1;
+	} else if (eb_info && atomic_read(&eb->refs) == 1) {
+		list_lru_add(&eb_info->lru_list, &eb->lru);
 	}
 	spin_unlock(&eb->refs_lock);
 
@@ -5057,10 +5070,6 @@ void free_extent_buffer(struct extent_buffer *eb)
 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
 		atomic_dec(&eb->refs);
 
-	/*
-	 * I know this is terrible, but it's temporary until we stop tracking
-	 * the uptodate bits and such for the extent buffers.
-	 */
 	release_extent_buffer(eb);
 }
 
@@ -5078,82 +5087,163 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
 	release_extent_buffer(eb);
 }
 
-void clear_extent_buffer_dirty(struct extent_buffer *eb)
+long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc)
 {
-	unsigned long i;
-	unsigned long num_pages;
-	struct page *page;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
 
-	num_pages = num_extent_pages(eb->start, eb->len);
+	return list_lru_shrink_count(&eb_info->lru_list, sc);
+}
 
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		if (!PageDirty(page))
-			continue;
+static enum lru_status eb_lru_isolate(struct list_head *item,
+				      struct list_lru_one *lru,
+				      spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = (struct list_head *)arg;
+	struct extent_buffer *eb = container_of(item, struct extent_buffer,
+						lru);
+	enum lru_status ret;
+	int refs;
 
-		lock_page(page);
-		WARN_ON(!PagePrivate(page));
+	if (!spin_trylock(&eb->refs_lock))
+		return LRU_SKIP;
 
-		clear_page_dirty_for_io(page);
-		spin_lock_irq(&page->mapping->tree_lock);
-		if (!PageDirty(page)) {
-			radix_tree_tag_clear(&page->mapping->page_tree,
-						page_index(page),
-						PAGECACHE_TAG_DIRTY);
-		}
-		spin_unlock_irq(&page->mapping->tree_lock);
-		ClearPageError(page);
-		unlock_page(page);
+	if (extent_buffer_under_io(eb)) {
+		ret = LRU_ROTATE;
+		goto out;
+	}
+
+	refs = atomic_read(&eb->refs);
+	/* We can race with somebody freeing us, just skip if this happens. */
+	if (refs == 0) {
+		ret = LRU_SKIP;
+		goto out;
+	}
+
+	/* Eb is in use, don't kill it. */
+	if (refs > 1) {
+		ret = LRU_ROTATE;
+		goto out;
+	}
+
+	/*
+	 * If we don't clear the TREE_REF flag then this eb is going to
+	 * disappear soon anyway.  Otherwise we become responsible for dropping
+	 * the last ref on this eb and we know it'll survive until we call
+	 * dispose_list.
+	 */
+	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+		ret = LRU_SKIP;
+		goto out;
+	}
+	list_lru_isolate_move(lru, &eb->lru, freeable);
+	ret = LRU_REMOVED;
+out:
+	spin_unlock(&eb->refs_lock);
+	return ret;
+}
+
+static void dispose_list(struct list_head *list)
+{
+	struct extent_buffer *eb;
+
+	while (!list_empty(list)) {
+		eb = list_first_entry(list, struct extent_buffer, lru);
+
+		spin_lock(&eb->refs_lock);
+		list_del_init(&eb->lru);
+		spin_unlock(&eb->refs_lock);
+		free_extent_buffer(eb);
+		cond_resched();
 	}
+}
+
+long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
+	LIST_HEAD(freeable);
+	long freed;
+
+	freed = list_lru_shrink_walk(&eb_info->lru_list, sc, eb_lru_isolate,
+				     &freeable);
+	dispose_list(&freeable);
+	return freed;
+}
+
+#define MAX_EVICT_COUNT 1024
+void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info)
+{
+	LIST_HEAD(freeable);
+	unsigned long count;
+
+	/*
+	 * Evict in batches so we don't lockup the system trying to evict
+	 * memory.
+	 */
+	do {
+		count = list_lru_walk(&eb_info->lru_list, eb_lru_isolate,
+				      &freeable, MAX_EVICT_COUNT);
+		cond_resched();
+	} while (count);
+	dispose_list(&freeable);
+	synchronize_rcu();
+}
+
+int clear_extent_buffer_dirty(struct extent_buffer *eb)
+{
+	struct btrfs_eb_info *eb_info = eb->eb_info;
+	unsigned long i;
+	unsigned long num_pages;
+
+	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
+		return 0;
+
+	spin_lock_irq(&eb_info->buffer_lock);
+	radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
+			     PAGECACHE_TAG_DIRTY);
+	spin_unlock_irq(&eb_info->buffer_lock);
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++)
+		account_metadata_cleaned(eb->pages[i], &eb_info->fs_info->bdi);
 	WARN_ON(atomic_read(&eb->refs) == 0);
+	return 1;
 }
 
 int set_extent_buffer_dirty(struct extent_buffer *eb)
 {
+	struct btrfs_eb_info *eb_info = eb->eb_info;
 	unsigned long i;
 	unsigned long num_pages;
 	int was_dirty = 0;
 
 	check_buffer_tree_ref(eb);
 
-	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
-
-	num_pages = num_extent_pages(eb->start, eb->len);
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+	if (test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
+		return 1;
 
+	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++)
-		set_page_dirty(eb->pages[i]);
+		account_metadata_dirtied(eb->pages[i],
+					 &eb->eb_info->fs_info->bdi);
+	spin_lock_irq(&eb_info->buffer_lock);
+	radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
+			   PAGECACHE_TAG_DIRTY);
+	spin_unlock_irq(&eb_info->buffer_lock);
 	return was_dirty;
 }
 
 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
-	unsigned long i;
-	struct page *page;
-	unsigned long num_pages;
-
 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		if (page)
-			ClearPageUptodate(page);
-	}
 }
 
 void set_extent_buffer_uptodate(struct extent_buffer *eb)
 {
-	unsigned long i;
-	struct page *page;
-	unsigned long num_pages;
-
 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		SetPageUptodate(page);
-	}
 }
 
 int extent_buffer_uptodate(struct extent_buffer *eb)
@@ -5161,103 +5251,166 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 }
 
-int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, int wait,
-			     get_extent_t *get_extent, int mirror_num)
+static void end_bio_extent_buffer_readpage(struct bio *bio)
 {
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct extent_io_tree *tree = NULL;
+	struct bio_vec *bvec;
+	u64 unlock_start = 0, unlock_len = 0;
+	int mirror_num = io_bio->mirror_num;
+	int uptodate = !bio->bi_error;
+	int i, ret;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+		struct btrfs_eb_info *eb_info;
+		struct extent_buffer *eb;
+
+		eb = (struct extent_buffer *)page->private;
+		if (WARN_ON(!eb))
+			continue;
+
+		eb_info = eb->eb_info;
+		if (!tree)
+			tree = &eb_info->io_tree;
+		if (uptodate) {
+			/*
+			 * btree_readpage_end_io_hook doesn't care about
+			 * start/end so just pass 0.  We'll kill this later.
+			 */
+			ret = tree->ops->readpage_end_io_hook(io_bio, 0,
+							      page, 0, 0,
+							      mirror_num);
+			if (ret) {
+				uptodate = 0;
+			} else {
+				u64 start = eb->start;
+				int c, num_pages;
+
+				num_pages = num_extent_pages(eb->start,
+							     eb->len);
+				for (c = 0; c < num_pages; c++) {
+					if (eb->pages[c] == page)
+						break;
+					start += PAGE_SIZE;
+				}
+				clean_io_failure(eb_info->fs_info,
+						 &eb_info->io_failure_tree,
+						 tree, start, page, 0, 0);
+			}
+		}
+		/*
+		 * We never fix anything in btree_io_failed_hook.
+		 *
+		 * TODO: rework the io failed hook to not assume we can fix
+		 * anything.
+		 */
+		if (!uptodate)
+			tree->ops->readpage_io_failed_hook(page, mirror_num);
+
+		if (unlock_start == 0) {
+			unlock_start = eb->start;
+			unlock_len = PAGE_SIZE;
+		} else {
+			unlock_len += PAGE_SIZE;
+		}
+	}
+
+	if (unlock_start)
+		unlock_extent(tree, unlock_start,
+			      unlock_start + unlock_len - 1);
+	if (io_bio->end_io)
+		io_bio->end_io(io_bio, bio->bi_error);
+	bio_put(bio);
+}
+
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
+			     int mirror_num)
+{
+	struct btrfs_eb_info *eb_info = eb->eb_info;
+	struct extent_io_tree *io_tree = &eb_info->io_tree;
+	struct block_device *bdev = eb_info->fs_info->fs_devices->latest_bdev;
+	struct bio *bio = NULL;
+	u64 offset = eb->start;
+	u64 unlock_start = 0, unlock_len = 0;
 	unsigned long i;
 	struct page *page;
 	int err;
 	int ret = 0;
-	int locked_pages = 0;
-	int all_uptodate = 1;
 	unsigned long num_pages;
-	unsigned long num_reads = 0;
-	struct bio *bio = NULL;
-	unsigned long bio_flags = 0;
 
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		if (wait == WAIT_NONE) {
-			if (!trylock_page(page))
-				goto unlock_exit;
-		} else {
-			lock_page(page);
-		}
-		locked_pages++;
-		if (!PageUptodate(page)) {
-			num_reads++;
-			all_uptodate = 0;
-		}
-	}
-	if (all_uptodate) {
-		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-		goto unlock_exit;
+	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) {
+		if (wait != WAIT_COMPLETE)
+			return 0;
+		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
+			       TASK_UNINTERRUPTIBLE);
+		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+			ret = -EIO;
+		return ret;
 	}
 
+	lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+	num_pages = num_extent_pages(eb->start, eb->len);
 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
 	eb->read_mirror = 0;
-	atomic_set(&eb->io_pages, num_reads);
+	atomic_set(&eb->io_pages, num_pages);
 	for (i = 0; i < num_pages; i++) {
 		page = eb->pages[i];
-
-		if (!PageUptodate(page)) {
-			if (ret) {
-				atomic_dec(&eb->io_pages);
-				unlock_page(page);
-				continue;
+		if (ret) {
+			unlock_len += PAGE_SIZE;
+			if (atomic_dec_and_test(&eb->io_pages)) {
+				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+				smp_mb__after_atomic();
+				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
 			}
+			continue;
+		}
 
-			ClearPageError(page);
-			err = __extent_read_full_page(tree, page,
-						      get_extent, &bio,
-						      mirror_num, &bio_flags,
-						      REQ_META);
-			if (err) {
-				ret = err;
-				/*
-				 * We use &bio in above __extent_read_full_page,
-				 * so we ensure that if it returns error, the
-				 * current page fails to add itself to bio and
-				 * it's been unlocked.
-				 *
-				 * We must dec io_pages by ourselves.
-				 */
-				atomic_dec(&eb->io_pages);
+		err = submit_extent_page(REQ_OP_READ, REQ_META, io_tree, NULL,
+					 page, offset >> 9, PAGE_SIZE, 0, bdev,
+					 &bio, -1,
+					 end_bio_extent_buffer_readpage,
+					 mirror_num, 0, 0, false);
+		if (err) {
+			ret = err;
+			/*
+			 * We use &bio in above submit_extent_page
+			 * so we ensure that if it returns error, the
+			 * current page fails to add itself to bio and
+			 * it's been unlocked.
+			 *
+			 * We must dec io_pages by ourselves.
+			 */
+			if (atomic_dec_and_test(&eb->io_pages)) {
+				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+				smp_mb__after_atomic();
+				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
 			}
-		} else {
-			unlock_page(page);
+			unlock_start = eb->start;
+			unlock_len = PAGE_SIZE;
 		}
+		offset += PAGE_SIZE;
 	}
 
 	if (bio) {
-		err = submit_one_bio(bio, mirror_num, bio_flags);
+		err = submit_one_bio(bio, mirror_num, 0);
 		if (err)
 			return err;
 	}
 
+	if (ret && unlock_start)
+		unlock_extent(io_tree, unlock_start,
+			      unlock_start + unlock_len - 1);
 	if (ret || wait != WAIT_COMPLETE)
 		return ret;
 
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		wait_on_page_locked(page);
-		if (!PageUptodate(page))
-			ret = -EIO;
-	}
-
-	return ret;
-
-unlock_exit:
-	while (locked_pages > 0) {
-		locked_pages--;
-		page = eb->pages[locked_pages];
-		unlock_page(page);
-	}
+	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
+		       TASK_UNINTERRUPTIBLE);
+	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+		ret = -EIO;
 	return ret;
 }
 
@@ -5426,7 +5579,6 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 
 	while (len > 0) {
 		page = eb->pages[i];
-		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_SIZE - offset);
 		kaddr = page_address(page);
@@ -5456,7 +5608,6 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 
 	while (len > 0) {
 		page = eb->pages[i];
-		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_SIZE - offset);
 		kaddr = page_address(page);
@@ -5487,7 +5638,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 
 	while (len > 0) {
 		page = dst->pages[i];
-		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (unsigned long)(PAGE_SIZE - offset));
 
@@ -5562,7 +5712,6 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
 
 	eb_bitmap_offset(eb, start, nr, &i, &offset);
 	page = eb->pages[i];
-	WARN_ON(!PageUptodate(page));
 	kaddr = page_address(page);
 	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
 }
@@ -5587,7 +5736,6 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
 
 	eb_bitmap_offset(eb, start, pos, &i, &offset);
 	page = eb->pages[i];
-	WARN_ON(!PageUptodate(page));
 	kaddr = page_address(page);
 
 	while (len >= bits_to_set) {
@@ -5598,7 +5746,6 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
 		if (++offset >= PAGE_SIZE && len > 0) {
 			offset = 0;
 			page = eb->pages[++i];
-			WARN_ON(!PageUptodate(page));
 			kaddr = page_address(page);
 		}
 	}
@@ -5629,7 +5776,6 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
 
 	eb_bitmap_offset(eb, start, pos, &i, &offset);
 	page = eb->pages[i];
-	WARN_ON(!PageUptodate(page));
 	kaddr = page_address(page);
 
 	while (len >= bits_to_clear) {
@@ -5640,7 +5786,6 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
 		if (++offset >= PAGE_SIZE && len > 0) {
 			offset = 0;
 			page = eb->pages[++i];
-			WARN_ON(!PageUptodate(page));
 			kaddr = page_address(page);
 		}
 	}
@@ -5689,13 +5834,13 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		btrfs_err(dst->fs_info,
+		btrfs_err(dst->eb_info->fs_info,
 			"memmove bogus src_offset %lu move "
 		       "len %lu dst len %lu", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		btrfs_err(dst->fs_info,
+		btrfs_err(dst->eb_info->fs_info,
 			"memmove bogus dst_offset %lu move "
 		       "len %lu dst len %lu", dst_offset, len, dst->len);
 		BUG_ON(1);
@@ -5737,12 +5882,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+		btrfs_err(dst->eb_info->fs_info, "memmove bogus src_offset %lu move "
 		       "len %lu len %lu", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+		btrfs_err(dst->eb_info->fs_info, "memmove bogus dst_offset %lu move "
 		       "len %lu len %lu", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
@@ -5770,45 +5915,3 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		len -= cur;
 	}
 }
-
-int try_release_extent_buffer(struct page *page)
-{
-	struct extent_buffer *eb;
-
-	/*
-	 * We need to make sure nobody is attaching this page to an eb right
-	 * now.
-	 */
-	spin_lock(&page->mapping->private_lock);
-	if (!PagePrivate(page)) {
-		spin_unlock(&page->mapping->private_lock);
-		return 1;
-	}
-
-	eb = (struct extent_buffer *)page->private;
-	BUG_ON(!eb);
-
-	/*
-	 * This is a little awful but should be ok, we need to make sure that
-	 * the eb doesn't disappear out from under us while we're looking at
-	 * this page.
-	 */
-	spin_lock(&eb->refs_lock);
-	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
-		spin_unlock(&eb->refs_lock);
-		spin_unlock(&page->mapping->private_lock);
-		return 0;
-	}
-	spin_unlock(&page->mapping->private_lock);
-
-	/*
-	 * If tree ref isn't set then we know the ref on this eb is a real ref,
-	 * so just return, this page will likely be freed soon anyway.
-	 */
-	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
-		spin_unlock(&eb->refs_lock);
-		return 0;
-	}
-
-	return release_extent_buffer(eb);
-}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c393c06..06b6f14 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -43,6 +43,8 @@
 #define EXTENT_BUFFER_DUMMY 9
 #define EXTENT_BUFFER_IN_TREE 10
 #define EXTENT_BUFFER_WRITE_ERR 11    /* write IO error */
+#define EXTENT_BUFFER_MIXED_PAGES 12	/* the pages span multiple zones or numa nodes. */
+#define EXTENT_BUFFER_READING 13 /* currently reading this eb. */
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define PAGE_UNLOCK		(1 << 0)
@@ -122,13 +124,25 @@ struct extent_state {
 #endif
 };
 
+struct btrfs_eb_info {
+	struct btrfs_fs_info *fs_info;
+	struct extent_io_tree io_tree;
+	struct extent_io_tree io_failure_tree;
+
+	/* Extent buffer radix tree */
+	spinlock_t buffer_lock;
+	struct radix_tree_root buffer_radix;
+	struct list_lru lru_list;
+	pgoff_t writeback_index;
+};
+
 #define INLINE_EXTENT_BUFFER_PAGES 16
 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
 struct extent_buffer {
 	u64 start;
 	unsigned long len;
 	unsigned long bflags;
-	struct btrfs_fs_info *fs_info;
+	struct btrfs_eb_info *eb_info;
 	spinlock_t refs_lock;
 	atomic_t refs;
 	atomic_t io_pages;
@@ -163,6 +177,7 @@ struct extent_buffer {
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
 #endif
+	struct list_head lru;
 };
 
 /*
@@ -336,8 +351,6 @@ int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
 		      struct writeback_control *wbc);
-int btree_write_cache_pages(struct address_space *mapping,
-			    struct writeback_control *wbc);
 int extent_readpages(struct extent_io_tree *tree,
 		     struct address_space *mapping,
 		     struct list_head *pages, unsigned nr_pages,
@@ -348,21 +361,18 @@ void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 					  u64 start);
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						  u64 start, unsigned long len);
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						u64 start, u32 nodesize);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info *eb_info,
+						u64 start, unsigned long len);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
-struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
 					 u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE	0
 #define WAIT_COMPLETE	1
 #define WAIT_PAGE_LOCK	2
-int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, int wait,
-			     get_extent_t *get_extent, int mirror_num);
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
+			     int mirror_num);
 void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
 
 static inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -376,6 +386,11 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
 	atomic_inc(&eb->refs);
 }
 
+static inline unsigned long eb_index(struct extent_buffer *eb)
+{
+	return eb->start >> PAGE_SHIFT;
+}
+
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 			  unsigned long start,
 			  unsigned long len);
@@ -402,7 +417,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
 			      unsigned long pos, unsigned long len);
 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
 				unsigned long pos, unsigned long len);
-void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int clear_extent_buffer_dirty(struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_buffer *eb);
 void set_extent_buffer_uptodate(struct extent_buffer *eb);
 void clear_extent_buffer_uptodate(struct extent_buffer *eb);
@@ -436,6 +451,14 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
+void btree_flush(struct btrfs_fs_info *fs_info);
+int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
+int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
+long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc);
+long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc);
+void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc);
+void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info);
+int btrfs_init_eb_info(struct btrfs_fs_info *fs_info);
 
 /*
  * When IO fails, either with EIO or csum verification fails, we
@@ -474,6 +497,6 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
 				      struct page *locked_page, u64 *start,
 				      u64 *end, u64 max_bytes);
 #endif
-struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
 					       u64 start, u32 nodesize);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ca791f..3440b52 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1837,9 +1837,9 @@ static void btrfs_clear_bit_hook(void *private_data,
  * return 0 if page can be merged to bio
  * return error otherwise
  */
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio,
-			 unsigned long bio_flags)
+static int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+				size_t size, struct bio *bio,
+				unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7fd7e18..614fdb8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,7 +45,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 	if (!need_reset && btrfs_root_generation(item)
 		!= btrfs_root_generation_v2(item)) {
 		if (btrfs_root_generation_v2(item) != 0) {
-			btrfs_warn(eb->fs_info,
+			btrfs_warn(eb->eb_info->fs_info,
 					"mismatching "
 					"generation and generation_v2 "
 					"found in root item. This root "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b9d4c43..73a1d8d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1180,7 +1180,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	trace_btrfs_sync_fs(fs_info, wait);
 
 	if (!wait) {
-		filemap_flush(fs_info->btree_inode->i_mapping);
+		btree_flush(fs_info);
 		return 0;
 	}
 
@@ -2287,18 +2287,21 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 }
 
 static const struct super_operations btrfs_super_ops = {
-	.drop_inode	= btrfs_drop_inode,
-	.evict_inode	= btrfs_evict_inode,
-	.put_super	= btrfs_put_super,
-	.sync_fs	= btrfs_sync_fs,
-	.show_options	= btrfs_show_options,
-	.show_devname	= btrfs_show_devname,
-	.write_inode	= btrfs_write_inode,
-	.alloc_inode	= btrfs_alloc_inode,
-	.destroy_inode	= btrfs_destroy_inode,
-	.statfs		= btrfs_statfs,
-	.remount_fs	= btrfs_remount,
-	.freeze_fs	= btrfs_freeze,
+	.drop_inode		= btrfs_drop_inode,
+	.evict_inode		= btrfs_evict_inode,
+	.put_super		= btrfs_put_super,
+	.sync_fs		= btrfs_sync_fs,
+	.show_options		= btrfs_show_options,
+	.show_devname		= btrfs_show_devname,
+	.write_inode		= btrfs_write_inode,
+	.alloc_inode		= btrfs_alloc_inode,
+	.destroy_inode		= btrfs_destroy_inode,
+	.statfs			= btrfs_statfs,
+	.remount_fs		= btrfs_remount,
+	.freeze_fs		= btrfs_freeze,
+	.nr_cached_objects	= btrfs_nr_ebs,
+	.free_cached_objects	= btrfs_free_ebs,
+	.write_metadata		= btrfs_write_ebs,
 };
 
 static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index bf62ad9..dca90d6 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -100,14 +100,32 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 		return NULL;
 	}
 
+	fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info),
+				   GFP_KERNEL);
+	if (!fs_info->eb_info) {
+		kfree(fs_info->fs_devices);
+		kfree(fs_info->super_copy);
+		kfree(fs_info);
+		return NULL;
+	}
+
+	if (btrfs_init_eb_info(fs_info)) {
+		kfree(fs_info->eb_info);
+		kfree(fs_info->fs_devices);
+		kfree(fs_info->super_copy);
+		kfree(fs_info);
+		return NULL;
+	}
+
 	if (init_srcu_struct(&fs_info->subvol_srcu)) {
+		list_lru_destroy(&fs_info->eb_info->lru_list);
+		kfree(fs_info->eb_info);
 		kfree(fs_info->fs_devices);
 		kfree(fs_info->super_copy);
 		kfree(fs_info);
 		return NULL;
 	}
 
-	spin_lock_init(&fs_info->buffer_lock);
 	spin_lock_init(&fs_info->qgroup_lock);
 	spin_lock_init(&fs_info->qgroup_op_lock);
 	spin_lock_init(&fs_info->super_lock);
@@ -123,7 +141,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
-	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
 	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
 	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
@@ -137,6 +154,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 
 void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_eb_info *eb_info;
 	struct radix_tree_iter iter;
 	void **slot;
 
@@ -147,13 +165,14 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 			      &fs_info->fs_state)))
 		return;
 
+	eb_info = fs_info->eb_info;
 	test_mnt->mnt_sb->s_fs_info = NULL;
 
-	spin_lock(&fs_info->buffer_lock);
-	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+	spin_lock(&eb_info->buffer_lock);
+	radix_tree_for_each_slot(slot, &eb_info->buffer_radix, &iter, 0) {
 		struct extent_buffer *eb;
 
-		eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+		eb = radix_tree_deref_slot_protected(slot, &eb_info->buffer_lock);
 		if (!eb)
 			continue;
 		/* Shouldn't happen but that kind of thinking creates CVE's */
@@ -162,15 +181,17 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 				slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
-		spin_unlock(&fs_info->buffer_lock);
+		spin_unlock(&eb_info->buffer_lock);
 		free_extent_buffer_stale(eb);
-		spin_lock(&fs_info->buffer_lock);
+		spin_lock(&eb_info->buffer_lock);
 	}
-	spin_unlock(&fs_info->buffer_lock);
+	spin_unlock(&eb_info->buffer_lock);
 
 	btrfs_free_qgroup_config(fs_info);
 	btrfs_free_fs_roots(fs_info);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
+	list_lru_destroy(&eb_info->lru_list);
+	kfree(fs_info->eb_info);
 	kfree(fs_info->super_copy);
 	kfree(fs_info->fs_devices);
 	kfree(fs_info);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 1110b8c..45524f1 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -388,7 +388,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
 		return -ENOMEM;
 	}
 
-	eb = __alloc_dummy_extent_buffer(NULL, 0, len);
+	eb = alloc_dummy_extent_buffer(NULL, 0, len);
 	if (!eb) {
 		test_msg("Couldn't allocate test extent buffer\n");
 		kfree(bitmap);
@@ -401,7 +401,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
 
 	/* Do it over again with an extent buffer which isn't page-aligned. */
 	free_extent_buffer(eb);
-	eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
+	eb = alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
 	if (!eb) {
 		test_msg("Couldn't allocate test extent buffer\n");
 		kfree(bitmap);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 7508d3b..3bf5df1 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -469,8 +469,8 @@ static int run_test(test_func_t test_func, int bitmaps,
 	root->fs_info->free_space_root = root;
 	root->fs_info->tree_root = root;
 
-	root->node = alloc_test_extent_buffer(root->fs_info,
-		nodesize, nodesize);
+	root->node = alloc_test_extent_buffer(fs_info->eb_info, nodesize,
+					      nodesize);
 	if (!root->node) {
 		test_msg("Couldn't allocate dummy buffer\n");
 		ret = -ENOMEM;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ca7cb5e..bb277ee 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -486,8 +486,8 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 	 * Can't use bytenr 0, some things freak out
 	 * *cough*backref walking code*cough*
 	 */
-	root->node = alloc_test_extent_buffer(root->fs_info, nodesize,
-					nodesize);
+	root->node = alloc_test_extent_buffer(fs_info->eb_info, nodesize,
+					      nodesize);
 	if (!root->node) {
 		test_msg("Couldn't allocate dummy buffer\n");
 		ret = -ENOMEM;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 56bdb2f..a54691f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -293,8 +293,7 @@ loop:
 	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
 	spin_lock_init(&cur_trans->dropped_roots_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
-	extent_io_tree_init(&cur_trans->dirty_pages,
-			     fs_info->btree_inode);
+	extent_io_tree_init(&cur_trans->dirty_pages, NULL);
 	fs_info->generation++;
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
@@ -935,7 +934,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 {
 	int err = 0;
 	int werr = 0;
-	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
@@ -965,11 +963,11 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 			wait_writeback = true;
 		}
 		if (!err)
-			err = filemap_fdatawrite_range(mapping, start, end);
+			err = btree_write_range(root->fs_info, start, end);
 		if (err)
 			werr = err;
 		else if (wait_writeback)
-			werr = filemap_fdatawait_range(mapping, start, end);
+			werr = btree_wait_range(root->fs_info, start, end);
 		free_extent_state(cached_state);
 		cached_state = NULL;
 		cond_resched();
@@ -989,7 +987,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 {
 	int err = 0;
 	int werr = 0;
-	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
@@ -1011,7 +1008,7 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 		if (err == -ENOMEM)
 			err = 0;
 		if (!err)
-			err = filemap_fdatawait_range(mapping, start, end);
+			err = btree_wait_range(root->fs_info, start, end);
 		if (err)
 			werr = err;
 		free_extent_state(cached_state);
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 7/7] Btrfs: kill BUG_ON()'s in btrfs_mark_extent_written
  2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
                   ` (5 preceding siblings ...)
  2016-09-02 19:40 ` [PATCH 6/7] Btrfs: kill the btree_inode Josef Bacik
@ 2016-09-02 19:40 ` Josef Bacik
  2016-09-08 17:07   ` David Sterba
  2016-09-05 16:31 ` [PATCH 0/7] Kill the btree inode David Sterba
  7 siblings, 1 reply; 16+ messages in thread
From: Josef Bacik @ 2016-09-02 19:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

No reason to bug on in here, fs corruption could easily cause these things to
happen.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/file.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f060b08..85bf035 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1110,13 +1110,25 @@ again:
 
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-	BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
+	if (key.objectid != ino ||
+	    key.type != BTRFS_EXTENT_DATA_KEY) {
+		ret = -EINVAL;
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
-	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
-	       BTRFS_FILE_EXTENT_PREALLOC);
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
+		ret = -EINVAL;
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-	BUG_ON(key.offset > start || extent_end < end);
+	if (key.offset > start || extent_end < end) {
+		ret = -EINVAL;
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
 
 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
@@ -1213,12 +1225,19 @@ again:
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
 					   ino, orig_offset);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
 
 		if (split == start) {
 			key.offset = start;
 		} else {
-			BUG_ON(start != key.offset);
+			if (start != key.offset) {
+				ret = -EINVAL;
+				btrfs_abort_transaction(trans, ret);
+				goto out;
+			}
 			path->slots[0]--;
 			extent_end = end;
 		}
@@ -1240,7 +1259,10 @@ again:
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
 					ino, orig_offset);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
 	}
 	other_start = 0;
 	other_end = start;
@@ -1257,7 +1279,10 @@ again:
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
 					ino, orig_offset);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
 	}
 	if (del_nr == 0) {
 		fi = btrfs_item_ptr(leaf, path->slots[0],
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/7] Kill the btree inode
  2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
                   ` (6 preceding siblings ...)
  2016-09-02 19:40 ` [PATCH 7/7] Btrfs: kill BUG_ON()'s in btrfs_mark_extent_written Josef Bacik
@ 2016-09-05 16:31 ` David Sterba
  2016-09-06 13:03   ` Josef Bacik
  7 siblings, 1 reply; 16+ messages in thread
From: David Sterba @ 2016-09-05 16:31 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Fri, Sep 02, 2016 at 03:39:59PM -0400, Josef Bacik wrote:
> In order to provide a better way to do subpage blocksizes we need to stop
> allocating pages from a per fs btree inode and instead allocate our own pages.
> This work depends on 3 generic patches that I've sent previously
> 
> remove mapping from balance_dirty_pages*()
> writeback: allow for dirty metadata accounting
> writeback: introduce super_operations->write_metadata

What's the status of the patches? I don't see them in current
linux-next. I can put them to our for-next for the time being so we can
test the btree_inode removal.

> This is a pretty big change but ultimately makes extent_buffer reclaim much
> cleaner and will make the sub-pagesize blocksize work significantly cleaner.
> I've been hammering on this for a few weeks now and seems to be pretty solid.

The preparatory patches are ok. The core patch is quite large, although
there are simple transformations to the new eb_info, the remaining
changes are still quite big for a review. But I don't see a way how to
split it, it's basically an all-or-nothing change. Overall it looks ok
so I'll put it to for-next.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/7] Kill the btree inode
  2016-09-05 16:31 ` [PATCH 0/7] Kill the btree inode David Sterba
@ 2016-09-06 13:03   ` Josef Bacik
  0 siblings, 0 replies; 16+ messages in thread
From: Josef Bacik @ 2016-09-06 13:03 UTC (permalink / raw)
  To: dsterba, linux-btrfs, kernel-team

On 09/05/2016 12:31 PM, David Sterba wrote:
> On Fri, Sep 02, 2016 at 03:39:59PM -0400, Josef Bacik wrote:
>> In order to provide a better way to do subpage blocksizes we need to stop
>> allocating pages from a per fs btree inode and instead allocate our own pages.
>> This work depends on 3 generic patches that I've sent previously
>>
>> remove mapping from balance_dirty_pages*()
>> writeback: allow for dirty metadata accounting
>> writeback: introduce super_operations->write_metadata
>
> What's the status of the patches? I don't see them in current
> linux-next. I can put them to our for-next for the time being so we can
> test the btree_inode removal.

Still waiting on reviews.  The first pass had no big objections, just need to 
get people to look at the next pass.

>
>> This is a pretty big change but ultimately makes extent_buffer reclaim much
>> cleaner and will make the sub-pagesize blocksize work significantly cleaner.
>> I've been hammering on this for a few weeks now and seems to be pretty solid.
>
> The preparatory patches are ok. The core patch is quite large, although
> there are simple transformations to the new eb_info, the remaining
> changes are still quite big for a review. But I don't see a way how to
> split it, it's basically an all-or-nothing change. Overall it looks ok
> so I'll put it to for-next.
>

Yeah I broke out what I could but unfortunately ripping out the btree_inode 
means we have to make a bunch of other changes all at once, or we wouldn't be 
able to bisect across it without things breaking horribly.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 6/7] Btrfs: kill the btree_inode
  2016-09-02 19:40 ` [PATCH 6/7] Btrfs: kill the btree_inode Josef Bacik
@ 2016-09-08  5:17   ` Chandan Rajendra
  2016-09-08 14:12     ` Josef Bacik
  2016-09-09 17:40   ` [PATCH 6/7][V2] " Josef Bacik
  1 sibling, 1 reply; 16+ messages in thread
From: Chandan Rajendra @ 2016-09-08  5:17 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Friday, September 02, 2016 03:40:05 PM Josef Bacik wrote:

Please find my comment inlined below,

> In order to more efficiently support sub-page blocksizes we need to stop
> allocating pages from pagecache for our metadata.  Instead switch to using the
> account_metadata* counters for making sure we are keeping the system aware of
> how much dirty metadata we have, and use the ->free_cached_objects super
> operation in order to handle freeing up extent buffers.  This greatly simplifies
> how we deal with extent buffers as now we no longer have to tie the page cache
> reclaimation stuff to the extent buffer stuff.  This will also allow us to
> simply kmalloc() our data for sub-page blocksizes.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  fs/btrfs/btrfs_inode.h                 |   3 +-
>  fs/btrfs/ctree.c                       |  10 +-
>  fs/btrfs/ctree.h                       |  13 +-
>  fs/btrfs/disk-io.c                     | 389 ++++----------
>  fs/btrfs/extent_io.c                   | 913 ++++++++++++++++++---------------
>  fs/btrfs/extent_io.h                   |  49 +-
>  fs/btrfs/inode.c                       |   6 +-
>  fs/btrfs/root-tree.c                   |   2 +-
>  fs/btrfs/super.c                       |  29 +-
>  fs/btrfs/tests/btrfs-tests.c           |  37 +-
>  fs/btrfs/tests/extent-io-tests.c       |   4 +-
>  fs/btrfs/tests/free-space-tree-tests.c |   4 +-
>  fs/btrfs/tests/qgroup-tests.c          |   4 +-
>  fs/btrfs/transaction.c                 |  11 +-
>  14 files changed, 726 insertions(+), 748 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 1a8fa46..ad7b185 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -229,10 +229,9 @@ static inline u64 btrfs_ino(struct inode *inode)
>  	u64 ino = BTRFS_I(inode)->location.objectid;
> 
>  	/*
> -	 * !ino: btree_inode
>  	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
>  	 */
> -	if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
> +	if (BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
>  		ino = inode->i_ino;
>  	return ino;
>  }
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index d1c56c9..b267053 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1373,8 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
> 
>  	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
>  		BUG_ON(tm->slot != 0);
> -		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
> -						eb->len);
> +		eb_rewin = alloc_dummy_extent_buffer(fs_info->eb_info,
> +						     eb->start, eb->len);
>  		if (!eb_rewin) {
>  			btrfs_tree_read_unlock_blocking(eb);
>  			free_extent_buffer(eb);
> @@ -1455,8 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
>  	} else if (old_root) {
>  		btrfs_tree_read_unlock(eb_root);
>  		free_extent_buffer(eb_root);
> -		eb = alloc_dummy_extent_buffer(root->fs_info, logical,
> -					root->nodesize);
> +		eb = alloc_dummy_extent_buffer(root->fs_info->eb_info, logical,
> +					       root->nodesize);
>  	} else {
>  		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
>  		eb = btrfs_clone_extent_buffer(eb_root);
> @@ -1772,7 +1772,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
>  	int err;
> 
>  	if (low > high) {
> -		btrfs_err(eb->fs_info,
> +		btrfs_err(eb->eb_info->fs_info,
>  		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
>  			  __func__, low, high, eb->start,
>  			  btrfs_header_owner(eb), btrfs_header_level(eb));
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 282a031..ee6956c 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -37,6 +37,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/security.h>
>  #include <linux/sizes.h>
> +#include <linux/list_lru.h>
>  #include "extent_io.h"
>  #include "extent_map.h"
>  #include "async-thread.h"
> @@ -675,6 +676,7 @@ struct btrfs_device;
>  struct btrfs_fs_devices;
>  struct btrfs_balance_control;
>  struct btrfs_delayed_root;
> +struct btrfs_eb_info;
> 
>  #define BTRFS_FS_BARRIER			1
>  #define BTRFS_FS_CLOSING_START			2
> @@ -797,7 +799,7 @@ struct btrfs_fs_info {
>  	struct btrfs_super_block *super_for_commit;
>  	struct block_device *__bdev;
>  	struct super_block *sb;
> -	struct inode *btree_inode;
> +	struct btrfs_eb_info *eb_info;
>  	struct backing_dev_info bdi;
>  	struct mutex tree_log_mutex;
>  	struct mutex transaction_kthread_mutex;
> @@ -1042,10 +1044,6 @@ struct btrfs_fs_info {
>  	/* readahead works cnt */
>  	atomic_t reada_works_cnt;
> 
> -	/* Extent buffer radix tree */
> -	spinlock_t buffer_lock;
> -	struct radix_tree_root buffer_radix;
> -
>  	/* next backup root to be overwritten */
>  	int backup_root_index;
> 
> @@ -2884,6 +2882,8 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
> 
>  static inline void free_fs_info(struct btrfs_fs_info *fs_info)
>  {
> +	list_lru_destroy(&fs_info->eb_info->lru_list);
> +	kfree(fs_info->eb_info);
>  	kfree(fs_info->balance_ctl);
>  	kfree(fs_info->delayed_root);
>  	kfree(fs_info->extent_root);
> @@ -3121,9 +3121,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
>  			     struct btrfs_root *new_root,
>  			     struct btrfs_root *parent_root,
>  			     u64 new_dirid);
> -int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
> -			 size_t size, struct bio *bio,
> -			 unsigned long bio_flags);
>  void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
>  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
>  int btrfs_readpage(struct file *file, struct page *page);
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 9c42e53..03ac601 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -217,56 +217,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
> 
>  #endif
> 
> -/*
> - * extents on the btree inode are pretty simple, there's one extent
> - * that covers the entire device
> - */
> -static struct extent_map *btree_get_extent(struct inode *inode,
> -		struct page *page, size_t pg_offset, u64 start, u64 len,
> -		int create)
> -{
> -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
> -	struct extent_map *em;
> -	int ret;
> -
> -	read_lock(&em_tree->lock);
> -	em = lookup_extent_mapping(em_tree, start, len);
> -	if (em) {
> -		em->bdev =
> -			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
> -		read_unlock(&em_tree->lock);
> -		goto out;
> -	}
> -	read_unlock(&em_tree->lock);
> -
> -	em = alloc_extent_map();
> -	if (!em) {
> -		em = ERR_PTR(-ENOMEM);
> -		goto out;
> -	}
> -	em->start = 0;
> -	em->len = (u64)-1;
> -	em->block_len = (u64)-1;
> -	em->block_start = 0;
> -	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
> -
> -	write_lock(&em_tree->lock);
> -	ret = add_extent_mapping(em_tree, em, 0);
> -	if (ret == -EEXIST) {
> -		free_extent_map(em);
> -		em = lookup_extent_mapping(em_tree, start, len);
> -		if (!em)
> -			em = ERR_PTR(-EIO);
> -	} else if (ret) {
> -		free_extent_map(em);
> -		em = ERR_PTR(ret);
> -	}
> -	write_unlock(&em_tree->lock);
> -
> -out:
> -	return em;
> -}
> -
>  u32 btrfs_csum_data(char *data, u32 seed, size_t len)
>  {
>  	return btrfs_crc32c(seed, data, len);
> @@ -349,11 +299,11 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
>   * detect blocks that either didn't get written at all or got written
>   * in the wrong place.
>   */
> -static int verify_parent_transid(struct extent_io_tree *io_tree,
> -				 struct extent_buffer *eb, u64 parent_transid,
> +static int verify_parent_transid(struct extent_buffer *eb, u64 parent_transid,
>  				 int atomic)
>  {
>  	struct extent_state *cached_state = NULL;
> +	struct extent_io_tree *io_tree = &eb->eb_info->io_tree;
>  	int ret;
>  	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
> 
> @@ -375,7 +325,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
>  		ret = 0;
>  		goto out;
>  	}
> -	btrfs_err_rl(eb->fs_info,
> +	btrfs_err_rl(eb->eb_info->fs_info,
>  		"parent transid verify failed on %llu wanted %llu found %llu",
>  			eb->start,
>  			parent_transid, btrfs_header_generation(eb));
> @@ -445,7 +395,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
>  					  struct extent_buffer *eb,
>  					  u64 parent_transid)
>  {
> -	struct extent_io_tree *io_tree;
>  	int failed = 0;
>  	int ret;
>  	int num_copies = 0;
> @@ -453,13 +402,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
>  	int failed_mirror = 0;
> 
>  	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
> -	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
>  	while (1) {
> -		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
> -					       btree_get_extent, mirror_num);
> +		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
>  		if (!ret) {
> -			if (!verify_parent_transid(io_tree, eb,
> -						   parent_transid, 0))
> +			if (!verify_parent_transid(eb, parent_transid, 0))
>  				break;
>  			else
>  				ret = -EIO;
> @@ -504,24 +450,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
> 
>  static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
>  {
> -	u64 start = page_offset(page);
> -	u64 found_start;
>  	struct extent_buffer *eb;
> 
>  	eb = (struct extent_buffer *)page->private;
>  	if (page != eb->pages[0])
>  		return 0;
> -
> -	found_start = btrfs_header_bytenr(eb);
> -	/*
> -	 * Please do not consolidate these warnings into a single if.
> -	 * It is useful to know what went wrong.
> -	 */
> -	if (WARN_ON(found_start != start))
> -		return -EUCLEAN;
> -	if (WARN_ON(!PageUptodate(page)))
> -		return -EUCLEAN;
> -
>  	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
>  			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
> 
> @@ -619,8 +552,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
>  	u64 found_start;
>  	int found_level;
>  	struct extent_buffer *eb;
> -	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
> -	struct btrfs_fs_info *fs_info = root->fs_info;
> +	struct btrfs_root *root;
> +	struct btrfs_fs_info *fs_info;
>  	int ret = 0;
>  	int reads_done;
> 
> @@ -633,6 +566,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
>  	 * in memory.  Make sure we have a ref for all this other checks
>  	 */
>  	extent_buffer_get(eb);
> +	fs_info = eb->eb_info->fs_info;
> +	root = fs_info->tree_root;
> 
>  	reads_done = atomic_dec_and_test(&eb->io_pages);
>  	if (!reads_done)
> @@ -693,11 +628,19 @@ err:
>  		/*
>  		 * our io error hook is going to dec the io pages
>  		 * again, we have to make sure it has something
> -		 * to decrement
> +		 * to decrement.
> +		 *
> +		 * TODO: Kill this, we've re-arranged how this works now so we
> +		 * don't need to do this io_pages dance.
>  		 */
>  		atomic_inc(&eb->io_pages);
>  		clear_extent_buffer_uptodate(eb);
>  	}
> +	if (reads_done) {
> +		clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +		smp_mb__after_atomic();
> +		wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
> +	}
>  	free_extent_buffer(eb);
>  out:
>  	return ret;
> @@ -712,7 +655,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
>  	eb->read_mirror = failed_mirror;
>  	atomic_dec(&eb->io_pages);
>  	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
> -		btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
> +		btree_readahead_hook(eb->eb_info->fs_info, eb, eb->start, -EIO);
>  	return -EIO;	/* we fixed nothing */
>  }
> 
> @@ -884,15 +827,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
>  	return 0;
>  }
> 
> -static int btree_csum_one_bio(struct bio *bio)
> +static int btree_csum_one_bio(struct btrfs_fs_info *fs_info, struct bio *bio)
>  {
>  	struct bio_vec *bvec;
> -	struct btrfs_root *root;
>  	int i, ret = 0;
> 
>  	bio_for_each_segment_all(bvec, bio, i) {
> -		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
> -		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
> +		ret = csum_dirty_buffer(fs_info, bvec->bv_page);
>  		if (ret)
>  			break;
>  	}
> @@ -904,25 +845,27 @@ static int __btree_submit_bio_start(void *private_data, struct bio *bio,
>  				    int mirror_num, unsigned long bio_flags,
>  				    u64 bio_offset)
>  {
> +	struct btrfs_eb_info *eb_info = private_data;
>  	/*
>  	 * when we're called for a write, we're already in the async
>  	 * submission context.  Just jump into btrfs_map_bio
>  	 */
> -	return btree_csum_one_bio(bio);
> +	return btree_csum_one_bio(eb_info->fs_info, bio);
>  }
> 
>  static int __btree_submit_bio_done(void *private_data, struct bio *bio,
>  				 int mirror_num, unsigned long bio_flags,
>  				 u64 bio_offset)
>  {
> -	struct inode *inode = private_data;
> +	struct btrfs_eb_info *eb_info = private_data;
> +	struct btrfs_root *root = eb_info->fs_info->tree_root;
>  	int ret;
> 
>  	/*
>  	 * when we're called for a write, we're already in the async
>  	 * submission context.  Just jump into btrfs_map_bio
>  	 */
> -	ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
> +	ret = btrfs_map_bio(root, bio, mirror_num, 1);
>  	if (ret) {
>  		bio->bi_error = ret;
>  		bio_endio(bio);
> @@ -930,7 +873,7 @@ static int __btree_submit_bio_done(void *private_data, struct bio *bio,
>  	return ret;
>  }
> 
> -static int check_async_write(struct inode *inode, unsigned long bio_flags)
> +static int check_async_write(unsigned long bio_flags)
>  {
>  	if (bio_flags & EXTENT_BIO_TREE_LOG)
>  		return 0;
> @@ -945,8 +888,9 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
>  				 int mirror_num, unsigned long bio_flags,
>  				 u64 bio_offset)
>  {
> -	struct inode *inode = private_data;
> -	int async = check_async_write(inode, bio_flags);
> +	struct btrfs_eb_info *eb_info = private_data;
> +	struct btrfs_root *root = eb_info->fs_info->tree_root;
> +	int async = check_async_write(bio_flags);
>  	int ret;
> 
>  	if (bio_op(bio) != REQ_OP_WRITE) {
> @@ -954,23 +898,22 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
>  		 * called for a read, do the setup so that checksum validation
>  		 * can happen in the async kernel threads
>  		 */
> -		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
> -					  bio, BTRFS_WQ_ENDIO_METADATA);
> +		ret = btrfs_bio_wq_end_io(eb_info->fs_info, bio,
> +					  BTRFS_WQ_ENDIO_METADATA);
>  		if (ret)
>  			goto out_w_error;
> -		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
> +		ret = btrfs_map_bio(root, bio, mirror_num, 0);
>  	} else if (!async) {
> -		ret = btree_csum_one_bio(bio);
> +		ret = btree_csum_one_bio(eb_info->fs_info, bio);
>  		if (ret)
>  			goto out_w_error;
> -		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
> +		ret = btrfs_map_bio(root, bio, mirror_num, 0);
>  	} else {
>  		/*
>  		 * kthread helpers are used to submit writes so that
>  		 * checksumming can happen in parallel across all CPUs
>  		 */
> -		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
> -					  bio, mirror_num, 0,
> +		ret = btrfs_wq_submit_bio(eb_info->fs_info, bio, mirror_num, 0,
>  					  bio_offset, private_data,
>  					  __btree_submit_bio_start,
>  					  __btree_submit_bio_done);
> @@ -986,118 +929,14 @@ out_w_error:
>  	return ret;
>  }
> 
> -#ifdef CONFIG_MIGRATION
> -static int btree_migratepage(struct address_space *mapping,
> -			struct page *newpage, struct page *page,
> -			enum migrate_mode mode)
> -{
> -	/*
> -	 * we can't safely write a btree page from here,
> -	 * we haven't done the locking hook
> -	 */
> -	if (PageDirty(page))
> -		return -EAGAIN;
> -	/*
> -	 * Buffers may be managed in a filesystem specific way.
> -	 * We must have no buffers or drop them.
> -	 */
> -	if (page_has_private(page) &&
> -	    !try_to_release_page(page, GFP_KERNEL))
> -		return -EAGAIN;
> -	return migrate_page(mapping, newpage, page, mode);
> -}
> -#endif
> -
> -
> -static int btree_writepages(struct address_space *mapping,
> -			    struct writeback_control *wbc)
> -{
> -	struct btrfs_fs_info *fs_info;
> -	int ret;
> -
> -	if (wbc->sync_mode == WB_SYNC_NONE) {
> -
> -		if (wbc->for_kupdate)
> -			return 0;
> -
> -		fs_info = BTRFS_I(mapping->host)->root->fs_info;
> -		/* this is a bit racy, but that's ok */
> -		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
> -					     BTRFS_DIRTY_METADATA_THRESH);
> -		if (ret < 0)
> -			return 0;
> -	}
> -	return btree_write_cache_pages(mapping, wbc);
> -}
> -
> -static int btree_readpage(struct file *file, struct page *page)
> -{
> -	struct extent_io_tree *tree;
> -	tree = &BTRFS_I(page->mapping->host)->io_tree;
> -	return extent_read_full_page(tree, page, btree_get_extent, 0);
> -}
> -
> -static int btree_releasepage(struct page *page, gfp_t gfp_flags)
> -{
> -	if (PageWriteback(page) || PageDirty(page))
> -		return 0;
> -
> -	return try_release_extent_buffer(page);
> -}
> -
> -static void btree_invalidatepage(struct page *page, unsigned int offset,
> -				 unsigned int length)
> -{
> -	struct extent_io_tree *tree;
> -	tree = &BTRFS_I(page->mapping->host)->io_tree;
> -	extent_invalidatepage(tree, page, offset);
> -	btree_releasepage(page, GFP_NOFS);
> -	if (PagePrivate(page)) {
> -		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
> -			   "page private not zero on page %llu",
> -			   (unsigned long long)page_offset(page));
> -		ClearPagePrivate(page);
> -		set_page_private(page, 0);
> -		put_page(page);
> -	}
> -}
> -
> -static int btree_set_page_dirty(struct page *page)
> -{
> -#ifdef DEBUG
> -	struct extent_buffer *eb;
> -
> -	BUG_ON(!PagePrivate(page));
> -	eb = (struct extent_buffer *)page->private;
> -	BUG_ON(!eb);
> -	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
> -	BUG_ON(!atomic_read(&eb->refs));
> -	btrfs_assert_tree_locked(eb);
> -#endif
> -	return __set_page_dirty_nobuffers(page);
> -}
> -
> -static const struct address_space_operations btree_aops = {
> -	.readpage	= btree_readpage,
> -	.writepages	= btree_writepages,
> -	.releasepage	= btree_releasepage,
> -	.invalidatepage = btree_invalidatepage,
> -#ifdef CONFIG_MIGRATION
> -	.migratepage	= btree_migratepage,
> -#endif
> -	.set_page_dirty = btree_set_page_dirty,
> -};
> -
>  void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
>  {
>  	struct extent_buffer *buf = NULL;
> -	struct inode *btree_inode = root->fs_info->btree_inode;
> 
>  	buf = btrfs_find_create_tree_block(root, bytenr);
>  	if (IS_ERR(buf))
>  		return;
> -	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
> -				 buf, WAIT_NONE, btree_get_extent, 0);
> +	read_extent_buffer_pages(buf, WAIT_NONE, 0);
>  	free_extent_buffer(buf);
>  }
> 
> @@ -1105,8 +944,6 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
>  			 int mirror_num, struct extent_buffer **eb)
>  {
>  	struct extent_buffer *buf = NULL;
> -	struct inode *btree_inode = root->fs_info->btree_inode;
> -	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
>  	int ret;
> 
>  	buf = btrfs_find_create_tree_block(root, bytenr);
> @@ -1115,8 +952,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
> 
>  	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
> 
> -	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
> -				       btree_get_extent, mirror_num);
> +	ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
>  	if (ret) {
>  		free_extent_buffer(buf);
>  		return ret;
> @@ -1136,29 +972,29 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
>  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
>  					    u64 bytenr)
>  {
> -	return find_extent_buffer(fs_info, bytenr);
> +	return find_extent_buffer(fs_info->eb_info, bytenr);
>  }
> 
>  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
>  						 u64 bytenr)
>  {
>  	if (btrfs_is_testing(root->fs_info))
> -		return alloc_test_extent_buffer(root->fs_info, bytenr,
> -				root->nodesize);
> +		return alloc_test_extent_buffer(root->fs_info->eb_info, bytenr,
> +						root->nodesize);
>  	return alloc_extent_buffer(root->fs_info, bytenr);
>  }
> 
> 
>  int btrfs_write_tree_block(struct extent_buffer *buf)
>  {
> -	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
> -					buf->start + buf->len - 1);
> +	return btree_write_range(buf->eb_info->fs_info, buf->start,
> +				 buf->start + buf->len - 1);
>  }
> 
>  int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
>  {
> -	return filemap_fdatawait_range(buf->pages[0]->mapping,
> -				       buf->start, buf->start + buf->len - 1);
> +	return btree_wait_range(buf->eb_info->fs_info, buf->start,
> +				buf->start + buf->len - 1);
>  }
> 
>  struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
> @@ -1188,14 +1024,10 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
>  	    fs_info->running_transaction->transid) {
>  		btrfs_assert_tree_locked(buf);
> 
> -		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
> +		if (clear_extent_buffer_dirty(buf))
>  			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
>  					     -buf->len,
>  					     fs_info->dirty_metadata_batch);
> -			/* ugh, clear_extent_buffer_dirty needs to lock the page */
> -			btrfs_set_lock_blocking(buf);
> -			clear_extent_buffer_dirty(buf);
> -		}
>  	}
>  }
> 
> @@ -2247,33 +2079,20 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
>  	init_waitqueue_head(&fs_info->balance_wait_q);
>  }
> 
> -static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
> -				   struct btrfs_root *tree_root)
> +int btrfs_init_eb_info(struct btrfs_fs_info *fs_info)
>  {
> -	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
> -	set_nlink(fs_info->btree_inode, 1);
> -	/*
> -	 * we set the i_size on the btree inode to the max possible int.
> -	 * the real end of the address space is determined by all of
> -	 * the devices in the system
> -	 */
> -	fs_info->btree_inode->i_size = OFFSET_MAX;
> -	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
> -
> -	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
> -	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
> -			    fs_info->btree_inode);
> -	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
> -	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
> -
> -	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
> -
> -	BTRFS_I(fs_info->btree_inode)->root = tree_root;
> -	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
> -	       sizeof(struct btrfs_key));
> -	set_bit(BTRFS_INODE_DUMMY,
> -		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
> -	btrfs_insert_inode_hash(fs_info->btree_inode);
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +
> +	eb_info->fs_info = fs_info;
> +	extent_io_tree_init(&eb_info->io_tree, eb_info);
> +	eb_info->io_tree.track_uptodate = 0;
> +	eb_info->io_tree.ops = &btree_extent_io_ops;
> +	extent_io_tree_init(&eb_info->io_failure_tree, eb_info);
> +	INIT_RADIX_TREE(&eb_info->buffer_radix, GFP_ATOMIC);
> +	spin_lock_init(&eb_info->buffer_lock);
> +	if (list_lru_init(&eb_info->lru_list))
> +		return -ENOMEM;
> +	return 0;
>  }
> 
>  static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
> @@ -2566,16 +2385,7 @@ int open_ctree(struct super_block *sb,
>  		goto fail_delalloc_bytes;
>  	}
> 
> -	fs_info->btree_inode = new_inode(sb);
> -	if (!fs_info->btree_inode) {
> -		err = -ENOMEM;
> -		goto fail_bio_counter;
> -	}
> -
> -	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> -
>  	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
> -	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
>  	INIT_LIST_HEAD(&fs_info->trans_list);
>  	INIT_LIST_HEAD(&fs_info->dead_roots);
>  	INIT_LIST_HEAD(&fs_info->delayed_iputs);
> @@ -2590,7 +2400,6 @@ int open_ctree(struct super_block *sb,
>  	spin_lock_init(&fs_info->tree_mod_seq_lock);
>  	spin_lock_init(&fs_info->super_lock);
>  	spin_lock_init(&fs_info->qgroup_op_lock);
> -	spin_lock_init(&fs_info->buffer_lock);
>  	spin_lock_init(&fs_info->unused_bgs_lock);
>  	rwlock_init(&fs_info->tree_mod_log_lock);
>  	mutex_init(&fs_info->unused_bg_unpin_mutex);
> @@ -2643,7 +2452,7 @@ int open_ctree(struct super_block *sb,
>  					GFP_KERNEL);
>  	if (!fs_info->delayed_root) {
>  		err = -ENOMEM;
> -		goto fail_iput;
> +		goto fail_alloc;
>  	}
>  	btrfs_init_delayed_root(fs_info->delayed_root);
> 
> @@ -2658,7 +2467,15 @@ int open_ctree(struct super_block *sb,
>  	sb->s_blocksize_bits = blksize_bits(4096);
>  	sb->s_bdi = &fs_info->bdi;
> 
> -	btrfs_init_btree_inode(fs_info, tree_root);
> +	fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info), GFP_KERNEL);
> +	if (!fs_info->eb_info) {
> +		err = -ENOMEM;
> +		goto fail_alloc;
> +	}
> +	if (btrfs_init_eb_info(fs_info)) {
> +		err = -ENOMEM;
> +		goto fail_alloc;
> +	}
> 
>  	spin_lock_init(&fs_info->block_group_cache_lock);
>  	fs_info->block_group_cache_tree = RB_ROOT;
> @@ -3085,6 +2902,14 @@ retry_root_backup:
>  	if (sb->s_flags & MS_RDONLY)
>  		return 0;
> 
> +	/*
> +	 * We need to make sure we are on the bdi's dirty list so we get
> +	 * writeback requests for our fs properly.
> +	 */
> +	spin_lock(&fs_info->bdi.sb_list_lock);
> +	list_add_tail(&fs_info->bdi.dirty_sb_list, &sb->s_bdi_list);
> +	spin_unlock(&fs_info->bdi.sb_list_lock);
> +
>  	if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
>  	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
>  		btrfs_info(fs_info, "creating free space tree");
> @@ -3180,7 +3005,8 @@ fail_cleaner:
>  	 * make sure we're done with the btree inode before we stop our
>  	 * kthreads
>  	 */
> -	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
> +	btree_write_range(fs_info, 0, (u64)-1);
> +	btree_wait_range(fs_info, 0, (u64)-1);
> 
>  fail_sysfs:
>  	btrfs_sysfs_remove_mounted(fs_info);
> @@ -3194,16 +3020,11 @@ fail_block_groups:
> 
>  fail_tree_roots:
>  	free_root_pointers(fs_info, 1);
> -	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
> -
> +	btrfs_invalidate_eb_info(fs_info->eb_info);
>  fail_sb_buffer:
>  	btrfs_stop_all_workers(fs_info);
>  fail_alloc:
> -fail_iput:
>  	btrfs_mapping_tree_free(&fs_info->mapping_tree);
> -
> -	iput(fs_info->btree_inode);
> -fail_bio_counter:
>  	percpu_counter_destroy(&fs_info->bio_counter);
>  fail_delalloc_bytes:
>  	percpu_counter_destroy(&fs_info->delalloc_bytes);
> @@ -3908,14 +3729,11 @@ void close_ctree(struct btrfs_root *root)
>  	 * we must make sure there is not any read request to
>  	 * submit after we stopping all workers.
>  	 */
> -	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
>  	btrfs_stop_all_workers(fs_info);
> 
>  	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
>  	free_root_pointers(fs_info, 1);
> 
> -	iput(fs_info->btree_inode);
> -
>  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
>  	if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
>  		btrfsic_unmount(root, fs_info->fs_devices);
> @@ -3924,6 +3742,8 @@ void close_ctree(struct btrfs_root *root)
>  	btrfs_close_devices(fs_info->fs_devices);
>  	btrfs_mapping_tree_free(&fs_info->mapping_tree);
> 
> +	btrfs_invalidate_eb_info(fs_info->eb_info);
> +
>  	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
>  	percpu_counter_destroy(&fs_info->delalloc_bytes);
>  	percpu_counter_destroy(&fs_info->bio_counter);
> @@ -3951,14 +3771,12 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
>  			  int atomic)
>  {
>  	int ret;
> -	struct inode *btree_inode = buf->pages[0]->mapping->host;
> 
>  	ret = extent_buffer_uptodate(buf);
>  	if (!ret)
>  		return ret;
> 
> -	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
> -				    parent_transid, atomic);
> +	ret = verify_parent_transid(buf, parent_transid, atomic);
>  	if (ret == -EAGAIN)
>  		return ret;
>  	return !ret;
> @@ -3979,7 +3797,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
>  	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
>  		return;
>  #endif
> -	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
> +	root = buf->eb_info->fs_info->tree_root;
>  	btrfs_assert_tree_locked(buf);
>  	if (transid != root->fs_info->generation)
>  		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
> @@ -4015,10 +3833,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
> 
>  	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
>  				     BTRFS_DIRTY_METADATA_THRESH);
> -	if (ret > 0) {
> +	if (ret > 0)
>  		balance_dirty_pages_ratelimited(&root->fs_info->bdi,
>  						root->fs_info->sb);
> -	}
>  }
> 
>  void btrfs_btree_balance_dirty(struct btrfs_root *root)
> @@ -4033,7 +3850,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
> 
>  int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
>  {
> -	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
> +	struct btrfs_root *root = buf->eb_info->fs_info->tree_root;
>  	return btree_read_extent_buffer_pages(root, buf, parent_transid);
>  }
> 
> @@ -4376,10 +4193,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
>  			if (!eb)
>  				continue;
>  			wait_on_extent_buffer_writeback(eb);
> -
> -			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
> -					       &eb->bflags))
> -				clear_extent_buffer_dirty(eb);
> +			clear_extent_buffer_dirty(eb);
>  			free_extent_buffer_stale(eb);
>  		}
>  	}
> @@ -4504,16 +4318,37 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
> 
>  static struct btrfs_fs_info *btree_fs_info(void *private_data)
>  {
> -	struct inode *inode = private_data;
> -	return btrfs_sb(inode->i_sb);
> +	struct btrfs_eb_info *eb_info = private_data;
> +	return eb_info->fs_info;
> +}
> +
> +static int btree_merge_bio_hook(struct page *page, unsigned long offset,
> +				size_t size, struct bio *bio,
> +				unsigned long bio_flags)
> +{
> +	struct extent_buffer *eb = (struct extent_buffer *)page->private;
> +	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
> +	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
> +	u64 length = 0;
> +	u64 map_length;
> +	int ret;
> +
> +	length = bio->bi_iter.bi_size;
> +	map_length = length;
> +	ret = btrfs_map_block(fs_info, bio_op(bio), logical, &map_length,
> +			      NULL, 0);
> +	if (ret < 0)
> +		return ret;
> +	if (map_length < length + size)
> +		return 1;
> +	return 0;
>  }
> 
>  static const struct extent_io_ops btree_extent_io_ops = {
>  	.readpage_end_io_hook = btree_readpage_end_io_hook,
>  	.readpage_io_failed_hook = btree_io_failed_hook,
>  	.submit_bio_hook = btree_submit_bio_hook,
> -	/* note we're sharing with inode.c for the merge bio hook */
> -	.merge_bio_hook = btrfs_merge_bio_hook,
> +	.merge_bio_hook = btree_merge_bio_hook,
>  	.tree_fs_info = btree_fs_info,
>  	.set_range_writeback = btrfs_set_range_writeback,
>  };
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 5dcdd3e..5c18a49 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -75,8 +75,8 @@ void btrfs_leak_debug_check(void)
>  	while (!list_empty(&buffers)) {
>  		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
>  		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
> -		       "refs %d\n",
> -		       eb->start, eb->len, atomic_read(&eb->refs));
> +		       "bflags %lu refs %d\n",
> +		       eb->start, eb->len, eb->bflags, atomic_read(&eb->refs));
>  		list_del(&eb->leak_list);
>  		kmem_cache_free(extent_buffer_cache, eb);
>  	}
> @@ -3538,7 +3538,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
>  			  struct btrfs_fs_info *fs_info,
>  			  struct extent_page_data *epd)
>  {
> -	unsigned long i, num_pages;
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>  	int flush = 0;
>  	int ret = 0;
> 
> @@ -3585,37 +3585,42 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
> 
>  	btrfs_tree_unlock(eb);
> 
> -	if (!ret)
> -		return ret;
> -
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		struct page *p = eb->pages[i];
> -
> -		if (!trylock_page(p)) {
> -			if (!flush) {
> -				flush_write_bio(epd);
> -				flush = 1;
> -			}
> -			lock_page(p);
> -		}
> +	/*
> +	 * We cleared dirty on this buffer, we need to adjust the radix tags.
> +	 * We do the actual page accounting in write_one_eb.
> +	 */
> +	if (ret) {
> +		spin_lock_irq(&eb_info->buffer_lock);
> +		radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
> +				   PAGECACHE_TAG_WRITEBACK);
> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +				     PAGECACHE_TAG_DIRTY);
> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +				     PAGECACHE_TAG_TOWRITE);
> +		spin_unlock_irq(&eb_info->buffer_lock);
>  	}
> -
>  	return ret;
>  }
> 
>  static void end_extent_buffer_writeback(struct extent_buffer *eb)
>  {
> -	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> -	smp_mb__after_atomic();
> -	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
> +	if (test_and_clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
> +		struct btrfs_eb_info *eb_info = eb->eb_info;
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&eb_info->buffer_lock, flags);
> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +				     PAGECACHE_TAG_WRITEBACK);
> +		spin_unlock_irqrestore(&eb_info->buffer_lock, flags);
> +		wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
> +	}
>  }
> 
>  static void set_btree_ioerr(struct page *page)
>  {
>  	struct extent_buffer *eb = (struct extent_buffer *)page->private;
> +	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
> 
> -	SetPageError(page);
>  	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>  		return;
> 
> @@ -3624,8 +3629,7 @@ static void set_btree_ioerr(struct page *page)
>  	 * failed, increment the counter transaction->eb_write_errors.
>  	 * We do this because while the transaction is running and before it's
>  	 * committing (when we call filemap_fdata[write|wait]_range against
> -	 * the btree inode), we might have
> -	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
> +	 * the btree inode), we might have write_metadata() called - if it
>  	 * returns an error or an error happens during writeback, when we're
>  	 * committing the transaction we wouldn't know about it, since the pages
>  	 * can be no longer dirty nor marked anymore for writeback (if a
> @@ -3659,13 +3663,13 @@ static void set_btree_ioerr(struct page *page)
>  	 */
>  	switch (eb->log_index) {
>  	case -1:
> -		set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
> +		set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
>  		break;
>  	case 0:
> -		set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
> +		set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
>  		break;
>  	case 1:
> -		set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
> +		set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
>  		break;
>  	default:
>  		BUG(); /* unexpected, logic error */
> @@ -3686,16 +3690,13 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
>  		done = atomic_dec_and_test(&eb->io_pages);
> 
>  		if (bio->bi_error ||
> -		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
> -			ClearPageUptodate(page);
> +		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>  			set_btree_ioerr(page);
> -		}
> -
> -		end_page_writeback(page);
> 
> +		account_metadata_end_writeback(page,
> +					       &eb->eb_info->fs_info->bdi);
>  		if (!done)
>  			continue;
> -
>  		end_extent_buffer_writeback(eb);
>  	}
> 
> @@ -3708,7 +3709,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>  			struct extent_page_data *epd)
>  {
>  	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
> -	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
> +	struct extent_io_tree *tree = &fs_info->eb_info->io_tree;
>  	u64 offset = eb->start;
>  	unsigned long i, num_pages;
>  	unsigned long bio_flags = 0;
> @@ -3724,8 +3725,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>  	for (i = 0; i < num_pages; i++) {
>  		struct page *p = eb->pages[i];
> 
> -		clear_page_dirty_for_io(p);
> -		set_page_writeback(p);
>  		ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
>  					 p, offset >> 9, PAGE_SIZE, 0, bdev,
>  					 &epd->bio, -1,
> @@ -3734,34 +3733,90 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>  		epd->bio_flags = bio_flags;
>  		if (ret) {
>  			set_btree_ioerr(p);
> -			end_page_writeback(p);
>  			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
>  				end_extent_buffer_writeback(eb);
>  			ret = -EIO;
>  			break;
>  		}
> +		account_metadata_writeback(p, &fs_info->bdi);
>  		offset += PAGE_SIZE;
>  		update_nr_written(p, wbc, 1);
> -		unlock_page(p);
>  	}
> 
> -	if (unlikely(ret)) {
> -		for (; i < num_pages; i++) {
> -			struct page *p = eb->pages[i];
> -			clear_page_dirty_for_io(p);
> -			unlock_page(p);
> +	return ret;
> +}
> +
> +#define EB_TAG_BATCH 4096
> +static void tag_ebs_for_writeback(struct btrfs_eb_info *eb_info, pgoff_t start,
> +				  pgoff_t end)
> +{
> +	unsigned long tagged;
> +
> +	do {
> +		spin_lock_irq(&eb_info->buffer_lock);
> +		tagged = radix_tree_range_tag_if_tagged(&eb_info->buffer_radix,
> +							&start, end,
> +							EB_TAG_BATCH,
> +							PAGECACHE_TAG_DIRTY,
> +							PAGECACHE_TAG_TOWRITE);
> +		spin_unlock_irq(&eb_info->buffer_lock);
> +		cond_resched();
> +	} while (tagged >= EB_TAG_BATCH && start);
> +}
> +
> +static unsigned eb_lookup_tag(struct btrfs_eb_info *eb_info,
> +			      struct extent_buffer **ebs, pgoff_t *index,
> +			      int tag, unsigned nr)
> +{
> +	struct radix_tree_iter iter;
> +	void **slot;
> +	unsigned ret = 0;
> +
> +	if (unlikely(!nr))
> +		return 0;
> +
> +	rcu_read_lock();
> +	radix_tree_for_each_tagged(slot, &eb_info->buffer_radix, &iter, *index,
> +				   tag) {
> +		struct extent_buffer *eb;
> +repeat:
> +		eb = radix_tree_deref_slot(slot);
> +		if (unlikely(!eb))
> +			continue;
> +
> +		if (radix_tree_exception(eb)) {
> +			if (radix_tree_deref_retry(eb)) {
> +				slot = radix_tree_iter_retry(&iter);
> +				continue;
> +			}
> +			continue;
>  		}
> -	}
> 
> +		if (unlikely(!atomic_inc_not_zero(&eb->refs)))
> +			continue;
> +
> +		if (unlikely(eb != *slot)) {
> +			free_extent_buffer(eb);
> +			goto repeat;
> +		}
> +
> +		ebs[ret] = eb;
> +		if (++ret == nr)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	if (ret)
> +		*index = (ebs[ret - 1]->start >> PAGE_SHIFT) + 1;
>  	return ret;
>  }
> 
> -int btree_write_cache_pages(struct address_space *mapping,
> +#define EBVEC_SIZE 16
> +static int btree_write_cache_pages(struct btrfs_fs_info *fs_info,
>  				   struct writeback_control *wbc)
>  {
> -	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
> -	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
> -	struct extent_buffer *eb, *prev_eb = NULL;
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +	struct extent_io_tree *tree = &eb_info->io_tree;
> +	struct extent_buffer *eb;
>  	struct extent_page_data epd = {
>  		.bio = NULL,
>  		.tree = tree,
> @@ -3772,16 +3827,16 @@ int btree_write_cache_pages(struct address_space *mapping,
>  	int ret = 0;
>  	int done = 0;
>  	int nr_to_write_done = 0;
> -	struct pagevec pvec;
> -	int nr_pages;
> +	struct extent_buffer *ebs[EBVEC_SIZE];
> +	int nr_ebs;
>  	pgoff_t index;
>  	pgoff_t end;		/* Inclusive */
> +	pgoff_t done_index = 0;
>  	int scanned = 0;
>  	int tag;
> 
> -	pagevec_init(&pvec, 0);
>  	if (wbc->range_cyclic) {
> -		index = mapping->writeback_index; /* Start from prev offset */
> +		index = eb_info->writeback_index; /* Start from prev offset */
>  		end = -1;
>  	} else {
>  		index = wbc->range_start >> PAGE_SHIFT;
> @@ -3794,53 +3849,27 @@ int btree_write_cache_pages(struct address_space *mapping,
>  		tag = PAGECACHE_TAG_DIRTY;
>  retry:
>  	if (wbc->sync_mode == WB_SYNC_ALL)
> -		tag_pages_for_writeback(mapping, index, end);
> +		tag_ebs_for_writeback(fs_info->eb_info, index, end);
>  	while (!done && !nr_to_write_done && (index <= end) &&
> -	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
> +	       (nr_ebs = eb_lookup_tag(eb_info, ebs, &index, tag,
> +			min(end - index, (pgoff_t)EBVEC_SIZE-1) + 1))) {
>  		unsigned i;
> 
>  		scanned = 1;
> -		for (i = 0; i < nr_pages; i++) {
> -			struct page *page = pvec.pages[i];
> -
> -			if (!PagePrivate(page))
> -				continue;
> -
> -			if (!wbc->range_cyclic && page->index > end) {
> -				done = 1;
> -				break;
> -			}
> -
> -			spin_lock(&mapping->private_lock);
> -			if (!PagePrivate(page)) {
> -				spin_unlock(&mapping->private_lock);
> -				continue;
> -			}
> -
> -			eb = (struct extent_buffer *)page->private;
> -
> -			/*
> -			 * Shouldn't happen and normally this would be a BUG_ON
> -			 * but no sense in crashing the users box for something
> -			 * we can survive anyway.
> -			 */
> -			if (WARN_ON(!eb)) {
> -				spin_unlock(&mapping->private_lock);
> +		for (i = 0; i < nr_ebs; i++) {
> +			eb = ebs[i];
> +			if (done) {
> +				free_extent_buffer(eb);
>  				continue;
>  			}
> 
> -			if (eb == prev_eb) {
> -				spin_unlock(&mapping->private_lock);
> +			if (!wbc->range_cyclic && eb->start > wbc->range_end) {
> +				done = 1;
> +				free_extent_buffer(eb);
>  				continue;
>  			}
> 
> -			ret = atomic_inc_not_zero(&eb->refs);
> -			spin_unlock(&mapping->private_lock);
> -			if (!ret)
> -				continue;
> -
> -			prev_eb = eb;
> +			done_index = eb_index(eb);
>  			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
>  			if (!ret) {
>  				free_extent_buffer(eb);
> @@ -3848,12 +3877,11 @@ retry:
>  			}
> 
>  			ret = write_one_eb(eb, fs_info, wbc, &epd);
> +			free_extent_buffer(eb);
>  			if (ret) {
>  				done = 1;
> -				free_extent_buffer(eb);
> -				break;
> +				continue;
>  			}
> -			free_extent_buffer(eb);
> 
>  			/*
>  			 * the filesystem may choose to bump up nr_to_write.
> @@ -3862,7 +3890,6 @@ retry:
>  			 */
>  			nr_to_write_done = wbc->nr_to_write <= 0;
>  		}
> -		pagevec_release(&pvec);
>  		cond_resched();
>  	}
>  	if (!scanned && !done) {
> @@ -3874,10 +3901,77 @@ retry:
>  		index = 0;
>  		goto retry;
>  	}
> +	if (wbc->range_cyclic)
> +		fs_info->eb_info->writeback_index = done_index;
>  	flush_write_bio(&epd);
>  	return ret;
>  }
> 
> +void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	btree_write_cache_pages(fs_info, wbc);
> +}
> +
> +static int __btree_write_range(struct btrfs_fs_info *fs_info, u64 start,
> +			       u64 end, int sync_mode)
> +{
> +	struct writeback_control wbc = {
> +		.sync_mode = sync_mode,
> +		.nr_to_write = LONG_MAX,
> +		.range_start = start,
> +		.range_end = end,
> +	};
> +
> +	return btree_write_cache_pages(fs_info, &wbc);
> +}
> +
> +void btree_flush(struct btrfs_fs_info *fs_info)
> +{
> +	__btree_write_range(fs_info, 0, (u64)-1, WB_SYNC_NONE);
> +}
> +
> +int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
> +{
> +	return __btree_write_range(fs_info, start, end, WB_SYNC_ALL);
> +}
> +
> +int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
> +{
> +	struct extent_buffer *ebs[EBVEC_SIZE];
> +	pgoff_t index = start >> PAGE_SHIFT;
> +	pgoff_t end_index = end >> PAGE_SHIFT;
> +	unsigned nr_ebs;
> +	int ret = 0;
> +
> +	if (end < start)
> +		return ret;
> +
> +	while ((index <= end) &&
> +	       (nr_ebs = eb_lookup_tag(fs_info->eb_info, ebs, &index,
> +				       PAGECACHE_TAG_WRITEBACK,
> +				       min(end_index - index,
> +					   (pgoff_t)EBVEC_SIZE-1) + 1)) != 0) {
> +		unsigned i;
> +
> +		for (i = 0; i < nr_ebs; i++) {
> +			struct extent_buffer *eb = ebs[i];
> +
> +			if (eb->start > end) {
> +				free_extent_buffer(eb);
> +				continue;
> +			}
> +
> +			wait_on_extent_buffer_writeback(eb);
> +			if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
> +				ret = -EIO;
> +			free_extent_buffer(eb);
> +		}
> +		cond_resched();
> +	}
> +	return ret;
> +}
> +
>  /**
>   * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
>   * @mapping: address space structure to write
> @@ -4558,7 +4652,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
>  {
>  	unsigned long index;
>  	struct page *page;
> -	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
> 
>  	BUG_ON(extent_buffer_under_io(eb));
> 
> @@ -4566,39 +4659,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
>  	if (index == 0)
>  		return;
> 
> +	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
>  	do {
>  		index--;
>  		page = eb->pages[index];
>  		if (!page)
>  			continue;
> -		if (mapped)
> -			spin_lock(&page->mapping->private_lock);
> -		/*
> -		 * We do this since we'll remove the pages after we've
> -		 * removed the eb from the radix tree, so we could race
> -		 * and have this page now attached to the new eb.  So
> -		 * only clear page_private if it's still connected to
> -		 * this eb.
> -		 */
> -		if (PagePrivate(page) &&
> -		    page->private == (unsigned long)eb) {
> -			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
> -			BUG_ON(PageDirty(page));
> -			BUG_ON(PageWriteback(page));
> -			/*
> -			 * We need to make sure we haven't be attached
> -			 * to a new eb.
> -			 */
> -			ClearPagePrivate(page);
> -			set_page_private(page, 0);
> -			/* One for the page private */
> -			put_page(page);
> -		}
> +		ASSERT(PagePrivate(page));
> +		ASSERT(page->private == (unsigned long)eb);
> +		ClearPagePrivate(page);
> +		set_page_private(page, 0);
> 
> -		if (mapped)
> -			spin_unlock(&page->mapping->private_lock);
> +		/* Once for the page private. */
> +		put_page(page);
> 
> -		/* One for when we allocated the page */
> +		/* Once for the alloc_page. */
>  		put_page(page);
>  	} while (index != 0);
>  }
> @@ -4613,7 +4688,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
>  }
> 
>  static struct extent_buffer *
> -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
> +__alloc_extent_buffer(struct btrfs_eb_info *eb_info, u64 start,
>  		      unsigned long len)
>  {
>  	struct extent_buffer *eb = NULL;
> @@ -4621,7 +4696,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
>  	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
>  	eb->start = start;
>  	eb->len = len;
> -	eb->fs_info = fs_info;
> +	eb->eb_info = eb_info;
>  	eb->bflags = 0;
>  	rwlock_init(&eb->lock);
>  	atomic_set(&eb->write_locks, 0);
> @@ -4633,6 +4708,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
>  	eb->lock_nested = 0;
>  	init_waitqueue_head(&eb->write_lock_wq);
>  	init_waitqueue_head(&eb->read_lock_wq);
> +	INIT_LIST_HEAD(&eb->lru);
> 
>  	btrfs_leak_debug_add(&eb->leak_list, &buffers);
> 
> @@ -4657,7 +4733,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>  	struct extent_buffer *new;
>  	unsigned long num_pages = num_extent_pages(src->start, src->len);
> 
> -	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
> +	new = __alloc_extent_buffer(src->eb_info, src->start, src->len);
>  	if (new == NULL)
>  		return NULL;
> 
> @@ -4668,8 +4744,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>  			return NULL;
>  		}
>  		attach_extent_buffer_page(new, p);
> -		WARN_ON(PageDirty(p));
> -		SetPageUptodate(p);
>  		new->pages[i] = p;
>  	}
> 
> @@ -4680,8 +4754,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>  	return new;
>  }
> 
> -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> -						  u64 start, unsigned long len)
> +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info *eb_info,
> +						u64 start, unsigned long len)
>  {
>  	struct extent_buffer *eb;
>  	unsigned long num_pages;
> @@ -4689,7 +4763,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> 
>  	num_pages = num_extent_pages(start, len);
> 
> -	eb = __alloc_extent_buffer(fs_info, start, len);
> +	eb = __alloc_extent_buffer(eb_info, start, len);
>  	if (!eb)
>  		return NULL;
> 
> @@ -4697,6 +4771,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>  		eb->pages[i] = alloc_page(GFP_NOFS);
>  		if (!eb->pages[i])
>  			goto err;
> +		attach_extent_buffer_page(eb, eb->pages[i]);
>  	}
>  	set_extent_buffer_uptodate(eb);
>  	btrfs_set_header_nritems(eb, 0);
> @@ -4704,30 +4779,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> 
>  	return eb;
>  err:
> -	for (; i > 0; i--)
> -		__free_page(eb->pages[i - 1]);
> -	__free_extent_buffer(eb);
> +	btrfs_release_extent_buffer(eb);
>  	return NULL;
>  }
> 
> -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> -						u64 start, u32 nodesize)
> -{
> -	unsigned long len;
> -
> -	if (!fs_info) {
> -		/*
> -		 * Called only from tests that don't always have a fs_info
> -		 * available
> -		 */
> -		len = nodesize;
> -	} else {
> -		len = fs_info->tree_root->nodesize;
> -	}
> -
> -	return __alloc_dummy_extent_buffer(fs_info, start, len);
> -}
> -
>  static void check_buffer_tree_ref(struct extent_buffer *eb)
>  {
>  	int refs;
> @@ -4777,13 +4832,13 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
>  	}
>  }
> 
> -struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
> +struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
>  					 u64 start)
>  {
>  	struct extent_buffer *eb;
> 
>  	rcu_read_lock();
> -	eb = radix_tree_lookup(&fs_info->buffer_radix,
> +	eb = radix_tree_lookup(&eb_info->buffer_radix,
>  			       start >> PAGE_SHIFT);
>  	if (eb && atomic_inc_not_zero(&eb->refs)) {
>  		rcu_read_unlock();
> @@ -4815,30 +4870,30 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
>  }
> 
>  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> -struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> -					u64 start, u32 nodesize)
> +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
> +					       u64 start, u32 nodesize)
>  {
>  	struct extent_buffer *eb, *exists = NULL;
>  	int ret;
> 
> -	eb = find_extent_buffer(fs_info, start);
> +	eb = find_extent_buffer(eb_info, start);
>  	if (eb)
>  		return eb;
> -	eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
> +	eb = alloc_dummy_extent_buffer(eb_info, start, nodesize);
>  	if (!eb)
>  		return NULL;
> -	eb->fs_info = fs_info;
> +	eb->eb_info = eb_info;
>  again:
>  	ret = radix_tree_preload(GFP_NOFS);
>  	if (ret)
>  		goto free_eb;
> -	spin_lock(&fs_info->buffer_lock);
> -	ret = radix_tree_insert(&fs_info->buffer_radix,
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	ret = radix_tree_insert(&eb_info->buffer_radix,
>  				start >> PAGE_SHIFT, eb);
> -	spin_unlock(&fs_info->buffer_lock);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  	radix_tree_preload_end();
>  	if (ret == -EEXIST) {
> -		exists = find_extent_buffer(fs_info, start);
> +		exists = find_extent_buffer(eb_info, start);
>  		if (exists)
>  			goto free_eb;
>  		else
> @@ -4854,6 +4909,7 @@ again:
>  	 * bump the ref count again.
>  	 */
>  	atomic_inc(&eb->refs);
> +	set_extent_buffer_uptodate(eb);
>  	return eb;
>  free_eb:
>  	btrfs_release_extent_buffer(eb);
> @@ -4867,12 +4923,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  	unsigned long len = fs_info->tree_root->nodesize;
>  	unsigned long num_pages = num_extent_pages(start, len);
>  	unsigned long i;
> -	unsigned long index = start >> PAGE_SHIFT;
>  	struct extent_buffer *eb;
>  	struct extent_buffer *exists = NULL;
>  	struct page *p;
> -	struct address_space *mapping = fs_info->btree_inode->i_mapping;
> -	int uptodate = 1;
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +//	struct zone *last_zone = NULL;
> +//	struct pg_data_t *last_pgdata = NULL;
>  	int ret;
> 
>  	if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
> @@ -4880,62 +4936,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  		return ERR_PTR(-EINVAL);
>  	}
> 
> -	eb = find_extent_buffer(fs_info, start);
> +	eb = find_extent_buffer(eb_info, start);
>  	if (eb)
>  		return eb;
> 
> -	eb = __alloc_extent_buffer(fs_info, start, len);
> +	eb = __alloc_extent_buffer(eb_info, start, len);
>  	if (!eb)
>  		return ERR_PTR(-ENOMEM);
> 
> -	for (i = 0; i < num_pages; i++, index++) {
> -		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
> +	for (i = 0; i < num_pages; i++) {
> +		p = alloc_page(GFP_NOFS|__GFP_NOFAIL);
>  		if (!p) {
>  			exists = ERR_PTR(-ENOMEM);
>  			goto free_eb;
>  		}
> 
> -		spin_lock(&mapping->private_lock);
> -		if (PagePrivate(p)) {
> -			/*
> -			 * We could have already allocated an eb for this page
> -			 * and attached one so lets see if we can get a ref on
> -			 * the existing eb, and if we can we know it's good and
> -			 * we can just return that one, else we know we can just
> -			 * overwrite page->private.
> -			 */
> -			exists = (struct extent_buffer *)p->private;
> -			if (atomic_inc_not_zero(&exists->refs)) {
> -				spin_unlock(&mapping->private_lock);
> -				unlock_page(p);
> -				put_page(p);
> -				mark_extent_buffer_accessed(exists, p);
> -				goto free_eb;
> -			}
> -			exists = NULL;
> -
> -			/*
> -			 * Do this so attach doesn't complain and we need to
> -			 * drop the ref the old guy had.
> -			 */
> -			ClearPagePrivate(p);
> -			WARN_ON(PageDirty(p));
> -			put_page(p);
> -		}
> +		/*
> +		 * If our pages span zones or numa nodes we have to do
> +		 * dirty/writeback accounting per page, otherwise we can do it
> +		 * in bulk and save us some looping.
> +		 *
> +		if (!last_zone)
> +			last_zone = page_zone(p);
> +		if (!last_pgdata)
> +			last_pgdata = page_pgdata(p);
> +		if (last_zone != page_zone(p) || last_pgdata != page_pgdata(p))
> +			set_bit(EXTENT_BUFFER_MIXED_PAGES, &eb->bflags);
> +		*/
>  		attach_extent_buffer_page(eb, p);
> -		spin_unlock(&mapping->private_lock);
> -		WARN_ON(PageDirty(p));
>  		eb->pages[i] = p;
> -		if (!PageUptodate(p))
> -			uptodate = 0;
> -
> -		/*
> -		 * see below about how we avoid a nasty race with release page
> -		 * and why we unlock later
> -		 */
>  	}
> -	if (uptodate)
> -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>  again:
>  	ret = radix_tree_preload(GFP_NOFS);
>  	if (ret) {
> @@ -4943,13 +4973,13 @@ again:
>  		goto free_eb;
>  	}
> 
> -	spin_lock(&fs_info->buffer_lock);
> -	ret = radix_tree_insert(&fs_info->buffer_radix,
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	ret = radix_tree_insert(&eb_info->buffer_radix,
>  				start >> PAGE_SHIFT, eb);
> -	spin_unlock(&fs_info->buffer_lock);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  	radix_tree_preload_end();
>  	if (ret == -EEXIST) {
> -		exists = find_extent_buffer(fs_info, start);
> +		exists = find_extent_buffer(eb_info, start);
>  		if (exists)
>  			goto free_eb;
>  		else
> @@ -4959,31 +4989,10 @@ again:
>  	check_buffer_tree_ref(eb);
>  	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
> 
> -	/*
> -	 * there is a race where release page may have
> -	 * tried to find this extent buffer in the radix
> -	 * but failed.  It will tell the VM it is safe to
> -	 * reclaim the, and it will clear the page private bit.
> -	 * We must make sure to set the page private bit properly
> -	 * after the extent buffer is in the radix tree so
> -	 * it doesn't get lost
> -	 */
> -	SetPageChecked(eb->pages[0]);
> -	for (i = 1; i < num_pages; i++) {
> -		p = eb->pages[i];
> -		ClearPageChecked(p);
> -		unlock_page(p);
> -	}
> -	unlock_page(eb->pages[0]);
>  	return eb;
> 
>  free_eb:
>  	WARN_ON(!atomic_dec_and_test(&eb->refs));
> -	for (i = 0; i < num_pages; i++) {
> -		if (eb->pages[i])
> -			unlock_page(eb->pages[i]);
> -	}
> -
>  	btrfs_release_extent_buffer(eb);
>  	return exists;
>  }
> @@ -4999,17 +5008,19 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
>  /* Expects to have eb->eb_lock already held */
>  static int release_extent_buffer(struct extent_buffer *eb)
>  {
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +
>  	WARN_ON(atomic_read(&eb->refs) == 0);
>  	if (atomic_dec_and_test(&eb->refs)) {
> +		if (eb_info)
> +			list_lru_del(&eb_info->lru_list, &eb->lru);
>  		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
> -			struct btrfs_fs_info *fs_info = eb->fs_info;
> -
>  			spin_unlock(&eb->refs_lock);
> 
> -			spin_lock(&fs_info->buffer_lock);
> -			radix_tree_delete(&fs_info->buffer_radix,
> -					  eb->start >> PAGE_SHIFT);
> -			spin_unlock(&fs_info->buffer_lock);
> +			spin_lock_irq(&eb_info->buffer_lock);
> +			radix_tree_delete(&eb_info->buffer_radix,
> +					  eb_index(eb));
> +			spin_unlock_irq(&eb_info->buffer_lock);
>  		} else {
>  			spin_unlock(&eb->refs_lock);
>  		}
> @@ -5024,6 +5035,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
>  #endif
>  		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
>  		return 1;
> +	} else if (eb_info && atomic_read(&eb->refs) == 1) {
> +		list_lru_add(&eb_info->lru_list, &eb->lru);
>  	}
>  	spin_unlock(&eb->refs_lock);
> 
> @@ -5057,10 +5070,6 @@ void free_extent_buffer(struct extent_buffer *eb)
>  	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
>  		atomic_dec(&eb->refs);
> 
> -	/*
> -	 * I know this is terrible, but it's temporary until we stop tracking
> -	 * the uptodate bits and such for the extent buffers.
> -	 */
>  	release_extent_buffer(eb);
>  }
> 
> @@ -5078,82 +5087,163 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
>  	release_extent_buffer(eb);
>  }
> 
> -void clear_extent_buffer_dirty(struct extent_buffer *eb)
> +long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc)
>  {
> -	unsigned long i;
> -	unsigned long num_pages;
> -	struct page *page;
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> 
> -	num_pages = num_extent_pages(eb->start, eb->len);
> +	return list_lru_shrink_count(&eb_info->lru_list, sc);
> +}
> 
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (!PageDirty(page))
> -			continue;
> +static enum lru_status eb_lru_isolate(struct list_head *item,
> +				      struct list_lru_one *lru,
> +				      spinlock_t *lru_lock, void *arg)
> +{
> +	struct list_head *freeable = (struct list_head *)arg;
> +	struct extent_buffer *eb = container_of(item, struct extent_buffer,
> +						lru);
> +	enum lru_status ret;
> +	int refs;
> 
> -		lock_page(page);
> -		WARN_ON(!PagePrivate(page));
> +	if (!spin_trylock(&eb->refs_lock))
> +		return LRU_SKIP;
> 
> -		clear_page_dirty_for_io(page);
> -		spin_lock_irq(&page->mapping->tree_lock);
> -		if (!PageDirty(page)) {
> -			radix_tree_tag_clear(&page->mapping->page_tree,
> -						page_index(page),
> -						PAGECACHE_TAG_DIRTY);
> -		}
> -		spin_unlock_irq(&page->mapping->tree_lock);
> -		ClearPageError(page);
> -		unlock_page(page);
> +	if (extent_buffer_under_io(eb)) {
> +		ret = LRU_ROTATE;
> +		goto out;
> +	}
> +
> +	refs = atomic_read(&eb->refs);
> +	/* We can race with somebody freeing us, just skip if this happens. */
> +	if (refs == 0) {
> +		ret = LRU_SKIP;
> +		goto out;
> +	}
> +
> +	/* Eb is in use, don't kill it. */
> +	if (refs > 1) {
> +		ret = LRU_ROTATE;
> +		goto out;
> +	}
> +
> +	/*
> +	 * If we don't clear the TREE_REF flag then this eb is going to
> +	 * disappear soon anyway.  Otherwise we become responsible for dropping
> +	 * the last ref on this eb and we know it'll survive until we call
> +	 * dispose_list.
> +	 */
> +	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
> +		ret = LRU_SKIP;
> +		goto out;
> +	}
> +	list_lru_isolate_move(lru, &eb->lru, freeable);
> +	ret = LRU_REMOVED;
> +out:
> +	spin_unlock(&eb->refs_lock);
> +	return ret;
> +}
> +
> +static void dispose_list(struct list_head *list)
> +{
> +	struct extent_buffer *eb;
> +
> +	while (!list_empty(list)) {
> +		eb = list_first_entry(list, struct extent_buffer, lru);
> +
> +		spin_lock(&eb->refs_lock);
> +		list_del_init(&eb->lru);
> +		spin_unlock(&eb->refs_lock);
> +		free_extent_buffer(eb);
> +		cond_resched();
>  	}
> +}
> +
> +long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +	LIST_HEAD(freeable);
> +	long freed;
> +
> +	freed = list_lru_shrink_walk(&eb_info->lru_list, sc, eb_lru_isolate,
> +				     &freeable);
> +	dispose_list(&freeable);
> +	return freed;
> +}
> +
> +#define MAX_EVICT_COUNT 1024
> +void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info)
> +{
> +	LIST_HEAD(freeable);
> +	unsigned long count;
> +
> +	/*
> +	 * Evict in batches so we don't lockup the system trying to evict
> +	 * memory.
> +	 */
> +	do {
> +		count = list_lru_walk(&eb_info->lru_list, eb_lru_isolate,
> +				      &freeable, MAX_EVICT_COUNT);
> +		cond_resched();
> +	} while (count);
> +	dispose_list(&freeable);
> +	synchronize_rcu();
> +}
> +
> +int clear_extent_buffer_dirty(struct extent_buffer *eb)
> +{
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +	unsigned long i;
> +	unsigned long num_pages;
> +
> +	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
> +		return 0;
> +
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +			     PAGECACHE_TAG_DIRTY);
> +	spin_unlock_irq(&eb_info->buffer_lock);
> +
> +	num_pages = num_extent_pages(eb->start, eb->len);
> +	for (i = 0; i < num_pages; i++)
> +		account_metadata_cleaned(eb->pages[i], &eb_info->fs_info->bdi);
>  	WARN_ON(atomic_read(&eb->refs) == 0);
> +	return 1;
>  }
> 
>  int set_extent_buffer_dirty(struct extent_buffer *eb)
>  {
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
>  	unsigned long i;
>  	unsigned long num_pages;
>  	int was_dirty = 0;
> 
>  	check_buffer_tree_ref(eb);
> 
> -	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
> -
> -	num_pages = num_extent_pages(eb->start, eb->len);
>  	WARN_ON(atomic_read(&eb->refs) == 0);
>  	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
> +	if (test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
> +		return 1;
> 
> +	num_pages = num_extent_pages(eb->start, eb->len);
>  	for (i = 0; i < num_pages; i++)
> -		set_page_dirty(eb->pages[i]);
> +		account_metadata_dirtied(eb->pages[i],
> +					 &eb->eb_info->fs_info->bdi);
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
> +			   PAGECACHE_TAG_DIRTY);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  	return was_dirty;
>  }
> 
>  void clear_extent_buffer_uptodate(struct extent_buffer *eb)
>  {
> -	unsigned long i;
> -	struct page *page;
> -	unsigned long num_pages;
> -
>  	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (page)
> -			ClearPageUptodate(page);
> -	}
>  }
> 
>  void set_extent_buffer_uptodate(struct extent_buffer *eb)
>  {
> -	unsigned long i;
> -	struct page *page;
> -	unsigned long num_pages;
> -
>  	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		SetPageUptodate(page);
> -	}
>  }
> 
>  int extent_buffer_uptodate(struct extent_buffer *eb)
> @@ -5161,103 +5251,166 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
>  	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>  }
> 
> -int read_extent_buffer_pages(struct extent_io_tree *tree,
> -			     struct extent_buffer *eb, int wait,
> -			     get_extent_t *get_extent, int mirror_num)
> +static void end_bio_extent_buffer_readpage(struct bio *bio)
>  {
> +	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> +	struct extent_io_tree *tree = NULL;
> +	struct bio_vec *bvec;
> +	u64 unlock_start = 0, unlock_len = 0;
> +	int mirror_num = io_bio->mirror_num;
> +	int uptodate = !bio->bi_error;
> +	int i, ret;
> +
> +	bio_for_each_segment_all(bvec, bio, i) {
> +		struct page *page = bvec->bv_page;
> +		struct btrfs_eb_info *eb_info;
> +		struct extent_buffer *eb;
> +
> +		eb = (struct extent_buffer *)page->private;
> +		if (WARN_ON(!eb))
> +			continue;
> +
> +		eb_info = eb->eb_info;
> +		if (!tree)
> +			tree = &eb_info->io_tree;
> +		if (uptodate) {
> +			/*
> +			 * btree_readpage_end_io_hook doesn't care about
> +			 * start/end so just pass 0.  We'll kill this later.
> +			 */
> +			ret = tree->ops->readpage_end_io_hook(io_bio, 0,
> +							      page, 0, 0,
> +							      mirror_num);
> +			if (ret) {
> +				uptodate = 0;
> +			} else {
> +				u64 start = eb->start;
> +				int c, num_pages;
> +
> +				num_pages = num_extent_pages(eb->start,
> +							     eb->len);
> +				for (c = 0; c < num_pages; c++) {
> +					if (eb->pages[c] == page)
> +						break;
> +					start += PAGE_SIZE;
> +				}
> +				clean_io_failure(eb_info->fs_info,
> +						 &eb_info->io_failure_tree,
> +						 tree, start, page, 0, 0);
> +			}
> +		}
> +		/*
> +		 * We never fix anything in btree_io_failed_hook.
> +		 *
> +		 * TODO: rework the io failed hook to not assume we can fix
> +		 * anything.
> +		 */
> +		if (!uptodate)
> +			tree->ops->readpage_io_failed_hook(page, mirror_num);
> +
> +		if (unlock_start == 0) {
> +			unlock_start = eb->start;
> +			unlock_len = PAGE_SIZE;
> +		} else {
> +			unlock_len += PAGE_SIZE;
> +		}
> +	}
> +
> +	if (unlock_start)
> +		unlock_extent(tree, unlock_start,
> +			      unlock_start + unlock_len - 1);
> +	if (io_bio->end_io)
> +		io_bio->end_io(io_bio, bio->bi_error);
> +	bio_put(bio);
> +}
> +
> +int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
> +			     int mirror_num)
> +{
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +	struct extent_io_tree *io_tree = &eb_info->io_tree;
> +	struct block_device *bdev = eb_info->fs_info->fs_devices->latest_bdev;
> +	struct bio *bio = NULL;
> +	u64 offset = eb->start;
> +	u64 unlock_start = 0, unlock_len = 0;
>  	unsigned long i;
>  	struct page *page;
>  	int err;
>  	int ret = 0;
> -	int locked_pages = 0;
> -	int all_uptodate = 1;
>  	unsigned long num_pages;
> -	unsigned long num_reads = 0;
> -	struct bio *bio = NULL;
> -	unsigned long bio_flags = 0;
> 
>  	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
>  		return 0;
> 
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (wait == WAIT_NONE) {
> -			if (!trylock_page(page))
> -				goto unlock_exit;
> -		} else {
> -			lock_page(page);
> -		}
> -		locked_pages++;
> -		if (!PageUptodate(page)) {
> -			num_reads++;
> -			all_uptodate = 0;
> -		}
> -	}
> -	if (all_uptodate) {
> -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -		goto unlock_exit;
> +	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) {
> +		if (wait != WAIT_COMPLETE)
> +			return 0;
> +		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
> +			       TASK_UNINTERRUPTIBLE);
> +		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
> +			ret = -EIO;
> +		return ret;
>  	}
> 
> +	lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
> +	num_pages = num_extent_pages(eb->start, eb->len);
>  	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
>  	eb->read_mirror = 0;
> -	atomic_set(&eb->io_pages, num_reads);
> +	atomic_set(&eb->io_pages, num_pages);
>  	for (i = 0; i < num_pages; i++) {
>  		page = eb->pages[i];
> -
> -		if (!PageUptodate(page)) {
> -			if (ret) {
> -				atomic_dec(&eb->io_pages);
> -				unlock_page(page);
> -				continue;
> +		if (ret) {
> +			unlock_len += PAGE_SIZE;
> +			if (atomic_dec_and_test(&eb->io_pages)) {
> +				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +				smp_mb__after_atomic();
> +				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>  			}
> +			continue;
> +		}
> 
> -			ClearPageError(page);
> -			err = __extent_read_full_page(tree, page,
> -						      get_extent, &bio,
> -						      mirror_num, &bio_flags,
> -						      REQ_META);
> -			if (err) {
> -				ret = err;
> -				/*
> -				 * We use &bio in above __extent_read_full_page,
> -				 * so we ensure that if it returns error, the
> -				 * current page fails to add itself to bio and
> -				 * it's been unlocked.
> -				 *
> -				 * We must dec io_pages by ourselves.
> -				 */
> -				atomic_dec(&eb->io_pages);
> +		err = submit_extent_page(REQ_OP_READ, REQ_META, io_tree, NULL,
> +					 page, offset >> 9, PAGE_SIZE, 0, bdev,
> +					 &bio, -1,
> +					 end_bio_extent_buffer_readpage,
> +					 mirror_num, 0, 0, false);
> +		if (err) {
> +			ret = err;
> +			/*
> +			 * We use &bio in above submit_extent_page
> +			 * so we ensure that if it returns error, the
> +			 * current page fails to add itself to bio and
> +			 * it's been unlocked.
> +			 *
> +			 * We must dec io_pages by ourselves.
> +			 */
> +			if (atomic_dec_and_test(&eb->io_pages)) {
> +				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +				smp_mb__after_atomic();
> +				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>  			}
> -		} else {
> -			unlock_page(page);
> +			unlock_start = eb->start;

Josef, IMHO "unlock_start" should have been set to "offset". Lets say we
have 4 pages making up a metadata block and the first page was successfully
added to a bio. Assume that adding the second page to the bio resulted in
submit_extent_page() returning an error. In this scenario,
end_bio_extent_buffer_readpage() will own the responsibility of unlocking the
first 4k range in the io tree. However with "unlock_start" being set to
"eb->start", read_extent_buffer_pages() may end up unlocking the first 4k
range in the io tree.

> +			unlock_len = PAGE_SIZE;
>  		}
> +		offset += PAGE_SIZE;
>  	}
> 
>  	if (bio) {
> -		err = submit_one_bio(bio, mirror_num, bio_flags);
> +		err = submit_one_bio(bio, mirror_num, 0);
>  		if (err)
>  			return err;
>  	}
> 
> +	if (ret && unlock_start)
> +		unlock_extent(io_tree, unlock_start,
> +			      unlock_start + unlock_len - 1);
>  	if (ret || wait != WAIT_COMPLETE)
>  		return ret;
> 
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		wait_on_page_locked(page);
> -		if (!PageUptodate(page))
> -			ret = -EIO;
> -	}
> -
> -	return ret;
> -
> -unlock_exit:
> -	while (locked_pages > 0) {
> -		locked_pages--;
> -		page = eb->pages[locked_pages];
> -		unlock_page(page);
> -	}
> +	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
> +		       TASK_UNINTERRUPTIBLE);
> +	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
> +		ret = -EIO;
>  	return ret;
>  }
> 

-- 
chandan


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 6/7] Btrfs: kill the btree_inode
  2016-09-08  5:17   ` Chandan Rajendra
@ 2016-09-08 14:12     ` Josef Bacik
  0 siblings, 0 replies; 16+ messages in thread
From: Josef Bacik @ 2016-09-08 14:12 UTC (permalink / raw)
  To: Chandan Rajendra; +Cc: linux-btrfs, kernel-team

On 09/08/2016 01:17 AM, Chandan Rajendra wrote:
> On Friday, September 02, 2016 03:40:05 PM Josef Bacik wrote:
>
> Please find my comment inlined below,
>
>> In order to more efficiently support sub-page blocksizes we need to stop
>> allocating pages from pagecache for our metadata.  Instead switch to using the
>> account_metadata* counters for making sure we are keeping the system aware of
>> how much dirty metadata we have, and use the ->free_cached_objects super
>> operation in order to handle freeing up extent buffers.  This greatly simplifies
>> how we deal with extent buffers as now we no longer have to tie the page cache
>> reclaimation stuff to the extent buffer stuff.  This will also allow us to
>> simply kmalloc() our data for sub-page blocksizes.
>>
>> Signed-off-by: Josef Bacik <jbacik@fb.com>
>> ---
>>  fs/btrfs/btrfs_inode.h                 |   3 +-
>>  fs/btrfs/ctree.c                       |  10 +-
>>  fs/btrfs/ctree.h                       |  13 +-
>>  fs/btrfs/disk-io.c                     | 389 ++++----------
>>  fs/btrfs/extent_io.c                   | 913 ++++++++++++++++++---------------
>>  fs/btrfs/extent_io.h                   |  49 +-
>>  fs/btrfs/inode.c                       |   6 +-
>>  fs/btrfs/root-tree.c                   |   2 +-
>>  fs/btrfs/super.c                       |  29 +-
>>  fs/btrfs/tests/btrfs-tests.c           |  37 +-
>>  fs/btrfs/tests/extent-io-tests.c       |   4 +-
>>  fs/btrfs/tests/free-space-tree-tests.c |   4 +-
>>  fs/btrfs/tests/qgroup-tests.c          |   4 +-
>>  fs/btrfs/transaction.c                 |  11 +-
>>  14 files changed, 726 insertions(+), 748 deletions(-)
>>
>> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>> index 1a8fa46..ad7b185 100644
>> --- a/fs/btrfs/btrfs_inode.h
>> +++ b/fs/btrfs/btrfs_inode.h
>> @@ -229,10 +229,9 @@ static inline u64 btrfs_ino(struct inode *inode)
>>  	u64 ino = BTRFS_I(inode)->location.objectid;
>>
>>  	/*
>> -	 * !ino: btree_inode
>>  	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
>>  	 */
>> -	if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
>> +	if (BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
>>  		ino = inode->i_ino;
>>  	return ino;
>>  }
>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>> index d1c56c9..b267053 100644
>> --- a/fs/btrfs/ctree.c
>> +++ b/fs/btrfs/ctree.c
>> @@ -1373,8 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
>>
>>  	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
>>  		BUG_ON(tm->slot != 0);
>> -		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
>> -						eb->len);
>> +		eb_rewin = alloc_dummy_extent_buffer(fs_info->eb_info,
>> +						     eb->start, eb->len);
>>  		if (!eb_rewin) {
>>  			btrfs_tree_read_unlock_blocking(eb);
>>  			free_extent_buffer(eb);
>> @@ -1455,8 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
>>  	} else if (old_root) {
>>  		btrfs_tree_read_unlock(eb_root);
>>  		free_extent_buffer(eb_root);
>> -		eb = alloc_dummy_extent_buffer(root->fs_info, logical,
>> -					root->nodesize);
>> +		eb = alloc_dummy_extent_buffer(root->fs_info->eb_info, logical,
>> +					       root->nodesize);
>>  	} else {
>>  		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
>>  		eb = btrfs_clone_extent_buffer(eb_root);
>> @@ -1772,7 +1772,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
>>  	int err;
>>
>>  	if (low > high) {
>> -		btrfs_err(eb->fs_info,
>> +		btrfs_err(eb->eb_info->fs_info,
>>  		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
>>  			  __func__, low, high, eb->start,
>>  			  btrfs_header_owner(eb), btrfs_header_level(eb));
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 282a031..ee6956c 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -37,6 +37,7 @@
>>  #include <linux/workqueue.h>
>>  #include <linux/security.h>
>>  #include <linux/sizes.h>
>> +#include <linux/list_lru.h>
>>  #include "extent_io.h"
>>  #include "extent_map.h"
>>  #include "async-thread.h"
>> @@ -675,6 +676,7 @@ struct btrfs_device;
>>  struct btrfs_fs_devices;
>>  struct btrfs_balance_control;
>>  struct btrfs_delayed_root;
>> +struct btrfs_eb_info;
>>
>>  #define BTRFS_FS_BARRIER			1
>>  #define BTRFS_FS_CLOSING_START			2
>> @@ -797,7 +799,7 @@ struct btrfs_fs_info {
>>  	struct btrfs_super_block *super_for_commit;
>>  	struct block_device *__bdev;
>>  	struct super_block *sb;
>> -	struct inode *btree_inode;
>> +	struct btrfs_eb_info *eb_info;
>>  	struct backing_dev_info bdi;
>>  	struct mutex tree_log_mutex;
>>  	struct mutex transaction_kthread_mutex;
>> @@ -1042,10 +1044,6 @@ struct btrfs_fs_info {
>>  	/* readahead works cnt */
>>  	atomic_t reada_works_cnt;
>>
>> -	/* Extent buffer radix tree */
>> -	spinlock_t buffer_lock;
>> -	struct radix_tree_root buffer_radix;
>> -
>>  	/* next backup root to be overwritten */
>>  	int backup_root_index;
>>
>> @@ -2884,6 +2882,8 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
>>
>>  static inline void free_fs_info(struct btrfs_fs_info *fs_info)
>>  {
>> +	list_lru_destroy(&fs_info->eb_info->lru_list);
>> +	kfree(fs_info->eb_info);
>>  	kfree(fs_info->balance_ctl);
>>  	kfree(fs_info->delayed_root);
>>  	kfree(fs_info->extent_root);
>> @@ -3121,9 +3121,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
>>  			     struct btrfs_root *new_root,
>>  			     struct btrfs_root *parent_root,
>>  			     u64 new_dirid);
>> -int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
>> -			 size_t size, struct bio *bio,
>> -			 unsigned long bio_flags);
>>  void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
>>  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
>>  int btrfs_readpage(struct file *file, struct page *page);
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 9c42e53..03ac601 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -217,56 +217,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
>>
>>  #endif
>>
>> -/*
>> - * extents on the btree inode are pretty simple, there's one extent
>> - * that covers the entire device
>> - */
>> -static struct extent_map *btree_get_extent(struct inode *inode,
>> -		struct page *page, size_t pg_offset, u64 start, u64 len,
>> -		int create)
>> -{
>> -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
>> -	struct extent_map *em;
>> -	int ret;
>> -
>> -	read_lock(&em_tree->lock);
>> -	em = lookup_extent_mapping(em_tree, start, len);
>> -	if (em) {
>> -		em->bdev =
>> -			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
>> -		read_unlock(&em_tree->lock);
>> -		goto out;
>> -	}
>> -	read_unlock(&em_tree->lock);
>> -
>> -	em = alloc_extent_map();
>> -	if (!em) {
>> -		em = ERR_PTR(-ENOMEM);
>> -		goto out;
>> -	}
>> -	em->start = 0;
>> -	em->len = (u64)-1;
>> -	em->block_len = (u64)-1;
>> -	em->block_start = 0;
>> -	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
>> -
>> -	write_lock(&em_tree->lock);
>> -	ret = add_extent_mapping(em_tree, em, 0);
>> -	if (ret == -EEXIST) {
>> -		free_extent_map(em);
>> -		em = lookup_extent_mapping(em_tree, start, len);
>> -		if (!em)
>> -			em = ERR_PTR(-EIO);
>> -	} else if (ret) {
>> -		free_extent_map(em);
>> -		em = ERR_PTR(ret);
>> -	}
>> -	write_unlock(&em_tree->lock);
>> -
>> -out:
>> -	return em;
>> -}
>> -
>>  u32 btrfs_csum_data(char *data, u32 seed, size_t len)
>>  {
>>  	return btrfs_crc32c(seed, data, len);
>> @@ -349,11 +299,11 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
>>   * detect blocks that either didn't get written at all or got written
>>   * in the wrong place.
>>   */
>> -static int verify_parent_transid(struct extent_io_tree *io_tree,
>> -				 struct extent_buffer *eb, u64 parent_transid,
>> +static int verify_parent_transid(struct extent_buffer *eb, u64 parent_transid,
>>  				 int atomic)
>>  {
>>  	struct extent_state *cached_state = NULL;
>> +	struct extent_io_tree *io_tree = &eb->eb_info->io_tree;
>>  	int ret;
>>  	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
>>
>> @@ -375,7 +325,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
>>  		ret = 0;
>>  		goto out;
>>  	}
>> -	btrfs_err_rl(eb->fs_info,
>> +	btrfs_err_rl(eb->eb_info->fs_info,
>>  		"parent transid verify failed on %llu wanted %llu found %llu",
>>  			eb->start,
>>  			parent_transid, btrfs_header_generation(eb));
>> @@ -445,7 +395,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
>>  					  struct extent_buffer *eb,
>>  					  u64 parent_transid)
>>  {
>> -	struct extent_io_tree *io_tree;
>>  	int failed = 0;
>>  	int ret;
>>  	int num_copies = 0;
>> @@ -453,13 +402,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
>>  	int failed_mirror = 0;
>>
>>  	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
>> -	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
>>  	while (1) {
>> -		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
>> -					       btree_get_extent, mirror_num);
>> +		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
>>  		if (!ret) {
>> -			if (!verify_parent_transid(io_tree, eb,
>> -						   parent_transid, 0))
>> +			if (!verify_parent_transid(eb, parent_transid, 0))
>>  				break;
>>  			else
>>  				ret = -EIO;
>> @@ -504,24 +450,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
>>
>>  static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
>>  {
>> -	u64 start = page_offset(page);
>> -	u64 found_start;
>>  	struct extent_buffer *eb;
>>
>>  	eb = (struct extent_buffer *)page->private;
>>  	if (page != eb->pages[0])
>>  		return 0;
>> -
>> -	found_start = btrfs_header_bytenr(eb);
>> -	/*
>> -	 * Please do not consolidate these warnings into a single if.
>> -	 * It is useful to know what went wrong.
>> -	 */
>> -	if (WARN_ON(found_start != start))
>> -		return -EUCLEAN;
>> -	if (WARN_ON(!PageUptodate(page)))
>> -		return -EUCLEAN;
>> -
>>  	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
>>  			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
>>
>> @@ -619,8 +552,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
>>  	u64 found_start;
>>  	int found_level;
>>  	struct extent_buffer *eb;
>> -	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
>> -	struct btrfs_fs_info *fs_info = root->fs_info;
>> +	struct btrfs_root *root;
>> +	struct btrfs_fs_info *fs_info;
>>  	int ret = 0;
>>  	int reads_done;
>>
>> @@ -633,6 +566,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
>>  	 * in memory.  Make sure we have a ref for all this other checks
>>  	 */
>>  	extent_buffer_get(eb);
>> +	fs_info = eb->eb_info->fs_info;
>> +	root = fs_info->tree_root;
>>
>>  	reads_done = atomic_dec_and_test(&eb->io_pages);
>>  	if (!reads_done)
>> @@ -693,11 +628,19 @@ err:
>>  		/*
>>  		 * our io error hook is going to dec the io pages
>>  		 * again, we have to make sure it has something
>> -		 * to decrement
>> +		 * to decrement.
>> +		 *
>> +		 * TODO: Kill this, we've re-arranged how this works now so we
>> +		 * don't need to do this io_pages dance.
>>  		 */
>>  		atomic_inc(&eb->io_pages);
>>  		clear_extent_buffer_uptodate(eb);
>>  	}
>> +	if (reads_done) {
>> +		clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
>> +		smp_mb__after_atomic();
>> +		wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>> +	}
>>  	free_extent_buffer(eb);
>>  out:
>>  	return ret;
>> @@ -712,7 +655,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
>>  	eb->read_mirror = failed_mirror;
>>  	atomic_dec(&eb->io_pages);
>>  	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
>> -		btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
>> +		btree_readahead_hook(eb->eb_info->fs_info, eb, eb->start, -EIO);
>>  	return -EIO;	/* we fixed nothing */
>>  }
>>
>> @@ -884,15 +827,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
>>  	return 0;
>>  }
>>
>> -static int btree_csum_one_bio(struct bio *bio)
>> +static int btree_csum_one_bio(struct btrfs_fs_info *fs_info, struct bio *bio)
>>  {
>>  	struct bio_vec *bvec;
>> -	struct btrfs_root *root;
>>  	int i, ret = 0;
>>
>>  	bio_for_each_segment_all(bvec, bio, i) {
>> -		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
>> -		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
>> +		ret = csum_dirty_buffer(fs_info, bvec->bv_page);
>>  		if (ret)
>>  			break;
>>  	}
>> @@ -904,25 +845,27 @@ static int __btree_submit_bio_start(void *private_data, struct bio *bio,
>>  				    int mirror_num, unsigned long bio_flags,
>>  				    u64 bio_offset)
>>  {
>> +	struct btrfs_eb_info *eb_info = private_data;
>>  	/*
>>  	 * when we're called for a write, we're already in the async
>>  	 * submission context.  Just jump into btrfs_map_bio
>>  	 */
>> -	return btree_csum_one_bio(bio);
>> +	return btree_csum_one_bio(eb_info->fs_info, bio);
>>  }
>>
>>  static int __btree_submit_bio_done(void *private_data, struct bio *bio,
>>  				 int mirror_num, unsigned long bio_flags,
>>  				 u64 bio_offset)
>>  {
>> -	struct inode *inode = private_data;
>> +	struct btrfs_eb_info *eb_info = private_data;
>> +	struct btrfs_root *root = eb_info->fs_info->tree_root;
>>  	int ret;
>>
>>  	/*
>>  	 * when we're called for a write, we're already in the async
>>  	 * submission context.  Just jump into btrfs_map_bio
>>  	 */
>> -	ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
>> +	ret = btrfs_map_bio(root, bio, mirror_num, 1);
>>  	if (ret) {
>>  		bio->bi_error = ret;
>>  		bio_endio(bio);
>> @@ -930,7 +873,7 @@ static int __btree_submit_bio_done(void *private_data, struct bio *bio,
>>  	return ret;
>>  }
>>
>> -static int check_async_write(struct inode *inode, unsigned long bio_flags)
>> +static int check_async_write(unsigned long bio_flags)
>>  {
>>  	if (bio_flags & EXTENT_BIO_TREE_LOG)
>>  		return 0;
>> @@ -945,8 +888,9 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
>>  				 int mirror_num, unsigned long bio_flags,
>>  				 u64 bio_offset)
>>  {
>> -	struct inode *inode = private_data;
>> -	int async = check_async_write(inode, bio_flags);
>> +	struct btrfs_eb_info *eb_info = private_data;
>> +	struct btrfs_root *root = eb_info->fs_info->tree_root;
>> +	int async = check_async_write(bio_flags);
>>  	int ret;
>>
>>  	if (bio_op(bio) != REQ_OP_WRITE) {
>> @@ -954,23 +898,22 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
>>  		 * called for a read, do the setup so that checksum validation
>>  		 * can happen in the async kernel threads
>>  		 */
>> -		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
>> -					  bio, BTRFS_WQ_ENDIO_METADATA);
>> +		ret = btrfs_bio_wq_end_io(eb_info->fs_info, bio,
>> +					  BTRFS_WQ_ENDIO_METADATA);
>>  		if (ret)
>>  			goto out_w_error;
>> -		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
>> +		ret = btrfs_map_bio(root, bio, mirror_num, 0);
>>  	} else if (!async) {
>> -		ret = btree_csum_one_bio(bio);
>> +		ret = btree_csum_one_bio(eb_info->fs_info, bio);
>>  		if (ret)
>>  			goto out_w_error;
>> -		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
>> +		ret = btrfs_map_bio(root, bio, mirror_num, 0);
>>  	} else {
>>  		/*
>>  		 * kthread helpers are used to submit writes so that
>>  		 * checksumming can happen in parallel across all CPUs
>>  		 */
>> -		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
>> -					  bio, mirror_num, 0,
>> +		ret = btrfs_wq_submit_bio(eb_info->fs_info, bio, mirror_num, 0,
>>  					  bio_offset, private_data,
>>  					  __btree_submit_bio_start,
>>  					  __btree_submit_bio_done);
>> @@ -986,118 +929,14 @@ out_w_error:
>>  	return ret;
>>  }
>>
>> -#ifdef CONFIG_MIGRATION
>> -static int btree_migratepage(struct address_space *mapping,
>> -			struct page *newpage, struct page *page,
>> -			enum migrate_mode mode)
>> -{
>> -	/*
>> -	 * we can't safely write a btree page from here,
>> -	 * we haven't done the locking hook
>> -	 */
>> -	if (PageDirty(page))
>> -		return -EAGAIN;
>> -	/*
>> -	 * Buffers may be managed in a filesystem specific way.
>> -	 * We must have no buffers or drop them.
>> -	 */
>> -	if (page_has_private(page) &&
>> -	    !try_to_release_page(page, GFP_KERNEL))
>> -		return -EAGAIN;
>> -	return migrate_page(mapping, newpage, page, mode);
>> -}
>> -#endif
>> -
>> -
>> -static int btree_writepages(struct address_space *mapping,
>> -			    struct writeback_control *wbc)
>> -{
>> -	struct btrfs_fs_info *fs_info;
>> -	int ret;
>> -
>> -	if (wbc->sync_mode == WB_SYNC_NONE) {
>> -
>> -		if (wbc->for_kupdate)
>> -			return 0;
>> -
>> -		fs_info = BTRFS_I(mapping->host)->root->fs_info;
>> -		/* this is a bit racy, but that's ok */
>> -		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
>> -					     BTRFS_DIRTY_METADATA_THRESH);
>> -		if (ret < 0)
>> -			return 0;
>> -	}
>> -	return btree_write_cache_pages(mapping, wbc);
>> -}
>> -
>> -static int btree_readpage(struct file *file, struct page *page)
>> -{
>> -	struct extent_io_tree *tree;
>> -	tree = &BTRFS_I(page->mapping->host)->io_tree;
>> -	return extent_read_full_page(tree, page, btree_get_extent, 0);
>> -}
>> -
>> -static int btree_releasepage(struct page *page, gfp_t gfp_flags)
>> -{
>> -	if (PageWriteback(page) || PageDirty(page))
>> -		return 0;
>> -
>> -	return try_release_extent_buffer(page);
>> -}
>> -
>> -static void btree_invalidatepage(struct page *page, unsigned int offset,
>> -				 unsigned int length)
>> -{
>> -	struct extent_io_tree *tree;
>> -	tree = &BTRFS_I(page->mapping->host)->io_tree;
>> -	extent_invalidatepage(tree, page, offset);
>> -	btree_releasepage(page, GFP_NOFS);
>> -	if (PagePrivate(page)) {
>> -		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
>> -			   "page private not zero on page %llu",
>> -			   (unsigned long long)page_offset(page));
>> -		ClearPagePrivate(page);
>> -		set_page_private(page, 0);
>> -		put_page(page);
>> -	}
>> -}
>> -
>> -static int btree_set_page_dirty(struct page *page)
>> -{
>> -#ifdef DEBUG
>> -	struct extent_buffer *eb;
>> -
>> -	BUG_ON(!PagePrivate(page));
>> -	eb = (struct extent_buffer *)page->private;
>> -	BUG_ON(!eb);
>> -	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
>> -	BUG_ON(!atomic_read(&eb->refs));
>> -	btrfs_assert_tree_locked(eb);
>> -#endif
>> -	return __set_page_dirty_nobuffers(page);
>> -}
>> -
>> -static const struct address_space_operations btree_aops = {
>> -	.readpage	= btree_readpage,
>> -	.writepages	= btree_writepages,
>> -	.releasepage	= btree_releasepage,
>> -	.invalidatepage = btree_invalidatepage,
>> -#ifdef CONFIG_MIGRATION
>> -	.migratepage	= btree_migratepage,
>> -#endif
>> -	.set_page_dirty = btree_set_page_dirty,
>> -};
>> -
>>  void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
>>  {
>>  	struct extent_buffer *buf = NULL;
>> -	struct inode *btree_inode = root->fs_info->btree_inode;
>>
>>  	buf = btrfs_find_create_tree_block(root, bytenr);
>>  	if (IS_ERR(buf))
>>  		return;
>> -	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
>> -				 buf, WAIT_NONE, btree_get_extent, 0);
>> +	read_extent_buffer_pages(buf, WAIT_NONE, 0);
>>  	free_extent_buffer(buf);
>>  }
>>
>> @@ -1105,8 +944,6 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
>>  			 int mirror_num, struct extent_buffer **eb)
>>  {
>>  	struct extent_buffer *buf = NULL;
>> -	struct inode *btree_inode = root->fs_info->btree_inode;
>> -	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
>>  	int ret;
>>
>>  	buf = btrfs_find_create_tree_block(root, bytenr);
>> @@ -1115,8 +952,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
>>
>>  	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
>>
>> -	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
>> -				       btree_get_extent, mirror_num);
>> +	ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
>>  	if (ret) {
>>  		free_extent_buffer(buf);
>>  		return ret;
>> @@ -1136,29 +972,29 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
>>  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
>>  					    u64 bytenr)
>>  {
>> -	return find_extent_buffer(fs_info, bytenr);
>> +	return find_extent_buffer(fs_info->eb_info, bytenr);
>>  }
>>
>>  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
>>  						 u64 bytenr)
>>  {
>>  	if (btrfs_is_testing(root->fs_info))
>> -		return alloc_test_extent_buffer(root->fs_info, bytenr,
>> -				root->nodesize);
>> +		return alloc_test_extent_buffer(root->fs_info->eb_info, bytenr,
>> +						root->nodesize);
>>  	return alloc_extent_buffer(root->fs_info, bytenr);
>>  }
>>
>>
>>  int btrfs_write_tree_block(struct extent_buffer *buf)
>>  {
>> -	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
>> -					buf->start + buf->len - 1);
>> +	return btree_write_range(buf->eb_info->fs_info, buf->start,
>> +				 buf->start + buf->len - 1);
>>  }
>>
>>  int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
>>  {
>> -	return filemap_fdatawait_range(buf->pages[0]->mapping,
>> -				       buf->start, buf->start + buf->len - 1);
>> +	return btree_wait_range(buf->eb_info->fs_info, buf->start,
>> +				buf->start + buf->len - 1);
>>  }
>>
>>  struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
>> @@ -1188,14 +1024,10 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
>>  	    fs_info->running_transaction->transid) {
>>  		btrfs_assert_tree_locked(buf);
>>
>> -		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
>> +		if (clear_extent_buffer_dirty(buf))
>>  			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
>>  					     -buf->len,
>>  					     fs_info->dirty_metadata_batch);
>> -			/* ugh, clear_extent_buffer_dirty needs to lock the page */
>> -			btrfs_set_lock_blocking(buf);
>> -			clear_extent_buffer_dirty(buf);
>> -		}
>>  	}
>>  }
>>
>> @@ -2247,33 +2079,20 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
>>  	init_waitqueue_head(&fs_info->balance_wait_q);
>>  }
>>
>> -static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
>> -				   struct btrfs_root *tree_root)
>> +int btrfs_init_eb_info(struct btrfs_fs_info *fs_info)
>>  {
>> -	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
>> -	set_nlink(fs_info->btree_inode, 1);
>> -	/*
>> -	 * we set the i_size on the btree inode to the max possible int.
>> -	 * the real end of the address space is determined by all of
>> -	 * the devices in the system
>> -	 */
>> -	fs_info->btree_inode->i_size = OFFSET_MAX;
>> -	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
>> -
>> -	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
>> -	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
>> -			    fs_info->btree_inode);
>> -	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
>> -	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
>> -
>> -	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
>> -
>> -	BTRFS_I(fs_info->btree_inode)->root = tree_root;
>> -	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
>> -	       sizeof(struct btrfs_key));
>> -	set_bit(BTRFS_INODE_DUMMY,
>> -		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
>> -	btrfs_insert_inode_hash(fs_info->btree_inode);
>> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>> +
>> +	eb_info->fs_info = fs_info;
>> +	extent_io_tree_init(&eb_info->io_tree, eb_info);
>> +	eb_info->io_tree.track_uptodate = 0;
>> +	eb_info->io_tree.ops = &btree_extent_io_ops;
>> +	extent_io_tree_init(&eb_info->io_failure_tree, eb_info);
>> +	INIT_RADIX_TREE(&eb_info->buffer_radix, GFP_ATOMIC);
>> +	spin_lock_init(&eb_info->buffer_lock);
>> +	if (list_lru_init(&eb_info->lru_list))
>> +		return -ENOMEM;
>> +	return 0;
>>  }
>>
>>  static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
>> @@ -2566,16 +2385,7 @@ int open_ctree(struct super_block *sb,
>>  		goto fail_delalloc_bytes;
>>  	}
>>
>> -	fs_info->btree_inode = new_inode(sb);
>> -	if (!fs_info->btree_inode) {
>> -		err = -ENOMEM;
>> -		goto fail_bio_counter;
>> -	}
>> -
>> -	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
>> -
>>  	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
>> -	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
>>  	INIT_LIST_HEAD(&fs_info->trans_list);
>>  	INIT_LIST_HEAD(&fs_info->dead_roots);
>>  	INIT_LIST_HEAD(&fs_info->delayed_iputs);
>> @@ -2590,7 +2400,6 @@ int open_ctree(struct super_block *sb,
>>  	spin_lock_init(&fs_info->tree_mod_seq_lock);
>>  	spin_lock_init(&fs_info->super_lock);
>>  	spin_lock_init(&fs_info->qgroup_op_lock);
>> -	spin_lock_init(&fs_info->buffer_lock);
>>  	spin_lock_init(&fs_info->unused_bgs_lock);
>>  	rwlock_init(&fs_info->tree_mod_log_lock);
>>  	mutex_init(&fs_info->unused_bg_unpin_mutex);
>> @@ -2643,7 +2452,7 @@ int open_ctree(struct super_block *sb,
>>  					GFP_KERNEL);
>>  	if (!fs_info->delayed_root) {
>>  		err = -ENOMEM;
>> -		goto fail_iput;
>> +		goto fail_alloc;
>>  	}
>>  	btrfs_init_delayed_root(fs_info->delayed_root);
>>
>> @@ -2658,7 +2467,15 @@ int open_ctree(struct super_block *sb,
>>  	sb->s_blocksize_bits = blksize_bits(4096);
>>  	sb->s_bdi = &fs_info->bdi;
>>
>> -	btrfs_init_btree_inode(fs_info, tree_root);
>> +	fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info), GFP_KERNEL);
>> +	if (!fs_info->eb_info) {
>> +		err = -ENOMEM;
>> +		goto fail_alloc;
>> +	}
>> +	if (btrfs_init_eb_info(fs_info)) {
>> +		err = -ENOMEM;
>> +		goto fail_alloc;
>> +	}
>>
>>  	spin_lock_init(&fs_info->block_group_cache_lock);
>>  	fs_info->block_group_cache_tree = RB_ROOT;
>> @@ -3085,6 +2902,14 @@ retry_root_backup:
>>  	if (sb->s_flags & MS_RDONLY)
>>  		return 0;
>>
>> +	/*
>> +	 * We need to make sure we are on the bdi's dirty list so we get
>> +	 * writeback requests for our fs properly.
>> +	 */
>> +	spin_lock(&fs_info->bdi.sb_list_lock);
>> +	list_add_tail(&fs_info->bdi.dirty_sb_list, &sb->s_bdi_list);
>> +	spin_unlock(&fs_info->bdi.sb_list_lock);
>> +
>>  	if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
>>  	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
>>  		btrfs_info(fs_info, "creating free space tree");
>> @@ -3180,7 +3005,8 @@ fail_cleaner:
>>  	 * make sure we're done with the btree inode before we stop our
>>  	 * kthreads
>>  	 */
>> -	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
>> +	btree_write_range(fs_info, 0, (u64)-1);
>> +	btree_wait_range(fs_info, 0, (u64)-1);
>>
>>  fail_sysfs:
>>  	btrfs_sysfs_remove_mounted(fs_info);
>> @@ -3194,16 +3020,11 @@ fail_block_groups:
>>
>>  fail_tree_roots:
>>  	free_root_pointers(fs_info, 1);
>> -	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
>> -
>> +	btrfs_invalidate_eb_info(fs_info->eb_info);
>>  fail_sb_buffer:
>>  	btrfs_stop_all_workers(fs_info);
>>  fail_alloc:
>> -fail_iput:
>>  	btrfs_mapping_tree_free(&fs_info->mapping_tree);
>> -
>> -	iput(fs_info->btree_inode);
>> -fail_bio_counter:
>>  	percpu_counter_destroy(&fs_info->bio_counter);
>>  fail_delalloc_bytes:
>>  	percpu_counter_destroy(&fs_info->delalloc_bytes);
>> @@ -3908,14 +3729,11 @@ void close_ctree(struct btrfs_root *root)
>>  	 * we must make sure there is not any read request to
>>  	 * submit after we stopping all workers.
>>  	 */
>> -	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
>>  	btrfs_stop_all_workers(fs_info);
>>
>>  	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
>>  	free_root_pointers(fs_info, 1);
>>
>> -	iput(fs_info->btree_inode);
>> -
>>  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
>>  	if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
>>  		btrfsic_unmount(root, fs_info->fs_devices);
>> @@ -3924,6 +3742,8 @@ void close_ctree(struct btrfs_root *root)
>>  	btrfs_close_devices(fs_info->fs_devices);
>>  	btrfs_mapping_tree_free(&fs_info->mapping_tree);
>>
>> +	btrfs_invalidate_eb_info(fs_info->eb_info);
>> +
>>  	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
>>  	percpu_counter_destroy(&fs_info->delalloc_bytes);
>>  	percpu_counter_destroy(&fs_info->bio_counter);
>> @@ -3951,14 +3771,12 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
>>  			  int atomic)
>>  {
>>  	int ret;
>> -	struct inode *btree_inode = buf->pages[0]->mapping->host;
>>
>>  	ret = extent_buffer_uptodate(buf);
>>  	if (!ret)
>>  		return ret;
>>
>> -	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
>> -				    parent_transid, atomic);
>> +	ret = verify_parent_transid(buf, parent_transid, atomic);
>>  	if (ret == -EAGAIN)
>>  		return ret;
>>  	return !ret;
>> @@ -3979,7 +3797,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
>>  	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
>>  		return;
>>  #endif
>> -	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
>> +	root = buf->eb_info->fs_info->tree_root;
>>  	btrfs_assert_tree_locked(buf);
>>  	if (transid != root->fs_info->generation)
>>  		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
>> @@ -4015,10 +3833,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
>>
>>  	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
>>  				     BTRFS_DIRTY_METADATA_THRESH);
>> -	if (ret > 0) {
>> +	if (ret > 0)
>>  		balance_dirty_pages_ratelimited(&root->fs_info->bdi,
>>  						root->fs_info->sb);
>> -	}
>>  }
>>
>>  void btrfs_btree_balance_dirty(struct btrfs_root *root)
>> @@ -4033,7 +3850,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
>>
>>  int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
>>  {
>> -	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
>> +	struct btrfs_root *root = buf->eb_info->fs_info->tree_root;
>>  	return btree_read_extent_buffer_pages(root, buf, parent_transid);
>>  }
>>
>> @@ -4376,10 +4193,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
>>  			if (!eb)
>>  				continue;
>>  			wait_on_extent_buffer_writeback(eb);
>> -
>> -			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
>> -					       &eb->bflags))
>> -				clear_extent_buffer_dirty(eb);
>> +			clear_extent_buffer_dirty(eb);
>>  			free_extent_buffer_stale(eb);
>>  		}
>>  	}
>> @@ -4504,16 +4318,37 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
>>
>>  static struct btrfs_fs_info *btree_fs_info(void *private_data)
>>  {
>> -	struct inode *inode = private_data;
>> -	return btrfs_sb(inode->i_sb);
>> +	struct btrfs_eb_info *eb_info = private_data;
>> +	return eb_info->fs_info;
>> +}
>> +
>> +static int btree_merge_bio_hook(struct page *page, unsigned long offset,
>> +				size_t size, struct bio *bio,
>> +				unsigned long bio_flags)
>> +{
>> +	struct extent_buffer *eb = (struct extent_buffer *)page->private;
>> +	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
>> +	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
>> +	u64 length = 0;
>> +	u64 map_length;
>> +	int ret;
>> +
>> +	length = bio->bi_iter.bi_size;
>> +	map_length = length;
>> +	ret = btrfs_map_block(fs_info, bio_op(bio), logical, &map_length,
>> +			      NULL, 0);
>> +	if (ret < 0)
>> +		return ret;
>> +	if (map_length < length + size)
>> +		return 1;
>> +	return 0;
>>  }
>>
>>  static const struct extent_io_ops btree_extent_io_ops = {
>>  	.readpage_end_io_hook = btree_readpage_end_io_hook,
>>  	.readpage_io_failed_hook = btree_io_failed_hook,
>>  	.submit_bio_hook = btree_submit_bio_hook,
>> -	/* note we're sharing with inode.c for the merge bio hook */
>> -	.merge_bio_hook = btrfs_merge_bio_hook,
>> +	.merge_bio_hook = btree_merge_bio_hook,
>>  	.tree_fs_info = btree_fs_info,
>>  	.set_range_writeback = btrfs_set_range_writeback,
>>  };
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 5dcdd3e..5c18a49 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -75,8 +75,8 @@ void btrfs_leak_debug_check(void)
>>  	while (!list_empty(&buffers)) {
>>  		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
>>  		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
>> -		       "refs %d\n",
>> -		       eb->start, eb->len, atomic_read(&eb->refs));
>> +		       "bflags %lu refs %d\n",
>> +		       eb->start, eb->len, eb->bflags, atomic_read(&eb->refs));
>>  		list_del(&eb->leak_list);
>>  		kmem_cache_free(extent_buffer_cache, eb);
>>  	}
>> @@ -3538,7 +3538,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
>>  			  struct btrfs_fs_info *fs_info,
>>  			  struct extent_page_data *epd)
>>  {
>> -	unsigned long i, num_pages;
>> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>>  	int flush = 0;
>>  	int ret = 0;
>>
>> @@ -3585,37 +3585,42 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
>>
>>  	btrfs_tree_unlock(eb);
>>
>> -	if (!ret)
>> -		return ret;
>> -
>> -	num_pages = num_extent_pages(eb->start, eb->len);
>> -	for (i = 0; i < num_pages; i++) {
>> -		struct page *p = eb->pages[i];
>> -
>> -		if (!trylock_page(p)) {
>> -			if (!flush) {
>> -				flush_write_bio(epd);
>> -				flush = 1;
>> -			}
>> -			lock_page(p);
>> -		}
>> +	/*
>> +	 * We cleared dirty on this buffer, we need to adjust the radix tags.
>> +	 * We do the actual page accounting in write_one_eb.
>> +	 */
>> +	if (ret) {
>> +		spin_lock_irq(&eb_info->buffer_lock);
>> +		radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
>> +				   PAGECACHE_TAG_WRITEBACK);
>> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
>> +				     PAGECACHE_TAG_DIRTY);
>> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
>> +				     PAGECACHE_TAG_TOWRITE);
>> +		spin_unlock_irq(&eb_info->buffer_lock);
>>  	}
>> -
>>  	return ret;
>>  }
>>
>>  static void end_extent_buffer_writeback(struct extent_buffer *eb)
>>  {
>> -	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
>> -	smp_mb__after_atomic();
>> -	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
>> +	if (test_and_clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
>> +		struct btrfs_eb_info *eb_info = eb->eb_info;
>> +		unsigned long flags;
>> +
>> +		spin_lock_irqsave(&eb_info->buffer_lock, flags);
>> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
>> +				     PAGECACHE_TAG_WRITEBACK);
>> +		spin_unlock_irqrestore(&eb_info->buffer_lock, flags);
>> +		wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
>> +	}
>>  }
>>
>>  static void set_btree_ioerr(struct page *page)
>>  {
>>  	struct extent_buffer *eb = (struct extent_buffer *)page->private;
>> +	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
>>
>> -	SetPageError(page);
>>  	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>>  		return;
>>
>> @@ -3624,8 +3629,7 @@ static void set_btree_ioerr(struct page *page)
>>  	 * failed, increment the counter transaction->eb_write_errors.
>>  	 * We do this because while the transaction is running and before it's
>>  	 * committing (when we call filemap_fdata[write|wait]_range against
>> -	 * the btree inode), we might have
>> -	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
>> +	 * the btree inode), we might have write_metadata() called - if it
>>  	 * returns an error or an error happens during writeback, when we're
>>  	 * committing the transaction we wouldn't know about it, since the pages
>>  	 * can be no longer dirty nor marked anymore for writeback (if a
>> @@ -3659,13 +3663,13 @@ static void set_btree_ioerr(struct page *page)
>>  	 */
>>  	switch (eb->log_index) {
>>  	case -1:
>> -		set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
>> +		set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
>>  		break;
>>  	case 0:
>> -		set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
>> +		set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
>>  		break;
>>  	case 1:
>> -		set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
>> +		set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
>>  		break;
>>  	default:
>>  		BUG(); /* unexpected, logic error */
>> @@ -3686,16 +3690,13 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
>>  		done = atomic_dec_and_test(&eb->io_pages);
>>
>>  		if (bio->bi_error ||
>> -		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
>> -			ClearPageUptodate(page);
>> +		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>>  			set_btree_ioerr(page);
>> -		}
>> -
>> -		end_page_writeback(page);
>>
>> +		account_metadata_end_writeback(page,
>> +					       &eb->eb_info->fs_info->bdi);
>>  		if (!done)
>>  			continue;
>> -
>>  		end_extent_buffer_writeback(eb);
>>  	}
>>
>> @@ -3708,7 +3709,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>>  			struct extent_page_data *epd)
>>  {
>>  	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
>> -	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
>> +	struct extent_io_tree *tree = &fs_info->eb_info->io_tree;
>>  	u64 offset = eb->start;
>>  	unsigned long i, num_pages;
>>  	unsigned long bio_flags = 0;
>> @@ -3724,8 +3725,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>>  	for (i = 0; i < num_pages; i++) {
>>  		struct page *p = eb->pages[i];
>>
>> -		clear_page_dirty_for_io(p);
>> -		set_page_writeback(p);
>>  		ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
>>  					 p, offset >> 9, PAGE_SIZE, 0, bdev,
>>  					 &epd->bio, -1,
>> @@ -3734,34 +3733,90 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>>  		epd->bio_flags = bio_flags;
>>  		if (ret) {
>>  			set_btree_ioerr(p);
>> -			end_page_writeback(p);
>>  			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
>>  				end_extent_buffer_writeback(eb);
>>  			ret = -EIO;
>>  			break;
>>  		}
>> +		account_metadata_writeback(p, &fs_info->bdi);
>>  		offset += PAGE_SIZE;
>>  		update_nr_written(p, wbc, 1);
>> -		unlock_page(p);
>>  	}
>>
>> -	if (unlikely(ret)) {
>> -		for (; i < num_pages; i++) {
>> -			struct page *p = eb->pages[i];
>> -			clear_page_dirty_for_io(p);
>> -			unlock_page(p);
>> +	return ret;
>> +}
>> +
>> +#define EB_TAG_BATCH 4096
>> +static void tag_ebs_for_writeback(struct btrfs_eb_info *eb_info, pgoff_t start,
>> +				  pgoff_t end)
>> +{
>> +	unsigned long tagged;
>> +
>> +	do {
>> +		spin_lock_irq(&eb_info->buffer_lock);
>> +		tagged = radix_tree_range_tag_if_tagged(&eb_info->buffer_radix,
>> +							&start, end,
>> +							EB_TAG_BATCH,
>> +							PAGECACHE_TAG_DIRTY,
>> +							PAGECACHE_TAG_TOWRITE);
>> +		spin_unlock_irq(&eb_info->buffer_lock);
>> +		cond_resched();
>> +	} while (tagged >= EB_TAG_BATCH && start);
>> +}
>> +
>> +static unsigned eb_lookup_tag(struct btrfs_eb_info *eb_info,
>> +			      struct extent_buffer **ebs, pgoff_t *index,
>> +			      int tag, unsigned nr)
>> +{
>> +	struct radix_tree_iter iter;
>> +	void **slot;
>> +	unsigned ret = 0;
>> +
>> +	if (unlikely(!nr))
>> +		return 0;
>> +
>> +	rcu_read_lock();
>> +	radix_tree_for_each_tagged(slot, &eb_info->buffer_radix, &iter, *index,
>> +				   tag) {
>> +		struct extent_buffer *eb;
>> +repeat:
>> +		eb = radix_tree_deref_slot(slot);
>> +		if (unlikely(!eb))
>> +			continue;
>> +
>> +		if (radix_tree_exception(eb)) {
>> +			if (radix_tree_deref_retry(eb)) {
>> +				slot = radix_tree_iter_retry(&iter);
>> +				continue;
>> +			}
>> +			continue;
>>  		}
>> -	}
>>
>> +		if (unlikely(!atomic_inc_not_zero(&eb->refs)))
>> +			continue;
>> +
>> +		if (unlikely(eb != *slot)) {
>> +			free_extent_buffer(eb);
>> +			goto repeat;
>> +		}
>> +
>> +		ebs[ret] = eb;
>> +		if (++ret == nr)
>> +			break;
>> +	}
>> +	rcu_read_unlock();
>> +	if (ret)
>> +		*index = (ebs[ret - 1]->start >> PAGE_SHIFT) + 1;
>>  	return ret;
>>  }
>>
>> -int btree_write_cache_pages(struct address_space *mapping,
>> +#define EBVEC_SIZE 16
>> +static int btree_write_cache_pages(struct btrfs_fs_info *fs_info,
>>  				   struct writeback_control *wbc)
>>  {
>> -	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
>> -	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
>> -	struct extent_buffer *eb, *prev_eb = NULL;
>> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>> +	struct extent_io_tree *tree = &eb_info->io_tree;
>> +	struct extent_buffer *eb;
>>  	struct extent_page_data epd = {
>>  		.bio = NULL,
>>  		.tree = tree,
>> @@ -3772,16 +3827,16 @@ int btree_write_cache_pages(struct address_space *mapping,
>>  	int ret = 0;
>>  	int done = 0;
>>  	int nr_to_write_done = 0;
>> -	struct pagevec pvec;
>> -	int nr_pages;
>> +	struct extent_buffer *ebs[EBVEC_SIZE];
>> +	int nr_ebs;
>>  	pgoff_t index;
>>  	pgoff_t end;		/* Inclusive */
>> +	pgoff_t done_index = 0;
>>  	int scanned = 0;
>>  	int tag;
>>
>> -	pagevec_init(&pvec, 0);
>>  	if (wbc->range_cyclic) {
>> -		index = mapping->writeback_index; /* Start from prev offset */
>> +		index = eb_info->writeback_index; /* Start from prev offset */
>>  		end = -1;
>>  	} else {
>>  		index = wbc->range_start >> PAGE_SHIFT;
>> @@ -3794,53 +3849,27 @@ int btree_write_cache_pages(struct address_space *mapping,
>>  		tag = PAGECACHE_TAG_DIRTY;
>>  retry:
>>  	if (wbc->sync_mode == WB_SYNC_ALL)
>> -		tag_pages_for_writeback(mapping, index, end);
>> +		tag_ebs_for_writeback(fs_info->eb_info, index, end);
>>  	while (!done && !nr_to_write_done && (index <= end) &&
>> -	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
>> -			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
>> +	       (nr_ebs = eb_lookup_tag(eb_info, ebs, &index, tag,
>> +			min(end - index, (pgoff_t)EBVEC_SIZE-1) + 1))) {
>>  		unsigned i;
>>
>>  		scanned = 1;
>> -		for (i = 0; i < nr_pages; i++) {
>> -			struct page *page = pvec.pages[i];
>> -
>> -			if (!PagePrivate(page))
>> -				continue;
>> -
>> -			if (!wbc->range_cyclic && page->index > end) {
>> -				done = 1;
>> -				break;
>> -			}
>> -
>> -			spin_lock(&mapping->private_lock);
>> -			if (!PagePrivate(page)) {
>> -				spin_unlock(&mapping->private_lock);
>> -				continue;
>> -			}
>> -
>> -			eb = (struct extent_buffer *)page->private;
>> -
>> -			/*
>> -			 * Shouldn't happen and normally this would be a BUG_ON
>> -			 * but no sense in crashing the users box for something
>> -			 * we can survive anyway.
>> -			 */
>> -			if (WARN_ON(!eb)) {
>> -				spin_unlock(&mapping->private_lock);
>> +		for (i = 0; i < nr_ebs; i++) {
>> +			eb = ebs[i];
>> +			if (done) {
>> +				free_extent_buffer(eb);
>>  				continue;
>>  			}
>>
>> -			if (eb == prev_eb) {
>> -				spin_unlock(&mapping->private_lock);
>> +			if (!wbc->range_cyclic && eb->start > wbc->range_end) {
>> +				done = 1;
>> +				free_extent_buffer(eb);
>>  				continue;
>>  			}
>>
>> -			ret = atomic_inc_not_zero(&eb->refs);
>> -			spin_unlock(&mapping->private_lock);
>> -			if (!ret)
>> -				continue;
>> -
>> -			prev_eb = eb;
>> +			done_index = eb_index(eb);
>>  			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
>>  			if (!ret) {
>>  				free_extent_buffer(eb);
>> @@ -3848,12 +3877,11 @@ retry:
>>  			}
>>
>>  			ret = write_one_eb(eb, fs_info, wbc, &epd);
>> +			free_extent_buffer(eb);
>>  			if (ret) {
>>  				done = 1;
>> -				free_extent_buffer(eb);
>> -				break;
>> +				continue;
>>  			}
>> -			free_extent_buffer(eb);
>>
>>  			/*
>>  			 * the filesystem may choose to bump up nr_to_write.
>> @@ -3862,7 +3890,6 @@ retry:
>>  			 */
>>  			nr_to_write_done = wbc->nr_to_write <= 0;
>>  		}
>> -		pagevec_release(&pvec);
>>  		cond_resched();
>>  	}
>>  	if (!scanned && !done) {
>> @@ -3874,10 +3901,77 @@ retry:
>>  		index = 0;
>>  		goto retry;
>>  	}
>> +	if (wbc->range_cyclic)
>> +		fs_info->eb_info->writeback_index = done_index;
>>  	flush_write_bio(&epd);
>>  	return ret;
>>  }
>>
>> +void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc)
>> +{
>> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>> +	btree_write_cache_pages(fs_info, wbc);
>> +}
>> +
>> +static int __btree_write_range(struct btrfs_fs_info *fs_info, u64 start,
>> +			       u64 end, int sync_mode)
>> +{
>> +	struct writeback_control wbc = {
>> +		.sync_mode = sync_mode,
>> +		.nr_to_write = LONG_MAX,
>> +		.range_start = start,
>> +		.range_end = end,
>> +	};
>> +
>> +	return btree_write_cache_pages(fs_info, &wbc);
>> +}
>> +
>> +void btree_flush(struct btrfs_fs_info *fs_info)
>> +{
>> +	__btree_write_range(fs_info, 0, (u64)-1, WB_SYNC_NONE);
>> +}
>> +
>> +int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
>> +{
>> +	return __btree_write_range(fs_info, start, end, WB_SYNC_ALL);
>> +}
>> +
>> +int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
>> +{
>> +	struct extent_buffer *ebs[EBVEC_SIZE];
>> +	pgoff_t index = start >> PAGE_SHIFT;
>> +	pgoff_t end_index = end >> PAGE_SHIFT;
>> +	unsigned nr_ebs;
>> +	int ret = 0;
>> +
>> +	if (end < start)
>> +		return ret;
>> +
>> +	while ((index <= end) &&
>> +	       (nr_ebs = eb_lookup_tag(fs_info->eb_info, ebs, &index,
>> +				       PAGECACHE_TAG_WRITEBACK,
>> +				       min(end_index - index,
>> +					   (pgoff_t)EBVEC_SIZE-1) + 1)) != 0) {
>> +		unsigned i;
>> +
>> +		for (i = 0; i < nr_ebs; i++) {
>> +			struct extent_buffer *eb = ebs[i];
>> +
>> +			if (eb->start > end) {
>> +				free_extent_buffer(eb);
>> +				continue;
>> +			}
>> +
>> +			wait_on_extent_buffer_writeback(eb);
>> +			if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>> +				ret = -EIO;
>> +			free_extent_buffer(eb);
>> +		}
>> +		cond_resched();
>> +	}
>> +	return ret;
>> +}
>> +
>>  /**
>>   * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
>>   * @mapping: address space structure to write
>> @@ -4558,7 +4652,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
>>  {
>>  	unsigned long index;
>>  	struct page *page;
>> -	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
>>
>>  	BUG_ON(extent_buffer_under_io(eb));
>>
>> @@ -4566,39 +4659,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
>>  	if (index == 0)
>>  		return;
>>
>> +	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
>>  	do {
>>  		index--;
>>  		page = eb->pages[index];
>>  		if (!page)
>>  			continue;
>> -		if (mapped)
>> -			spin_lock(&page->mapping->private_lock);
>> -		/*
>> -		 * We do this since we'll remove the pages after we've
>> -		 * removed the eb from the radix tree, so we could race
>> -		 * and have this page now attached to the new eb.  So
>> -		 * only clear page_private if it's still connected to
>> -		 * this eb.
>> -		 */
>> -		if (PagePrivate(page) &&
>> -		    page->private == (unsigned long)eb) {
>> -			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
>> -			BUG_ON(PageDirty(page));
>> -			BUG_ON(PageWriteback(page));
>> -			/*
>> -			 * We need to make sure we haven't be attached
>> -			 * to a new eb.
>> -			 */
>> -			ClearPagePrivate(page);
>> -			set_page_private(page, 0);
>> -			/* One for the page private */
>> -			put_page(page);
>> -		}
>> +		ASSERT(PagePrivate(page));
>> +		ASSERT(page->private == (unsigned long)eb);
>> +		ClearPagePrivate(page);
>> +		set_page_private(page, 0);
>>
>> -		if (mapped)
>> -			spin_unlock(&page->mapping->private_lock);
>> +		/* Once for the page private. */
>> +		put_page(page);
>>
>> -		/* One for when we allocated the page */
>> +		/* Once for the alloc_page. */
>>  		put_page(page);
>>  	} while (index != 0);
>>  }
>> @@ -4613,7 +4688,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
>>  }
>>
>>  static struct extent_buffer *
>> -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
>> +__alloc_extent_buffer(struct btrfs_eb_info *eb_info, u64 start,
>>  		      unsigned long len)
>>  {
>>  	struct extent_buffer *eb = NULL;
>> @@ -4621,7 +4696,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
>>  	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
>>  	eb->start = start;
>>  	eb->len = len;
>> -	eb->fs_info = fs_info;
>> +	eb->eb_info = eb_info;
>>  	eb->bflags = 0;
>>  	rwlock_init(&eb->lock);
>>  	atomic_set(&eb->write_locks, 0);
>> @@ -4633,6 +4708,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
>>  	eb->lock_nested = 0;
>>  	init_waitqueue_head(&eb->write_lock_wq);
>>  	init_waitqueue_head(&eb->read_lock_wq);
>> +	INIT_LIST_HEAD(&eb->lru);
>>
>>  	btrfs_leak_debug_add(&eb->leak_list, &buffers);
>>
>> @@ -4657,7 +4733,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>>  	struct extent_buffer *new;
>>  	unsigned long num_pages = num_extent_pages(src->start, src->len);
>>
>> -	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
>> +	new = __alloc_extent_buffer(src->eb_info, src->start, src->len);
>>  	if (new == NULL)
>>  		return NULL;
>>
>> @@ -4668,8 +4744,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>>  			return NULL;
>>  		}
>>  		attach_extent_buffer_page(new, p);
>> -		WARN_ON(PageDirty(p));
>> -		SetPageUptodate(p);
>>  		new->pages[i] = p;
>>  	}
>>
>> @@ -4680,8 +4754,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>>  	return new;
>>  }
>>
>> -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>> -						  u64 start, unsigned long len)
>> +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info *eb_info,
>> +						u64 start, unsigned long len)
>>  {
>>  	struct extent_buffer *eb;
>>  	unsigned long num_pages;
>> @@ -4689,7 +4763,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>>
>>  	num_pages = num_extent_pages(start, len);
>>
>> -	eb = __alloc_extent_buffer(fs_info, start, len);
>> +	eb = __alloc_extent_buffer(eb_info, start, len);
>>  	if (!eb)
>>  		return NULL;
>>
>> @@ -4697,6 +4771,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>>  		eb->pages[i] = alloc_page(GFP_NOFS);
>>  		if (!eb->pages[i])
>>  			goto err;
>> +		attach_extent_buffer_page(eb, eb->pages[i]);
>>  	}
>>  	set_extent_buffer_uptodate(eb);
>>  	btrfs_set_header_nritems(eb, 0);
>> @@ -4704,30 +4779,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>>
>>  	return eb;
>>  err:
>> -	for (; i > 0; i--)
>> -		__free_page(eb->pages[i - 1]);
>> -	__free_extent_buffer(eb);
>> +	btrfs_release_extent_buffer(eb);
>>  	return NULL;
>>  }
>>
>> -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>> -						u64 start, u32 nodesize)
>> -{
>> -	unsigned long len;
>> -
>> -	if (!fs_info) {
>> -		/*
>> -		 * Called only from tests that don't always have a fs_info
>> -		 * available
>> -		 */
>> -		len = nodesize;
>> -	} else {
>> -		len = fs_info->tree_root->nodesize;
>> -	}
>> -
>> -	return __alloc_dummy_extent_buffer(fs_info, start, len);
>> -}
>> -
>>  static void check_buffer_tree_ref(struct extent_buffer *eb)
>>  {
>>  	int refs;
>> @@ -4777,13 +4832,13 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
>>  	}
>>  }
>>
>> -struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
>> +struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
>>  					 u64 start)
>>  {
>>  	struct extent_buffer *eb;
>>
>>  	rcu_read_lock();
>> -	eb = radix_tree_lookup(&fs_info->buffer_radix,
>> +	eb = radix_tree_lookup(&eb_info->buffer_radix,
>>  			       start >> PAGE_SHIFT);
>>  	if (eb && atomic_inc_not_zero(&eb->refs)) {
>>  		rcu_read_unlock();
>> @@ -4815,30 +4870,30 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
>>  }
>>
>>  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>> -struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
>> -					u64 start, u32 nodesize)
>> +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
>> +					       u64 start, u32 nodesize)
>>  {
>>  	struct extent_buffer *eb, *exists = NULL;
>>  	int ret;
>>
>> -	eb = find_extent_buffer(fs_info, start);
>> +	eb = find_extent_buffer(eb_info, start);
>>  	if (eb)
>>  		return eb;
>> -	eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
>> +	eb = alloc_dummy_extent_buffer(eb_info, start, nodesize);
>>  	if (!eb)
>>  		return NULL;
>> -	eb->fs_info = fs_info;
>> +	eb->eb_info = eb_info;
>>  again:
>>  	ret = radix_tree_preload(GFP_NOFS);
>>  	if (ret)
>>  		goto free_eb;
>> -	spin_lock(&fs_info->buffer_lock);
>> -	ret = radix_tree_insert(&fs_info->buffer_radix,
>> +	spin_lock_irq(&eb_info->buffer_lock);
>> +	ret = radix_tree_insert(&eb_info->buffer_radix,
>>  				start >> PAGE_SHIFT, eb);
>> -	spin_unlock(&fs_info->buffer_lock);
>> +	spin_unlock_irq(&eb_info->buffer_lock);
>>  	radix_tree_preload_end();
>>  	if (ret == -EEXIST) {
>> -		exists = find_extent_buffer(fs_info, start);
>> +		exists = find_extent_buffer(eb_info, start);
>>  		if (exists)
>>  			goto free_eb;
>>  		else
>> @@ -4854,6 +4909,7 @@ again:
>>  	 * bump the ref count again.
>>  	 */
>>  	atomic_inc(&eb->refs);
>> +	set_extent_buffer_uptodate(eb);
>>  	return eb;
>>  free_eb:
>>  	btrfs_release_extent_buffer(eb);
>> @@ -4867,12 +4923,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>  	unsigned long len = fs_info->tree_root->nodesize;
>>  	unsigned long num_pages = num_extent_pages(start, len);
>>  	unsigned long i;
>> -	unsigned long index = start >> PAGE_SHIFT;
>>  	struct extent_buffer *eb;
>>  	struct extent_buffer *exists = NULL;
>>  	struct page *p;
>> -	struct address_space *mapping = fs_info->btree_inode->i_mapping;
>> -	int uptodate = 1;
>> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>> +//	struct zone *last_zone = NULL;
>> +//	struct pg_data_t *last_pgdata = NULL;
>>  	int ret;
>>
>>  	if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
>> @@ -4880,62 +4936,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>  		return ERR_PTR(-EINVAL);
>>  	}
>>
>> -	eb = find_extent_buffer(fs_info, start);
>> +	eb = find_extent_buffer(eb_info, start);
>>  	if (eb)
>>  		return eb;
>>
>> -	eb = __alloc_extent_buffer(fs_info, start, len);
>> +	eb = __alloc_extent_buffer(eb_info, start, len);
>>  	if (!eb)
>>  		return ERR_PTR(-ENOMEM);
>>
>> -	for (i = 0; i < num_pages; i++, index++) {
>> -		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
>> +	for (i = 0; i < num_pages; i++) {
>> +		p = alloc_page(GFP_NOFS|__GFP_NOFAIL);
>>  		if (!p) {
>>  			exists = ERR_PTR(-ENOMEM);
>>  			goto free_eb;
>>  		}
>>
>> -		spin_lock(&mapping->private_lock);
>> -		if (PagePrivate(p)) {
>> -			/*
>> -			 * We could have already allocated an eb for this page
>> -			 * and attached one so lets see if we can get a ref on
>> -			 * the existing eb, and if we can we know it's good and
>> -			 * we can just return that one, else we know we can just
>> -			 * overwrite page->private.
>> -			 */
>> -			exists = (struct extent_buffer *)p->private;
>> -			if (atomic_inc_not_zero(&exists->refs)) {
>> -				spin_unlock(&mapping->private_lock);
>> -				unlock_page(p);
>> -				put_page(p);
>> -				mark_extent_buffer_accessed(exists, p);
>> -				goto free_eb;
>> -			}
>> -			exists = NULL;
>> -
>> -			/*
>> -			 * Do this so attach doesn't complain and we need to
>> -			 * drop the ref the old guy had.
>> -			 */
>> -			ClearPagePrivate(p);
>> -			WARN_ON(PageDirty(p));
>> -			put_page(p);
>> -		}
>> +		/*
>> +		 * If our pages span zones or numa nodes we have to do
>> +		 * dirty/writeback accounting per page, otherwise we can do it
>> +		 * in bulk and save us some looping.
>> +		 *
>> +		if (!last_zone)
>> +			last_zone = page_zone(p);
>> +		if (!last_pgdata)
>> +			last_pgdata = page_pgdata(p);
>> +		if (last_zone != page_zone(p) || last_pgdata != page_pgdata(p))
>> +			set_bit(EXTENT_BUFFER_MIXED_PAGES, &eb->bflags);
>> +		*/
>>  		attach_extent_buffer_page(eb, p);
>> -		spin_unlock(&mapping->private_lock);
>> -		WARN_ON(PageDirty(p));
>>  		eb->pages[i] = p;
>> -		if (!PageUptodate(p))
>> -			uptodate = 0;
>> -
>> -		/*
>> -		 * see below about how we avoid a nasty race with release page
>> -		 * and why we unlock later
>> -		 */
>>  	}
>> -	if (uptodate)
>> -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>>  again:
>>  	ret = radix_tree_preload(GFP_NOFS);
>>  	if (ret) {
>> @@ -4943,13 +4973,13 @@ again:
>>  		goto free_eb;
>>  	}
>>
>> -	spin_lock(&fs_info->buffer_lock);
>> -	ret = radix_tree_insert(&fs_info->buffer_radix,
>> +	spin_lock_irq(&eb_info->buffer_lock);
>> +	ret = radix_tree_insert(&eb_info->buffer_radix,
>>  				start >> PAGE_SHIFT, eb);
>> -	spin_unlock(&fs_info->buffer_lock);
>> +	spin_unlock_irq(&eb_info->buffer_lock);
>>  	radix_tree_preload_end();
>>  	if (ret == -EEXIST) {
>> -		exists = find_extent_buffer(fs_info, start);
>> +		exists = find_extent_buffer(eb_info, start);
>>  		if (exists)
>>  			goto free_eb;
>>  		else
>> @@ -4959,31 +4989,10 @@ again:
>>  	check_buffer_tree_ref(eb);
>>  	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
>>
>> -	/*
>> -	 * there is a race where release page may have
>> -	 * tried to find this extent buffer in the radix
>> -	 * but failed.  It will tell the VM it is safe to
>> -	 * reclaim the, and it will clear the page private bit.
>> -	 * We must make sure to set the page private bit properly
>> -	 * after the extent buffer is in the radix tree so
>> -	 * it doesn't get lost
>> -	 */
>> -	SetPageChecked(eb->pages[0]);
>> -	for (i = 1; i < num_pages; i++) {
>> -		p = eb->pages[i];
>> -		ClearPageChecked(p);
>> -		unlock_page(p);
>> -	}
>> -	unlock_page(eb->pages[0]);
>>  	return eb;
>>
>>  free_eb:
>>  	WARN_ON(!atomic_dec_and_test(&eb->refs));
>> -	for (i = 0; i < num_pages; i++) {
>> -		if (eb->pages[i])
>> -			unlock_page(eb->pages[i]);
>> -	}
>> -
>>  	btrfs_release_extent_buffer(eb);
>>  	return exists;
>>  }
>> @@ -4999,17 +5008,19 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
>>  /* Expects to have eb->eb_lock already held */
>>  static int release_extent_buffer(struct extent_buffer *eb)
>>  {
>> +	struct btrfs_eb_info *eb_info = eb->eb_info;
>> +
>>  	WARN_ON(atomic_read(&eb->refs) == 0);
>>  	if (atomic_dec_and_test(&eb->refs)) {
>> +		if (eb_info)
>> +			list_lru_del(&eb_info->lru_list, &eb->lru);
>>  		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
>> -			struct btrfs_fs_info *fs_info = eb->fs_info;
>> -
>>  			spin_unlock(&eb->refs_lock);
>>
>> -			spin_lock(&fs_info->buffer_lock);
>> -			radix_tree_delete(&fs_info->buffer_radix,
>> -					  eb->start >> PAGE_SHIFT);
>> -			spin_unlock(&fs_info->buffer_lock);
>> +			spin_lock_irq(&eb_info->buffer_lock);
>> +			radix_tree_delete(&eb_info->buffer_radix,
>> +					  eb_index(eb));
>> +			spin_unlock_irq(&eb_info->buffer_lock);
>>  		} else {
>>  			spin_unlock(&eb->refs_lock);
>>  		}
>> @@ -5024,6 +5035,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
>>  #endif
>>  		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
>>  		return 1;
>> +	} else if (eb_info && atomic_read(&eb->refs) == 1) {
>> +		list_lru_add(&eb_info->lru_list, &eb->lru);
>>  	}
>>  	spin_unlock(&eb->refs_lock);
>>
>> @@ -5057,10 +5070,6 @@ void free_extent_buffer(struct extent_buffer *eb)
>>  	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
>>  		atomic_dec(&eb->refs);
>>
>> -	/*
>> -	 * I know this is terrible, but it's temporary until we stop tracking
>> -	 * the uptodate bits and such for the extent buffers.
>> -	 */
>>  	release_extent_buffer(eb);
>>  }
>>
>> @@ -5078,82 +5087,163 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
>>  	release_extent_buffer(eb);
>>  }
>>
>> -void clear_extent_buffer_dirty(struct extent_buffer *eb)
>> +long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc)
>>  {
>> -	unsigned long i;
>> -	unsigned long num_pages;
>> -	struct page *page;
>> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>>
>> -	num_pages = num_extent_pages(eb->start, eb->len);
>> +	return list_lru_shrink_count(&eb_info->lru_list, sc);
>> +}
>>
>> -	for (i = 0; i < num_pages; i++) {
>> -		page = eb->pages[i];
>> -		if (!PageDirty(page))
>> -			continue;
>> +static enum lru_status eb_lru_isolate(struct list_head *item,
>> +				      struct list_lru_one *lru,
>> +				      spinlock_t *lru_lock, void *arg)
>> +{
>> +	struct list_head *freeable = (struct list_head *)arg;
>> +	struct extent_buffer *eb = container_of(item, struct extent_buffer,
>> +						lru);
>> +	enum lru_status ret;
>> +	int refs;
>>
>> -		lock_page(page);
>> -		WARN_ON(!PagePrivate(page));
>> +	if (!spin_trylock(&eb->refs_lock))
>> +		return LRU_SKIP;
>>
>> -		clear_page_dirty_for_io(page);
>> -		spin_lock_irq(&page->mapping->tree_lock);
>> -		if (!PageDirty(page)) {
>> -			radix_tree_tag_clear(&page->mapping->page_tree,
>> -						page_index(page),
>> -						PAGECACHE_TAG_DIRTY);
>> -		}
>> -		spin_unlock_irq(&page->mapping->tree_lock);
>> -		ClearPageError(page);
>> -		unlock_page(page);
>> +	if (extent_buffer_under_io(eb)) {
>> +		ret = LRU_ROTATE;
>> +		goto out;
>> +	}
>> +
>> +	refs = atomic_read(&eb->refs);
>> +	/* We can race with somebody freeing us, just skip if this happens. */
>> +	if (refs == 0) {
>> +		ret = LRU_SKIP;
>> +		goto out;
>> +	}
>> +
>> +	/* Eb is in use, don't kill it. */
>> +	if (refs > 1) {
>> +		ret = LRU_ROTATE;
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * If we don't clear the TREE_REF flag then this eb is going to
>> +	 * disappear soon anyway.  Otherwise we become responsible for dropping
>> +	 * the last ref on this eb and we know it'll survive until we call
>> +	 * dispose_list.
>> +	 */
>> +	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
>> +		ret = LRU_SKIP;
>> +		goto out;
>> +	}
>> +	list_lru_isolate_move(lru, &eb->lru, freeable);
>> +	ret = LRU_REMOVED;
>> +out:
>> +	spin_unlock(&eb->refs_lock);
>> +	return ret;
>> +}
>> +
>> +static void dispose_list(struct list_head *list)
>> +{
>> +	struct extent_buffer *eb;
>> +
>> +	while (!list_empty(list)) {
>> +		eb = list_first_entry(list, struct extent_buffer, lru);
>> +
>> +		spin_lock(&eb->refs_lock);
>> +		list_del_init(&eb->lru);
>> +		spin_unlock(&eb->refs_lock);
>> +		free_extent_buffer(eb);
>> +		cond_resched();
>>  	}
>> +}
>> +
>> +long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc)
>> +{
>> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>> +	LIST_HEAD(freeable);
>> +	long freed;
>> +
>> +	freed = list_lru_shrink_walk(&eb_info->lru_list, sc, eb_lru_isolate,
>> +				     &freeable);
>> +	dispose_list(&freeable);
>> +	return freed;
>> +}
>> +
>> +#define MAX_EVICT_COUNT 1024
>> +void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info)
>> +{
>> +	LIST_HEAD(freeable);
>> +	unsigned long count;
>> +
>> +	/*
>> +	 * Evict in batches so we don't lockup the system trying to evict
>> +	 * memory.
>> +	 */
>> +	do {
>> +		count = list_lru_walk(&eb_info->lru_list, eb_lru_isolate,
>> +				      &freeable, MAX_EVICT_COUNT);
>> +		cond_resched();
>> +	} while (count);
>> +	dispose_list(&freeable);
>> +	synchronize_rcu();
>> +}
>> +
>> +int clear_extent_buffer_dirty(struct extent_buffer *eb)
>> +{
>> +	struct btrfs_eb_info *eb_info = eb->eb_info;
>> +	unsigned long i;
>> +	unsigned long num_pages;
>> +
>> +	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
>> +		return 0;
>> +
>> +	spin_lock_irq(&eb_info->buffer_lock);
>> +	radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
>> +			     PAGECACHE_TAG_DIRTY);
>> +	spin_unlock_irq(&eb_info->buffer_lock);
>> +
>> +	num_pages = num_extent_pages(eb->start, eb->len);
>> +	for (i = 0; i < num_pages; i++)
>> +		account_metadata_cleaned(eb->pages[i], &eb_info->fs_info->bdi);
>>  	WARN_ON(atomic_read(&eb->refs) == 0);
>> +	return 1;
>>  }
>>
>>  int set_extent_buffer_dirty(struct extent_buffer *eb)
>>  {
>> +	struct btrfs_eb_info *eb_info = eb->eb_info;
>>  	unsigned long i;
>>  	unsigned long num_pages;
>>  	int was_dirty = 0;
>>
>>  	check_buffer_tree_ref(eb);
>>
>> -	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
>> -
>> -	num_pages = num_extent_pages(eb->start, eb->len);
>>  	WARN_ON(atomic_read(&eb->refs) == 0);
>>  	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
>> +	if (test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
>> +		return 1;
>>
>> +	num_pages = num_extent_pages(eb->start, eb->len);
>>  	for (i = 0; i < num_pages; i++)
>> -		set_page_dirty(eb->pages[i]);
>> +		account_metadata_dirtied(eb->pages[i],
>> +					 &eb->eb_info->fs_info->bdi);
>> +	spin_lock_irq(&eb_info->buffer_lock);
>> +	radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
>> +			   PAGECACHE_TAG_DIRTY);
>> +	spin_unlock_irq(&eb_info->buffer_lock);
>>  	return was_dirty;
>>  }
>>
>>  void clear_extent_buffer_uptodate(struct extent_buffer *eb)
>>  {
>> -	unsigned long i;
>> -	struct page *page;
>> -	unsigned long num_pages;
>> -
>>  	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>> -	num_pages = num_extent_pages(eb->start, eb->len);
>> -	for (i = 0; i < num_pages; i++) {
>> -		page = eb->pages[i];
>> -		if (page)
>> -			ClearPageUptodate(page);
>> -	}
>>  }
>>
>>  void set_extent_buffer_uptodate(struct extent_buffer *eb)
>>  {
>> -	unsigned long i;
>> -	struct page *page;
>> -	unsigned long num_pages;
>> -
>>  	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>> -	num_pages = num_extent_pages(eb->start, eb->len);
>> -	for (i = 0; i < num_pages; i++) {
>> -		page = eb->pages[i];
>> -		SetPageUptodate(page);
>> -	}
>>  }
>>
>>  int extent_buffer_uptodate(struct extent_buffer *eb)
>> @@ -5161,103 +5251,166 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
>>  	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>>  }
>>
>> -int read_extent_buffer_pages(struct extent_io_tree *tree,
>> -			     struct extent_buffer *eb, int wait,
>> -			     get_extent_t *get_extent, int mirror_num)
>> +static void end_bio_extent_buffer_readpage(struct bio *bio)
>>  {
>> +	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
>> +	struct extent_io_tree *tree = NULL;
>> +	struct bio_vec *bvec;
>> +	u64 unlock_start = 0, unlock_len = 0;
>> +	int mirror_num = io_bio->mirror_num;
>> +	int uptodate = !bio->bi_error;
>> +	int i, ret;
>> +
>> +	bio_for_each_segment_all(bvec, bio, i) {
>> +		struct page *page = bvec->bv_page;
>> +		struct btrfs_eb_info *eb_info;
>> +		struct extent_buffer *eb;
>> +
>> +		eb = (struct extent_buffer *)page->private;
>> +		if (WARN_ON(!eb))
>> +			continue;
>> +
>> +		eb_info = eb->eb_info;
>> +		if (!tree)
>> +			tree = &eb_info->io_tree;
>> +		if (uptodate) {
>> +			/*
>> +			 * btree_readpage_end_io_hook doesn't care about
>> +			 * start/end so just pass 0.  We'll kill this later.
>> +			 */
>> +			ret = tree->ops->readpage_end_io_hook(io_bio, 0,
>> +							      page, 0, 0,
>> +							      mirror_num);
>> +			if (ret) {
>> +				uptodate = 0;
>> +			} else {
>> +				u64 start = eb->start;
>> +				int c, num_pages;
>> +
>> +				num_pages = num_extent_pages(eb->start,
>> +							     eb->len);
>> +				for (c = 0; c < num_pages; c++) {
>> +					if (eb->pages[c] == page)
>> +						break;
>> +					start += PAGE_SIZE;
>> +				}
>> +				clean_io_failure(eb_info->fs_info,
>> +						 &eb_info->io_failure_tree,
>> +						 tree, start, page, 0, 0);
>> +			}
>> +		}
>> +		/*
>> +		 * We never fix anything in btree_io_failed_hook.
>> +		 *
>> +		 * TODO: rework the io failed hook to not assume we can fix
>> +		 * anything.
>> +		 */
>> +		if (!uptodate)
>> +			tree->ops->readpage_io_failed_hook(page, mirror_num);
>> +
>> +		if (unlock_start == 0) {
>> +			unlock_start = eb->start;
>> +			unlock_len = PAGE_SIZE;
>> +		} else {
>> +			unlock_len += PAGE_SIZE;
>> +		}
>> +	}
>> +
>> +	if (unlock_start)
>> +		unlock_extent(tree, unlock_start,
>> +			      unlock_start + unlock_len - 1);
>> +	if (io_bio->end_io)
>> +		io_bio->end_io(io_bio, bio->bi_error);
>> +	bio_put(bio);
>> +}
>> +
>> +int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
>> +			     int mirror_num)
>> +{
>> +	struct btrfs_eb_info *eb_info = eb->eb_info;
>> +	struct extent_io_tree *io_tree = &eb_info->io_tree;
>> +	struct block_device *bdev = eb_info->fs_info->fs_devices->latest_bdev;
>> +	struct bio *bio = NULL;
>> +	u64 offset = eb->start;
>> +	u64 unlock_start = 0, unlock_len = 0;
>>  	unsigned long i;
>>  	struct page *page;
>>  	int err;
>>  	int ret = 0;
>> -	int locked_pages = 0;
>> -	int all_uptodate = 1;
>>  	unsigned long num_pages;
>> -	unsigned long num_reads = 0;
>> -	struct bio *bio = NULL;
>> -	unsigned long bio_flags = 0;
>>
>>  	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
>>  		return 0;
>>
>> -	num_pages = num_extent_pages(eb->start, eb->len);
>> -	for (i = 0; i < num_pages; i++) {
>> -		page = eb->pages[i];
>> -		if (wait == WAIT_NONE) {
>> -			if (!trylock_page(page))
>> -				goto unlock_exit;
>> -		} else {
>> -			lock_page(page);
>> -		}
>> -		locked_pages++;
>> -		if (!PageUptodate(page)) {
>> -			num_reads++;
>> -			all_uptodate = 0;
>> -		}
>> -	}
>> -	if (all_uptodate) {
>> -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>> -		goto unlock_exit;
>> +	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) {
>> +		if (wait != WAIT_COMPLETE)
>> +			return 0;
>> +		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
>> +			       TASK_UNINTERRUPTIBLE);
>> +		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
>> +			ret = -EIO;
>> +		return ret;
>>  	}
>>
>> +	lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
>> +	num_pages = num_extent_pages(eb->start, eb->len);
>>  	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
>>  	eb->read_mirror = 0;
>> -	atomic_set(&eb->io_pages, num_reads);
>> +	atomic_set(&eb->io_pages, num_pages);
>>  	for (i = 0; i < num_pages; i++) {
>>  		page = eb->pages[i];
>> -
>> -		if (!PageUptodate(page)) {
>> -			if (ret) {
>> -				atomic_dec(&eb->io_pages);
>> -				unlock_page(page);
>> -				continue;
>> +		if (ret) {
>> +			unlock_len += PAGE_SIZE;
>> +			if (atomic_dec_and_test(&eb->io_pages)) {
>> +				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
>> +				smp_mb__after_atomic();
>> +				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>>  			}
>> +			continue;
>> +		}
>>
>> -			ClearPageError(page);
>> -			err = __extent_read_full_page(tree, page,
>> -						      get_extent, &bio,
>> -						      mirror_num, &bio_flags,
>> -						      REQ_META);
>> -			if (err) {
>> -				ret = err;
>> -				/*
>> -				 * We use &bio in above __extent_read_full_page,
>> -				 * so we ensure that if it returns error, the
>> -				 * current page fails to add itself to bio and
>> -				 * it's been unlocked.
>> -				 *
>> -				 * We must dec io_pages by ourselves.
>> -				 */
>> -				atomic_dec(&eb->io_pages);
>> +		err = submit_extent_page(REQ_OP_READ, REQ_META, io_tree, NULL,
>> +					 page, offset >> 9, PAGE_SIZE, 0, bdev,
>> +					 &bio, -1,
>> +					 end_bio_extent_buffer_readpage,
>> +					 mirror_num, 0, 0, false);
>> +		if (err) {
>> +			ret = err;
>> +			/*
>> +			 * We use &bio in above submit_extent_page
>> +			 * so we ensure that if it returns error, the
>> +			 * current page fails to add itself to bio and
>> +			 * it's been unlocked.
>> +			 *
>> +			 * We must dec io_pages by ourselves.
>> +			 */
>> +			if (atomic_dec_and_test(&eb->io_pages)) {
>> +				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
>> +				smp_mb__after_atomic();
>> +				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>>  			}
>> -		} else {
>> -			unlock_page(page);
>> +			unlock_start = eb->start;
>
> Josef, IMHO "unlock_start" should have been set to "offset". Lets say we
> have 4 pages making up a metadata block and the first page was successfully
> added to a bio. Assume that adding the second page to the bio resulted in
> submit_extent_page() returning an error. In this scenario,
> end_bio_extent_buffer_readpage() will own the responsibility of unlocking the
> first 4k range in the io tree. However with "unlock_start" being set to
> "eb->start", read_extent_buffer_pages() may end up unlocking the first 4k
> range in the io tree.

Oops, good catch, I'll fix it up.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/7] Btrfs: add a flags field to btrfs_fs_info
  2016-09-02 19:40 ` [PATCH 3/7] Btrfs: add a flags field to btrfs_fs_info Josef Bacik
@ 2016-09-08 17:01   ` David Sterba
  0 siblings, 0 replies; 16+ messages in thread
From: David Sterba @ 2016-09-08 17:01 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Fri, Sep 02, 2016 at 03:40:02PM -0400, Josef Bacik wrote:
> We have a lot of random ints in btrfs_fs_info that can be put into flags.  This
> is mostly equivalent with the exception of how we deal with quota going on or
> off, now instead we set a flag when we are turning it on or off and deal with
> that appropriately, rather than just having a pending state that the current
> quota_enabled gets set to.  Thanks,
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>

Reviewed-by: David Sterba <dsterba@suse.com>

I'm picking this patch independently to 4.9, but feel free to include it
in the patch series if you send more revisions.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/7] Btrfs: kill the start argument to read_extent_buffer_pages
  2016-09-02 19:40 ` [PATCH 4/7] Btrfs: kill the start argument to read_extent_buffer_pages Josef Bacik
@ 2016-09-08 17:01   ` David Sterba
  0 siblings, 0 replies; 16+ messages in thread
From: David Sterba @ 2016-09-08 17:01 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Fri, Sep 02, 2016 at 03:40:03PM -0400, Josef Bacik wrote:
> Nobody uses this, it makes no sense to do partial reads of extent buffers.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>

Reviewed-by: David Sterba <dsterba@suse.com>

Same here, picked to 4.9.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 7/7] Btrfs: kill BUG_ON()'s in btrfs_mark_extent_written
  2016-09-02 19:40 ` [PATCH 7/7] Btrfs: kill BUG_ON()'s in btrfs_mark_extent_written Josef Bacik
@ 2016-09-08 17:07   ` David Sterba
  0 siblings, 0 replies; 16+ messages in thread
From: David Sterba @ 2016-09-08 17:07 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, kernel-team

On Fri, Sep 02, 2016 at 03:40:06PM -0400, Josef Bacik wrote:
> No reason to bug on in here, fs corruption could easily cause these things to
> happen.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>

Reviewed-by: David Sterba <dsterba@suse.com>

Also on the way to 4.9.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 6/7][V2] Btrfs: kill the btree_inode
  2016-09-02 19:40 ` [PATCH 6/7] Btrfs: kill the btree_inode Josef Bacik
  2016-09-08  5:17   ` Chandan Rajendra
@ 2016-09-09 17:40   ` Josef Bacik
  1 sibling, 0 replies; 16+ messages in thread
From: Josef Bacik @ 2016-09-09 17:40 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

In order to more efficiently support sub-page blocksizes we need to stop
allocating pages from pagecache for our metadata.  Instead switch to using the
account_metadata* counters for making sure we are keeping the system aware of
how much dirty metadata we have, and use the ->free_cached_objects super
operation in order to handle freeing up extent buffers.  This greatly simplifies
how we deal with extent buffers as now we no longer have to tie the page cache
reclaimation stuff to the extent buffer stuff.  This will also allow us to
simply kmalloc() our data for sub-page blocksizes.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
V1->V2
-fixed the unlock_start as pointed out by Chandan.
-fixed a panic when fs_info->eb_info is null.

 fs/btrfs/btrfs_inode.h                 |   3 +-
 fs/btrfs/ctree.c                       |  10 +-
 fs/btrfs/ctree.h                       |  14 +-
 fs/btrfs/disk-io.c                     | 389 ++++----------
 fs/btrfs/extent_io.c                   | 913 ++++++++++++++++++---------------
 fs/btrfs/extent_io.h                   |  49 +-
 fs/btrfs/inode.c                       |   6 +-
 fs/btrfs/root-tree.c                   |   2 +-
 fs/btrfs/super.c                       |  29 +-
 fs/btrfs/tests/btrfs-tests.c           |  37 +-
 fs/btrfs/tests/extent-io-tests.c       |   4 +-
 fs/btrfs/tests/free-space-tree-tests.c |   4 +-
 fs/btrfs/tests/qgroup-tests.c          |   4 +-
 fs/btrfs/transaction.c                 |  11 +-
 14 files changed, 727 insertions(+), 748 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1a8fa46..ad7b185 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -229,10 +229,9 @@ static inline u64 btrfs_ino(struct inode *inode)
 	u64 ino = BTRFS_I(inode)->location.objectid;
 
 	/*
-	 * !ino: btree_inode
 	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
 	 */
-	if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
+	if (BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
 		ino = inode->i_ino;
 	return ino;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d1c56c9..b267053 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1373,8 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 
 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 		BUG_ON(tm->slot != 0);
-		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
-						eb->len);
+		eb_rewin = alloc_dummy_extent_buffer(fs_info->eb_info,
+						     eb->start, eb->len);
 		if (!eb_rewin) {
 			btrfs_tree_read_unlock_blocking(eb);
 			free_extent_buffer(eb);
@@ -1455,8 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	} else if (old_root) {
 		btrfs_tree_read_unlock(eb_root);
 		free_extent_buffer(eb_root);
-		eb = alloc_dummy_extent_buffer(root->fs_info, logical,
-					root->nodesize);
+		eb = alloc_dummy_extent_buffer(root->fs_info->eb_info, logical,
+					       root->nodesize);
 	} else {
 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
 		eb = btrfs_clone_extent_buffer(eb_root);
@@ -1772,7 +1772,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 	int err;
 
 	if (low > high) {
-		btrfs_err(eb->fs_info,
+		btrfs_err(eb->eb_info->fs_info,
 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
 			  __func__, low, high, eb->start,
 			  btrfs_header_owner(eb), btrfs_header_level(eb));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 282a031..b9ee7cf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -37,6 +37,7 @@
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/sizes.h>
+#include <linux/list_lru.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -675,6 +676,7 @@ struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_balance_control;
 struct btrfs_delayed_root;
+struct btrfs_eb_info;
 
 #define BTRFS_FS_BARRIER			1
 #define BTRFS_FS_CLOSING_START			2
@@ -797,7 +799,7 @@ struct btrfs_fs_info {
 	struct btrfs_super_block *super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
-	struct inode *btree_inode;
+	struct btrfs_eb_info *eb_info;
 	struct backing_dev_info bdi;
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
@@ -1042,10 +1044,6 @@ struct btrfs_fs_info {
 	/* readahead works cnt */
 	atomic_t reada_works_cnt;
 
-	/* Extent buffer radix tree */
-	spinlock_t buffer_lock;
-	struct radix_tree_root buffer_radix;
-
 	/* next backup root to be overwritten */
 	int backup_root_index;
 
@@ -2884,6 +2882,9 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
 
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+	if (fs_info->eb_info)
+		list_lru_destroy(&fs_info->eb_info->lru_list);
+	kfree(fs_info->eb_info);
 	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
@@ -3121,9 +3122,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root,
 			     struct btrfs_root *parent_root,
 			     u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio,
-			 unsigned long bio_flags);
 void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9c42e53..03ac601 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -217,56 +217,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
 
 #endif
 
-/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-static struct extent_map *btree_get_extent(struct inode *inode,
-		struct page *page, size_t pg_offset, u64 start, u64 len,
-		int create)
-{
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct extent_map *em;
-	int ret;
-
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, start, len);
-	if (em) {
-		em->bdev =
-			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-		read_unlock(&em_tree->lock);
-		goto out;
-	}
-	read_unlock(&em_tree->lock);
-
-	em = alloc_extent_map();
-	if (!em) {
-		em = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-	em->start = 0;
-	em->len = (u64)-1;
-	em->block_len = (u64)-1;
-	em->block_start = 0;
-	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-
-	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
-	if (ret == -EEXIST) {
-		free_extent_map(em);
-		em = lookup_extent_mapping(em_tree, start, len);
-		if (!em)
-			em = ERR_PTR(-EIO);
-	} else if (ret) {
-		free_extent_map(em);
-		em = ERR_PTR(ret);
-	}
-	write_unlock(&em_tree->lock);
-
-out:
-	return em;
-}
-
 u32 btrfs_csum_data(char *data, u32 seed, size_t len)
 {
 	return btrfs_crc32c(seed, data, len);
@@ -349,11 +299,11 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
  * detect blocks that either didn't get written at all or got written
  * in the wrong place.
  */
-static int verify_parent_transid(struct extent_io_tree *io_tree,
-				 struct extent_buffer *eb, u64 parent_transid,
+static int verify_parent_transid(struct extent_buffer *eb, u64 parent_transid,
 				 int atomic)
 {
 	struct extent_state *cached_state = NULL;
+	struct extent_io_tree *io_tree = &eb->eb_info->io_tree;
 	int ret;
 	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
 
@@ -375,7 +325,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		ret = 0;
 		goto out;
 	}
-	btrfs_err_rl(eb->fs_info,
+	btrfs_err_rl(eb->eb_info->fs_info,
 		"parent transid verify failed on %llu wanted %llu found %llu",
 			eb->start,
 			parent_transid, btrfs_header_generation(eb));
@@ -445,7 +395,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  struct extent_buffer *eb,
 					  u64 parent_transid)
 {
-	struct extent_io_tree *io_tree;
 	int failed = 0;
 	int ret;
 	int num_copies = 0;
@@ -453,13 +402,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	int failed_mirror = 0;
 
 	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
-	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	while (1) {
-		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
-					       btree_get_extent, mirror_num);
+		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
 		if (!ret) {
-			if (!verify_parent_transid(io_tree, eb,
-						   parent_transid, 0))
+			if (!verify_parent_transid(eb, parent_transid, 0))
 				break;
 			else
 				ret = -EIO;
@@ -504,24 +450,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 
 static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
 {
-	u64 start = page_offset(page);
-	u64 found_start;
 	struct extent_buffer *eb;
 
 	eb = (struct extent_buffer *)page->private;
 	if (page != eb->pages[0])
 		return 0;
-
-	found_start = btrfs_header_bytenr(eb);
-	/*
-	 * Please do not consolidate these warnings into a single if.
-	 * It is useful to know what went wrong.
-	 */
-	if (WARN_ON(found_start != start))
-		return -EUCLEAN;
-	if (WARN_ON(!PageUptodate(page)))
-		return -EUCLEAN;
-
 	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
 			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
 
@@ -619,8 +552,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	u64 found_start;
 	int found_level;
 	struct extent_buffer *eb;
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info;
 	int ret = 0;
 	int reads_done;
 
@@ -633,6 +566,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	 * in memory.  Make sure we have a ref for all this other checks
 	 */
 	extent_buffer_get(eb);
+	fs_info = eb->eb_info->fs_info;
+	root = fs_info->tree_root;
 
 	reads_done = atomic_dec_and_test(&eb->io_pages);
 	if (!reads_done)
@@ -693,11 +628,19 @@ err:
 		/*
 		 * our io error hook is going to dec the io pages
 		 * again, we have to make sure it has something
-		 * to decrement
+		 * to decrement.
+		 *
+		 * TODO: Kill this, we've re-arranged how this works now so we
+		 * don't need to do this io_pages dance.
 		 */
 		atomic_inc(&eb->io_pages);
 		clear_extent_buffer_uptodate(eb);
 	}
+	if (reads_done) {
+		clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+		smp_mb__after_atomic();
+		wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+	}
 	free_extent_buffer(eb);
 out:
 	return ret;
@@ -712,7 +655,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
 	eb->read_mirror = failed_mirror;
 	atomic_dec(&eb->io_pages);
 	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-		btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
+		btree_readahead_hook(eb->eb_info->fs_info, eb, eb->start, -EIO);
 	return -EIO;	/* we fixed nothing */
 }
 
@@ -884,15 +827,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 	return 0;
 }
 
-static int btree_csum_one_bio(struct bio *bio)
+static int btree_csum_one_bio(struct btrfs_fs_info *fs_info, struct bio *bio)
 {
 	struct bio_vec *bvec;
-	struct btrfs_root *root;
 	int i, ret = 0;
 
 	bio_for_each_segment_all(bvec, bio, i) {
-		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
-		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
+		ret = csum_dirty_buffer(fs_info, bvec->bv_page);
 		if (ret)
 			break;
 	}
@@ -904,25 +845,27 @@ static int __btree_submit_bio_start(void *private_data, struct bio *bio,
 				    int mirror_num, unsigned long bio_flags,
 				    u64 bio_offset)
 {
+	struct btrfs_eb_info *eb_info = private_data;
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	return btree_csum_one_bio(bio);
+	return btree_csum_one_bio(eb_info->fs_info, bio);
 }
 
 static int __btree_submit_bio_done(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
-	struct inode *inode = private_data;
+	struct btrfs_eb_info *eb_info = private_data;
+	struct btrfs_root *root = eb_info->fs_info->tree_root;
 	int ret;
 
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
+	ret = btrfs_map_bio(root, bio, mirror_num, 1);
 	if (ret) {
 		bio->bi_error = ret;
 		bio_endio(bio);
@@ -930,7 +873,7 @@ static int __btree_submit_bio_done(void *private_data, struct bio *bio,
 	return ret;
 }
 
-static int check_async_write(struct inode *inode, unsigned long bio_flags)
+static int check_async_write(unsigned long bio_flags)
 {
 	if (bio_flags & EXTENT_BIO_TREE_LOG)
 		return 0;
@@ -945,8 +888,9 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
-	struct inode *inode = private_data;
-	int async = check_async_write(inode, bio_flags);
+	struct btrfs_eb_info *eb_info = private_data;
+	struct btrfs_root *root = eb_info->fs_info->tree_root;
+	int async = check_async_write(bio_flags);
 	int ret;
 
 	if (bio_op(bio) != REQ_OP_WRITE) {
@@ -954,23 +898,22 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
 		 * called for a read, do the setup so that checksum validation
 		 * can happen in the async kernel threads
 		 */
-		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
-					  bio, BTRFS_WQ_ENDIO_METADATA);
+		ret = btrfs_bio_wq_end_io(eb_info->fs_info, bio,
+					  BTRFS_WQ_ENDIO_METADATA);
 		if (ret)
 			goto out_w_error;
-		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
+		ret = btrfs_map_bio(root, bio, mirror_num, 0);
 	} else if (!async) {
-		ret = btree_csum_one_bio(bio);
+		ret = btree_csum_one_bio(eb_info->fs_info, bio);
 		if (ret)
 			goto out_w_error;
-		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
+		ret = btrfs_map_bio(root, bio, mirror_num, 0);
 	} else {
 		/*
 		 * kthread helpers are used to submit writes so that
 		 * checksumming can happen in parallel across all CPUs
 		 */
-		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-					  bio, mirror_num, 0,
+		ret = btrfs_wq_submit_bio(eb_info->fs_info, bio, mirror_num, 0,
 					  bio_offset, private_data,
 					  __btree_submit_bio_start,
 					  __btree_submit_bio_done);
@@ -986,118 +929,14 @@ out_w_error:
 	return ret;
 }
 
-#ifdef CONFIG_MIGRATION
-static int btree_migratepage(struct address_space *mapping,
-			struct page *newpage, struct page *page,
-			enum migrate_mode mode)
-{
-	/*
-	 * we can't safely write a btree page from here,
-	 * we haven't done the locking hook
-	 */
-	if (PageDirty(page))
-		return -EAGAIN;
-	/*
-	 * Buffers may be managed in a filesystem specific way.
-	 * We must have no buffers or drop them.
-	 */
-	if (page_has_private(page) &&
-	    !try_to_release_page(page, GFP_KERNEL))
-		return -EAGAIN;
-	return migrate_page(mapping, newpage, page, mode);
-}
-#endif
-
-
-static int btree_writepages(struct address_space *mapping,
-			    struct writeback_control *wbc)
-{
-	struct btrfs_fs_info *fs_info;
-	int ret;
-
-	if (wbc->sync_mode == WB_SYNC_NONE) {
-
-		if (wbc->for_kupdate)
-			return 0;
-
-		fs_info = BTRFS_I(mapping->host)->root->fs_info;
-		/* this is a bit racy, but that's ok */
-		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
-					     BTRFS_DIRTY_METADATA_THRESH);
-		if (ret < 0)
-			return 0;
-	}
-	return btree_write_cache_pages(mapping, wbc);
-}
-
-static int btree_readpage(struct file *file, struct page *page)
-{
-	struct extent_io_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btree_get_extent, 0);
-}
-
-static int btree_releasepage(struct page *page, gfp_t gfp_flags)
-{
-	if (PageWriteback(page) || PageDirty(page))
-		return 0;
-
-	return try_release_extent_buffer(page);
-}
-
-static void btree_invalidatepage(struct page *page, unsigned int offset,
-				 unsigned int length)
-{
-	struct extent_io_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	extent_invalidatepage(tree, page, offset);
-	btree_releasepage(page, GFP_NOFS);
-	if (PagePrivate(page)) {
-		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
-			   "page private not zero on page %llu",
-			   (unsigned long long)page_offset(page));
-		ClearPagePrivate(page);
-		set_page_private(page, 0);
-		put_page(page);
-	}
-}
-
-static int btree_set_page_dirty(struct page *page)
-{
-#ifdef DEBUG
-	struct extent_buffer *eb;
-
-	BUG_ON(!PagePrivate(page));
-	eb = (struct extent_buffer *)page->private;
-	BUG_ON(!eb);
-	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-	BUG_ON(!atomic_read(&eb->refs));
-	btrfs_assert_tree_locked(eb);
-#endif
-	return __set_page_dirty_nobuffers(page);
-}
-
-static const struct address_space_operations btree_aops = {
-	.readpage	= btree_readpage,
-	.writepages	= btree_writepages,
-	.releasepage	= btree_releasepage,
-	.invalidatepage = btree_invalidatepage,
-#ifdef CONFIG_MIGRATION
-	.migratepage	= btree_migratepage,
-#endif
-	.set_page_dirty = btree_set_page_dirty,
-};
-
 void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
 {
 	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = root->fs_info->btree_inode;
 
 	buf = btrfs_find_create_tree_block(root, bytenr);
 	if (IS_ERR(buf))
 		return;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, WAIT_NONE, btree_get_extent, 0);
+	read_extent_buffer_pages(buf, WAIT_NONE, 0);
 	free_extent_buffer(buf);
 }
 
@@ -1105,8 +944,6 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 			 int mirror_num, struct extent_buffer **eb)
 {
 	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
 	int ret;
 
 	buf = btrfs_find_create_tree_block(root, bytenr);
@@ -1115,8 +952,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 
 	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
 
-	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
-				       btree_get_extent, mirror_num);
+	ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
 	if (ret) {
 		free_extent_buffer(buf);
 		return ret;
@@ -1136,29 +972,29 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 					    u64 bytenr)
 {
-	return find_extent_buffer(fs_info, bytenr);
+	return find_extent_buffer(fs_info->eb_info, bytenr);
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 bytenr)
 {
 	if (btrfs_is_testing(root->fs_info))
-		return alloc_test_extent_buffer(root->fs_info, bytenr,
-				root->nodesize);
+		return alloc_test_extent_buffer(root->fs_info->eb_info, bytenr,
+						root->nodesize);
 	return alloc_extent_buffer(root->fs_info, bytenr);
 }
 
 
 int btrfs_write_tree_block(struct extent_buffer *buf)
 {
-	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
-					buf->start + buf->len - 1);
+	return btree_write_range(buf->eb_info->fs_info, buf->start,
+				 buf->start + buf->len - 1);
 }
 
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-	return filemap_fdatawait_range(buf->pages[0]->mapping,
-				       buf->start, buf->start + buf->len - 1);
+	return btree_wait_range(buf->eb_info->fs_info, buf->start,
+				buf->start + buf->len - 1);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -1188,14 +1024,10 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
 	    fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
 
-		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+		if (clear_extent_buffer_dirty(buf))
 			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
 					     -buf->len,
 					     fs_info->dirty_metadata_batch);
-			/* ugh, clear_extent_buffer_dirty needs to lock the page */
-			btrfs_set_lock_blocking(buf);
-			clear_extent_buffer_dirty(buf);
-		}
 	}
 }
 
@@ -2247,33 +2079,20 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
 	init_waitqueue_head(&fs_info->balance_wait_q);
 }
 
-static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
-				   struct btrfs_root *tree_root)
+int btrfs_init_eb_info(struct btrfs_fs_info *fs_info)
 {
-	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
-	set_nlink(fs_info->btree_inode, 1);
-	/*
-	 * we set the i_size on the btree inode to the max possible int.
-	 * the real end of the address space is determined by all of
-	 * the devices in the system
-	 */
-	fs_info->btree_inode->i_size = OFFSET_MAX;
-	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-
-	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
-	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
-			    fs_info->btree_inode);
-	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
-	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
-
-	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
-
-	BTRFS_I(fs_info->btree_inode)->root = tree_root;
-	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
-	       sizeof(struct btrfs_key));
-	set_bit(BTRFS_INODE_DUMMY,
-		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
-	btrfs_insert_inode_hash(fs_info->btree_inode);
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
+
+	eb_info->fs_info = fs_info;
+	extent_io_tree_init(&eb_info->io_tree, eb_info);
+	eb_info->io_tree.track_uptodate = 0;
+	eb_info->io_tree.ops = &btree_extent_io_ops;
+	extent_io_tree_init(&eb_info->io_failure_tree, eb_info);
+	INIT_RADIX_TREE(&eb_info->buffer_radix, GFP_ATOMIC);
+	spin_lock_init(&eb_info->buffer_lock);
+	if (list_lru_init(&eb_info->lru_list))
+		return -ENOMEM;
+	return 0;
 }
 
 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2566,16 +2385,7 @@ int open_ctree(struct super_block *sb,
 		goto fail_delalloc_bytes;
 	}
 
-	fs_info->btree_inode = new_inode(sb);
-	if (!fs_info->btree_inode) {
-		err = -ENOMEM;
-		goto fail_bio_counter;
-	}
-
-	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
-	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2590,7 +2400,6 @@ int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->tree_mod_seq_lock);
 	spin_lock_init(&fs_info->super_lock);
 	spin_lock_init(&fs_info->qgroup_op_lock);
-	spin_lock_init(&fs_info->buffer_lock);
 	spin_lock_init(&fs_info->unused_bgs_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->unused_bg_unpin_mutex);
@@ -2643,7 +2452,7 @@ int open_ctree(struct super_block *sb,
 					GFP_KERNEL);
 	if (!fs_info->delayed_root) {
 		err = -ENOMEM;
-		goto fail_iput;
+		goto fail_alloc;
 	}
 	btrfs_init_delayed_root(fs_info->delayed_root);
 
@@ -2658,7 +2467,15 @@ int open_ctree(struct super_block *sb,
 	sb->s_blocksize_bits = blksize_bits(4096);
 	sb->s_bdi = &fs_info->bdi;
 
-	btrfs_init_btree_inode(fs_info, tree_root);
+	fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info), GFP_KERNEL);
+	if (!fs_info->eb_info) {
+		err = -ENOMEM;
+		goto fail_alloc;
+	}
+	if (btrfs_init_eb_info(fs_info)) {
+		err = -ENOMEM;
+		goto fail_alloc;
+	}
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT;
@@ -3085,6 +2902,14 @@ retry_root_backup:
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
+	/*
+	 * We need to make sure we are on the bdi's dirty list so we get
+	 * writeback requests for our fs properly.
+	 */
+	spin_lock(&fs_info->bdi.sb_list_lock);
+	list_add_tail(&fs_info->bdi.dirty_sb_list, &sb->s_bdi_list);
+	spin_unlock(&fs_info->bdi.sb_list_lock);
+
 	if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
 	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
 		btrfs_info(fs_info, "creating free space tree");
@@ -3180,7 +3005,8 @@ fail_cleaner:
 	 * make sure we're done with the btree inode before we stop our
 	 * kthreads
 	 */
-	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+	btree_write_range(fs_info, 0, (u64)-1);
+	btree_wait_range(fs_info, 0, (u64)-1);
 
 fail_sysfs:
 	btrfs_sysfs_remove_mounted(fs_info);
@@ -3194,16 +3020,11 @@ fail_block_groups:
 
 fail_tree_roots:
 	free_root_pointers(fs_info, 1);
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
-
+	btrfs_invalidate_eb_info(fs_info->eb_info);
 fail_sb_buffer:
 	btrfs_stop_all_workers(fs_info);
 fail_alloc:
-fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
-
-	iput(fs_info->btree_inode);
-fail_bio_counter:
 	percpu_counter_destroy(&fs_info->bio_counter);
 fail_delalloc_bytes:
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
@@ -3908,14 +3729,11 @@ void close_ctree(struct btrfs_root *root)
 	 * we must make sure there is not any read request to
 	 * submit after we stopping all workers.
 	 */
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	btrfs_stop_all_workers(fs_info);
 
 	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
 	free_root_pointers(fs_info, 1);
 
-	iput(fs_info->btree_inode);
-
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
 		btrfsic_unmount(root, fs_info->fs_devices);
@@ -3924,6 +3742,8 @@ void close_ctree(struct btrfs_root *root)
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
+	btrfs_invalidate_eb_info(fs_info->eb_info);
+
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
 	percpu_counter_destroy(&fs_info->bio_counter);
@@ -3951,14 +3771,12 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic)
 {
 	int ret;
-	struct inode *btree_inode = buf->pages[0]->mapping->host;
 
 	ret = extent_buffer_uptodate(buf);
 	if (!ret)
 		return ret;
 
-	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
-				    parent_transid, atomic);
+	ret = verify_parent_transid(buf, parent_transid, atomic);
 	if (ret == -EAGAIN)
 		return ret;
 	return !ret;
@@ -3979,7 +3797,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
 		return;
 #endif
-	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	root = buf->eb_info->fs_info->tree_root;
 	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation)
 		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -4015,10 +3833,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
 
 	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
 				     BTRFS_DIRTY_METADATA_THRESH);
-	if (ret > 0) {
+	if (ret > 0)
 		balance_dirty_pages_ratelimited(&root->fs_info->bdi,
 						root->fs_info->sb);
-	}
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root)
@@ -4033,7 +3850,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
 
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = buf->eb_info->fs_info->tree_root;
 	return btree_read_extent_buffer_pages(root, buf, parent_transid);
 }
 
@@ -4376,10 +4193,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 			if (!eb)
 				continue;
 			wait_on_extent_buffer_writeback(eb);
-
-			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
-					       &eb->bflags))
-				clear_extent_buffer_dirty(eb);
+			clear_extent_buffer_dirty(eb);
 			free_extent_buffer_stale(eb);
 		}
 	}
@@ -4504,16 +4318,37 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 
 static struct btrfs_fs_info *btree_fs_info(void *private_data)
 {
-	struct inode *inode = private_data;
-	return btrfs_sb(inode->i_sb);
+	struct btrfs_eb_info *eb_info = private_data;
+	return eb_info->fs_info;
+}
+
+static int btree_merge_bio_hook(struct page *page, unsigned long offset,
+				size_t size, struct bio *bio,
+				unsigned long bio_flags)
+{
+	struct extent_buffer *eb = (struct extent_buffer *)page->private;
+	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
+	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+	u64 length = 0;
+	u64 map_length;
+	int ret;
+
+	length = bio->bi_iter.bi_size;
+	map_length = length;
+	ret = btrfs_map_block(fs_info, bio_op(bio), logical, &map_length,
+			      NULL, 0);
+	if (ret < 0)
+		return ret;
+	if (map_length < length + size)
+		return 1;
+	return 0;
 }
 
 static const struct extent_io_ops btree_extent_io_ops = {
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
-	/* note we're sharing with inode.c for the merge bio hook */
-	.merge_bio_hook = btrfs_merge_bio_hook,
+	.merge_bio_hook = btree_merge_bio_hook,
 	.tree_fs_info = btree_fs_info,
 	.set_range_writeback = btrfs_set_range_writeback,
 };
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5dcdd3e..f669240 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -75,8 +75,8 @@ void btrfs_leak_debug_check(void)
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
 		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
-		       "refs %d\n",
-		       eb->start, eb->len, atomic_read(&eb->refs));
+		       "bflags %lu refs %d\n",
+		       eb->start, eb->len, eb->bflags, atomic_read(&eb->refs));
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
@@ -3538,7 +3538,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
 			  struct btrfs_fs_info *fs_info,
 			  struct extent_page_data *epd)
 {
-	unsigned long i, num_pages;
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
 	int flush = 0;
 	int ret = 0;
 
@@ -3585,37 +3585,42 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
 
 	btrfs_tree_unlock(eb);
 
-	if (!ret)
-		return ret;
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		struct page *p = eb->pages[i];
-
-		if (!trylock_page(p)) {
-			if (!flush) {
-				flush_write_bio(epd);
-				flush = 1;
-			}
-			lock_page(p);
-		}
+	/*
+	 * We cleared dirty on this buffer, we need to adjust the radix tags.
+	 * We do the actual page accounting in write_one_eb.
+	 */
+	if (ret) {
+		spin_lock_irq(&eb_info->buffer_lock);
+		radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
+				   PAGECACHE_TAG_WRITEBACK);
+		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
+				     PAGECACHE_TAG_DIRTY);
+		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
+				     PAGECACHE_TAG_TOWRITE);
+		spin_unlock_irq(&eb_info->buffer_lock);
 	}
-
 	return ret;
 }
 
 static void end_extent_buffer_writeback(struct extent_buffer *eb)
 {
-	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
-	smp_mb__after_atomic();
-	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+	if (test_and_clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+		struct btrfs_eb_info *eb_info = eb->eb_info;
+		unsigned long flags;
+
+		spin_lock_irqsave(&eb_info->buffer_lock, flags);
+		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
+				     PAGECACHE_TAG_WRITEBACK);
+		spin_unlock_irqrestore(&eb_info->buffer_lock, flags);
+		wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+	}
 }
 
 static void set_btree_ioerr(struct page *page)
 {
 	struct extent_buffer *eb = (struct extent_buffer *)page->private;
+	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
 
-	SetPageError(page);
 	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
 		return;
 
@@ -3624,8 +3629,7 @@ static void set_btree_ioerr(struct page *page)
 	 * failed, increment the counter transaction->eb_write_errors.
 	 * We do this because while the transaction is running and before it's
 	 * committing (when we call filemap_fdata[write|wait]_range against
-	 * the btree inode), we might have
-	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+	 * the btree inode), we might have write_metadata() called - if it
 	 * returns an error or an error happens during writeback, when we're
 	 * committing the transaction we wouldn't know about it, since the pages
 	 * can be no longer dirty nor marked anymore for writeback (if a
@@ -3659,13 +3663,13 @@ static void set_btree_ioerr(struct page *page)
 	 */
 	switch (eb->log_index) {
 	case -1:
-		set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
+		set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
 		break;
 	case 0:
-		set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
+		set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
 		break;
 	case 1:
-		set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
+		set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
 		break;
 	default:
 		BUG(); /* unexpected, logic error */
@@ -3686,16 +3690,13 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
 		done = atomic_dec_and_test(&eb->io_pages);
 
 		if (bio->bi_error ||
-		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
-			ClearPageUptodate(page);
+		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
 			set_btree_ioerr(page);
-		}
-
-		end_page_writeback(page);
 
+		account_metadata_end_writeback(page,
+					       &eb->eb_info->fs_info->bdi);
 		if (!done)
 			continue;
-
 		end_extent_buffer_writeback(eb);
 	}
 
@@ -3708,7 +3709,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 			struct extent_page_data *epd)
 {
 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
-	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+	struct extent_io_tree *tree = &fs_info->eb_info->io_tree;
 	u64 offset = eb->start;
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
@@ -3724,8 +3725,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = eb->pages[i];
 
-		clear_page_dirty_for_io(p);
-		set_page_writeback(p);
 		ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
 					 p, offset >> 9, PAGE_SIZE, 0, bdev,
 					 &epd->bio, -1,
@@ -3734,34 +3733,90 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 		epd->bio_flags = bio_flags;
 		if (ret) {
 			set_btree_ioerr(p);
-			end_page_writeback(p);
 			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
 				end_extent_buffer_writeback(eb);
 			ret = -EIO;
 			break;
 		}
+		account_metadata_writeback(p, &fs_info->bdi);
 		offset += PAGE_SIZE;
 		update_nr_written(p, wbc, 1);
-		unlock_page(p);
 	}
 
-	if (unlikely(ret)) {
-		for (; i < num_pages; i++) {
-			struct page *p = eb->pages[i];
-			clear_page_dirty_for_io(p);
-			unlock_page(p);
+	return ret;
+}
+
+#define EB_TAG_BATCH 4096
+static void tag_ebs_for_writeback(struct btrfs_eb_info *eb_info, pgoff_t start,
+				  pgoff_t end)
+{
+	unsigned long tagged;
+
+	do {
+		spin_lock_irq(&eb_info->buffer_lock);
+		tagged = radix_tree_range_tag_if_tagged(&eb_info->buffer_radix,
+							&start, end,
+							EB_TAG_BATCH,
+							PAGECACHE_TAG_DIRTY,
+							PAGECACHE_TAG_TOWRITE);
+		spin_unlock_irq(&eb_info->buffer_lock);
+		cond_resched();
+	} while (tagged >= EB_TAG_BATCH && start);
+}
+
+static unsigned eb_lookup_tag(struct btrfs_eb_info *eb_info,
+			      struct extent_buffer **ebs, pgoff_t *index,
+			      int tag, unsigned nr)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+	unsigned ret = 0;
+
+	if (unlikely(!nr))
+		return 0;
+
+	rcu_read_lock();
+	radix_tree_for_each_tagged(slot, &eb_info->buffer_radix, &iter, *index,
+				   tag) {
+		struct extent_buffer *eb;
+repeat:
+		eb = radix_tree_deref_slot(slot);
+		if (unlikely(!eb))
+			continue;
+
+		if (radix_tree_exception(eb)) {
+			if (radix_tree_deref_retry(eb)) {
+				slot = radix_tree_iter_retry(&iter);
+				continue;
+			}
+			continue;
 		}
-	}
 
+		if (unlikely(!atomic_inc_not_zero(&eb->refs)))
+			continue;
+
+		if (unlikely(eb != *slot)) {
+			free_extent_buffer(eb);
+			goto repeat;
+		}
+
+		ebs[ret] = eb;
+		if (++ret == nr)
+			break;
+	}
+	rcu_read_unlock();
+	if (ret)
+		*index = (ebs[ret - 1]->start >> PAGE_SHIFT) + 1;
 	return ret;
 }
 
-int btree_write_cache_pages(struct address_space *mapping,
+#define EBVEC_SIZE 16
+static int btree_write_cache_pages(struct btrfs_fs_info *fs_info,
 				   struct writeback_control *wbc)
 {
-	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
-	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
-	struct extent_buffer *eb, *prev_eb = NULL;
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
+	struct extent_io_tree *tree = &eb_info->io_tree;
+	struct extent_buffer *eb;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
@@ -3772,16 +3827,16 @@ int btree_write_cache_pages(struct address_space *mapping,
 	int ret = 0;
 	int done = 0;
 	int nr_to_write_done = 0;
-	struct pagevec pvec;
-	int nr_pages;
+	struct extent_buffer *ebs[EBVEC_SIZE];
+	int nr_ebs;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
+	pgoff_t done_index = 0;
 	int scanned = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
-		index = mapping->writeback_index; /* Start from prev offset */
+		index = eb_info->writeback_index; /* Start from prev offset */
 		end = -1;
 	} else {
 		index = wbc->range_start >> PAGE_SHIFT;
@@ -3794,53 +3849,27 @@ int btree_write_cache_pages(struct address_space *mapping,
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		tag_pages_for_writeback(mapping, index, end);
+		tag_ebs_for_writeback(fs_info->eb_info, index, end);
 	while (!done && !nr_to_write_done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+	       (nr_ebs = eb_lookup_tag(eb_info, ebs, &index, tag,
+			min(end - index, (pgoff_t)EBVEC_SIZE-1) + 1))) {
 		unsigned i;
 
 		scanned = 1;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			if (!PagePrivate(page))
-				continue;
-
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				break;
-			}
-
-			spin_lock(&mapping->private_lock);
-			if (!PagePrivate(page)) {
-				spin_unlock(&mapping->private_lock);
-				continue;
-			}
-
-			eb = (struct extent_buffer *)page->private;
-
-			/*
-			 * Shouldn't happen and normally this would be a BUG_ON
-			 * but no sense in crashing the users box for something
-			 * we can survive anyway.
-			 */
-			if (WARN_ON(!eb)) {
-				spin_unlock(&mapping->private_lock);
+		for (i = 0; i < nr_ebs; i++) {
+			eb = ebs[i];
+			if (done) {
+				free_extent_buffer(eb);
 				continue;
 			}
 
-			if (eb == prev_eb) {
-				spin_unlock(&mapping->private_lock);
+			if (!wbc->range_cyclic && eb->start > wbc->range_end) {
+				done = 1;
+				free_extent_buffer(eb);
 				continue;
 			}
 
-			ret = atomic_inc_not_zero(&eb->refs);
-			spin_unlock(&mapping->private_lock);
-			if (!ret)
-				continue;
-
-			prev_eb = eb;
+			done_index = eb_index(eb);
 			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
 			if (!ret) {
 				free_extent_buffer(eb);
@@ -3848,12 +3877,11 @@ retry:
 			}
 
 			ret = write_one_eb(eb, fs_info, wbc, &epd);
+			free_extent_buffer(eb);
 			if (ret) {
 				done = 1;
-				free_extent_buffer(eb);
-				break;
+				continue;
 			}
-			free_extent_buffer(eb);
 
 			/*
 			 * the filesystem may choose to bump up nr_to_write.
@@ -3862,7 +3890,6 @@ retry:
 			 */
 			nr_to_write_done = wbc->nr_to_write <= 0;
 		}
-		pagevec_release(&pvec);
 		cond_resched();
 	}
 	if (!scanned && !done) {
@@ -3874,10 +3901,77 @@ retry:
 		index = 0;
 		goto retry;
 	}
+	if (wbc->range_cyclic)
+		fs_info->eb_info->writeback_index = done_index;
 	flush_write_bio(&epd);
 	return ret;
 }
 
+void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	btree_write_cache_pages(fs_info, wbc);
+}
+
+static int __btree_write_range(struct btrfs_fs_info *fs_info, u64 start,
+			       u64 end, int sync_mode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = sync_mode,
+		.nr_to_write = LONG_MAX,
+		.range_start = start,
+		.range_end = end,
+	};
+
+	return btree_write_cache_pages(fs_info, &wbc);
+}
+
+void btree_flush(struct btrfs_fs_info *fs_info)
+{
+	__btree_write_range(fs_info, 0, (u64)-1, WB_SYNC_NONE);
+}
+
+int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
+{
+	return __btree_write_range(fs_info, start, end, WB_SYNC_ALL);
+}
+
+int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
+{
+	struct extent_buffer *ebs[EBVEC_SIZE];
+	pgoff_t index = start >> PAGE_SHIFT;
+	pgoff_t end_index = end >> PAGE_SHIFT;
+	unsigned nr_ebs;
+	int ret = 0;
+
+	if (end < start)
+		return ret;
+
+	while ((index <= end) &&
+	       (nr_ebs = eb_lookup_tag(fs_info->eb_info, ebs, &index,
+				       PAGECACHE_TAG_WRITEBACK,
+				       min(end_index - index,
+					   (pgoff_t)EBVEC_SIZE-1) + 1)) != 0) {
+		unsigned i;
+
+		for (i = 0; i < nr_ebs; i++) {
+			struct extent_buffer *eb = ebs[i];
+
+			if (eb->start > end) {
+				free_extent_buffer(eb);
+				continue;
+			}
+
+			wait_on_extent_buffer_writeback(eb);
+			if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+				ret = -EIO;
+			free_extent_buffer(eb);
+		}
+		cond_resched();
+	}
+	return ret;
+}
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -4558,7 +4652,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
 {
 	unsigned long index;
 	struct page *page;
-	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
 
 	BUG_ON(extent_buffer_under_io(eb));
 
@@ -4566,39 +4659,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
 	if (index == 0)
 		return;
 
+	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 	do {
 		index--;
 		page = eb->pages[index];
 		if (!page)
 			continue;
-		if (mapped)
-			spin_lock(&page->mapping->private_lock);
-		/*
-		 * We do this since we'll remove the pages after we've
-		 * removed the eb from the radix tree, so we could race
-		 * and have this page now attached to the new eb.  So
-		 * only clear page_private if it's still connected to
-		 * this eb.
-		 */
-		if (PagePrivate(page) &&
-		    page->private == (unsigned long)eb) {
-			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-			BUG_ON(PageDirty(page));
-			BUG_ON(PageWriteback(page));
-			/*
-			 * We need to make sure we haven't be attached
-			 * to a new eb.
-			 */
-			ClearPagePrivate(page);
-			set_page_private(page, 0);
-			/* One for the page private */
-			put_page(page);
-		}
+		ASSERT(PagePrivate(page));
+		ASSERT(page->private == (unsigned long)eb);
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
 
-		if (mapped)
-			spin_unlock(&page->mapping->private_lock);
+		/* Once for the page private. */
+		put_page(page);
 
-		/* One for when we allocated the page */
+		/* Once for the alloc_page. */
 		put_page(page);
 	} while (index != 0);
 }
@@ -4613,7 +4688,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 }
 
 static struct extent_buffer *
-__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
+__alloc_extent_buffer(struct btrfs_eb_info *eb_info, u64 start,
 		      unsigned long len)
 {
 	struct extent_buffer *eb = NULL;
@@ -4621,7 +4696,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
 	eb->start = start;
 	eb->len = len;
-	eb->fs_info = fs_info;
+	eb->eb_info = eb_info;
 	eb->bflags = 0;
 	rwlock_init(&eb->lock);
 	atomic_set(&eb->write_locks, 0);
@@ -4633,6 +4708,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 	eb->lock_nested = 0;
 	init_waitqueue_head(&eb->write_lock_wq);
 	init_waitqueue_head(&eb->read_lock_wq);
+	INIT_LIST_HEAD(&eb->lru);
 
 	btrfs_leak_debug_add(&eb->leak_list, &buffers);
 
@@ -4657,7 +4733,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 	struct extent_buffer *new;
 	unsigned long num_pages = num_extent_pages(src->start, src->len);
 
-	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+	new = __alloc_extent_buffer(src->eb_info, src->start, src->len);
 	if (new == NULL)
 		return NULL;
 
@@ -4668,8 +4744,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 			return NULL;
 		}
 		attach_extent_buffer_page(new, p);
-		WARN_ON(PageDirty(p));
-		SetPageUptodate(p);
 		new->pages[i] = p;
 	}
 
@@ -4680,8 +4754,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 	return new;
 }
 
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						  u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info *eb_info,
+						u64 start, unsigned long len)
 {
 	struct extent_buffer *eb;
 	unsigned long num_pages;
@@ -4689,7 +4763,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	num_pages = num_extent_pages(start, len);
 
-	eb = __alloc_extent_buffer(fs_info, start, len);
+	eb = __alloc_extent_buffer(eb_info, start, len);
 	if (!eb)
 		return NULL;
 
@@ -4697,6 +4771,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 		eb->pages[i] = alloc_page(GFP_NOFS);
 		if (!eb->pages[i])
 			goto err;
+		attach_extent_buffer_page(eb, eb->pages[i]);
 	}
 	set_extent_buffer_uptodate(eb);
 	btrfs_set_header_nritems(eb, 0);
@@ -4704,30 +4779,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	return eb;
 err:
-	for (; i > 0; i--)
-		__free_page(eb->pages[i - 1]);
-	__free_extent_buffer(eb);
+	btrfs_release_extent_buffer(eb);
 	return NULL;
 }
 
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						u64 start, u32 nodesize)
-{
-	unsigned long len;
-
-	if (!fs_info) {
-		/*
-		 * Called only from tests that don't always have a fs_info
-		 * available
-		 */
-		len = nodesize;
-	} else {
-		len = fs_info->tree_root->nodesize;
-	}
-
-	return __alloc_dummy_extent_buffer(fs_info, start, len);
-}
-
 static void check_buffer_tree_ref(struct extent_buffer *eb)
 {
 	int refs;
@@ -4777,13 +4832,13 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
 	}
 }
 
-struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
 					 u64 start)
 {
 	struct extent_buffer *eb;
 
 	rcu_read_lock();
-	eb = radix_tree_lookup(&fs_info->buffer_radix,
+	eb = radix_tree_lookup(&eb_info->buffer_radix,
 			       start >> PAGE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
@@ -4815,30 +4870,30 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 }
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-					u64 start, u32 nodesize)
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
+					       u64 start, u32 nodesize)
 {
 	struct extent_buffer *eb, *exists = NULL;
 	int ret;
 
-	eb = find_extent_buffer(fs_info, start);
+	eb = find_extent_buffer(eb_info, start);
 	if (eb)
 		return eb;
-	eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
+	eb = alloc_dummy_extent_buffer(eb_info, start, nodesize);
 	if (!eb)
 		return NULL;
-	eb->fs_info = fs_info;
+	eb->eb_info = eb_info;
 again:
 	ret = radix_tree_preload(GFP_NOFS);
 	if (ret)
 		goto free_eb;
-	spin_lock(&fs_info->buffer_lock);
-	ret = radix_tree_insert(&fs_info->buffer_radix,
+	spin_lock_irq(&eb_info->buffer_lock);
+	ret = radix_tree_insert(&eb_info->buffer_radix,
 				start >> PAGE_SHIFT, eb);
-	spin_unlock(&fs_info->buffer_lock);
+	spin_unlock_irq(&eb_info->buffer_lock);
 	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = find_extent_buffer(fs_info, start);
+		exists = find_extent_buffer(eb_info, start);
 		if (exists)
 			goto free_eb;
 		else
@@ -4854,6 +4909,7 @@ again:
 	 * bump the ref count again.
 	 */
 	atomic_inc(&eb->refs);
+	set_extent_buffer_uptodate(eb);
 	return eb;
 free_eb:
 	btrfs_release_extent_buffer(eb);
@@ -4867,12 +4923,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	unsigned long len = fs_info->tree_root->nodesize;
 	unsigned long num_pages = num_extent_pages(start, len);
 	unsigned long i;
-	unsigned long index = start >> PAGE_SHIFT;
 	struct extent_buffer *eb;
 	struct extent_buffer *exists = NULL;
 	struct page *p;
-	struct address_space *mapping = fs_info->btree_inode->i_mapping;
-	int uptodate = 1;
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
+//	struct zone *last_zone = NULL;
+//	struct pg_data_t *last_pgdata = NULL;
 	int ret;
 
 	if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
@@ -4880,62 +4936,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		return ERR_PTR(-EINVAL);
 	}
 
-	eb = find_extent_buffer(fs_info, start);
+	eb = find_extent_buffer(eb_info, start);
 	if (eb)
 		return eb;
 
-	eb = __alloc_extent_buffer(fs_info, start, len);
+	eb = __alloc_extent_buffer(eb_info, start, len);
 	if (!eb)
 		return ERR_PTR(-ENOMEM);
 
-	for (i = 0; i < num_pages; i++, index++) {
-		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
+	for (i = 0; i < num_pages; i++) {
+		p = alloc_page(GFP_NOFS|__GFP_NOFAIL);
 		if (!p) {
 			exists = ERR_PTR(-ENOMEM);
 			goto free_eb;
 		}
 
-		spin_lock(&mapping->private_lock);
-		if (PagePrivate(p)) {
-			/*
-			 * We could have already allocated an eb for this page
-			 * and attached one so lets see if we can get a ref on
-			 * the existing eb, and if we can we know it's good and
-			 * we can just return that one, else we know we can just
-			 * overwrite page->private.
-			 */
-			exists = (struct extent_buffer *)p->private;
-			if (atomic_inc_not_zero(&exists->refs)) {
-				spin_unlock(&mapping->private_lock);
-				unlock_page(p);
-				put_page(p);
-				mark_extent_buffer_accessed(exists, p);
-				goto free_eb;
-			}
-			exists = NULL;
-
-			/*
-			 * Do this so attach doesn't complain and we need to
-			 * drop the ref the old guy had.
-			 */
-			ClearPagePrivate(p);
-			WARN_ON(PageDirty(p));
-			put_page(p);
-		}
+		/*
+		 * If our pages span zones or numa nodes we have to do
+		 * dirty/writeback accounting per page, otherwise we can do it
+		 * in bulk and save us some looping.
+		 *
+		if (!last_zone)
+			last_zone = page_zone(p);
+		if (!last_pgdata)
+			last_pgdata = page_pgdata(p);
+		if (last_zone != page_zone(p) || last_pgdata != page_pgdata(p))
+			set_bit(EXTENT_BUFFER_MIXED_PAGES, &eb->bflags);
+		*/
 		attach_extent_buffer_page(eb, p);
-		spin_unlock(&mapping->private_lock);
-		WARN_ON(PageDirty(p));
 		eb->pages[i] = p;
-		if (!PageUptodate(p))
-			uptodate = 0;
-
-		/*
-		 * see below about how we avoid a nasty race with release page
-		 * and why we unlock later
-		 */
 	}
-	if (uptodate)
-		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 again:
 	ret = radix_tree_preload(GFP_NOFS);
 	if (ret) {
@@ -4943,13 +4973,13 @@ again:
 		goto free_eb;
 	}
 
-	spin_lock(&fs_info->buffer_lock);
-	ret = radix_tree_insert(&fs_info->buffer_radix,
+	spin_lock_irq(&eb_info->buffer_lock);
+	ret = radix_tree_insert(&eb_info->buffer_radix,
 				start >> PAGE_SHIFT, eb);
-	spin_unlock(&fs_info->buffer_lock);
+	spin_unlock_irq(&eb_info->buffer_lock);
 	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = find_extent_buffer(fs_info, start);
+		exists = find_extent_buffer(eb_info, start);
 		if (exists)
 			goto free_eb;
 		else
@@ -4959,31 +4989,10 @@ again:
 	check_buffer_tree_ref(eb);
 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
 
-	/*
-	 * there is a race where release page may have
-	 * tried to find this extent buffer in the radix
-	 * but failed.  It will tell the VM it is safe to
-	 * reclaim the, and it will clear the page private bit.
-	 * We must make sure to set the page private bit properly
-	 * after the extent buffer is in the radix tree so
-	 * it doesn't get lost
-	 */
-	SetPageChecked(eb->pages[0]);
-	for (i = 1; i < num_pages; i++) {
-		p = eb->pages[i];
-		ClearPageChecked(p);
-		unlock_page(p);
-	}
-	unlock_page(eb->pages[0]);
 	return eb;
 
 free_eb:
 	WARN_ON(!atomic_dec_and_test(&eb->refs));
-	for (i = 0; i < num_pages; i++) {
-		if (eb->pages[i])
-			unlock_page(eb->pages[i]);
-	}
-
 	btrfs_release_extent_buffer(eb);
 	return exists;
 }
@@ -4999,17 +5008,19 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 /* Expects to have eb->eb_lock already held */
 static int release_extent_buffer(struct extent_buffer *eb)
 {
+	struct btrfs_eb_info *eb_info = eb->eb_info;
+
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	if (atomic_dec_and_test(&eb->refs)) {
+		if (eb_info)
+			list_lru_del(&eb_info->lru_list, &eb->lru);
 		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
-			struct btrfs_fs_info *fs_info = eb->fs_info;
-
 			spin_unlock(&eb->refs_lock);
 
-			spin_lock(&fs_info->buffer_lock);
-			radix_tree_delete(&fs_info->buffer_radix,
-					  eb->start >> PAGE_SHIFT);
-			spin_unlock(&fs_info->buffer_lock);
+			spin_lock_irq(&eb_info->buffer_lock);
+			radix_tree_delete(&eb_info->buffer_radix,
+					  eb_index(eb));
+			spin_unlock_irq(&eb_info->buffer_lock);
 		} else {
 			spin_unlock(&eb->refs_lock);
 		}
@@ -5024,6 +5035,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
 #endif
 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
 		return 1;
+	} else if (eb_info && atomic_read(&eb->refs) == 1) {
+		list_lru_add(&eb_info->lru_list, &eb->lru);
 	}
 	spin_unlock(&eb->refs_lock);
 
@@ -5057,10 +5070,6 @@ void free_extent_buffer(struct extent_buffer *eb)
 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
 		atomic_dec(&eb->refs);
 
-	/*
-	 * I know this is terrible, but it's temporary until we stop tracking
-	 * the uptodate bits and such for the extent buffers.
-	 */
 	release_extent_buffer(eb);
 }
 
@@ -5078,82 +5087,163 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
 	release_extent_buffer(eb);
 }
 
-void clear_extent_buffer_dirty(struct extent_buffer *eb)
+long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc)
 {
-	unsigned long i;
-	unsigned long num_pages;
-	struct page *page;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
 
-	num_pages = num_extent_pages(eb->start, eb->len);
+	return list_lru_shrink_count(&eb_info->lru_list, sc);
+}
 
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		if (!PageDirty(page))
-			continue;
+static enum lru_status eb_lru_isolate(struct list_head *item,
+				      struct list_lru_one *lru,
+				      spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = (struct list_head *)arg;
+	struct extent_buffer *eb = container_of(item, struct extent_buffer,
+						lru);
+	enum lru_status ret;
+	int refs;
 
-		lock_page(page);
-		WARN_ON(!PagePrivate(page));
+	if (!spin_trylock(&eb->refs_lock))
+		return LRU_SKIP;
 
-		clear_page_dirty_for_io(page);
-		spin_lock_irq(&page->mapping->tree_lock);
-		if (!PageDirty(page)) {
-			radix_tree_tag_clear(&page->mapping->page_tree,
-						page_index(page),
-						PAGECACHE_TAG_DIRTY);
-		}
-		spin_unlock_irq(&page->mapping->tree_lock);
-		ClearPageError(page);
-		unlock_page(page);
+	if (extent_buffer_under_io(eb)) {
+		ret = LRU_ROTATE;
+		goto out;
+	}
+
+	refs = atomic_read(&eb->refs);
+	/* We can race with somebody freeing us, just skip if this happens. */
+	if (refs == 0) {
+		ret = LRU_SKIP;
+		goto out;
+	}
+
+	/* Eb is in use, don't kill it. */
+	if (refs > 1) {
+		ret = LRU_ROTATE;
+		goto out;
+	}
+
+	/*
+	 * If we don't clear the TREE_REF flag then this eb is going to
+	 * disappear soon anyway.  Otherwise we become responsible for dropping
+	 * the last ref on this eb and we know it'll survive until we call
+	 * dispose_list.
+	 */
+	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+		ret = LRU_SKIP;
+		goto out;
+	}
+	list_lru_isolate_move(lru, &eb->lru, freeable);
+	ret = LRU_REMOVED;
+out:
+	spin_unlock(&eb->refs_lock);
+	return ret;
+}
+
+static void dispose_list(struct list_head *list)
+{
+	struct extent_buffer *eb;
+
+	while (!list_empty(list)) {
+		eb = list_first_entry(list, struct extent_buffer, lru);
+
+		spin_lock(&eb->refs_lock);
+		list_del_init(&eb->lru);
+		spin_unlock(&eb->refs_lock);
+		free_extent_buffer(eb);
+		cond_resched();
 	}
+}
+
+long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_eb_info *eb_info = fs_info->eb_info;
+	LIST_HEAD(freeable);
+	long freed;
+
+	freed = list_lru_shrink_walk(&eb_info->lru_list, sc, eb_lru_isolate,
+				     &freeable);
+	dispose_list(&freeable);
+	return freed;
+}
+
+#define MAX_EVICT_COUNT 1024
+void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info)
+{
+	LIST_HEAD(freeable);
+	unsigned long count;
+
+	/*
+	 * Evict in batches so we don't lockup the system trying to evict
+	 * memory.
+	 */
+	do {
+		count = list_lru_walk(&eb_info->lru_list, eb_lru_isolate,
+				      &freeable, MAX_EVICT_COUNT);
+		cond_resched();
+	} while (count);
+	dispose_list(&freeable);
+	synchronize_rcu();
+}
+
+int clear_extent_buffer_dirty(struct extent_buffer *eb)
+{
+	struct btrfs_eb_info *eb_info = eb->eb_info;
+	unsigned long i;
+	unsigned long num_pages;
+
+	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
+		return 0;
+
+	spin_lock_irq(&eb_info->buffer_lock);
+	radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
+			     PAGECACHE_TAG_DIRTY);
+	spin_unlock_irq(&eb_info->buffer_lock);
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++)
+		account_metadata_cleaned(eb->pages[i], &eb_info->fs_info->bdi);
 	WARN_ON(atomic_read(&eb->refs) == 0);
+	return 1;
 }
 
 int set_extent_buffer_dirty(struct extent_buffer *eb)
 {
+	struct btrfs_eb_info *eb_info = eb->eb_info;
 	unsigned long i;
 	unsigned long num_pages;
 	int was_dirty = 0;
 
 	check_buffer_tree_ref(eb);
 
-	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
-
-	num_pages = num_extent_pages(eb->start, eb->len);
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+	if (test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
+		return 1;
 
+	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++)
-		set_page_dirty(eb->pages[i]);
+		account_metadata_dirtied(eb->pages[i],
+					 &eb->eb_info->fs_info->bdi);
+	spin_lock_irq(&eb_info->buffer_lock);
+	radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
+			   PAGECACHE_TAG_DIRTY);
+	spin_unlock_irq(&eb_info->buffer_lock);
 	return was_dirty;
 }
 
 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
-	unsigned long i;
-	struct page *page;
-	unsigned long num_pages;
-
 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		if (page)
-			ClearPageUptodate(page);
-	}
 }
 
 void set_extent_buffer_uptodate(struct extent_buffer *eb)
 {
-	unsigned long i;
-	struct page *page;
-	unsigned long num_pages;
-
 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		SetPageUptodate(page);
-	}
 }
 
 int extent_buffer_uptodate(struct extent_buffer *eb)
@@ -5161,103 +5251,166 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 }
 
-int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, int wait,
-			     get_extent_t *get_extent, int mirror_num)
+static void end_bio_extent_buffer_readpage(struct bio *bio)
 {
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct extent_io_tree *tree = NULL;
+	struct bio_vec *bvec;
+	u64 unlock_start = 0, unlock_len = 0;
+	int mirror_num = io_bio->mirror_num;
+	int uptodate = !bio->bi_error;
+	int i, ret;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+		struct btrfs_eb_info *eb_info;
+		struct extent_buffer *eb;
+
+		eb = (struct extent_buffer *)page->private;
+		if (WARN_ON(!eb))
+			continue;
+
+		eb_info = eb->eb_info;
+		if (!tree)
+			tree = &eb_info->io_tree;
+		if (uptodate) {
+			/*
+			 * btree_readpage_end_io_hook doesn't care about
+			 * start/end so just pass 0.  We'll kill this later.
+			 */
+			ret = tree->ops->readpage_end_io_hook(io_bio, 0,
+							      page, 0, 0,
+							      mirror_num);
+			if (ret) {
+				uptodate = 0;
+			} else {
+				u64 start = eb->start;
+				int c, num_pages;
+
+				num_pages = num_extent_pages(eb->start,
+							     eb->len);
+				for (c = 0; c < num_pages; c++) {
+					if (eb->pages[c] == page)
+						break;
+					start += PAGE_SIZE;
+				}
+				clean_io_failure(eb_info->fs_info,
+						 &eb_info->io_failure_tree,
+						 tree, start, page, 0, 0);
+			}
+		}
+		/*
+		 * We never fix anything in btree_io_failed_hook.
+		 *
+		 * TODO: rework the io failed hook to not assume we can fix
+		 * anything.
+		 */
+		if (!uptodate)
+			tree->ops->readpage_io_failed_hook(page, mirror_num);
+
+		if (unlock_start == 0) {
+			unlock_start = eb->start;
+			unlock_len = PAGE_SIZE;
+		} else {
+			unlock_len += PAGE_SIZE;
+		}
+	}
+
+	if (unlock_start)
+		unlock_extent(tree, unlock_start,
+			      unlock_start + unlock_len - 1);
+	if (io_bio->end_io)
+		io_bio->end_io(io_bio, bio->bi_error);
+	bio_put(bio);
+}
+
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
+			     int mirror_num)
+{
+	struct btrfs_eb_info *eb_info = eb->eb_info;
+	struct extent_io_tree *io_tree = &eb_info->io_tree;
+	struct block_device *bdev = eb_info->fs_info->fs_devices->latest_bdev;
+	struct bio *bio = NULL;
+	u64 offset = eb->start;
+	u64 unlock_start = 0, unlock_len = 0;
 	unsigned long i;
 	struct page *page;
 	int err;
 	int ret = 0;
-	int locked_pages = 0;
-	int all_uptodate = 1;
 	unsigned long num_pages;
-	unsigned long num_reads = 0;
-	struct bio *bio = NULL;
-	unsigned long bio_flags = 0;
 
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		if (wait == WAIT_NONE) {
-			if (!trylock_page(page))
-				goto unlock_exit;
-		} else {
-			lock_page(page);
-		}
-		locked_pages++;
-		if (!PageUptodate(page)) {
-			num_reads++;
-			all_uptodate = 0;
-		}
-	}
-	if (all_uptodate) {
-		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-		goto unlock_exit;
+	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) {
+		if (wait != WAIT_COMPLETE)
+			return 0;
+		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
+			       TASK_UNINTERRUPTIBLE);
+		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+			ret = -EIO;
+		return ret;
 	}
 
+	lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+	num_pages = num_extent_pages(eb->start, eb->len);
 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
 	eb->read_mirror = 0;
-	atomic_set(&eb->io_pages, num_reads);
+	atomic_set(&eb->io_pages, num_pages);
 	for (i = 0; i < num_pages; i++) {
 		page = eb->pages[i];
-
-		if (!PageUptodate(page)) {
-			if (ret) {
-				atomic_dec(&eb->io_pages);
-				unlock_page(page);
-				continue;
+		if (ret) {
+			unlock_len += PAGE_SIZE;
+			if (atomic_dec_and_test(&eb->io_pages)) {
+				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+				smp_mb__after_atomic();
+				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
 			}
+			continue;
+		}
 
-			ClearPageError(page);
-			err = __extent_read_full_page(tree, page,
-						      get_extent, &bio,
-						      mirror_num, &bio_flags,
-						      REQ_META);
-			if (err) {
-				ret = err;
-				/*
-				 * We use &bio in above __extent_read_full_page,
-				 * so we ensure that if it returns error, the
-				 * current page fails to add itself to bio and
-				 * it's been unlocked.
-				 *
-				 * We must dec io_pages by ourselves.
-				 */
-				atomic_dec(&eb->io_pages);
+		err = submit_extent_page(REQ_OP_READ, REQ_META, io_tree, NULL,
+					 page, offset >> 9, PAGE_SIZE, 0, bdev,
+					 &bio, -1,
+					 end_bio_extent_buffer_readpage,
+					 mirror_num, 0, 0, false);
+		if (err) {
+			ret = err;
+			/*
+			 * We use &bio in above submit_extent_page
+			 * so we ensure that if it returns error, the
+			 * current page fails to add itself to bio and
+			 * it's been unlocked.
+			 *
+			 * We must dec io_pages by ourselves.
+			 */
+			if (atomic_dec_and_test(&eb->io_pages)) {
+				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+				smp_mb__after_atomic();
+				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
 			}
-		} else {
-			unlock_page(page);
+			unlock_start = offset;
+			unlock_len = PAGE_SIZE;
 		}
+		offset += PAGE_SIZE;
 	}
 
 	if (bio) {
-		err = submit_one_bio(bio, mirror_num, bio_flags);
+		err = submit_one_bio(bio, mirror_num, 0);
 		if (err)
 			return err;
 	}
 
+	if (ret && unlock_start)
+		unlock_extent(io_tree, unlock_start,
+			      unlock_start + unlock_len - 1);
 	if (ret || wait != WAIT_COMPLETE)
 		return ret;
 
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		wait_on_page_locked(page);
-		if (!PageUptodate(page))
-			ret = -EIO;
-	}
-
-	return ret;
-
-unlock_exit:
-	while (locked_pages > 0) {
-		locked_pages--;
-		page = eb->pages[locked_pages];
-		unlock_page(page);
-	}
+	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
+		       TASK_UNINTERRUPTIBLE);
+	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+		ret = -EIO;
 	return ret;
 }
 
@@ -5426,7 +5579,6 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 
 	while (len > 0) {
 		page = eb->pages[i];
-		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_SIZE - offset);
 		kaddr = page_address(page);
@@ -5456,7 +5608,6 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 
 	while (len > 0) {
 		page = eb->pages[i];
-		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_SIZE - offset);
 		kaddr = page_address(page);
@@ -5487,7 +5638,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 
 	while (len > 0) {
 		page = dst->pages[i];
-		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (unsigned long)(PAGE_SIZE - offset));
 
@@ -5562,7 +5712,6 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
 
 	eb_bitmap_offset(eb, start, nr, &i, &offset);
 	page = eb->pages[i];
-	WARN_ON(!PageUptodate(page));
 	kaddr = page_address(page);
 	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
 }
@@ -5587,7 +5736,6 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
 
 	eb_bitmap_offset(eb, start, pos, &i, &offset);
 	page = eb->pages[i];
-	WARN_ON(!PageUptodate(page));
 	kaddr = page_address(page);
 
 	while (len >= bits_to_set) {
@@ -5598,7 +5746,6 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
 		if (++offset >= PAGE_SIZE && len > 0) {
 			offset = 0;
 			page = eb->pages[++i];
-			WARN_ON(!PageUptodate(page));
 			kaddr = page_address(page);
 		}
 	}
@@ -5629,7 +5776,6 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
 
 	eb_bitmap_offset(eb, start, pos, &i, &offset);
 	page = eb->pages[i];
-	WARN_ON(!PageUptodate(page));
 	kaddr = page_address(page);
 
 	while (len >= bits_to_clear) {
@@ -5640,7 +5786,6 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
 		if (++offset >= PAGE_SIZE && len > 0) {
 			offset = 0;
 			page = eb->pages[++i];
-			WARN_ON(!PageUptodate(page));
 			kaddr = page_address(page);
 		}
 	}
@@ -5689,13 +5834,13 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		btrfs_err(dst->fs_info,
+		btrfs_err(dst->eb_info->fs_info,
 			"memmove bogus src_offset %lu move "
 		       "len %lu dst len %lu", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		btrfs_err(dst->fs_info,
+		btrfs_err(dst->eb_info->fs_info,
 			"memmove bogus dst_offset %lu move "
 		       "len %lu dst len %lu", dst_offset, len, dst->len);
 		BUG_ON(1);
@@ -5737,12 +5882,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+		btrfs_err(dst->eb_info->fs_info, "memmove bogus src_offset %lu move "
 		       "len %lu len %lu", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+		btrfs_err(dst->eb_info->fs_info, "memmove bogus dst_offset %lu move "
 		       "len %lu len %lu", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
@@ -5770,45 +5915,3 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		len -= cur;
 	}
 }
-
-int try_release_extent_buffer(struct page *page)
-{
-	struct extent_buffer *eb;
-
-	/*
-	 * We need to make sure nobody is attaching this page to an eb right
-	 * now.
-	 */
-	spin_lock(&page->mapping->private_lock);
-	if (!PagePrivate(page)) {
-		spin_unlock(&page->mapping->private_lock);
-		return 1;
-	}
-
-	eb = (struct extent_buffer *)page->private;
-	BUG_ON(!eb);
-
-	/*
-	 * This is a little awful but should be ok, we need to make sure that
-	 * the eb doesn't disappear out from under us while we're looking at
-	 * this page.
-	 */
-	spin_lock(&eb->refs_lock);
-	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
-		spin_unlock(&eb->refs_lock);
-		spin_unlock(&page->mapping->private_lock);
-		return 0;
-	}
-	spin_unlock(&page->mapping->private_lock);
-
-	/*
-	 * If tree ref isn't set then we know the ref on this eb is a real ref,
-	 * so just return, this page will likely be freed soon anyway.
-	 */
-	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
-		spin_unlock(&eb->refs_lock);
-		return 0;
-	}
-
-	return release_extent_buffer(eb);
-}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c393c06..06b6f14 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -43,6 +43,8 @@
 #define EXTENT_BUFFER_DUMMY 9
 #define EXTENT_BUFFER_IN_TREE 10
 #define EXTENT_BUFFER_WRITE_ERR 11    /* write IO error */
+#define EXTENT_BUFFER_MIXED_PAGES 12	/* the pages span multiple zones or numa nodes. */
+#define EXTENT_BUFFER_READING 13 /* currently reading this eb. */
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define PAGE_UNLOCK		(1 << 0)
@@ -122,13 +124,25 @@ struct extent_state {
 #endif
 };
 
+struct btrfs_eb_info {
+	struct btrfs_fs_info *fs_info;
+	struct extent_io_tree io_tree;
+	struct extent_io_tree io_failure_tree;
+
+	/* Extent buffer radix tree */
+	spinlock_t buffer_lock;
+	struct radix_tree_root buffer_radix;
+	struct list_lru lru_list;
+	pgoff_t writeback_index;
+};
+
 #define INLINE_EXTENT_BUFFER_PAGES 16
 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
 struct extent_buffer {
 	u64 start;
 	unsigned long len;
 	unsigned long bflags;
-	struct btrfs_fs_info *fs_info;
+	struct btrfs_eb_info *eb_info;
 	spinlock_t refs_lock;
 	atomic_t refs;
 	atomic_t io_pages;
@@ -163,6 +177,7 @@ struct extent_buffer {
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
 #endif
+	struct list_head lru;
 };
 
 /*
@@ -336,8 +351,6 @@ int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
 		      struct writeback_control *wbc);
-int btree_write_cache_pages(struct address_space *mapping,
-			    struct writeback_control *wbc);
 int extent_readpages(struct extent_io_tree *tree,
 		     struct address_space *mapping,
 		     struct list_head *pages, unsigned nr_pages,
@@ -348,21 +361,18 @@ void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 					  u64 start);
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						  u64 start, unsigned long len);
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						u64 start, u32 nodesize);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info *eb_info,
+						u64 start, unsigned long len);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
-struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
 					 u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE	0
 #define WAIT_COMPLETE	1
 #define WAIT_PAGE_LOCK	2
-int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, int wait,
-			     get_extent_t *get_extent, int mirror_num);
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
+			     int mirror_num);
 void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
 
 static inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -376,6 +386,11 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
 	atomic_inc(&eb->refs);
 }
 
+static inline unsigned long eb_index(struct extent_buffer *eb)
+{
+	return eb->start >> PAGE_SHIFT;
+}
+
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 			  unsigned long start,
 			  unsigned long len);
@@ -402,7 +417,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
 			      unsigned long pos, unsigned long len);
 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
 				unsigned long pos, unsigned long len);
-void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int clear_extent_buffer_dirty(struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_buffer *eb);
 void set_extent_buffer_uptodate(struct extent_buffer *eb);
 void clear_extent_buffer_uptodate(struct extent_buffer *eb);
@@ -436,6 +451,14 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
+void btree_flush(struct btrfs_fs_info *fs_info);
+int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
+int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
+long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc);
+long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc);
+void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc);
+void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info);
+int btrfs_init_eb_info(struct btrfs_fs_info *fs_info);
 
 /*
  * When IO fails, either with EIO or csum verification fails, we
@@ -474,6 +497,6 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
 				      struct page *locked_page, u64 *start,
 				      u64 *end, u64 max_bytes);
 #endif
-struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
 					       u64 start, u32 nodesize);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ca791f..3440b52 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1837,9 +1837,9 @@ static void btrfs_clear_bit_hook(void *private_data,
  * return 0 if page can be merged to bio
  * return error otherwise
  */
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio,
-			 unsigned long bio_flags)
+static int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+				size_t size, struct bio *bio,
+				unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7fd7e18..614fdb8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,7 +45,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 	if (!need_reset && btrfs_root_generation(item)
 		!= btrfs_root_generation_v2(item)) {
 		if (btrfs_root_generation_v2(item) != 0) {
-			btrfs_warn(eb->fs_info,
+			btrfs_warn(eb->eb_info->fs_info,
 					"mismatching "
 					"generation and generation_v2 "
 					"found in root item. This root "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b9d4c43..73a1d8d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1180,7 +1180,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	trace_btrfs_sync_fs(fs_info, wait);
 
 	if (!wait) {
-		filemap_flush(fs_info->btree_inode->i_mapping);
+		btree_flush(fs_info);
 		return 0;
 	}
 
@@ -2287,18 +2287,21 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 }
 
 static const struct super_operations btrfs_super_ops = {
-	.drop_inode	= btrfs_drop_inode,
-	.evict_inode	= btrfs_evict_inode,
-	.put_super	= btrfs_put_super,
-	.sync_fs	= btrfs_sync_fs,
-	.show_options	= btrfs_show_options,
-	.show_devname	= btrfs_show_devname,
-	.write_inode	= btrfs_write_inode,
-	.alloc_inode	= btrfs_alloc_inode,
-	.destroy_inode	= btrfs_destroy_inode,
-	.statfs		= btrfs_statfs,
-	.remount_fs	= btrfs_remount,
-	.freeze_fs	= btrfs_freeze,
+	.drop_inode		= btrfs_drop_inode,
+	.evict_inode		= btrfs_evict_inode,
+	.put_super		= btrfs_put_super,
+	.sync_fs		= btrfs_sync_fs,
+	.show_options		= btrfs_show_options,
+	.show_devname		= btrfs_show_devname,
+	.write_inode		= btrfs_write_inode,
+	.alloc_inode		= btrfs_alloc_inode,
+	.destroy_inode		= btrfs_destroy_inode,
+	.statfs			= btrfs_statfs,
+	.remount_fs		= btrfs_remount,
+	.freeze_fs		= btrfs_freeze,
+	.nr_cached_objects	= btrfs_nr_ebs,
+	.free_cached_objects	= btrfs_free_ebs,
+	.write_metadata		= btrfs_write_ebs,
 };
 
 static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index bf62ad9..dca90d6 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -100,14 +100,32 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 		return NULL;
 	}
 
+	fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info),
+				   GFP_KERNEL);
+	if (!fs_info->eb_info) {
+		kfree(fs_info->fs_devices);
+		kfree(fs_info->super_copy);
+		kfree(fs_info);
+		return NULL;
+	}
+
+	if (btrfs_init_eb_info(fs_info)) {
+		kfree(fs_info->eb_info);
+		kfree(fs_info->fs_devices);
+		kfree(fs_info->super_copy);
+		kfree(fs_info);
+		return NULL;
+	}
+
 	if (init_srcu_struct(&fs_info->subvol_srcu)) {
+		list_lru_destroy(&fs_info->eb_info->lru_list);
+		kfree(fs_info->eb_info);
 		kfree(fs_info->fs_devices);
 		kfree(fs_info->super_copy);
 		kfree(fs_info);
 		return NULL;
 	}
 
-	spin_lock_init(&fs_info->buffer_lock);
 	spin_lock_init(&fs_info->qgroup_lock);
 	spin_lock_init(&fs_info->qgroup_op_lock);
 	spin_lock_init(&fs_info->super_lock);
@@ -123,7 +141,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
-	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
 	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
 	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
@@ -137,6 +154,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 
 void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_eb_info *eb_info;
 	struct radix_tree_iter iter;
 	void **slot;
 
@@ -147,13 +165,14 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 			      &fs_info->fs_state)))
 		return;
 
+	eb_info = fs_info->eb_info;
 	test_mnt->mnt_sb->s_fs_info = NULL;
 
-	spin_lock(&fs_info->buffer_lock);
-	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+	spin_lock(&eb_info->buffer_lock);
+	radix_tree_for_each_slot(slot, &eb_info->buffer_radix, &iter, 0) {
 		struct extent_buffer *eb;
 
-		eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+		eb = radix_tree_deref_slot_protected(slot, &eb_info->buffer_lock);
 		if (!eb)
 			continue;
 		/* Shouldn't happen but that kind of thinking creates CVE's */
@@ -162,15 +181,17 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 				slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
-		spin_unlock(&fs_info->buffer_lock);
+		spin_unlock(&eb_info->buffer_lock);
 		free_extent_buffer_stale(eb);
-		spin_lock(&fs_info->buffer_lock);
+		spin_lock(&eb_info->buffer_lock);
 	}
-	spin_unlock(&fs_info->buffer_lock);
+	spin_unlock(&eb_info->buffer_lock);
 
 	btrfs_free_qgroup_config(fs_info);
 	btrfs_free_fs_roots(fs_info);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
+	list_lru_destroy(&eb_info->lru_list);
+	kfree(fs_info->eb_info);
 	kfree(fs_info->super_copy);
 	kfree(fs_info->fs_devices);
 	kfree(fs_info);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 1110b8c..45524f1 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -388,7 +388,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
 		return -ENOMEM;
 	}
 
-	eb = __alloc_dummy_extent_buffer(NULL, 0, len);
+	eb = alloc_dummy_extent_buffer(NULL, 0, len);
 	if (!eb) {
 		test_msg("Couldn't allocate test extent buffer\n");
 		kfree(bitmap);
@@ -401,7 +401,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
 
 	/* Do it over again with an extent buffer which isn't page-aligned. */
 	free_extent_buffer(eb);
-	eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
+	eb = alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
 	if (!eb) {
 		test_msg("Couldn't allocate test extent buffer\n");
 		kfree(bitmap);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 7508d3b..3bf5df1 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -469,8 +469,8 @@ static int run_test(test_func_t test_func, int bitmaps,
 	root->fs_info->free_space_root = root;
 	root->fs_info->tree_root = root;
 
-	root->node = alloc_test_extent_buffer(root->fs_info,
-		nodesize, nodesize);
+	root->node = alloc_test_extent_buffer(fs_info->eb_info, nodesize,
+					      nodesize);
 	if (!root->node) {
 		test_msg("Couldn't allocate dummy buffer\n");
 		ret = -ENOMEM;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ca7cb5e..bb277ee 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -486,8 +486,8 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 	 * Can't use bytenr 0, some things freak out
 	 * *cough*backref walking code*cough*
 	 */
-	root->node = alloc_test_extent_buffer(root->fs_info, nodesize,
-					nodesize);
+	root->node = alloc_test_extent_buffer(fs_info->eb_info, nodesize,
+					      nodesize);
 	if (!root->node) {
 		test_msg("Couldn't allocate dummy buffer\n");
 		ret = -ENOMEM;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 56bdb2f..a54691f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -293,8 +293,7 @@ loop:
 	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
 	spin_lock_init(&cur_trans->dropped_roots_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
-	extent_io_tree_init(&cur_trans->dirty_pages,
-			     fs_info->btree_inode);
+	extent_io_tree_init(&cur_trans->dirty_pages, NULL);
 	fs_info->generation++;
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
@@ -935,7 +934,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 {
 	int err = 0;
 	int werr = 0;
-	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
@@ -965,11 +963,11 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 			wait_writeback = true;
 		}
 		if (!err)
-			err = filemap_fdatawrite_range(mapping, start, end);
+			err = btree_write_range(root->fs_info, start, end);
 		if (err)
 			werr = err;
 		else if (wait_writeback)
-			werr = filemap_fdatawait_range(mapping, start, end);
+			werr = btree_wait_range(root->fs_info, start, end);
 		free_extent_state(cached_state);
 		cached_state = NULL;
 		cond_resched();
@@ -989,7 +987,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 {
 	int err = 0;
 	int werr = 0;
-	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
@@ -1011,7 +1008,7 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 		if (err == -ENOMEM)
 			err = 0;
 		if (!err)
-			err = filemap_fdatawait_range(mapping, start, end);
+			err = btree_wait_range(root->fs_info, start, end);
 		if (err)
 			werr = err;
 		free_extent_state(cached_state);
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2016-09-09 17:41 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
2016-09-02 19:40 ` [PATCH 1/7] Btrfs: replace tree->mapping with tree->private_data Josef Bacik
2016-09-02 19:40 ` [PATCH 2/7] btrfs: remove inode argument from repair_io_failure Josef Bacik
2016-09-02 19:40 ` [PATCH 3/7] Btrfs: add a flags field to btrfs_fs_info Josef Bacik
2016-09-08 17:01   ` David Sterba
2016-09-02 19:40 ` [PATCH 4/7] Btrfs: kill the start argument to read_extent_buffer_pages Josef Bacik
2016-09-08 17:01   ` David Sterba
2016-09-02 19:40 ` [PATCH 5/7] Btrfs: don't pass the inode through clean_io_failure Josef Bacik
2016-09-02 19:40 ` [PATCH 6/7] Btrfs: kill the btree_inode Josef Bacik
2016-09-08  5:17   ` Chandan Rajendra
2016-09-08 14:12     ` Josef Bacik
2016-09-09 17:40   ` [PATCH 6/7][V2] " Josef Bacik
2016-09-02 19:40 ` [PATCH 7/7] Btrfs: kill BUG_ON()'s in btrfs_mark_extent_written Josef Bacik
2016-09-08 17:07   ` David Sterba
2016-09-05 16:31 ` [PATCH 0/7] Kill the btree inode David Sterba
2016-09-06 13:03   ` Josef Bacik

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.