All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/10] several fixes and cleanups
@ 2012-03-27  6:44 Liu Bo
  2012-03-27  6:44 ` [PATCH 01/10] Btrfs: show useful info in space reservation tracepoint Liu Bo
                   ` (9 more replies)
  0 siblings, 10 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

This patchset consists of a bug fix from allocating chunk,
six bug fixes from autodefrag, and other cleanups.

I've tested it with xfstests plus autodefrag option.

Liu Bo (10):
  Btrfs: show useful info in space reservation tracepoint
  Btrfs: fix deadlock during allocating chunks
  Btrfs: fix race between direct io and autodefrag
  Btrfs: fix the mismatch of page->mapping
  Btrfs: fix recursive defragment with autodefrag option
  Btrfs: add a check to decide if we should defrag the range
  Btrfs: do not bother to defrag an extent if it is a big real extent
  Btrfs: update to the right index of defragment
  Btrfs: use PagePrivate2 to check ordered data
  Btrfs: drop cache with VACANCY em when we fail to start a transaction

 fs/btrfs/extent-tree.c |   79 ++++++++++++++++++++++++++++++++----------
 fs/btrfs/inode-map.c   |    6 +--
 fs/btrfs/inode.c       |   59 ++++++++++++-------------------
 fs/btrfs/ioctl.c       |   89 +++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/transaction.c |    3 +-
 5 files changed, 151 insertions(+), 85 deletions(-)


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 01/10] Btrfs: show useful info in space reservation tracepoint
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 02/10][RESEND] Btrfs: fix deadlock during allocating chunks Liu Bo
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

o For space info, the type of space info is useful for debug.
o For transaction handle, its transid is useful.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c |   29 ++++++++++-------------------
 fs/btrfs/inode-map.c   |    6 ++----
 fs/btrfs/transaction.c |    3 +--
 3 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 37e0a80..f3d367a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3312,8 +3312,7 @@ commit_trans:
 	}
 	data_sinfo->bytes_may_use += bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
-				      (u64)(unsigned long)data_sinfo,
-				      bytes, 1);
+				      data_sinfo->flags, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
@@ -3334,8 +3333,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	spin_lock(&data_sinfo->lock);
 	data_sinfo->bytes_may_use -= bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
-				      (u64)(unsigned long)data_sinfo,
-				      bytes, 0);
+				      data_sinfo->flags, bytes, 0);
 	spin_unlock(&data_sinfo->lock);
 }
 
@@ -3700,9 +3698,7 @@ again:
 		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
 			trace_btrfs_space_reservation(root->fs_info,
-					      "space_info",
-					      (u64)(unsigned long)space_info,
-					      orig_bytes, 1);
+				"space_info", space_info->flags, orig_bytes, 1);
 			ret = 0;
 		} else {
 			/*
@@ -3771,9 +3767,7 @@ again:
 		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
 			trace_btrfs_space_reservation(root->fs_info,
-					      "space_info",
-					      (u64)(unsigned long)space_info,
-					      orig_bytes, 1);
+				"space_info", space_info->flags, orig_bytes, 1);
 			ret = 0;
 		} else {
 			wait_ordered = true;
@@ -3918,8 +3912,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 			spin_lock(&space_info->lock);
 			space_info->bytes_may_use -= num_bytes;
 			trace_btrfs_space_reservation(fs_info, "space_info",
-					      (u64)(unsigned long)space_info,
-					      num_bytes, 0);
+					space_info->flags, num_bytes, 0);
 			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
@@ -4137,14 +4130,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 		block_rsv->reserved += num_bytes;
 		sinfo->bytes_may_use += num_bytes;
 		trace_btrfs_space_reservation(fs_info, "space_info",
-				      (u64)(unsigned long)sinfo, num_bytes, 1);
+				      sinfo->flags, num_bytes, 1);
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
 		sinfo->bytes_may_use -= num_bytes;
 		trace_btrfs_space_reservation(fs_info, "space_info",
-				      (u64)(unsigned long)sinfo, num_bytes, 0);
+				      sinfo->flags, num_bytes, 0);
 		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
@@ -4198,8 +4191,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 		return;
 
 	trace_btrfs_space_reservation(root->fs_info, "transaction",
-				      (u64)(unsigned long)trans,
-				      trans->bytes_reserved, 0);
+				      trans->transid, trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
@@ -4716,9 +4708,8 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 			space_info->bytes_reserved += num_bytes;
 			if (reserve == RESERVE_ALLOC) {
 				trace_btrfs_space_reservation(cache->fs_info,
-					      "space_info",
-					      (u64)(unsigned long)space_info,
-					      num_bytes, 0);
+						"space_info", space_info->flags,
+						num_bytes, 0);
 				space_info->bytes_may_use -= num_bytes;
 			}
 		}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ee15d88..ad16df1 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -439,8 +439,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	if (ret)
 		goto out;
 	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
-				      (u64)(unsigned long)trans,
-				      trans->bytes_reserved, 1);
+				      trans->transid, trans->bytes_reserved, 1);
 again:
 	inode = lookup_free_ino_inode(root, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -502,8 +501,7 @@ out_put:
 	iput(inode);
 out_release:
 	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
-				      (u64)(unsigned long)trans,
-				      trans->bytes_reserved, 0);
+				      trans->transid, trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
 	trans->block_rsv = rsv;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04b77e3..ca65079 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -327,8 +327,7 @@ again:
 
 	if (num_bytes) {
 		trace_btrfs_space_reservation(root->fs_info, "transaction",
-					      (u64)(unsigned long)h,
-					      num_bytes, 1);
+					      h->transid, num_bytes, 1);
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
 	}
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 02/10][RESEND] Btrfs: fix deadlock during allocating chunks
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
  2012-03-27  6:44 ` [PATCH 01/10] Btrfs: show useful info in space reservation tracepoint Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 03/10] Btrfs: fix race between direct io and autodefrag Liu Bo
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

This deadlock comes from xfstests 251.

We'll hold the chunk_mutex throughout the whole of a chunk allocation.
But if we find that we've used up system chunk space, we need to allocate a
new system chunk, but this will lead to a recursion of chunk allocation and end
up with a deadlock on chunk_mutex.
So instead we need to allocate the system chunk first if we find we're in ENOSPC.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 50 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f3d367a..fe5bbc7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3394,6 +3394,50 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	return 1;
 }
 
+static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+{
+	u64 num_dev;
+
+	if (type & BTRFS_BLOCK_GROUP_RAID10 ||
+	    type & BTRFS_BLOCK_GROUP_RAID0)
+		num_dev = root->fs_info->fs_devices->rw_devices;
+	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		num_dev = 2;
+	else
+		num_dev = 1;	/* DUP or single */
+
+	/* metadata for updaing devices and chunk tree */
+	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+}
+
+static void check_system_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, u64 type)
+{
+	struct btrfs_space_info *info;
+	u64 left;
+	u64 thresh;
+
+	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	spin_lock(&info->lock);
+	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
+		info->bytes_reserved - info->bytes_readonly;
+	spin_unlock(&info->lock);
+
+	thresh = get_system_chunk_thresh(root, type);
+	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
+		       left, thresh, type);
+		dump_space_info(info, 0, 0);
+	}
+
+	if (left < thresh) {
+		u64 flags;
+
+		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+		btrfs_alloc_chunk(trans, root, flags);
+	}
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
@@ -3466,6 +3510,12 @@ again:
 			force_metadata_allocation(fs_info);
 	}
 
+	/*
+	 * Check if we have enough space in SYSTEM chunk because we may need
+	 * to update devices.
+	 */
+	check_system_chunk(trans, extent_root, flags);
+
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
 	if (ret < 0 && ret != -ENOSPC)
 		goto out;
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 03/10] Btrfs: fix race between direct io and autodefrag
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
  2012-03-27  6:44 ` [PATCH 01/10] Btrfs: show useful info in space reservation tracepoint Liu Bo
  2012-03-27  6:44 ` [PATCH 02/10][RESEND] Btrfs: fix deadlock during allocating chunks Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 04/10] Btrfs: fix the mismatch of page->mapping Liu Bo
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

The bug is from running xfstests 209 with autodefrag.

The race is as follows:
       t1                       t2(autodefrag)
   direct IO
     invalidate pagecache
     dio(old data)             add_inode_defrag
     invalidate pagecache
   endio

   direct IO
     invalidate pagecache
                                run_defrag
                                  readpage(old data)
                                  set page dirty (old data)
     dio(new data, rewrite)
     invalidate pagecache (*)
     endio

t2(autodefrag) will get old data into pagecache via readpage and set
pagecache dirty.  Meanwhile, invalidate pagecache(*) will fail due to
dirty flags in pages.  So the old data may be flushed into disk by
flush thread, which will lead to data loss.

And so does the case of user defragment progs.

The patch fixes this race by holding i_mutex when we readpage and set page dirty.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d8b5471..0acc828 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1123,12 +1123,16 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			ra_index += max_cluster;
 		}
 
+		mutex_lock(&inode->i_mutex);
 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
-		if (ret < 0)
+		if (ret < 0) {
+			mutex_unlock(&inode->i_mutex);
 			goto out_ra;
+		}
 
 		defrag_count += ret;
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+		mutex_unlock(&inode->i_mutex);
 
 		if (newer_than) {
 			if (newer_off == (u64)-1)
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 04/10] Btrfs: fix the mismatch of page->mapping
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
                   ` (2 preceding siblings ...)
  2012-03-27  6:44 ` [PATCH 03/10] Btrfs: fix race between direct io and autodefrag Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 05/10][RESEND] Btrfs: fix recursive defragment with autodefrag option Liu Bo
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

commit 600a45e1d5e376f679ff9ecc4ce9452710a6d27c
(Btrfs: fix deadlock on page lock when doing auto-defragment)
fixes the deadlock on page, but it also introduces another bug.

A page may have been truncated after unlock & lock.
So we need to find it again to get the right one.

And since we've held i_mutex lock, inode size remains unchanged and
we can drop isize overflow checks.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c |   35 +++++++++++++++++++----------------
 1 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0acc828..81faa78 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -856,6 +856,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 	u64 isize = i_size_read(inode);
 	u64 page_start;
 	u64 page_end;
+	u64 page_cnt;
 	int ret;
 	int i;
 	int i_done;
@@ -864,19 +865,21 @@ static int cluster_pages_for_defrag(struct inode *inode,
 	struct extent_io_tree *tree;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 
-	if (isize == 0)
-		return 0;
 	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+	if (!isize || start_index > file_end)
+		return 0;
+
+	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
 	ret = btrfs_delalloc_reserve_space(inode,
-					   num_pages << PAGE_CACHE_SHIFT);
+					   page_cnt << PAGE_CACHE_SHIFT);
 	if (ret)
 		return ret;
 	i_done = 0;
 	tree = &BTRFS_I(inode)->io_tree;
 
 	/* step one, lock all the pages */
-	for (i = 0; i < num_pages; i++) {
+	for (i = 0; i < page_cnt; i++) {
 		struct page *page;
 again:
 		page = find_or_create_page(inode->i_mapping,
@@ -898,6 +901,15 @@ again:
 			btrfs_start_ordered_extent(inode, ordered, 1);
 			btrfs_put_ordered_extent(ordered);
 			lock_page(page);
+			/*
+			 * we unlocked the page above, so we need check if
+			 * it was released or not.
+			 */
+			if (page->mapping != inode->i_mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto again;
+			}
 		}
 
 		if (!PageUptodate(page)) {
@@ -911,15 +923,6 @@ again:
 			}
 		}
 
-		isize = i_size_read(inode);
-		file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
-		if (!isize || page->index > file_end) {
-			/* whoops, we blew past eof, skip this page */
-			unlock_page(page);
-			page_cache_release(page);
-			break;
-		}
-
 		if (page->mapping != inode->i_mapping) {
 			unlock_page(page);
 			page_cache_release(page);
@@ -953,12 +956,12 @@ again:
 			  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
 			  GFP_NOFS);
 
-	if (i_done != num_pages) {
+	if (i_done != page_cnt) {
 		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->lock);
 		btrfs_delalloc_release_space(inode,
-				     (num_pages - i_done) << PAGE_CACHE_SHIFT);
+				     (page_cnt - i_done) << PAGE_CACHE_SHIFT);
 	}
 
 
@@ -983,7 +986,7 @@ out:
 		unlock_page(pages[i]);
 		page_cache_release(pages[i]);
 	}
-	btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
+	btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
 	return ret;
 
 }
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 05/10][RESEND] Btrfs: fix recursive defragment with autodefrag option
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
                   ` (3 preceding siblings ...)
  2012-03-27  6:44 ` [PATCH 04/10] Btrfs: fix the mismatch of page->mapping Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 06/10] Btrfs: add a check to decide if we should defrag the range Liu Bo
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

Reproduce:
$ mkfs.btrfs disk
$ mount disk /mnt -o autodefrag
$ dd if=/dev/zero of=/mnt/foobar bs=4k count=10 2>/dev/null && sync
$ for i in `seq 9 -2 0`; do dd if=/dev/zero of=/mnt/foobar bs=4k count=1 \
  seek=$i conv=notrunc 2> /dev/null; done && sync

then we'll get to defrag "foobar" again and again.
So does option "-o autodefrag,compress".

Reasons:
When the cleaner kthread gets to fetch inodes from the defrag tree and defrag
them, it will dirty pages and submit them, this will comes to another DATA COW
where the processing inode will be inserted to the defrag tree again.

This patch sets a rule for COW code, i.e. insert an inode when we're really
going to make some defragments.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/inode.c |    8 +++++---
 1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 892b347..7f5018d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -344,8 +344,9 @@ static noinline int compress_file_range(struct inode *inode,
 	int will_compress;
 	int compress_type = root->fs_info->compress_type;
 
-	/* if this is a small write inside eof, kick off a defragbot */
-	if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+	/* if this is a small write inside eof, kick off a defrag */
+	if ((end - start + 1) < 16 * 1024 &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 		btrfs_add_inode_defrag(NULL, inode);
 
 	actual_end = min_t(u64, isize, end + 1);
@@ -800,7 +801,8 @@ static noinline int cow_file_range(struct inode *inode,
 	ret = 0;
 
 	/* if this is a small write inside eof, kick off defrag */
-	if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
+	if (num_bytes < 64 * 1024 &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 		btrfs_add_inode_defrag(trans, inode);
 
 	if (start == 0) {
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 06/10] Btrfs: add a check to decide if we should defrag the range
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
                   ` (4 preceding siblings ...)
  2012-03-27  6:44 ` [PATCH 05/10][RESEND] Btrfs: fix recursive defragment with autodefrag option Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 07/10] Btrfs: do not bother to defrag an extent if it is a big real extent Liu Bo
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

If our file's layout is as follows:
| hole | data1 | hole | data2 |

we do not need to defrag this file, because this file has holes and
cannot be merged into one extent.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c |   36 +++++++++++++++++++++++++++++++++++-
 1 files changed, 35 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 81faa78..66a4933 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -769,6 +769,31 @@ none:
 	return -ENOENT;
 }
 
+/*
+ * Validaty check of prev em and next em:
+ * 1) no prev/next em
+ * 2) prev/next em is an hole/inline extent
+ */
+static int check_adjacent_extents(struct inode *inode, struct extent_map *em)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *prev = NULL, *next = NULL;
+	int ret = 0;
+
+	read_lock(&em_tree->lock);
+	prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1);
+	next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1);
+	read_unlock(&em_tree->lock);
+
+	if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) &&
+	    (!next || next->block_start >= EXTENT_MAP_LAST_BYTE))
+		ret = 1;
+	free_extent_map(prev);
+	free_extent_map(next);
+
+	return ret;
+}
+
 static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 			       int thresh, u64 *last_len, u64 *skip,
 			       u64 *defrag_end)
@@ -806,8 +831,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	}
 
 	/* this will cover holes, and inline extents */
-	if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		ret = 0;
+		goto out;
+	}
+
+	/* If we have nothing to merge with us, just skip. */
+	if (check_adjacent_extents(inode, em)) {
 		ret = 0;
+		goto out;
+	}
 
 	/*
 	 * we hit a real extent, if it is big don't bother defragging it again
@@ -815,6 +848,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
 		ret = 0;
 
+out:
 	/*
 	 * last_len ends up being a counter of how many bytes we've defragged.
 	 * every time we choose not to defrag an extent, we reset *last_len
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 07/10] Btrfs: do not bother to defrag an extent if it is a big real extent
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
                   ` (5 preceding siblings ...)
  2012-03-27  6:44 ` [PATCH 06/10] Btrfs: add a check to decide if we should defrag the range Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 08/10] Btrfs: update to the right index of defragment Liu Bo
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

$ mkfs.btrfs /dev/sdb7
$ mount /dev/sdb7 /mnt/btrfs/ -oautodefrag
$ dd if=/dev/zero of=/mnt/btrfs/foobar bs=4k count=10 oflag=direct 2>/dev/null
$ filefrag -v /mnt/btrfs/foobar
Filesystem type is: 9123683e
File size of /mnt/btrfs/foobar is 40960 (10 blocks, blocksize 4096)
 ext logical physical expected length flags
   0       0     3072              10 eof
/mnt/btrfs/foobar: 1 extent found

Now we have a big real extent [0, 40960), but autodefrag will still defrag it.

$ sync
$ filefrag -v /mnt/btrfs/foobar
Filesystem type is: 9123683e
File size of /mnt/btrfs/foobar is 40960 (10 blocks, blocksize 4096)
 ext logical physical expected length flags
   0       0     3082              10 eof
/mnt/btrfs/foobar: 1 extent found

So if we already find a big real extent, we're ok about that, just skip it.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c |    9 +++------
 1 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 66a4933..7a6d15c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1126,12 +1126,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		if (!(inode->i_sb->s_flags & MS_ACTIVE))
 			break;
 
-		if (!newer_than &&
-		    !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
-					PAGE_CACHE_SIZE,
-					extent_thresh,
-					&last_len, &skip,
-					&defrag_end)) {
+		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+					 PAGE_CACHE_SIZE, extent_thresh,
+					 &last_len, &skip, &defrag_end)) {
 			unsigned long next;
 			/*
 			 * the should_defrag function tells us how much to skip
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 08/10] Btrfs: update to the right index of defragment
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
                   ` (6 preceding siblings ...)
  2012-03-27  6:44 ` [PATCH 07/10] Btrfs: do not bother to defrag an extent if it is a big real extent Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 09/10] Btrfs: use PagePrivate2 to check ordered data Liu Bo
  2012-03-27  6:44 ` [PATCH 10/10] Btrfs: drop cache with VACANCY em when we fail to start a transaction Liu Bo
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

When we use autodefrag, we forget to update the index which indicates
the last page we've dirty.  And we'll set dirty flags on a same set of
pages again and again.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7a6d15c..e3cb770 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1172,6 +1172,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			if (newer_off == (u64)-1)
 				break;
 
+			if (ret > 0)
+				i += ret;
+
 			newer_off = max(newer_off + 1,
 					(u64)i << PAGE_CACHE_SHIFT);
 
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 09/10] Btrfs: use PagePrivate2 to check ordered data
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
                   ` (7 preceding siblings ...)
  2012-03-27  6:44 ` [PATCH 08/10] Btrfs: update to the right index of defragment Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-27  6:44 ` [PATCH 10/10] Btrfs: drop cache with VACANCY em when we fail to start a transaction Liu Bo
  9 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

If a page has PagePrivate2 flag, it still remains as ordered data,
so we can check this flag directly instead of looking up an ordered
extent.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/inode.c |   45 +++++++++++++++------------------------------
 1 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7f5018d..bacf441 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6345,8 +6345,6 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct extent_io_tree *tree;
-	struct btrfs_ordered_extent *ordered;
-	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
@@ -6365,35 +6363,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
-			 GFP_NOFS);
-	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
-					   page_offset(page));
-	if (ordered) {
-		/*
-		 * IO on this page will never be started, so we need
-		 * to account for any ordered extents now
-		 */
-		clear_extent_bit(tree, page_start, page_end,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-				 &cached_state, GFP_NOFS);
-		/*
-		 * whoever cleared the private bit is responsible
-		 * for the finish_ordered_io
-		 */
-		if (TestClearPagePrivate2(page)) {
-			btrfs_finish_ordered_io(page->mapping->host,
-						page_start, page_end);
-		}
-		btrfs_put_ordered_extent(ordered);
-		cached_state = NULL;
-		lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
-				 GFP_NOFS);
-	}
+	/*
+	 * IO on this page will never be started, so we need
+	 * to account for any ordered extents now
+	 */
 	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+			 EXTENT_DIRTY | EXTENT_DELALLOC |
+			 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 1,
+			 NULL, GFP_NOFS);
+	/*
+	 * whoever cleared the private bit is responsible
+	 * for the finish_ordered_io
+	 */
+	if (TestClearPagePrivate2(page)) {
+		btrfs_finish_ordered_io(page->mapping->host,
+					page_start, page_end);
+	}
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 10/10] Btrfs: drop cache with VACANCY em when we fail to start a transaction
  2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
                   ` (8 preceding siblings ...)
  2012-03-27  6:44 ` [PATCH 09/10] Btrfs: use PagePrivate2 to check ordered data Liu Bo
@ 2012-03-27  6:44 ` Liu Bo
  2012-03-29 14:01   ` Chris Mason
  9 siblings, 1 reply; 13+ messages in thread
From: Liu Bo @ 2012-03-27  6:44 UTC (permalink / raw)
  To: linux-btrfs

We need to clean a VACANCY em(if we have) when we fail to start a transaction.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/inode.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bacf441..2b2f0b6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3406,9 +3406,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 				break;
 			}
 
-			btrfs_drop_extent_cache(inode, hole_start,
-					last_byte - 1, 0);
-
 			btrfs_update_inode(trans, root, inode);
 			btrfs_end_transaction(trans, root);
 		}
@@ -3419,6 +3416,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			break;
 	}
 
+	if (em && test_bit(EXTENT_FLAG_VACANCY, &em->flags))
+		btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0);
+
 	free_extent_map(em);
 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
 			     GFP_NOFS);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 10/10] Btrfs: drop cache with VACANCY em when we fail to start a transaction
  2012-03-27  6:44 ` [PATCH 10/10] Btrfs: drop cache with VACANCY em when we fail to start a transaction Liu Bo
@ 2012-03-29 14:01   ` Chris Mason
  2012-03-30  1:08     ` Liu Bo
  0 siblings, 1 reply; 13+ messages in thread
From: Chris Mason @ 2012-03-29 14:01 UTC (permalink / raw)
  To: Liu Bo; +Cc: linux-btrfs

On Tue, Mar 27, 2012 at 02:44:38PM +0800, Liu Bo wrote:
> We need to clean a VACANCY em(if we have) when we fail to start a transaction.
> 
> Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
> ---
>  fs/btrfs/inode.c |    6 +++---
>  1 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index bacf441..2b2f0b6 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3406,9 +3406,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>  				break;
>  			}
>  
> -			btrfs_drop_extent_cache(inode, hole_start,
> -					last_byte - 1, 0);
> -
>  			btrfs_update_inode(trans, root, inode);
>  			btrfs_end_transaction(trans, root);
>  		}
> @@ -3419,6 +3416,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>  			break;
>  	}
>  
> +	if (em && test_bit(EXTENT_FLAG_VACANCY, &em->flags))
> +		btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0);
> +

Hmmm, last_byte isn't always setup at this point (uninit variable).  I'm
a little uncertain about what this one is fixing?

-chris

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 10/10] Btrfs: drop cache with VACANCY em when we fail to start a transaction
  2012-03-29 14:01   ` Chris Mason
@ 2012-03-30  1:08     ` Liu Bo
  0 siblings, 0 replies; 13+ messages in thread
From: Liu Bo @ 2012-03-30  1:08 UTC (permalink / raw)
  To: Chris Mason, linux-btrfs

On 03/29/2012 10:01 PM, Chris Mason wrote:
> On Tue, Mar 27, 2012 at 02:44:38PM +0800, Liu Bo wrote:
>> We need to clean a VACANCY em(if we have) when we fail to start a transaction.
>>
>> Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
>> ---
>>  fs/btrfs/inode.c |    6 +++---
>>  1 files changed, 3 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index bacf441..2b2f0b6 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -3406,9 +3406,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>>  				break;
>>  			}
>>  
>> -			btrfs_drop_extent_cache(inode, hole_start,
>> -					last_byte - 1, 0);
>> -
>>  			btrfs_update_inode(trans, root, inode);
>>  			btrfs_end_transaction(trans, root);
>>  		}
>> @@ -3419,6 +3416,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
>>  			break;
>>  	}
>>  
>> +	if (em && test_bit(EXTENT_FLAG_VACANCY, &em->flags))
>> +		btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0);
>> +
> 
> Hmmm, last_byte isn't always setup at this point (uninit variable).  I'm
> a little uncertain about what this one is fixing?
> 

err, sorry, plz drop this one, it's just a cleanup.

I was just thinking when we create a VACANCY em and fail to start a transaction(break out from while loop),
we'll leave a VACANCY em behind.

thanks,
liubo

> -chris
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-03-30  1:08 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-27  6:44 [PATCH 00/10] several fixes and cleanups Liu Bo
2012-03-27  6:44 ` [PATCH 01/10] Btrfs: show useful info in space reservation tracepoint Liu Bo
2012-03-27  6:44 ` [PATCH 02/10][RESEND] Btrfs: fix deadlock during allocating chunks Liu Bo
2012-03-27  6:44 ` [PATCH 03/10] Btrfs: fix race between direct io and autodefrag Liu Bo
2012-03-27  6:44 ` [PATCH 04/10] Btrfs: fix the mismatch of page->mapping Liu Bo
2012-03-27  6:44 ` [PATCH 05/10][RESEND] Btrfs: fix recursive defragment with autodefrag option Liu Bo
2012-03-27  6:44 ` [PATCH 06/10] Btrfs: add a check to decide if we should defrag the range Liu Bo
2012-03-27  6:44 ` [PATCH 07/10] Btrfs: do not bother to defrag an extent if it is a big real extent Liu Bo
2012-03-27  6:44 ` [PATCH 08/10] Btrfs: update to the right index of defragment Liu Bo
2012-03-27  6:44 ` [PATCH 09/10] Btrfs: use PagePrivate2 to check ordered data Liu Bo
2012-03-27  6:44 ` [PATCH 10/10] Btrfs: drop cache with VACANCY em when we fail to start a transaction Liu Bo
2012-03-29 14:01   ` Chris Mason
2012-03-30  1:08     ` Liu Bo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.