All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 2/3] Move the file data to the new blocks
@ 2007-01-16 12:05 sho
  2007-02-05 13:12 ` Jan Kara
  2007-02-07  1:33 ` Andrew Morton
  0 siblings, 2 replies; 18+ messages in thread
From: sho @ 2007-01-16 12:05 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel

Move the blocks on the temporary inode to the original inode
by a page.
1. Read the file data from the old blocks to the page
2. Move the block on the temporary inode to the original inode
3. Write the file data on the page into the new blocks

Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
---
diff -Nrup -X linux-2.6.19-rc6-org/Documentation/dontdiff linux-2.6.19-rc6-1/fs/ext4/extents.c linux-2.6.19-rc6-FULL/fs/ext4/extents.c
--- linux-2.6.19-rc6-1/fs/ext4/extents.c	2007-01-16 09:46:13.000000000 +0900
+++ linux-2.6.19-rc6-FULL/fs/ext4/extents.c	2007-01-16 09:43:45.000000000 +0900
@@ -2539,6 +2539,470 @@ ext4_ext_next_extent(struct inode *inode
 }
 
 /**
+ * ext4_ext_merge_extents - merge new extent to the extent block
+ *
+ * @handle      journal handle
+ * @inode       target file's inode
+ * @org_path    path indicates first extent to be defraged
+ * @o_start     first original extent to be defraged
+ * @o_end       last original extent to be defraged
+ * @start_ext   first new extent to be merged
+ * @new_ext     middle of new extent to be merged
+ * @end_ext     last new extent to be merged
+ * @replaced    the number of blocks which will be replaced with new_ext
+ *
+ * This function returns 0 if succeed, otherwise returns -1.
+ */
+static int
+ext4_ext_merge_extents(handle_t *handle, struct inode *inode,
+		struct ext4_ext_path *org_path,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext, unsigned long replaced)
+{
+	int	i = 0;
+	unsigned need_slots;
+	unsigned slots_range, len;
+	int	 range_to_move;
+	struct	ext4_extent_header * eh;
+	struct	ext4_extent *free_start, *free_end;
+	int	depth;
+
+	/* The extents need to be inserted
+	 * start_extent + new_extent + end_extent
+	 */
+	need_slots = (le16_to_cpu(start_ext->ee_len) ? 1 : 0) +
+			(le16_to_cpu(end_ext->ee_len) ? 1 : 0) +
+			(le16_to_cpu(new_ext->ee_len) ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = o_end - o_start + 1;
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = org_path->p_depth;
+	org_path += depth;
+	eh = org_path->p_hdr;
+	/* expansion */
+	if (range_to_move > 0) {
+		if (range_to_move > le16_to_cpu(eh->eh_max)
+				- le16_to_cpu(eh->eh_entries)) {
+			printk("Cannot merge extents due to no space\n");
+			return -1;
+		}
+	}
+	if (depth) {
+		/* Register to journal */
+		if (ext4_journal_get_write_access(handle, org_path->p_bh)) {
+			return -1;
+		}
+	}
+
+	/* Free old blocks
+	 * dest        |---------------|
+	 * org  |---------------|
+	 */
+	free_start = o_start;
+	free_end = o_end;
+	if (le16_to_cpu(start_ext->ee_len)) {
+		if (le16_to_cpu(o_start->ee_len)
+			> le16_to_cpu(start_ext->ee_len)) {
+			ext4_free_blocks(handle, inode,
+				ext_pblock(o_start)
+					+ le16_to_cpu(start_ext->ee_len),
+				le16_to_cpu(o_start->ee_len)
+					- le16_to_cpu(start_ext->ee_len), 0);
+		}
+		free_start++;
+	}
+
+	/* dest |----------------|
+	 * org          |---------------|
+	 */
+	if (le16_to_cpu(end_ext->ee_len)) {
+		ext4_free_blocks(handle, inode, ext_pblock(o_end),
+			le16_to_cpu(o_end->ee_len)
+				- le16_to_cpu(end_ext->ee_len), 0);
+		free_end--;
+	}
+
+	/* dest |-------------------|
+	 * org   |-----------------|
+	 */
+	for (; free_start <= free_end; free_start++) {
+		ext4_free_blocks(handle, inode,
+			ext_pblock(free_start),
+			le32_to_cpu(free_start->ee_len), 0);
+	}
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = EXT_LAST_EXTENT(eh) - (o_end + 1) + 1;
+		len = len * sizeof(struct ext4_extent);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (le16_to_cpu(start_ext->ee_len)) {
+		o_start[i++].ee_len = start_ext->ee_len;
+	}
+
+	/* Insert new entry */
+	if (le16_to_cpu(new_ext->ee_len)) {
+		o_start[i].ee_block = new_ext->ee_block;
+		o_start[i].ee_len = cpu_to_le16(replaced);
+		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len) {
+		o_start[i] = *end_ext;
+	}
+
+	/* Increment the total entries counter on the extent block */
+	eh->eh_entries
+		= cpu_to_le16(le16_to_cpu(eh->eh_entries) + range_to_move);
+	if (depth) {
+		if (ext4_journal_dirty_metadata(handle, org_path->p_bh)) {
+			return -1;
+		}
+	} else {
+		if (ext4_mark_inode_dirty(handle, inode) < 0) {
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ext4_ext_defrag_leaf_block -  Defragmentation for one leaf extent block.
+ * @handle      journal handle
+ * @org_inode   target inode
+ * @org_path    path indicates first extent to be defraged
+ * @dext        destination extent
+ * @from        start offset on the target file
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_ext_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+		struct ext4_ext_path *org_path, struct ext4_extent *dext,
+		unsigned long *from, unsigned long *delete_start)
+{
+	unsigned long depth, replaced = 0;
+	struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	unsigned long new_end;
+	unsigned long lblock;
+	unsigned long len;
+	unsigned long long new_phys_end;
+	int	ret;
+
+	depth = ext_depth(org_inode);
+	start_ext.ee_len = end_ext.ee_len = 0;
+	o_start = o_end = oext = org_path[depth].p_ext;
+	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));	
+	len = new_ext.ee_len = dext->ee_len;
+	new_ext.ee_block = cpu_to_le32(*from);
+	lblock = le32_to_cpu(oext->ee_block);
+	new_end = le32_to_cpu(new_ext.ee_block)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+	new_phys_end = ext_pblock(&new_ext)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+
+	/* First original extent
+	 * dest         |---------------|
+	 * org  |---------------|
+	 */
+	if (le32_to_cpu(new_ext.ee_block) >
+		le32_to_cpu(oext->ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block)
+		+ le16_to_cpu(oext->ee_len)) {
+		start_ext.ee_len = cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+					- le32_to_cpu(oext->ee_block));
+		replaced += le16_to_cpu(oext->ee_len)
+					- le16_to_cpu(start_ext.ee_len);
+	} else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+		/* We can merge previous extent. */
+		prev_ext = oext -1;
+		if ((ext_pblock(prev_ext)
+			+ le32_to_cpu(prev_ext->ee_len))
+			== ext_pblock(&new_ext)) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le32(
+					le16_to_cpu(prev_ext->ee_len)
+					+ le16_to_cpu(new_ext.ee_len));
+			new_ext.ee_len = 0;
+		}
+	}
+
+	for (;;) {
+
+		/* The extent for destination must be found. */
+		BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
+		lblock += le16_to_cpu(oext->ee_len);
+
+		/* Middle of original extent
+		 * dest |-------------------|
+		 * org   |-----------------|
+		 */
+		if (le32_to_cpu(new_ext.ee_block) <=
+			le32_to_cpu(oext->ee_block) &&
+			new_end >= le32_to_cpu(oext->ee_block)
+			+ le16_to_cpu(oext->ee_len) -1) {
+			replaced += le16_to_cpu(oext->ee_len);
+		}
+
+		/* Last original extent
+		 * dest |----------------|
+		 * org          |---------------|
+		 */
+		if (new_end >= le32_to_cpu(oext->ee_block) &&
+			new_end < le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			end_ext.ee_len
+				= cpu_to_le16(le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) -1 - new_end);
+			ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
+				+ cpu_to_le16(oext->ee_len)
+				- cpu_to_le16(end_ext.ee_len)));
+			end_ext.ee_block
+				= cpu_to_le32(le32_to_cpu(o_end->ee_block)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len));
+			replaced += le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len);
+		}
+
+		/* Detected the block end, reached the number of replaced
+		 * blocks to dext->ee_len.  Then, merge the extent.
+		 */
+		if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+			new_end <= le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			if (ext4_ext_merge_extents(handle, org_inode,
+				org_path, o_start, o_end,
+				&start_ext, &new_ext, &end_ext, replaced) < 0) {
+				return -EIO;
+			}
+
+			*delete_start = le32_to_cpu(new_ext.ee_block)
+							+ replaced;
+
+			/* All expected blocks are replaced */
+			if (new_ext.ee_len <= 0) {
+				if (DQUOT_ALLOC_BLOCK
+					(org_inode, len)) {
+					return -EDQUOT;
+				}
+				return 0;
+			}
+
+			/* re-calculate new_ext */
+			new_ext.ee_len = cpu_to_le32(le16_to_cpu(new_ext.ee_len)
+				- replaced);
+			new_ext.ee_block =
+				cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+				+ replaced);
+			ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
+					 + replaced);
+			replaced = 0;
+			start_ext.ee_len = end_ext.ee_len = 0;
+			o_start = NULL;
+
+			/* All expected blocks are replaced */
+			if (new_ext.ee_len <= 0) {
+				if (DQUOT_ALLOC_BLOCK
+					(org_inode, len)) {
+					return -EDQUOT;
+				}
+				return 0;
+			}
+		}
+
+		/* Get next extent for original. */
+		if ((ret
+		 = ext4_ext_next_extent(org_inode, org_path, &oext))
+								!= 0) {
+			if (ret == 1)
+				ret = -EIO;
+			return ret;
+		}
+		o_end = oext;
+		if (!o_start)
+			o_start = oext;
+	}
+}
+
+/**
+ * ext4_ext_replace_branches - replace original extents with new extents.
+ * @org_inode    Original inode
+ * @dest_inode   temporary inode
+ * @from_page    Page offset
+ * @count_page   Page count to be replaced
+ * @delete_start block offset for deletion
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from+count-1".
+ */
+static int
+ext4_ext_replace_branches(struct inode *org_inode, struct inode *dest_inode,
+			pgoff_t from_page,  pgoff_t dest_from_page,
+			pgoff_t count_page, unsigned long *delete_start) 
+{
+	struct ext4_ext_path *org_path = NULL;
+	struct ext4_ext_path *dest_path = NULL;
+	struct ext4_extent   *oext, *dext;
+	struct ext4_extent   tmp_ext;
+	int	err = 0;
+	int	depth;
+	unsigned long from, count, dest_off, diff, replaced_count = 0;
+	handle_t *handle = NULL;
+	unsigned jnum;
+
+	from = from_page << (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	count = count_page << (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	dest_off = dest_from_page << (PAGE_CACHE_SHIFT - 
+			dest_inode->i_blkbits);
+	jnum = ext4_ext_writepage_trans_blocks(org_inode, count) + 3;
+	handle = ext4_journal_start(org_inode, jnum);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out;
+	}
+
+	/* Get the original extent for the block "from" */
+	org_path = ext4_ext_find_extent(org_inode, from, NULL);
+	if (IS_ERR(org_path)) {
+		err = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get the destination extent for the head */
+	dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+	if (IS_ERR(dest_path)) {
+		err = PTR_ERR(dest_path);
+		dest_path = NULL;
+		goto out;
+	}
+	depth = ext_depth(dest_inode);
+	dext = dest_path[depth].p_ext;
+	/* When dext is too large, pick up the target range. */
+	diff = dest_off - le32_to_cpu(dext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+	tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+	tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+	if (count < tmp_ext.ee_len) {
+		tmp_ext.ee_len = cpu_to_le16(count);
+	}
+	dext = &tmp_ext;
+
+	/* loop for the destination extents */
+	while (1) {
+		/* The extent for destination must be found. */
+		BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
+
+		/* loop for the original extent blocks */
+		if ((err = ext4_ext_defrag_leaf_block(handle, org_inode,
+			org_path, dext, &from, delete_start)) < 0) {
+			goto out;
+		}
+
+		replaced_count += le16_to_cpu(dext->ee_len);
+		dest_off += le16_to_cpu(dext->ee_len);
+		from += le16_to_cpu(dext->ee_len);
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		/* get the next extent on both original and destination. */
+		err = ext4_ext_next_extent(dest_inode, dest_path, &dext);
+		if (err != 0) {
+			if (err > 0) {
+				err = 0;
+			}
+			goto out;
+		}
+		if ((err =
+			ext4_ext_next_extent(org_inode, org_path, &oext)) < 0) {
+			goto out;
+		}
+	}
+out:
+	if (handle) {
+		ext4_journal_stop(handle);
+	}
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (dest_path) {
+		ext4_ext_drop_refs(dest_path);
+		kfree(dest_path);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_ext_remove_index_blocks - remove leaf blocks and index blocks
+ * @handle      handle
+ * @dest_inode  temporary inode
+ * @eh          extent header
+ * @depth       depth of extent tree
+ *
+ * This function returns 0 if succeed, otherwise returns error value
+ */
+static int ext4_ext_remove_index_blocks(handle_t *handle,
+			struct inode *dest_inode,
+			struct ext4_extent_header *eh, int depth)
+{
+	struct ext4_extent_idx *idx;
+	struct buffer_head *bh;
+	int metadata = 1;
+	int credits = 0;
+	int i, ret = 0;
+
+	if (depth == 0) {
+		eh->eh_entries = 0;
+		return 0;
+	}
+
+	idx = EXT_FIRST_INDEX(eh);
+
+	for (i = 1; i <= eh->eh_entries; i++) {
+		if (depth > 1) {
+			bh = sb_bread(dest_inode->i_sb, idx->ei_leaf);
+			if (!bh) {
+				ret = -EIO;
+			} else {
+				ret = ext4_ext_remove_index_blocks(handle,
+					dest_inode, ext_block_hdr(bh),
+					depth - 1);
+				brelse(bh);
+			}
+		}
+#ifdef CONFIG_QUOTA
+		credits = 2 * EXT4_QUOTA_TRANS_BLOCKS(dest_inode->i_sb);
+#endif
+		handle = ext4_ext_journal_restart(handle,
+				credits + EXT4_TRANS_META_BLOCKS);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+
+		ext4_free_blocks(handle, dest_inode,
+				idx->ei_leaf, 1, metadata);
+		eh->eh_entries--;
+		idx++;
+	}
+	return ret;
+}
+
+/**
  * ext4_ext_alloc_blocks - allocate contiguous blocks to temporary inode
  * @dest_inode   temporary inode for multiple block allocation
  * @iblock       file related offset
@@ -2673,6 +3137,61 @@ out2:
 }
 
 /**
+ * ext4_ext_defrag_partial - defrag original file partially
+ * @filp:		pointer to file
+ * @org_offset:		page index on original file
+ * @dest_offset:	page index on temporary file
+ *
+ * This function returns 0 if succeeded, otherwise returns error value
+ */
+static int
+ext4_ext_defrag_partial(struct inode *tmp_inode,
+			struct file *filp,
+			pgoff_t org_offset,
+			pgoff_t dest_offset, unsigned long *delete_start)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	pgoff_t offset_in_page = PAGE_SIZE;
+	int ret = 0;
+
+	page = read_cache_page(inode->i_mapping,
+		       org_offset, (filler_t *)inode->i_mapping->a_ops->readpage,
+		       NULL);
+
+	if (IS_ERR(page)) {
+		ret = PTR_ERR(page);
+		return ret;
+	}
+
+	wait_on_page_locked(page);
+	lock_page(page);
+	/* release old bh and drop refs */
+	try_to_release_page(page, 0);
+	ret = ext4_ext_replace_branches(inode, tmp_inode, org_offset, 
+			dest_offset, 1, delete_start);
+	if (ret < 0)
+		goto ERR;
+
+	if (org_offset == ((inode->i_size - 1) >> PAGE_SHIFT))
+		offset_in_page = (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+	ret = mapping->a_ops->prepare_write(filp, page,
+					    0, offset_in_page);
+	if (ret)
+		goto ERR;
+
+	ret = mapping->a_ops->commit_write(filp, page,
+					   0, offset_in_page);
+ERR:
+	unlock_page(page);
+	page_cache_release(page);
+
+	return (ret < 0 ? ret : 0);
+}
+
+/**
  * ext4_ext_new_extent_tree -  allocate contiguous blocks
  * @inode:		inode of the original file
  * @tmp_inode:		inode of the temporary file

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-01-16 12:05 [RFC][PATCH 2/3] Move the file data to the new blocks sho
@ 2007-02-05 13:12 ` Jan Kara
  2007-02-05 22:06   ` Nathan Scott
  2007-02-07  1:35   ` Andrew Morton
  2007-02-07  1:33 ` Andrew Morton
  1 sibling, 2 replies; 18+ messages in thread
From: Jan Kara @ 2007-02-05 13:12 UTC (permalink / raw)
  To: sho; +Cc: linux-ext4, linux-fsdevel

  Hi,

  I'm replying rather late but I've been busy with my PhD thesis lately.
So sorry for that.

> Move the blocks on the temporary inode to the original inode
> by a page.
> 1. Read the file data from the old blocks to the page
> 2. Move the block on the temporary inode to the original inode
> 3. Write the file data on the page into the new blocks
  I have one thing - it's probably not good to use page cache for
defragmentation. As you will read/write tons of data and that will push
out all other data from the cache. I think it may be better to allocate
some separate pages and use them as buffers.
  Also I'm intending to work on a similar functionality for ext3
(without extents/mballoc). It would be probably good to find some common
parts so that they don't have to be duplicated.

								Honza

> Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
> ---
> diff -Nrup -X linux-2.6.19-rc6-org/Documentation/dontdiff linux-2.6.19-rc6-1/fs/ext4/extents.c linux-2.6.19-rc6-FULL/fs/ext4/extents.c
> --- linux-2.6.19-rc6-1/fs/ext4/extents.c	2007-01-16 09:46:13.000000000 +0900
> +++ linux-2.6.19-rc6-FULL/fs/ext4/extents.c	2007-01-16 09:43:45.000000000 +0900
> @@ -2539,6 +2539,470 @@ ext4_ext_next_extent(struct inode *inode
>  }
>  
>  /**
> + * ext4_ext_merge_extents - merge new extent to the extent block
> + *
> + * @handle      journal handle
> + * @inode       target file's inode
> + * @org_path    path indicates first extent to be defraged
> + * @o_start     first original extent to be defraged
> + * @o_end       last original extent to be defraged
> + * @start_ext   first new extent to be merged
> + * @new_ext     middle of new extent to be merged
> + * @end_ext     last new extent to be merged
> + * @replaced    the number of blocks which will be replaced with new_ext
> + *
> + * This function returns 0 if succeed, otherwise returns -1.
> + */
> +static int
> +ext4_ext_merge_extents(handle_t *handle, struct inode *inode,
> +		struct ext4_ext_path *org_path,
> +		struct ext4_extent *o_start, struct ext4_extent *o_end,
> +		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
> +		struct ext4_extent *end_ext, unsigned long replaced)
> +{
> +	int	i = 0;
> +	unsigned need_slots;
> +	unsigned slots_range, len;
> +	int	 range_to_move;
> +	struct	ext4_extent_header * eh;
> +	struct	ext4_extent *free_start, *free_end;
> +	int	depth;
> +
> +	/* The extents need to be inserted
> +	 * start_extent + new_extent + end_extent
> +	 */
> +	need_slots = (le16_to_cpu(start_ext->ee_len) ? 1 : 0) +
> +			(le16_to_cpu(end_ext->ee_len) ? 1 : 0) +
> +			(le16_to_cpu(new_ext->ee_len) ? 1 : 0);
> +
> +	/* The number of slots between start and end */
> +	slots_range = o_end - o_start + 1;
> +
> +	/* Range to move the end of extent */
> +	range_to_move = need_slots - slots_range;
> +	depth = org_path->p_depth;
> +	org_path += depth;
> +	eh = org_path->p_hdr;
> +	/* expansion */
> +	if (range_to_move > 0) {
> +		if (range_to_move > le16_to_cpu(eh->eh_max)
> +				- le16_to_cpu(eh->eh_entries)) {
> +			printk("Cannot merge extents due to no space\n");
> +			return -1;
> +		}
> +	}
> +	if (depth) {
> +		/* Register to journal */
> +		if (ext4_journal_get_write_access(handle, org_path->p_bh)) {
> +			return -1;
> +		}
> +	}
> +
> +	/* Free old blocks
> +	 * dest        |---------------|
> +	 * org  |---------------|
> +	 */
> +	free_start = o_start;
> +	free_end = o_end;
> +	if (le16_to_cpu(start_ext->ee_len)) {
> +		if (le16_to_cpu(o_start->ee_len)
> +			> le16_to_cpu(start_ext->ee_len)) {
> +			ext4_free_blocks(handle, inode,
> +				ext_pblock(o_start)
> +					+ le16_to_cpu(start_ext->ee_len),
> +				le16_to_cpu(o_start->ee_len)
> +					- le16_to_cpu(start_ext->ee_len), 0);
> +		}
> +		free_start++;
> +	}
> +
> +	/* dest |----------------|
> +	 * org          |---------------|
> +	 */
> +	if (le16_to_cpu(end_ext->ee_len)) {
> +		ext4_free_blocks(handle, inode, ext_pblock(o_end),
> +			le16_to_cpu(o_end->ee_len)
> +				- le16_to_cpu(end_ext->ee_len), 0);
> +		free_end--;
> +	}
> +
> +	/* dest |-------------------|
> +	 * org   |-----------------|
> +	 */
> +	for (; free_start <= free_end; free_start++) {
> +		ext4_free_blocks(handle, inode,
> +			ext_pblock(free_start),
> +			le32_to_cpu(free_start->ee_len), 0);
> +	}
> +
> +	/* Move the existing extents */
> +	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
> +		len = EXT_LAST_EXTENT(eh) - (o_end + 1) + 1;
> +		len = len * sizeof(struct ext4_extent);
> +		memmove(o_end + 1 + range_to_move, o_end + 1, len);
> +	}
> +
> +	/* Insert start entry */
> +	if (le16_to_cpu(start_ext->ee_len)) {
> +		o_start[i++].ee_len = start_ext->ee_len;
> +	}
> +
> +	/* Insert new entry */
> +	if (le16_to_cpu(new_ext->ee_len)) {
> +		o_start[i].ee_block = new_ext->ee_block;
> +		o_start[i].ee_len = cpu_to_le16(replaced);
> +		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
> +	}
> +
> +	/* Insert end entry */
> +	if (end_ext->ee_len) {
> +		o_start[i] = *end_ext;
> +	}
> +
> +	/* Increment the total entries counter on the extent block */
> +	eh->eh_entries
> +		= cpu_to_le16(le16_to_cpu(eh->eh_entries) + range_to_move);
> +	if (depth) {
> +		if (ext4_journal_dirty_metadata(handle, org_path->p_bh)) {
> +			return -1;
> +		}
> +	} else {
> +		if (ext4_mark_inode_dirty(handle, inode) < 0) {
> +			return -1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * ext4_ext_defrag_leaf_block -  Defragmentation for one leaf extent block.
> + * @handle      journal handle
> + * @org_inode   target inode
> + * @org_path    path indicates first extent to be defraged
> + * @dext        destination extent
> + * @from        start offset on the target file
> + *
> + * This function returns 0 if succeed, otherwise returns error value.
> + */
> +static int
> +ext4_ext_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
> +		struct ext4_ext_path *org_path, struct ext4_extent *dext,
> +		unsigned long *from, unsigned long *delete_start)
> +{
> +	unsigned long depth, replaced = 0;
> +	struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
> +	struct ext4_extent new_ext, start_ext, end_ext;
> +	unsigned long new_end;
> +	unsigned long lblock;
> +	unsigned long len;
> +	unsigned long long new_phys_end;
> +	int	ret;
> +
> +	depth = ext_depth(org_inode);
> +	start_ext.ee_len = end_ext.ee_len = 0;
> +	o_start = o_end = oext = org_path[depth].p_ext;
> +	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));	
> +	len = new_ext.ee_len = dext->ee_len;
> +	new_ext.ee_block = cpu_to_le32(*from);
> +	lblock = le32_to_cpu(oext->ee_block);
> +	new_end = le32_to_cpu(new_ext.ee_block)
> +		+ le16_to_cpu(new_ext.ee_len) - 1;
> +	new_phys_end = ext_pblock(&new_ext)
> +		+ le16_to_cpu(new_ext.ee_len) - 1;
> +
> +	/* First original extent
> +	 * dest         |---------------|
> +	 * org  |---------------|
> +	 */
> +	if (le32_to_cpu(new_ext.ee_block) >
> +		le32_to_cpu(oext->ee_block) &&
> +		le32_to_cpu(new_ext.ee_block) <
> +		le32_to_cpu(oext->ee_block)
> +		+ le16_to_cpu(oext->ee_len)) {
> +		start_ext.ee_len = cpu_to_le32(le32_to_cpu(new_ext.ee_block)
> +					- le32_to_cpu(oext->ee_block));
> +		replaced += le16_to_cpu(oext->ee_len)
> +					- le16_to_cpu(start_ext.ee_len);
> +	} else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
> +		/* We can merge previous extent. */
> +		prev_ext = oext -1;
> +		if ((ext_pblock(prev_ext)
> +			+ le32_to_cpu(prev_ext->ee_len))
> +			== ext_pblock(&new_ext)) {
> +			o_start = prev_ext;
> +			start_ext.ee_len = cpu_to_le32(
> +					le16_to_cpu(prev_ext->ee_len)
> +					+ le16_to_cpu(new_ext.ee_len));
> +			new_ext.ee_len = 0;
> +		}
> +	}
> +
> +	for (;;) {
> +
> +		/* The extent for destination must be found. */
> +		BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
> +		lblock += le16_to_cpu(oext->ee_len);
> +
> +		/* Middle of original extent
> +		 * dest |-------------------|
> +		 * org   |-----------------|
> +		 */
> +		if (le32_to_cpu(new_ext.ee_block) <=
> +			le32_to_cpu(oext->ee_block) &&
> +			new_end >= le32_to_cpu(oext->ee_block)
> +			+ le16_to_cpu(oext->ee_len) -1) {
> +			replaced += le16_to_cpu(oext->ee_len);
> +		}
> +
> +		/* Last original extent
> +		 * dest |----------------|
> +		 * org          |---------------|
> +		 */
> +		if (new_end >= le32_to_cpu(oext->ee_block) &&
> +			new_end < le32_to_cpu(oext->ee_block)
> +				+ le16_to_cpu(oext->ee_len) - 1) {
> +			end_ext.ee_len
> +				= cpu_to_le16(le32_to_cpu(oext->ee_block)
> +				+ le16_to_cpu(oext->ee_len) -1 - new_end);
> +			ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
> +				+ cpu_to_le16(oext->ee_len)
> +				- cpu_to_le16(end_ext.ee_len)));
> +			end_ext.ee_block
> +				= cpu_to_le32(le32_to_cpu(o_end->ee_block)
> +				+ le16_to_cpu(oext->ee_len)
> +				- le16_to_cpu(end_ext.ee_len));
> +			replaced += le16_to_cpu(oext->ee_len)
> +				- le16_to_cpu(end_ext.ee_len);
> +		}
> +
> +		/* Detected the block end, reached the number of replaced
> +		 * blocks to dext->ee_len.  Then, merge the extent.
> +		 */
> +		if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
> +			new_end <= le32_to_cpu(oext->ee_block)
> +				+ le16_to_cpu(oext->ee_len) - 1) {
> +			if (ext4_ext_merge_extents(handle, org_inode,
> +				org_path, o_start, o_end,
> +				&start_ext, &new_ext, &end_ext, replaced) < 0) {
> +				return -EIO;
> +			}
> +
> +			*delete_start = le32_to_cpu(new_ext.ee_block)
> +							+ replaced;
> +
> +			/* All expected blocks are replaced */
> +			if (new_ext.ee_len <= 0) {
> +				if (DQUOT_ALLOC_BLOCK
> +					(org_inode, len)) {
> +					return -EDQUOT;
> +				}
> +				return 0;
> +			}
> +
> +			/* re-calculate new_ext */
> +			new_ext.ee_len = cpu_to_le32(le16_to_cpu(new_ext.ee_len)
> +				- replaced);
> +			new_ext.ee_block =
> +				cpu_to_le32(le32_to_cpu(new_ext.ee_block)
> +				+ replaced);
> +			ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
> +					 + replaced);
> +			replaced = 0;
> +			start_ext.ee_len = end_ext.ee_len = 0;
> +			o_start = NULL;
> +
> +			/* All expected blocks are replaced */
> +			if (new_ext.ee_len <= 0) {
> +				if (DQUOT_ALLOC_BLOCK
> +					(org_inode, len)) {
> +					return -EDQUOT;
> +				}
> +				return 0;
> +			}
> +		}
> +
> +		/* Get next extent for original. */
> +		if ((ret
> +		 = ext4_ext_next_extent(org_inode, org_path, &oext))
> +								!= 0) {
> +			if (ret == 1)
> +				ret = -EIO;
> +			return ret;
> +		}
> +		o_end = oext;
> +		if (!o_start)
> +			o_start = oext;
> +	}
> +}
> +
> +/**
> + * ext4_ext_replace_branches - replace original extents with new extents.
> + * @org_inode    Original inode
> + * @dest_inode   temporary inode
> + * @from_page    Page offset
> + * @count_page   Page count to be replaced
> + * @delete_start block offset for deletion
> + *
> + * This function returns 0 if succeed, otherwise returns error value.
> + * Replace extents for blocks from "from" to "from+count-1".
> + */
> +static int
> +ext4_ext_replace_branches(struct inode *org_inode, struct inode *dest_inode,
> +			pgoff_t from_page,  pgoff_t dest_from_page,
> +			pgoff_t count_page, unsigned long *delete_start) 
> +{
> +	struct ext4_ext_path *org_path = NULL;
> +	struct ext4_ext_path *dest_path = NULL;
> +	struct ext4_extent   *oext, *dext;
> +	struct ext4_extent   tmp_ext;
> +	int	err = 0;
> +	int	depth;
> +	unsigned long from, count, dest_off, diff, replaced_count = 0;
> +	handle_t *handle = NULL;
> +	unsigned jnum;
> +
> +	from = from_page << (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
> +	count = count_page << (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
> +	dest_off = dest_from_page << (PAGE_CACHE_SHIFT - 
> +			dest_inode->i_blkbits);
> +	jnum = ext4_ext_writepage_trans_blocks(org_inode, count) + 3;
> +	handle = ext4_journal_start(org_inode, jnum);
> +	if (IS_ERR(handle)) {
> +		err = PTR_ERR(handle);
> +		goto out;
> +	}
> +
> +	/* Get the original extent for the block "from" */
> +	org_path = ext4_ext_find_extent(org_inode, from, NULL);
> +	if (IS_ERR(org_path)) {
> +		err = PTR_ERR(org_path);
> +		org_path = NULL;
> +		goto out;
> +	}
> +
> +	/* Get the destination extent for the head */
> +	dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
> +	if (IS_ERR(dest_path)) {
> +		err = PTR_ERR(dest_path);
> +		dest_path = NULL;
> +		goto out;
> +	}
> +	depth = ext_depth(dest_inode);
> +	dext = dest_path[depth].p_ext;
> +	/* When dext is too large, pick up the target range. */
> +	diff = dest_off - le32_to_cpu(dext->ee_block);
> +	ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
> +	tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
> +	tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
> +	if (count < tmp_ext.ee_len) {
> +		tmp_ext.ee_len = cpu_to_le16(count);
> +	}
> +	dext = &tmp_ext;
> +
> +	/* loop for the destination extents */
> +	while (1) {
> +		/* The extent for destination must be found. */
> +		BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
> +
> +		/* loop for the original extent blocks */
> +		if ((err = ext4_ext_defrag_leaf_block(handle, org_inode,
> +			org_path, dext, &from, delete_start)) < 0) {
> +			goto out;
> +		}
> +
> +		replaced_count += le16_to_cpu(dext->ee_len);
> +		dest_off += le16_to_cpu(dext->ee_len);
> +		from += le16_to_cpu(dext->ee_len);
> +
> +		/* Already moved the expected blocks */
> +		if (replaced_count >= count)
> +			break;
> +
> +		/* get the next extent on both original and destination. */
> +		err = ext4_ext_next_extent(dest_inode, dest_path, &dext);
> +		if (err != 0) {
> +			if (err > 0) {
> +				err = 0;
> +			}
> +			goto out;
> +		}
> +		if ((err =
> +			ext4_ext_next_extent(org_inode, org_path, &oext)) < 0) {
> +			goto out;
> +		}
> +	}
> +out:
> +	if (handle) {
> +		ext4_journal_stop(handle);
> +	}
> +	if (org_path) {
> +		ext4_ext_drop_refs(org_path);
> +		kfree(org_path);
> +	}
> +	if (dest_path) {
> +		ext4_ext_drop_refs(dest_path);
> +		kfree(dest_path);
> +	}
> +
> +	return err;
> +}
> +
> +/**
> + * ext4_ext_remove_index_blocks - remove leaf blocks and index blocks
> + * @handle      handle
> + * @dest_inode  temporary inode
> + * @eh          extent header
> + * @depth       depth of extent tree
> + *
> + * This function returns 0 if succeed, otherwise returns error value
> + */
> +static int ext4_ext_remove_index_blocks(handle_t *handle,
> +			struct inode *dest_inode,
> +			struct ext4_extent_header *eh, int depth)
> +{
> +	struct ext4_extent_idx *idx;
> +	struct buffer_head *bh;
> +	int metadata = 1;
> +	int credits = 0;
> +	int i, ret = 0;
> +
> +	if (depth == 0) {
> +		eh->eh_entries = 0;
> +		return 0;
> +	}
> +
> +	idx = EXT_FIRST_INDEX(eh);
> +
> +	for (i = 1; i <= eh->eh_entries; i++) {
> +		if (depth > 1) {
> +			bh = sb_bread(dest_inode->i_sb, idx->ei_leaf);
> +			if (!bh) {
> +				ret = -EIO;
> +			} else {
> +				ret = ext4_ext_remove_index_blocks(handle,
> +					dest_inode, ext_block_hdr(bh),
> +					depth - 1);
> +				brelse(bh);
> +			}
> +		}
> +#ifdef CONFIG_QUOTA
> +		credits = 2 * EXT4_QUOTA_TRANS_BLOCKS(dest_inode->i_sb);
> +#endif
> +		handle = ext4_ext_journal_restart(handle,
> +				credits + EXT4_TRANS_META_BLOCKS);
> +		if (IS_ERR(handle))
> +			return PTR_ERR(handle);
> +
> +		ext4_free_blocks(handle, dest_inode,
> +				idx->ei_leaf, 1, metadata);
> +		eh->eh_entries--;
> +		idx++;
> +	}
> +	return ret;
> +}
> +
> +/**
>   * ext4_ext_alloc_blocks - allocate contiguous blocks to temporary inode
>   * @dest_inode   temporary inode for multiple block allocation
>   * @iblock       file related offset
> @@ -2673,6 +3137,61 @@ out2:
>  }
>  
>  /**
> + * ext4_ext_defrag_partial - defrag original file partially
> + * @filp:		pointer to file
> + * @org_offset:		page index on original file
> + * @dest_offset:	page index on temporary file
> + *
> + * This function returns 0 if succeeded, otherwise returns error value
> + */
> +static int
> +ext4_ext_defrag_partial(struct inode *tmp_inode,
> +			struct file *filp,
> +			pgoff_t org_offset,
> +			pgoff_t dest_offset, unsigned long *delete_start)
> +{
> +	struct inode *inode = filp->f_dentry->d_inode;
> +	struct address_space *mapping = inode->i_mapping;
> +	struct page *page;
> +	pgoff_t offset_in_page = PAGE_SIZE;
> +	int ret = 0;
> +
> +	page = read_cache_page(inode->i_mapping,
> +		       org_offset, (filler_t *)inode->i_mapping->a_ops->readpage,
> +		       NULL);
> +
> +	if (IS_ERR(page)) {
> +		ret = PTR_ERR(page);
> +		return ret;
> +	}
> +
> +	wait_on_page_locked(page);
> +	lock_page(page);
> +	/* release old bh and drop refs */
> +	try_to_release_page(page, 0);
> +	ret = ext4_ext_replace_branches(inode, tmp_inode, org_offset, 
> +			dest_offset, 1, delete_start);
> +	if (ret < 0)
> +		goto ERR;
> +
> +	if (org_offset == ((inode->i_size - 1) >> PAGE_SHIFT))
> +		offset_in_page = (inode->i_size & (PAGE_CACHE_SIZE - 1));
> +
> +	ret = mapping->a_ops->prepare_write(filp, page,
> +					    0, offset_in_page);
> +	if (ret)
> +		goto ERR;
> +
> +	ret = mapping->a_ops->commit_write(filp, page,
> +					   0, offset_in_page);
> +ERR:
> +	unlock_page(page);
> +	page_cache_release(page);
> +
> +	return (ret < 0 ? ret : 0);
> +}
> +
> +/**
>   * ext4_ext_new_extent_tree -  allocate contiguous blocks
>   * @inode:		inode of the original file
>   * @tmp_inode:		inode of the temporary file
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-05 13:12 ` Jan Kara
@ 2007-02-05 22:06   ` Nathan Scott
  2007-02-07  1:35   ` Andrew Morton
  1 sibling, 0 replies; 18+ messages in thread
From: Nathan Scott @ 2007-02-05 22:06 UTC (permalink / raw)
  To: Jan Kara; +Cc: sho, linux-ext4, linux-fsdevel

Hi Jan,

On Mon, 2007-02-05 at 14:12 +0100, Jan Kara wrote:
>   Hi,
> 
>   I'm replying rather late but I've been busy with my PhD thesis lately.
> So sorry for that.
> 
> > Move the blocks on the temporary inode to the original inode
> > by a page.
> > 1. Read the file data from the old blocks to the page
> > 2. Move the block on the temporary inode to the original inode
> > 3. Write the file data on the page into the new blocks
>   I have one thing - it's probably not good to use page cache for
> defragmentation. As you will read/write tons of data and that will push
> out all other data from the cache. I think it may be better to allocate
> some separate pages and use them as buffers.

The XFS defrag tool uses direct IO to avoid this issue.  You may
do an even better job with splice(2) in this situation, since its
simply read-then-write-straight-back-out without looking at the
file data.

cheers.

-- 
Nathan


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-01-16 12:05 [RFC][PATCH 2/3] Move the file data to the new blocks sho
  2007-02-05 13:12 ` Jan Kara
@ 2007-02-07  1:33 ` Andrew Morton
  2007-02-07  3:45   ` Eric Sandeen
  1 sibling, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2007-02-07  1:33 UTC (permalink / raw)
  To: sho; +Cc: linux-ext4, linux-fsdevel

On Tue, 16 Jan 2007 21:05:20 +0900
sho@tnes.nec.co.jp wrote:

> Move the blocks on the temporary inode to the original inode
> by a page.
> 1. Read the file data from the old blocks to the page
> 2. Move the block on the temporary inode to the original inode
> 3. Write the file data on the page into the new blocks
> 
> ...
>
> +
> +/**
> + * ext4_ext_replace_branches - replace original extents with new extents.
> + * @org_inode    Original inode
> + * @dest_inode   temporary inode
> + * @from_page    Page offset
> + * @count_page   Page count to be replaced
> + * @delete_start block offset for deletion
> + *
> + * This function returns 0 if succeed, otherwise returns error value.
> + * Replace extents for blocks from "from" to "from+count-1".
> + */
> +static int
> +ext4_ext_replace_branches(struct inode *org_inode, struct inode *dest_inode,
> +			pgoff_t from_page,  pgoff_t dest_from_page,
> +			pgoff_t count_page, unsigned long *delete_start) 
> +{
> +	struct ext4_ext_path *org_path = NULL;
> +	struct ext4_ext_path *dest_path = NULL;
> +	struct ext4_extent   *oext, *dext;
> +	struct ext4_extent   tmp_ext;
> +	int	err = 0;
> +	int	depth;
> +	unsigned long from, count, dest_off, diff, replaced_count = 0;

These should be sector_t, shouldn't they?

> +	handle_t *handle = NULL;
> +	unsigned jnum;
> +
> +	from = from_page << (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);

In which case one needs to be very careful to avoid overflows in
expressions such as this one.

> +	wait_on_page_locked(page);
> +	lock_page(page);

The wait_on_page_locked() is unneeded here.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-05 13:12 ` Jan Kara
  2007-02-05 22:06   ` Nathan Scott
@ 2007-02-07  1:35   ` Andrew Morton
  2007-02-07 20:46     ` Andreas Dilger
  1 sibling, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2007-02-07  1:35 UTC (permalink / raw)
  To: Jan Kara; +Cc: sho, linux-ext4, linux-fsdevel

On Mon, 5 Feb 2007 14:12:04 +0100
Jan Kara <jack@suse.cz> wrote:

> > Move the blocks on the temporary inode to the original inode
> > by a page.
> > 1. Read the file data from the old blocks to the page
> > 2. Move the block on the temporary inode to the original inode
> > 3. Write the file data on the page into the new blocks
>   I have one thing - it's probably not good to use page cache for
> defragmentation.

Then it is no longer online defragmentation.  The issues with maintaining
correctness and coherency with ongoing VFS activity would be truly ghastly.

If we're worried about pagecache pollution then it would be better to control
that from userspace via fadvise().

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-07  1:33 ` Andrew Morton
@ 2007-02-07  3:45   ` Eric Sandeen
  2007-02-07  9:46     ` Takashi Sato
  0 siblings, 1 reply; 18+ messages in thread
From: Eric Sandeen @ 2007-02-07  3:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: sho, linux-ext4, linux-fsdevel

Andrew Morton wrote:
> On Tue, 16 Jan 2007 21:05:20 +0900
> sho@tnes.nec.co.jp wrote:

...

>> +ext4_ext_replace_branches(struct inode *org_inode, struct inode *dest_inode,
>> +			pgoff_t from_page,  pgoff_t dest_from_page,
>> +			pgoff_t count_page, unsigned long *delete_start) 
>> +{
>> +	struct ext4_ext_path *org_path = NULL;
>> +	struct ext4_ext_path *dest_path = NULL;
>> +	struct ext4_extent   *oext, *dext;
>> +	struct ext4_extent   tmp_ext;
>> +	int	err = 0;
>> +	int	depth;
>> +	unsigned long from, count, dest_off, diff, replaced_count = 0;
> 
> These should be sector_t, shouldn't they?

At some point should we start using blkcnt_t properly? 
(block-in[-large]-file, not block-in[-large]-device?)  I think that's 
what it was introduced for, although it's not in wide use at this point.

I guess the type really isn't used anywhere else; just in the inode's 
i_blocks.  Hmm.

-Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-07  3:45   ` Eric Sandeen
@ 2007-02-07  9:46     ` Takashi Sato
  0 siblings, 0 replies; 18+ messages in thread
From: Takashi Sato @ 2007-02-07  9:46 UTC (permalink / raw)
  To: Eric Sandeen, Andrew Morton; +Cc: linux-ext4, linux-fsdevel

Hi,

>>> +ext4_ext_replace_branches(struct inode *org_inode, struct inode *dest_inode,
>>> + pgoff_t from_page,  pgoff_t dest_from_page,
>>> + pgoff_t count_page, unsigned long *delete_start) +{
>>> + struct ext4_ext_path *org_path = NULL;
>>> + struct ext4_ext_path *dest_path = NULL;
>>> + struct ext4_extent   *oext, *dext;
>>> + struct ext4_extent   tmp_ext;
>>> + int err = 0;
>>> + int depth;
>>> + unsigned long from, count, dest_off, diff, replaced_count = 0;
>>
>> These should be sector_t, shouldn't they?
>
> At some point should we start using blkcnt_t properly? (block-in[-large]-file, not 
> block-in[-large]-device?)  I think that's what it was introduced for, although it's not 
> in wide use at this point.
>
> I guess the type really isn't used anywhere else; just in the inode's i_blocks.  Hmm.

On reflection, I think we should use ext4_fsblk_t in this case, because
some ext4 codes such as ext4_ext_get_blocks() use it.
int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t iblock,
So I would like to change "unsigned long" into ext4_fsblk_t in my next patch.

Cheers, Takashi 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-07  1:35   ` Andrew Morton
@ 2007-02-07 20:46     ` Andreas Dilger
  2007-02-07 20:56       ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Andreas Dilger @ 2007-02-07 20:46 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Kara, sho, linux-ext4, linux-fsdevel

On Feb 06, 2007  17:35 -0800, Andrew Morton wrote:
> On Mon, 5 Feb 2007 14:12:04 +0100
> Jan Kara <jack@suse.cz> wrote:
> > > Move the blocks on the temporary inode to the original inode
> > > by a page.
> > > 1. Read the file data from the old blocks to the page
> > > 2. Move the block on the temporary inode to the original inode
> > > 3. Write the file data on the page into the new blocks
> >   I have one thing - it's probably not good to use page cache for
> > defragmentation.
> 
> Then it is no longer online defragmentation.  The issues with maintaining
> correctness and coherency with ongoing VFS activity would be truly ghastly.
> 
> If we're worried about pagecache pollution then it would be better to control
> that from userspace via fadvise().

It should be possible to have the online defrag tool lock the inode against
any changes, flush all pages out of the cache for that inode, and then do
the reallocated outside of the page cache.  For inodes not already in cache
this is a no-op.  For the (hopefully rare) case were the inode already has
cached pages and also needs to be reallocated it would be a performance hit.

Alternately, we could skip files currently being modified (or mmaped), or
even recently modified (e.g. within the last 30 minutes) in the default case,
on the assumption that they might be deleted soon anyways.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-07 20:46     ` Andreas Dilger
@ 2007-02-07 20:56       ` Andrew Morton
  2007-02-08  9:29         ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2007-02-07 20:56 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Jan Kara, sho, linux-ext4, linux-fsdevel

On Wed, 7 Feb 2007 13:46:57 -0700
Andreas Dilger <adilger@clusterfs.com> wrote:

> On Feb 06, 2007  17:35 -0800, Andrew Morton wrote:
> > On Mon, 5 Feb 2007 14:12:04 +0100
> > Jan Kara <jack@suse.cz> wrote:
> > > > Move the blocks on the temporary inode to the original inode
> > > > by a page.
> > > > 1. Read the file data from the old blocks to the page
> > > > 2. Move the block on the temporary inode to the original inode
> > > > 3. Write the file data on the page into the new blocks
> > >   I have one thing - it's probably not good to use page cache for
> > > defragmentation.
> > 
> > Then it is no longer online defragmentation.  The issues with maintaining
> > correctness and coherency with ongoing VFS activity would be truly ghastly.
> > 
> > If we're worried about pagecache pollution then it would be better to control
> > that from userspace via fadvise().
> 
> It should be possible to have the online defrag tool lock the inode against
> any changes,

Sounds easy when you say it fast.  But how do we "lock" against, say, a
read pagefault?  Only by writing back then removing the pagecache page then
reinstantiating it as a locked, not-uptodate page and then removing it from
pagecache afterwards prior to unlocking it.  Or something.

I don't think we want to go there.

> flush all pages out of the cache for that inode, and then do
> the reallocated outside of the page cache.  For inodes not already in cache
> this is a no-op.  For the (hopefully rare) case were the inode already has
> cached pages and also needs to be reallocated it would be a performance hit.
> 
> Alternately, we could skip files currently being modified (or mmaped), or
> even recently modified (e.g. within the last 30 minutes) in the default case,
> on the assumption that they might be deleted soon anyways.

argh.

It's simple to just use pagecache.  The "we don't want to swamp the machine
with pagecache" argument is bogus.  If it becomes a problem (and it might
not) then it is very simple to control the pagecache from userspace.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-07 20:56       ` Andrew Morton
@ 2007-02-08  9:29         ` Jan Kara
  2007-02-08  9:45           ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2007-02-08  9:29 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andreas Dilger, sho, linux-ext4, linux-fsdevel

On Wed 07-02-07 12:56:59, Andrew Morton wrote:
> On Wed, 7 Feb 2007 13:46:57 -0700
> Andreas Dilger <adilger@clusterfs.com> wrote:
> 
> > On Feb 06, 2007  17:35 -0800, Andrew Morton wrote:
> > > On Mon, 5 Feb 2007 14:12:04 +0100
> > > Jan Kara <jack@suse.cz> wrote:
> > > > > Move the blocks on the temporary inode to the original inode
> > > > > by a page.
> > > > > 1. Read the file data from the old blocks to the page
> > > > > 2. Move the block on the temporary inode to the original inode
> > > > > 3. Write the file data on the page into the new blocks
> > > >   I have one thing - it's probably not good to use page cache for
> > > > defragmentation.
> > > 
> > > Then it is no longer online defragmentation.  The issues with maintaining
> > > correctness and coherency with ongoing VFS activity would be truly ghastly.
> > > 
> > > If we're worried about pagecache pollution then it would be better to control
> > > that from userspace via fadvise().
> > 
> > It should be possible to have the online defrag tool lock the inode against
> > any changes,
> 
> Sounds easy when you say it fast.  But how do we "lock" against, say, a
> read pagefault?  Only by writing back then removing the pagecache page then
> reinstantiating it as a locked, not-uptodate page and then removing it from
> pagecache afterwards prior to unlocking it.  Or something.
> 
> I don't think we want to go there.
  I though Andreas meant "any write changes" - i.e. you check that noone
has open file descriptor for writing and block any new open for writing.
That can be done quite easily.
  Anyway, I agree with you that userspace solution to a possible page
cache pollution is preferable after thinking about it for a while.
As I've been thinking about it, we could actually do the copying
from user space. We could do something like:
  block any writes to file (as I described above)
  craft new inode with blocks allocated as we want (using preallocation,
    we should mostly have the kernel infrastructure we need)
  copy data using splice syscall
  call the kernel to switch data

  But maybe I miss something and it's more complicated than I think.

									Honza
-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-08  9:29         ` Jan Kara
@ 2007-02-08  9:45           ` Andrew Morton
  2007-02-08 10:21             ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2007-02-08  9:45 UTC (permalink / raw)
  To: Jan Kara; +Cc: Andreas Dilger, sho, linux-ext4, linux-fsdevel

On Thu, 8 Feb 2007 10:29:45 +0100 Jan Kara <jack@suse.cz> wrote:

> On Wed 07-02-07 12:56:59, Andrew Morton wrote:
> > On Wed, 7 Feb 2007 13:46:57 -0700
> > Andreas Dilger <adilger@clusterfs.com> wrote:
> > 
> > > On Feb 06, 2007  17:35 -0800, Andrew Morton wrote:
> > > > On Mon, 5 Feb 2007 14:12:04 +0100
> > > > Jan Kara <jack@suse.cz> wrote:
> > > > > > Move the blocks on the temporary inode to the original inode
> > > > > > by a page.
> > > > > > 1. Read the file data from the old blocks to the page
> > > > > > 2. Move the block on the temporary inode to the original inode
> > > > > > 3. Write the file data on the page into the new blocks
> > > > >   I have one thing - it's probably not good to use page cache for
> > > > > defragmentation.
> > > > 
> > > > Then it is no longer online defragmentation.  The issues with maintaining
> > > > correctness and coherency with ongoing VFS activity would be truly ghastly.
> > > > 
> > > > If we're worried about pagecache pollution then it would be better to control
> > > > that from userspace via fadvise().
> > > 
> > > It should be possible to have the online defrag tool lock the inode against
> > > any changes,
> > 
> > Sounds easy when you say it fast.  But how do we "lock" against, say, a
> > read pagefault?  Only by writing back then removing the pagecache page then
> > reinstantiating it as a locked, not-uptodate page and then removing it from
> > pagecache afterwards prior to unlocking it.  Or something.
> > 
> > I don't think we want to go there.
>   I though Andreas meant "any write changes" - i.e. you check that noone
> has open file descriptor for writing and block any new open for writing.
> That can be done quite easily.
>   Anyway, I agree with you that userspace solution to a possible page
> cache pollution is preferable after thinking about it for a while.
> As I've been thinking about it, we could actually do the copying
> from user space. We could do something like:
>   block any writes to file (as I described above)
>   craft new inode with blocks allocated as we want (using preallocation,
>     we should mostly have the kernel infrastructure we need)
>   copy data using splice syscall
>   call the kernel to switch data
> 

I don't think we need to block any writes to any file or anything.

To move a page within a file:

	fd = open(file);
	p = mmap(fd);
	the_page_was_in_core = mincore(p, offset);
	munmap(p);
	ioctl(fd, ..., new_block);

			<kernel>
			read_cache_page(inode, offset);
			lock_page(page);
			if (try_to_free_buffers(page)) {
				<relocate the page>
				set_page_dirty(page);
			}
			unlock_page(page);

	if (the_page_was_in_core) {
		sync_file_range(fd, offset SYNC_FILE_RANGE_WAIT_BEFORE|
						SYNC_FILE_RANGE_WRITE|
						SYNC_FILE_RANGE_WAIT_AFTER);
		fadvise(fd, offset, FADV_DONTNEED);
	}

completely coherent with pagecache, quite safe in the presence of mmap,
mlock, O_DIRECT, everything else.  Also fully journallable in-kernel.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-08  9:45           ` Andrew Morton
@ 2007-02-08 10:21             ` Jan Kara
  2007-02-08 10:32               ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2007-02-08 10:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andreas Dilger, sho, linux-ext4, linux-fsdevel

On Thu 08-02-07 01:45:29, Andrew Morton wrote:
 <snip>
> >   I though Andreas meant "any write changes" - i.e. you check that noone
> > has open file descriptor for writing and block any new open for writing.
> > That can be done quite easily.
> >   Anyway, I agree with you that userspace solution to a possible page
> > cache pollution is preferable after thinking about it for a while.
> > As I've been thinking about it, we could actually do the copying
> > from user space. We could do something like:
> >   block any writes to file (as I described above)
> >   craft new inode with blocks allocated as we want (using preallocation,
> >     we should mostly have the kernel infrastructure we need)
> >   copy data using splice syscall
> >   call the kernel to switch data
> > 
> 
> I don't think we need to block any writes to any file or anything.
> 
> To move a page within a file:
> 
> 	fd = open(file);
> 	p = mmap(fd);
> 	the_page_was_in_core = mincore(p, offset);
> 	munmap(p);
> 	ioctl(fd, ..., new_block);
> 
> 			<kernel>
> 			read_cache_page(inode, offset);
> 			lock_page(page);
> 			if (try_to_free_buffers(page)) {
> 				<relocate the page>
> 				set_page_dirty(page);
> 			}
> 			unlock_page(page);
> 
> 	if (the_page_was_in_core) {
> 		sync_file_range(fd, offset SYNC_FILE_RANGE_WAIT_BEFORE|
> 						SYNC_FILE_RANGE_WRITE|
> 						SYNC_FILE_RANGE_WAIT_AFTER);
> 		fadvise(fd, offset, FADV_DONTNEED);
> 	}
> 
> completely coherent with pagecache, quite safe in the presence of mmap,
> mlock, O_DIRECT, everything else.  Also fully journallable in-kernel.
  Yes, this is the simple way. But I see two disadvantages:
1) You'd like to relocate metadata (indirect blocks) too. For that you need
   a different mechanism. In my approach, you can mostly assume you've got
   sanely laid out metadata and so the existence of such mechanism is not
   so important.
2) You'd like to allocate new blocks in big chunks. So your kernel function
   should rather take a range. Also when you fail in the middle of
   relocating a file (for example the block you'd like to use is already
   taken by someone else), I find it nice if you can return at least to the
   original state. But that's probably not important.

								Honza

-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-08 10:21             ` Jan Kara
@ 2007-02-08 10:32               ` Andrew Morton
  2007-02-08 10:47                 ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2007-02-08 10:32 UTC (permalink / raw)
  To: Jan Kara; +Cc: Andreas Dilger, sho, linux-ext4, linux-fsdevel

On Thu, 8 Feb 2007 11:21:02 +0100 Jan Kara <jack@suse.cz> wrote:

> On Thu 08-02-07 01:45:29, Andrew Morton wrote:
>  <snip>
> > >   I though Andreas meant "any write changes" - i.e. you check that noone
> > > has open file descriptor for writing and block any new open for writing.
> > > That can be done quite easily.
> > >   Anyway, I agree with you that userspace solution to a possible page
> > > cache pollution is preferable after thinking about it for a while.
> > > As I've been thinking about it, we could actually do the copying
> > > from user space. We could do something like:
> > >   block any writes to file (as I described above)
> > >   craft new inode with blocks allocated as we want (using preallocation,
> > >     we should mostly have the kernel infrastructure we need)
> > >   copy data using splice syscall
> > >   call the kernel to switch data
> > > 
> > 
> > I don't think we need to block any writes to any file or anything.
> > 
> > To move a page within a file:
> > 
> > 	fd = open(file);
> > 	p = mmap(fd);
> > 	the_page_was_in_core = mincore(p, offset);
> > 	munmap(p);
> > 	ioctl(fd, ..., new_block);
> > 
> > 			<kernel>
> > 			read_cache_page(inode, offset);
> > 			lock_page(page);
> > 			if (try_to_free_buffers(page)) {
> > 				<relocate the page>
> > 				set_page_dirty(page);
> > 			}
> > 			unlock_page(page);
> > 
> > 	if (the_page_was_in_core) {
> > 		sync_file_range(fd, offset SYNC_FILE_RANGE_WAIT_BEFORE|
> > 						SYNC_FILE_RANGE_WRITE|
> > 						SYNC_FILE_RANGE_WAIT_AFTER);
> > 		fadvise(fd, offset, FADV_DONTNEED);
> > 	}
> > 
> > completely coherent with pagecache, quite safe in the presence of mmap,
> > mlock, O_DIRECT, everything else.  Also fully journallable in-kernel.
>   Yes, this is the simple way. But I see two disadvantages:
> 1) You'd like to relocate metadata (indirect blocks) too.

Well.  Do we really?  Are we looking for a 100% solution here, or a 90% one?

Relocating data is the main thing.  After that, yeah, relocating metadata,
inodes and directories is probably a second-order thing.

> For that you need
>    a different mechanism.

I suspect a similar approach will work there: load and lock the
buffer_heads (or maybe just the top-level buffer_head) and then alter their
contents.  It could be that verify_chain() will just magically do the right
thing there, but some changes might be needed.

> In my approach, you can mostly assume you've got
>    sanely laid out metadata and so the existence of such mechanism is not
>    so important.
> 2) You'd like to allocate new blocks in big chunks. So your kernel function
>    should rather take a range. Also when you fail in the middle of
>    relocating a file (for example the block you'd like to use is already
>    taken by someone else), I find it nice if you can return at least to the
>    original state. But that's probably not important.

Well yes, that was a minimal sketch.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-08 10:32               ` Andrew Morton
@ 2007-02-08 10:47                 ` Jan Kara
  2007-02-12  3:11                   ` Theodore Tso
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2007-02-08 10:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andreas Dilger, sho, linux-ext4, linux-fsdevel

On Thu 08-02-07 02:32:13, Andrew Morton wrote:
> On Thu, 8 Feb 2007 11:21:02 +0100 Jan Kara <jack@suse.cz> wrote:
> 
> > On Thu 08-02-07 01:45:29, Andrew Morton wrote:
> >  <snip>
> > > >   I though Andreas meant "any write changes" - i.e. you check that noone
> > > > has open file descriptor for writing and block any new open for writing.
> > > > That can be done quite easily.
> > > >   Anyway, I agree with you that userspace solution to a possible page
> > > > cache pollution is preferable after thinking about it for a while.
> > > > As I've been thinking about it, we could actually do the copying
> > > > from user space. We could do something like:
> > > >   block any writes to file (as I described above)
> > > >   craft new inode with blocks allocated as we want (using preallocation,
> > > >     we should mostly have the kernel infrastructure we need)
> > > >   copy data using splice syscall
> > > >   call the kernel to switch data
> > > > 
> > > 
> > > I don't think we need to block any writes to any file or anything.
> > > 
> > > To move a page within a file:
> > > 
> > > 	fd = open(file);
> > > 	p = mmap(fd);
> > > 	the_page_was_in_core = mincore(p, offset);
> > > 	munmap(p);
> > > 	ioctl(fd, ..., new_block);
> > > 
> > > 			<kernel>
> > > 			read_cache_page(inode, offset);
> > > 			lock_page(page);
> > > 			if (try_to_free_buffers(page)) {
> > > 				<relocate the page>
> > > 				set_page_dirty(page);
> > > 			}
> > > 			unlock_page(page);
> > > 
> > > 	if (the_page_was_in_core) {
> > > 		sync_file_range(fd, offset SYNC_FILE_RANGE_WAIT_BEFORE|
> > > 						SYNC_FILE_RANGE_WRITE|
> > > 						SYNC_FILE_RANGE_WAIT_AFTER);
> > > 		fadvise(fd, offset, FADV_DONTNEED);
> > > 	}
> > > 
> > > completely coherent with pagecache, quite safe in the presence of mmap,
> > > mlock, O_DIRECT, everything else.  Also fully journallable in-kernel.
> >   Yes, this is the simple way. But I see two disadvantages:
> > 1) You'd like to relocate metadata (indirect blocks) too.
> 
> Well.  Do we really?  Are we looking for a 100% solution here, or a 90% one?
  Umm, I think that for ext3 having data on one end of the disk and
indirect blocks on the other end of the disk does not quite help (not
mentioning that it can create bad free space fragmentation over the time).
I have not measured it but I'd guess that it would erase the effect of
moving data closer together. At least for sequential reads..

> Relocating data is the main thing.  After that, yeah, relocating metadata,
> inodes and directories is probably a second-order thing.
> 
> > For that you need
> >    a different mechanism.
> 
> I suspect a similar approach will work there: load and lock the
> buffer_heads (or maybe just the top-level buffer_head) and then alter their
> contents.  It could be that verify_chain() will just magically do the right
> thing there, but some changes might be needed.
  Yes, it could be done. I just wanted to point to the fact that things may
not be as simple in your solution either...

									Honza
-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC][PATCH 2/3] Move the file data to the new blocks
  2007-02-08 10:47                 ` Jan Kara
@ 2007-02-12  3:11                   ` Theodore Tso
  0 siblings, 0 replies; 18+ messages in thread
From: Theodore Tso @ 2007-02-12  3:11 UTC (permalink / raw)
  To: Jan Kara; +Cc: Andrew Morton, Andreas Dilger, sho, linux-ext4, linux-fsdevel

On Thu, Feb 08, 2007 at 11:47:39AM +0100, Jan Kara wrote:
> > 
> > Well.  Do we really?  Are we looking for a 100% solution here, or a 90% one?
>   Umm, I think that for ext3 having data on one end of the disk and
> indirect blocks on the other end of the disk does not quite help (not
> mentioning that it can create bad free space fragmentation over the time).
> I have not measured it but I'd guess that it would erase the effect of
> moving data closer together. At least for sequential reads..

I don't think anyone is saying we can ignore the metadata; but the
fact is, the cleanest solution for 90% of the problem is to use the
page cache, and as far as the other 10%, Linus has been pushing us to
move at least the directories into the page cache, and it's not insane
to consider moving the rest of the metadata into page cache.  At least
it's something we should consider carefully.

						- Ted

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC][PATCH 2/3] Move the file data to the new blocks
@ 2007-02-08  9:01 Takashi Sato
  0 siblings, 0 replies; 18+ messages in thread
From: Takashi Sato @ 2007-02-08  9:01 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel

Move the blocks on the temporary inode to the original inode
by a page.
1. Read the file data from the old blocks to the page
2. Move the block on the temporary inode to the original inode
3. Write the file data on the page into the new blocks

Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
---
diff -Nrup -X linux-2.6.19-rc6.org/Documentation/dontdiff linux-2.6.19-rc6-1/fs/ext4/extents.c linux-2.6.19-rc6-full/fs/ext4/extents.c
--- linux-2.6.19-rc6-1/fs/ext4/extents.c	2007-02-08 14:13:49.000000000 +0900
+++ linux-2.6.19-rc6-full/fs/ext4/extents.c	2007-02-08 14:09:43.000000000 +0900
@@ -2533,6 +2533,653 @@ ext4_ext_next_extent(struct inode *inode
 }
 
 /**
+ * ext4_ext_merge_across - merge extents across leaf block
+ * 
+ * @handle	journal handle
+ * @inode	target file's inode
+ * @o_start	first original extent to be defraged
+ * @o_end	last original extent to be defraged
+ * @start_ext	first new extent to be merged
+ * @new_ext	middle of new extent to be merged
+ * @end_ext	last new extent to be merged
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_ext_merge_across_blocks(handle_t *handle, struct inode *inode,
+		struct ext4_extent *o_start,
+		struct ext4_extent *o_end, struct ext4_extent *start_ext,
+		struct ext4_extent *new_ext,struct ext4_extent *end_ext)
+{
+	struct ext4_ext_path *org_path = NULL;
+	unsigned long eblock = 0;
+	int err = 0;
+	int new_flag = 0;
+	int end_flag = 0;
+
+	if (le16_to_cpu(start_ext->ee_len) &&
+		le16_to_cpu(new_ext->ee_len) &&
+		le16_to_cpu(end_ext->ee_len)) {
+
+		if ((o_start) == (o_end)) {
+
+			/*       start_ext   new_ext    end_ext
+			 * dest |---------|-----------|--------|
+			 * org  |------------------------------|
+			 */
+
+			ext4_free_blocks(handle, inode, ext_pblock(o_start) +
+                                le16_to_cpu(start_ext->ee_len),
+                                le16_to_cpu(new_ext->ee_len), 0);
+
+			end_flag = 1;
+
+		} else {
+
+			/*       start_ext   new_ext   end_ext
+			 * dest |---------|----------|---------|
+			 * org  |---------------|--------------|
+			 */
+
+			ext4_free_blocks(handle, inode, ext_pblock(o_start) +
+				le16_to_cpu(start_ext->ee_len),
+				le16_to_cpu(o_start->ee_len)
+				- le16_to_cpu(start_ext->ee_len), 0);
+
+			ext4_free_blocks(handle, inode, ext_pblock(o_end),
+				le16_to_cpu(o_end->ee_len)
+				- le16_to_cpu(end_ext->ee_len), 0);
+
+			o_end->ee_block = end_ext->ee_block;
+			o_end->ee_len = end_ext->ee_len;
+			ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+		}
+
+		o_start->ee_len = start_ext->ee_len;
+		new_flag = 1;
+
+	} else if ((le16_to_cpu(start_ext->ee_len)) &&
+			(le16_to_cpu(new_ext->ee_len)) &&
+			(!le16_to_cpu(end_ext->ee_len)) &&
+			((o_start) == (o_end))) {
+
+		/*         start_ext        new_ext
+		 * dest |--------------|---------------|
+		 * org  |------------------------------|
+		 */
+
+		ext4_free_blocks(handle, inode, ext_pblock(o_start) +
+			le16_to_cpu(start_ext->ee_len),
+			le16_to_cpu(new_ext->ee_len), 0);
+
+		o_start->ee_len = start_ext->ee_len;
+		new_flag = 1;
+
+	} else if ((!le16_to_cpu(start_ext->ee_len)) &&
+			(le16_to_cpu(new_ext->ee_len)) &&
+			(le16_to_cpu(end_ext->ee_len)) &&
+			((o_start) == (o_end))) {
+
+		/*          new_ext        end_ext
+		 * dest |--------------|---------------|
+		 * org  |------------------------------|
+		 */
+
+		ext4_free_blocks(handle, inode, ext_pblock(o_end),
+			le16_to_cpu(new_ext->ee_len), 0);
+
+		o_end->ee_block = end_ext->ee_block;
+		o_end->ee_len = end_ext->ee_len;
+		ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+
+		/* If new_ext was first block */	
+		if (!new_ext->ee_block)
+			eblock = 0;
+		else
+			eblock = le32_to_cpu(new_ext->ee_block);
+
+		new_flag = 1;
+	} else {
+		printk("Unexpected case \n");
+		return -EIO;
+	}
+
+	if (new_flag) {
+		org_path = ext4_ext_find_extent(inode, eblock, NULL);
+		if (IS_ERR(org_path)) {
+			err = PTR_ERR(org_path);
+			org_path = NULL;
+			goto ERR;
+		}
+		err = ext4_ext_insert_extent(handle, inode,
+					org_path, new_ext);
+		if (err)
+			goto ERR;
+	}
+
+	if (end_flag) {
+		org_path = ext4_ext_find_extent(inode,
+					end_ext->ee_block -1, org_path);
+		if (IS_ERR(org_path)) {
+			err = PTR_ERR(org_path);
+			org_path = NULL;
+			goto ERR;
+		}
+		err = ext4_ext_insert_extent(handle, inode,
+					org_path, end_ext);
+		if (err)
+			goto ERR;
+	}
+ERR:
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+
+	return err;
+
+}
+
+/**
+ * ext4_ext_merge_inside_block - merge new extent to the extent block
+ *
+ * @handle	journal handle
+ * @inode	target file's inode
+ * @o_start	first original extent to be defraged
+ * @o_end	last original extent to be merged
+ * @start_ext	first new extent to be merged
+ * @new_ext	middle of new extent to be merged
+ * @end_ext	last new extent to be merged
+ * @eh		extent header of target leaf block
+ * @replaced	the number of blocks which will be replaced with new_ext
+ * @range_to_move used to dicide how to merge
+ *
+ * This function always returns 0.
+*/
+static int
+ext4_ext_merge_inside_block(handle_t *handle, struct inode *inode,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext, struct ext4_extent_header *eh,
+		ext4_fsblk_t replaced, int range_to_move)
+{
+	struct ext4_extent *free_start, *free_end;
+	int i = 0;
+	unsigned len;
+
+	/* Free old blocks
+	 * dest        |---------------|
+	 * org  |---------------|
+	 */
+	free_start = o_start;
+	free_end = o_end;
+	if (le16_to_cpu(start_ext->ee_len)) {
+		if (le16_to_cpu(o_start->ee_len)
+			> le16_to_cpu(start_ext->ee_len)) {
+			ext4_free_blocks(handle, inode,
+				ext_pblock(o_start)
+					+ le16_to_cpu(start_ext->ee_len),
+				le16_to_cpu(o_start->ee_len)
+					- le16_to_cpu(start_ext->ee_len), 0);
+		}
+		free_start++;
+	}
+
+	/* dest |----------------|
+	 * org          |---------------|
+	 */
+	if (le16_to_cpu(end_ext->ee_len)) {
+		ext4_free_blocks(handle, inode, ext_pblock(o_end),
+			le16_to_cpu(o_end->ee_len)
+				- le16_to_cpu(end_ext->ee_len), 0);
+		free_end--;
+	}
+
+	/* dest |-------------------|
+	 * org   |-----------------|
+	 */
+	for (; free_start <= free_end; free_start++) {
+		ext4_free_blocks(handle, inode,
+			ext_pblock(free_start),
+			le32_to_cpu(free_start->ee_len), 0);
+	}
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = EXT_LAST_EXTENT(eh) - (o_end + 1) + 1;
+		len = len * sizeof(struct ext4_extent);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (le16_to_cpu(start_ext->ee_len)) {
+		o_start[i++].ee_len = start_ext->ee_len;
+	}
+
+	/* Insert new entry */
+	if (le16_to_cpu(new_ext->ee_len)) {
+		o_start[i].ee_block = new_ext->ee_block;
+		o_start[i].ee_len = cpu_to_le16(replaced);
+		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len) {
+		o_start[i] = *end_ext;
+	}
+
+	/* Increment the total entries counter on the extent block */
+	eh->eh_entries
+		= cpu_to_le16(le16_to_cpu(eh->eh_entries) + range_to_move);
+
+	return 0;
+}
+
+/**
+ * ext4_ext_merge_extents - merge new extent
+ *
+ * @handle      journal handle
+ * @inode       target file's inode
+ * @org_path    path indicates first extent to be defraged
+ * @o_start     first original extent to be defraged
+ * @o_end       last original extent to be defraged
+ * @start_ext   first new extent to be merged
+ * @new_ext     middle of new extent to be merged
+ * @end_ext     last new extent to be merged
+ * @replaced    the number of blocks which will be replaced with new_ext
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_ext_merge_extents(handle_t *handle, struct inode *inode,
+		struct ext4_ext_path *org_path,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext, ext4_fsblk_t replaced)
+{
+	struct  ext4_extent_header * eh;
+	unsigned need_slots, slots_range;
+	int	range_to_move, depth, ret;
+
+	/* The extents need to be inserted
+	 * start_extent + new_extent + end_extent
+	 */
+	need_slots = (le16_to_cpu(start_ext->ee_len) ? 1 : 0) +
+			(le16_to_cpu(end_ext->ee_len) ? 1 : 0) +
+			(le16_to_cpu(new_ext->ee_len) ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = o_end - o_start + 1;
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = org_path->p_depth;
+	org_path += depth;
+	eh = org_path->p_hdr;
+
+	if (depth) {
+		/* Register to journal */
+		if (ext4_journal_get_write_access(handle, org_path->p_bh))
+			return -EIO;
+	}
+
+	/* expansion */
+	if ((range_to_move > 0) && 
+		(range_to_move > le16_to_cpu(eh->eh_max)
+			- le16_to_cpu(eh->eh_entries))) {
+
+		if ((ret = ext4_ext_merge_across_blocks(handle, inode,
+				o_start, o_end, start_ext,
+				new_ext, end_ext)) < 0)
+			return ret;
+	} else {
+		if ((ret = ext4_ext_merge_inside_block(handle, inode,
+			o_start, o_end, start_ext, new_ext,
+			end_ext, eh, replaced, range_to_move)) < 0)
+			return ret;
+	}
+
+	if (depth) {
+		if (ext4_journal_dirty_metadata(handle, org_path->p_bh))
+			return -EIO;
+	} else {
+		if (ext4_mark_inode_dirty(handle, inode) < 0)
+			return -EIO;
+	}
+
+	return 0;
+
+}
+
+/**
+ * ext4_ext_defrag_leaf_block -  Defragmentation for one leaf extent block.
+ * @handle      journal handle
+ * @org_inode   target inode
+ * @org_path    path indicates first extent to be defraged
+ * @dext        destination extent
+ * @from        start offset on the target file
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_ext_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+		struct ext4_ext_path *org_path, struct ext4_extent *dext,
+		ext4_fsblk_t *from, ext4_fsblk_t *delete_start)
+{
+	unsigned long depth;
+	ext4_fsblk_t replaced = 0;
+	struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	ext4_fsblk_t new_end;
+	ext4_fsblk_t lblock;
+	ext4_fsblk_t len;
+	ext4_fsblk_t new_phys_end;
+	int	ret;
+
+	depth = ext_depth(org_inode);
+	start_ext.ee_len = end_ext.ee_len = 0;
+	o_start = o_end = oext = org_path[depth].p_ext;
+	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));	
+	len = new_ext.ee_len = dext->ee_len;
+	new_ext.ee_block = cpu_to_le32(*from);
+	lblock = le32_to_cpu(oext->ee_block);
+	new_end = le32_to_cpu(new_ext.ee_block)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+	new_phys_end = ext_pblock(&new_ext)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+
+	/* First original extent
+	 * dest         |---------------|
+	 * org  |---------------|
+	 */
+	if (le32_to_cpu(new_ext.ee_block) >
+		le32_to_cpu(oext->ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block)
+		+ le16_to_cpu(oext->ee_len)) {
+		start_ext.ee_len = cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+					- le32_to_cpu(oext->ee_block));
+		replaced += le16_to_cpu(oext->ee_len)
+					- le16_to_cpu(start_ext.ee_len);
+	} else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+		/* We can merge previous extent. */
+		prev_ext = oext -1;
+		if ((ext_pblock(prev_ext)
+			+ le32_to_cpu(prev_ext->ee_len))
+			== ext_pblock(&new_ext)) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le32(
+					le16_to_cpu(prev_ext->ee_len)
+					+ le16_to_cpu(new_ext.ee_len));
+			new_ext.ee_len = 0;
+		}
+	}
+	for (;;) {
+		/* The extent for destination must be found. */
+		BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
+		lblock += le16_to_cpu(oext->ee_len);
+
+		/* Middle of original extent
+		 * dest |-------------------|
+		 * org   |-----------------|
+		 */
+		if (le32_to_cpu(new_ext.ee_block) <=
+			le32_to_cpu(oext->ee_block) &&
+			new_end >= le32_to_cpu(oext->ee_block)
+			+ le16_to_cpu(oext->ee_len) -1) {
+			replaced += le16_to_cpu(oext->ee_len);
+		}
+
+		/* Last original extent
+		 * dest |----------------|
+		 * org          |---------------|
+		 */
+		if (new_end >= le32_to_cpu(oext->ee_block) &&
+			new_end < le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			end_ext.ee_len
+				= cpu_to_le16(le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) -1 - new_end);
+			ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
+				+ cpu_to_le16(oext->ee_len)
+				- cpu_to_le16(end_ext.ee_len)));
+			end_ext.ee_block
+				= cpu_to_le32(le32_to_cpu(o_end->ee_block)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len));
+			replaced += le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len);
+		}
+
+		/* Detected the block end, reached the number of replaced
+		 * blocks to dext->ee_len.  Then, merge the extent.
+		 */
+		if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+			new_end <= le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			if ((ret = ext4_ext_merge_extents(handle, org_inode,
+				org_path, o_start, o_end, &start_ext,
+				&new_ext, &end_ext, replaced)) < 0) {
+				return ret;
+			}
+
+			*delete_start = le32_to_cpu(new_ext.ee_block)
+							+ replaced;
+
+			/* All expected blocks are replaced */
+			if (new_ext.ee_len <= 0) {
+				if (DQUOT_ALLOC_BLOCK
+					(org_inode, len)) {
+					return -EDQUOT;
+				}
+				return 0;
+			}
+
+			/* re-calculate new_ext */
+			new_ext.ee_len = cpu_to_le32(le16_to_cpu(new_ext.ee_len)
+				- replaced);
+			new_ext.ee_block =
+				cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+				+ replaced);
+			ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
+					 + replaced);
+			replaced = 0;
+			start_ext.ee_len = end_ext.ee_len = 0;
+			o_start = NULL;
+
+			/* All expected blocks are replaced */
+			if (new_ext.ee_len <= 0) {
+				if (DQUOT_ALLOC_BLOCK
+					(org_inode, len)) {
+					return -EDQUOT;
+				}
+				return 0;
+			}
+		}
+
+		/* Get next extent for original. */
+		if ((ret
+		 = ext4_ext_next_extent(org_inode, org_path, &oext))
+								!= 0) {
+			if (ret == 1)
+				ret = -EIO;
+			return ret;
+		}
+		o_end = oext;
+		if (!o_start)
+			o_start = oext;
+	}
+}
+
+/**
+ * ext4_ext_replace_branches - replace original extents with new extents.
+ * @org_inode    Original inode
+ * @dest_inode   temporary inode
+ * @from_page    Page offset
+ * @count_page   Page count to be replaced
+ * @delete_start block offset for deletion
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from+count-1".
+ */
+static int
+ext4_ext_replace_branches(struct inode *org_inode, struct inode *dest_inode,
+			pgoff_t from_page,  pgoff_t dest_from_page,
+			pgoff_t count_page, ext4_fsblk_t *delete_start) 
+{
+	struct ext4_ext_path *org_path = NULL;
+	struct ext4_ext_path *dest_path = NULL;
+	struct ext4_extent   *oext, *dext;
+	struct ext4_extent   tmp_ext;
+	int	err = 0;
+	int	depth;
+	ext4_fsblk_t from, count, dest_off, diff, replaced_count = 0;
+	handle_t *handle = NULL;
+	unsigned jnum;
+
+	from = (ext4_fsblk_t)from_page << 
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	count = (ext4_fsblk_t)count_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	dest_off = (ext4_fsblk_t)dest_from_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	jnum = ext4_ext_writepage_trans_blocks(org_inode, count) + 3;
+	handle = ext4_journal_start(org_inode, jnum);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out;
+	}
+
+	/* Get the original extent for the block "from" */
+	org_path = ext4_ext_find_extent(org_inode, from, NULL);
+	if (IS_ERR(org_path)) {
+		err = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get the destination extent for the head */
+	dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+	if (IS_ERR(dest_path)) {
+		err = PTR_ERR(dest_path);
+		dest_path = NULL;
+		goto out;
+	}
+	depth = ext_depth(dest_inode);
+	dext = dest_path[depth].p_ext;
+	/* When dext is too large, pick up the target range. */
+	diff = dest_off - le32_to_cpu(dext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+	tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+	tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+	if (count < tmp_ext.ee_len) {
+		tmp_ext.ee_len = cpu_to_le16(count);
+	}
+	dext = &tmp_ext;
+
+	/* loop for the destination extents */
+	while (1) {
+		/* The extent for destination must be found. */
+		BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
+
+		/* loop for the original extent blocks */
+		if ((err = ext4_ext_defrag_leaf_block(handle, org_inode,
+			org_path, dext, &from, delete_start)) < 0) {
+			goto out;
+		}
+
+		replaced_count += le16_to_cpu(dext->ee_len);
+		dest_off += le16_to_cpu(dext->ee_len);
+		from += le16_to_cpu(dext->ee_len);
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		/* get the next extent on both original and destination. */
+		err = ext4_ext_next_extent(dest_inode, dest_path, &dext);
+		if (err != 0) {
+			if (err > 0) {
+				err = 0;
+			}
+			goto out;
+		}
+		if ((err =
+			ext4_ext_next_extent(org_inode, org_path, &oext)) < 0) {
+			goto out;
+		}
+	}
+out:
+	if (handle) {
+		ext4_journal_stop(handle);
+	}
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (dest_path) {
+		ext4_ext_drop_refs(dest_path);
+		kfree(dest_path);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_ext_remove_index_blocks - remove leaf blocks and index blocks
+ * @handle      handle
+ * @dest_inode  temporary inode
+ * @eh          extent header
+ * @depth       depth of extent tree
+ *
+ * This function returns 0 if succeed, otherwise returns error value
+ */
+static int ext4_ext_remove_index_blocks(handle_t *handle,
+			struct inode *dest_inode,
+			struct ext4_extent_header *eh, int depth)
+{
+	struct ext4_extent_idx *idx;
+	struct buffer_head *bh;
+	int metadata = 1;
+	int credits = 0;
+	int i, ret = 0;
+
+	if (depth == 0) {
+		eh->eh_entries = 0;
+		return 0;
+	}
+
+	idx = EXT_FIRST_INDEX(eh);
+
+	for (i = 1; i <= eh->eh_entries; i++) {
+		if (depth > 1) {
+			bh = sb_bread(dest_inode->i_sb, idx->ei_leaf);
+			if (!bh) {
+				ret = -EIO;
+			} else {
+				ret = ext4_ext_remove_index_blocks(handle,
+					dest_inode, ext_block_hdr(bh),
+					depth - 1);
+				brelse(bh);
+			}
+		}
+#ifdef CONFIG_QUOTA
+		credits = 2 * EXT4_QUOTA_TRANS_BLOCKS(dest_inode->i_sb);
+#endif
+		handle = ext4_ext_journal_restart(handle,
+				credits + EXT4_TRANS_META_BLOCKS);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+
+		ext4_free_blocks(handle, dest_inode,
+				idx->ei_leaf, 1, metadata);
+		eh->eh_entries--;
+		idx++;
+	}
+	return ret;
+}
+
+/**
  * ext4_ext_alloc_blocks - allocate contiguous blocks to temporary inode
  * @dest_inode   temporary inode for multiple block allocation
  * @iblock       file related offset
@@ -2667,6 +3314,60 @@ out2:
 }
 
 /**
+ * ext4_ext_defrag_partial - defrag original file partially
+ * @filp:		pointer to file
+ * @org_offset:		page index on original file
+ * @dest_offset:	page index on temporary file
+ *
+ * This function returns 0 if succeeded, otherwise returns error value
+ */
+static int
+ext4_ext_defrag_partial(struct inode *tmp_inode,
+			struct file *filp,
+			pgoff_t org_offset,
+			pgoff_t dest_offset, ext4_fsblk_t *delete_start)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	pgoff_t offset_in_page = PAGE_SIZE;
+	int ret = 0;
+
+	page = read_cache_page(inode->i_mapping,
+		       org_offset, (filler_t *)inode->i_mapping->a_ops->readpage,
+		       NULL);
+
+	if (IS_ERR(page)) {
+		ret = PTR_ERR(page);
+		return ret;
+	}
+
+	lock_page(page);
+	/* release old bh and drop refs */
+	try_to_release_page(page, 0);
+	ret = ext4_ext_replace_branches(inode, tmp_inode, org_offset, 
+			dest_offset, 1, delete_start);
+	if (ret < 0)
+		goto ERR;
+
+	if (org_offset == ((inode->i_size - 1) >> PAGE_SHIFT))
+		offset_in_page = (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+	ret = mapping->a_ops->prepare_write(filp, page,
+					    0, offset_in_page);
+	if (ret)
+		goto ERR;
+
+	ret = mapping->a_ops->commit_write(filp, page,
+					   0, offset_in_page);
+ERR:
+	unlock_page(page);
+	page_cache_release(page);
+
+	return (ret < 0 ? ret : 0);
+}
+
+/**
  * ext4_ext_new_extent_tree -  allocate contiguous blocks
  * @inode:		inode of the original file
  * @tmp_inode:		inode of the temporary file


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC][PATCH 2/3] Move the file data to the new blocks
@ 2006-12-22 10:30 sho
  0 siblings, 0 replies; 18+ messages in thread
From: sho @ 2006-12-22 10:30 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel

Move the blocks on the temporary inode to the original inode
by a page.
1. Read the file data from the old blocks to the page
2. Move the block on the temporary inode to the original inode
3. Write the file data on the page into the new blocks

Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
---
diff -Nrup -X linux-2.6.19-rc6-1/Documentation/dontdiff linux-2.6.19-rc6-1/fs/ext4/extents.c linux-2.6.19-rc6-full/fs/ext4/extents.c
--- linux-2.6.19-rc6-1/fs/ext4/extents.c	2006-12-22 14:41:49.000000000 +0900
+++ linux-2.6.19-rc6-full/fs/ext4/extents.c	2006-12-22 14:56:17.000000000 +0900
@@ -2518,6 +2518,470 @@ ext4_ext_next_extent(struct inode *inode
 }
 
 /**
+ * ext4_ext_merge_extents - merge new extent to the extent block
+ *
+ * @handle      journal handle
+ * @inode       target file's inode
+ * @org_path    path indicates first extent to be defraged
+ * @o_start     first original extent to be defraged
+ * @o_end       last original extent to be defraged
+ * @start_ext   first new extent to be merged
+ * @new_ext     middle of new extent to be merged
+ * @end_ext     last new extent to be merged
+ * @replaced    the number of blocks which will be replaced with new_ext
+ *
+ * This function returns 0 if succeed, otherwise returns -1.
+ */
+static int
+ext4_ext_merge_extents(handle_t *handle, struct inode *inode,
+		struct ext4_ext_path *org_path,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext, unsigned long replaced)
+{
+	int	i = 0;
+	unsigned need_slots;
+	unsigned slots_range, len;
+	int	 range_to_move;
+	struct	ext4_extent_header * eh;
+	struct	ext4_extent *free_start, *free_end;
+	int	depth;
+
+	/* The extents need to be inserted
+	 * start_extent + new_extent + end_extent
+	 */
+	need_slots = (le16_to_cpu(start_ext->ee_len) ? 1 : 0) +
+			(le16_to_cpu(end_ext->ee_len) ? 1 : 0) +
+			(le16_to_cpu(new_ext->ee_len) ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = o_end - o_start + 1;
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = org_path->p_depth;
+	org_path += depth;
+	eh = org_path->p_hdr;
+	/* expansion */
+	if (range_to_move > 0) {
+		if (range_to_move > le16_to_cpu(eh->eh_max)
+				- le16_to_cpu(eh->eh_entries)) {
+			printk("Cannot merge extents due to no space\n");
+			return -1;
+		}
+	}
+	if (depth) {
+		/* Register to journal */
+		if (ext4_journal_get_write_access(handle, org_path->p_bh)) {
+			return -1;
+		}
+	}
+
+	/* Free old blocks
+	 * dest        |---------------|
+	 * org  |---------------|
+	 */
+	free_start = o_start;
+	free_end = o_end;
+	if (le16_to_cpu(start_ext->ee_len)) {
+		if (le16_to_cpu(o_start->ee_len)
+			> le16_to_cpu(start_ext->ee_len)) {
+			ext4_free_blocks(handle, inode,
+				ext_pblock(o_start)
+					+ le16_to_cpu(start_ext->ee_len),
+				le16_to_cpu(o_start->ee_len)
+					- le16_to_cpu(start_ext->ee_len), 0);
+		}
+		free_start++;
+	}
+
+	/* dest |----------------|
+	 * org          |---------------|
+	 */
+	if (le16_to_cpu(end_ext->ee_len)) {
+		ext4_free_blocks(handle, inode, ext_pblock(o_end),
+			le16_to_cpu(o_end->ee_len)
+				- le16_to_cpu(end_ext->ee_len), 0);
+		free_end--;
+	}
+
+	/* dest |-------------------|
+	 * org   |-----------------|
+	 */
+	for (; free_start <= free_end; free_start++) {
+		ext4_free_blocks(handle, inode,
+			ext_pblock(free_start),
+			le32_to_cpu(free_start->ee_len), 0);
+	}
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = EXT_LAST_EXTENT(eh) - (o_end + 1) + 1;
+		len = len * sizeof(struct ext4_extent);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (le16_to_cpu(start_ext->ee_len)) {
+		o_start[i++].ee_len = start_ext->ee_len;
+	}
+
+	/* Insert new entry */
+	if (le16_to_cpu(new_ext->ee_len)) {
+		o_start[i].ee_block = new_ext->ee_block;
+		o_start[i].ee_len = cpu_to_le16(replaced);
+		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len) {
+		o_start[i] = *end_ext;
+	}
+
+	/* Increment the total entries counter on the extent block */
+	eh->eh_entries
+		= cpu_to_le16(le16_to_cpu(eh->eh_entries) + range_to_move);
+	if (depth) {
+		if (ext4_journal_dirty_metadata(handle, org_path->p_bh)) {
+			return -1;
+		}
+	} else {
+		if (ext4_mark_inode_dirty(handle, inode) < 0) {
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ext4_ext_defrag_leaf_block -  Defragmentation for one leaf extent block.
+ * @handle      journal handle
+ * @org_inode   target inode
+ * @org_path    path indicates first extent to be defraged
+ * @dext        destination extent
+ * @from        start offset on the target file
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_ext_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+		struct ext4_ext_path *org_path, struct ext4_extent *dext,
+		unsigned long *from, unsigned long *delete_start)
+{
+	unsigned long depth, replaced = 0;
+	struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	unsigned long new_end;
+	unsigned long lblock;
+	unsigned long len;
+	unsigned long long new_phys_end;
+	int	ret;
+
+	depth = ext_depth(org_inode);
+	start_ext.ee_len = end_ext.ee_len = 0;
+	o_start = o_end = oext = org_path[depth].p_ext;
+	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));	
+	len = new_ext.ee_len = dext->ee_len;
+	new_ext.ee_block = cpu_to_le32(*from);
+	lblock = le32_to_cpu(oext->ee_block);
+	new_end = le32_to_cpu(new_ext.ee_block)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+	new_phys_end = ext_pblock(&new_ext)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+
+	/* First original extent
+	 * dest         |---------------|
+	 * org  |---------------|
+	 */
+	if (le32_to_cpu(new_ext.ee_block) >
+		le32_to_cpu(oext->ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block)
+		+ le16_to_cpu(oext->ee_len)) {
+		start_ext.ee_len = cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+					- le32_to_cpu(oext->ee_block));
+		replaced += le16_to_cpu(oext->ee_len)
+					- le16_to_cpu(start_ext.ee_len);
+	} else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+		/* We can merge previous extent. */
+		prev_ext = oext -1;
+		if ((ext_pblock(prev_ext)
+			+ le32_to_cpu(prev_ext->ee_len))
+			== ext_pblock(&new_ext)) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le32(
+					le16_to_cpu(prev_ext->ee_len)
+					+ le16_to_cpu(new_ext.ee_len));
+			new_ext.ee_len = 0;
+		}
+	}
+
+	for (;;) {
+
+		/* The extent for destination must be found. */
+		BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
+		lblock += le16_to_cpu(oext->ee_len);
+
+		/* Middle of original extent
+		 * dest |-------------------|
+		 * org   |-----------------|
+		 */
+		if (le32_to_cpu(new_ext.ee_block) <=
+			le32_to_cpu(oext->ee_block) &&
+			new_end >= le32_to_cpu(oext->ee_block)
+			+ le16_to_cpu(oext->ee_len) -1) {
+			replaced += le16_to_cpu(oext->ee_len);
+		}
+
+		/* Last original extent
+		 * dest |----------------|
+		 * org          |---------------|
+		 */
+		if (new_end >= le32_to_cpu(oext->ee_block) &&
+			new_end < le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			end_ext.ee_len
+				= cpu_to_le16(le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) -1 - new_end);
+			ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
+				+ cpu_to_le16(oext->ee_len)
+				- cpu_to_le16(end_ext.ee_len)));
+			end_ext.ee_block
+				= cpu_to_le32(le32_to_cpu(o_end->ee_block)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len));
+			replaced += le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len);
+		}
+
+		/* Detected the block end, reached the number of replaced
+		 * blocks to dext->ee_len.  Then, merge the extent.
+		 */
+		if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+			new_end <= le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			if (ext4_ext_merge_extents(handle, org_inode,
+				org_path, o_start, o_end,
+				&start_ext, &new_ext, &end_ext, replaced) < 0) {
+				return -EIO;
+			}
+
+			*delete_start = le32_to_cpu(new_ext.ee_block)
+							+ replaced;
+
+			/* All expected blocks are replaced */
+			if (new_ext.ee_len <= 0) {
+				if (DQUOT_ALLOC_BLOCK
+					(org_inode, len)) {
+					return -EDQUOT;
+				}
+				return 0;
+			}
+
+			/* re-calculate new_ext */
+			new_ext.ee_len = cpu_to_le32(le16_to_cpu(new_ext.ee_len)
+				- replaced);
+			new_ext.ee_block =
+				cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+				+ replaced);
+			ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
+					 + replaced);
+			replaced = 0;
+			start_ext.ee_len = end_ext.ee_len = 0;
+			o_start = NULL;
+
+			/* All expected blocks are replaced */
+			if (new_ext.ee_len <= 0) {
+				if (DQUOT_ALLOC_BLOCK
+					(org_inode, len)) {
+					return -EDQUOT;
+				}
+				return 0;
+			}
+		}
+
+		/* Get next extent for original. */
+		if ((ret
+		 = ext4_ext_next_extent(org_inode, org_path, &oext))
+								!= 0) {
+			if (ret == 1)
+				ret = -EIO;
+			return ret;
+		}
+		o_end = oext;
+		if (!o_start)
+			o_start = oext;
+	}
+}
+
+/**
+ * ext4_ext_replace_branches - replace original extents with new extents.
+ * @org_inode    Original inode
+ * @dest_inode   temporary inode
+ * @from_page    Page offset
+ * @count_page   Page count to be replaced
+ * @delete_start block offset for deletion
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from+count-1".
+ */
+static int
+ext4_ext_replace_branches(struct inode *org_inode, struct inode *dest_inode,
+			pgoff_t from_page,  pgoff_t dest_from_page,
+			pgoff_t count_page, unsigned long *delete_start) 
+{
+	struct ext4_ext_path *org_path = NULL;
+	struct ext4_ext_path *dest_path = NULL;
+	struct ext4_extent   *oext, *dext;
+	struct ext4_extent   tmp_ext;
+	int	err = 0;
+	int	depth;
+	unsigned long from, count, dest_off, diff, replaced_count = 0;
+	handle_t *handle = NULL;
+	unsigned jnum;
+
+	from = from_page << (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	count = count_page << (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	dest_off = dest_from_page << (PAGE_CACHE_SHIFT - 
+			dest_inode->i_blkbits);
+	jnum = ext4_ext_writepage_trans_blocks(org_inode, count) + 3;
+	handle = ext4_journal_start(org_inode, jnum);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out;
+	}
+
+	/* Get the original extent for the block "from" */
+	org_path = ext4_ext_find_extent(org_inode, from, NULL);
+	if (IS_ERR(org_path)) {
+		err = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get the destination extent for the head */
+	dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+	if (IS_ERR(dest_path)) {
+		err = PTR_ERR(dest_path);
+		dest_path = NULL;
+		goto out;
+	}
+	depth = ext_depth(dest_inode);
+	dext = dest_path[depth].p_ext;
+	/* When dext is too large, pick up the target range. */
+	diff = dest_off - le32_to_cpu(dext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+	tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+	tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+	if (count < tmp_ext.ee_len) {
+		tmp_ext.ee_len = cpu_to_le16(count);
+	}
+	dext = &tmp_ext;
+
+	/* loop for the destination extents */
+	while (1) {
+		/* The extent for destination must be found. */
+		BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
+
+		/* loop for the original extent blocks */
+		if ((err = ext4_ext_defrag_leaf_block(handle, org_inode,
+			org_path, dext, &from, delete_start)) < 0) {
+			goto out;
+		}
+
+		replaced_count += le16_to_cpu(dext->ee_len);
+		dest_off += le16_to_cpu(dext->ee_len);
+		from += le16_to_cpu(dext->ee_len);
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		/* get the next extent on both original and destination. */
+		err = ext4_ext_next_extent(dest_inode, dest_path, &dext);
+		if (err != 0) {
+			if (err > 0) {
+				err = 0;
+			}
+			goto out;
+		}
+		if ((err =
+			ext4_ext_next_extent(org_inode, org_path, &oext)) < 0) {
+			goto out;
+		}
+	}
+out:
+	if (handle) {
+		ext4_journal_stop(handle);
+	}
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (dest_path) {
+		ext4_ext_drop_refs(dest_path);
+		kfree(dest_path);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_ext_remove_index_blocks - remove leaf blocks and index blocks
+ * @handle      handle
+ * @dest_inode  temporary inode
+ * @eh          extent header
+ * @depth       depth of extent tree
+ *
+ * This function returns 0 if succeed, otherwise returns error value
+ */
+static int ext4_ext_remove_index_blocks(handle_t *handle,
+			struct inode *dest_inode,
+			struct ext4_extent_header *eh, int depth)
+{
+	struct ext4_extent_idx *idx;
+	struct buffer_head *bh;
+	int metadata = 1;
+	int credits = 0;
+	int i, ret = 0;
+
+	if (depth == 0) {
+		eh->eh_entries = 0;
+		return 0;
+	}
+
+	idx = EXT_FIRST_INDEX(eh);
+
+	for (i = 1; i <= eh->eh_entries; i++) {
+		if (depth > 1) {
+			bh = sb_bread(dest_inode->i_sb, idx->ei_leaf);
+			if (!bh) {
+				ret = -EIO;
+			} else {
+				ret = ext4_ext_remove_index_blocks(handle,
+					dest_inode, ext_block_hdr(bh),
+					depth - 1);
+				brelse(bh);
+			}
+		}
+#ifdef CONFIG_QUOTA
+		credits = 2 * EXT4_QUOTA_TRANS_BLOCKS(dest_inode->i_sb);
+#endif
+		handle = ext4_ext_journal_restart(handle,
+				credits + EXT4_TRANS_META_BLOCKS);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+
+		ext4_free_blocks(handle, dest_inode,
+				idx->ei_leaf, 1, metadata);
+		eh->eh_entries--;
+		idx++;
+	}
+	return ret;
+}
+
+/**
  * ext4_ext_alloc_blocks - allocate contiguous blocks to temporary inode
  * @dest_inode   temporary inode for multiple block allocation
  * @iblock       file related offset
@@ -2645,6 +3109,61 @@ out2:
 }
 
 /**
+ * ext4_ext_defrag_partial - defrag original file partially
+ * @filp:		pointer to file
+ * @org_offset:		page index on original file
+ * @dest_offset:	page index on temporary file
+ *
+ * This function returns 0 if succeeded, otherwise returns error value
+ */
+static int
+ext4_ext_defrag_partial(struct inode *tmp_inode,
+			struct file *filp,
+			pgoff_t org_offset,
+			pgoff_t dest_offset, unsigned long *delete_start)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	pgoff_t offset_in_page = PAGE_SIZE;
+	int ret = 0;
+
+	page = read_cache_page(inode->i_mapping,
+		       org_offset, (filler_t *)inode->i_mapping->a_ops->readpage,
+		       NULL);
+
+	if (IS_ERR(page)) {
+		ret = PTR_ERR(page);
+		return ret;
+	}
+
+	wait_on_page_locked(page);
+	lock_page(page);
+	/* release old bh and drop refs */
+	try_to_release_page(page, 0);
+	ret = ext4_ext_replace_branches(inode, tmp_inode, org_offset, 
+			dest_offset, 1, delete_start);
+	if (ret < 0)
+		goto ERR;
+
+	if (org_offset == ((inode->i_size - 1) >> PAGE_SHIFT))
+		offset_in_page = (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+	ret = mapping->a_ops->prepare_write(filp, page,
+					    0, offset_in_page);
+	if (ret)
+		goto ERR;
+
+	ret = mapping->a_ops->commit_write(filp, page,
+					   0, offset_in_page);
+ERR:
+	unlock_page(page);
+	page_cache_release(page);
+
+	return (ret < 0 ? ret : 0);
+}
+
+/**
  * ext4_ext_new_extent_tree -  allocate contiguous blocks
  * @inode:		inode of the original file
  * @tmp_inode:		inode of the temporary file

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC][PATCH 2/3] Move the file data to the new blocks
@ 2006-11-09 11:10 sho
  0 siblings, 0 replies; 18+ messages in thread
From: sho @ 2006-11-09 11:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel

Move the blocks on the temporary inode to the original inode
by a page.
1. Read the file data from the old blocks to the page
2. Move the block on the temporary inode to the original inode
3. Write the file data on the page into the new blocks

Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
---
diff -upNr -X linux-2.6.16.8-tmp1/Documentation/dontdiff linux-2.6.16.8-tmp1/fs/ext3/extents.c linux-2.6.16.8-tmp2/fs/ext3/extents.c
--- linux-2.6.16.8-tmp1/fs/ext3/extents.c	2006-11-08 22:12:19.000000000 +0900
+++ linux-2.6.16.8-tmp2/fs/ext3/extents.c	2006-11-08 22:25:07.000000000 +0900
@@ -2424,6 +2424,421 @@ ext3_ext_next_extent(struct inode *inode
 }
 
 /**
+ * ext3_ext_merge_extents - merge new extent to the extent block
+ *
+ * @handle	journal handle
+ * @inode	target file's inode
+ * @org_path	path indicates first extent to be defraged
+ * @o_start	first original extent to be defraged
+ * @o_end	last original extent to be defraged
+ * @start_ext	first new extent to be merged
+ * @new_ext	middle of new extent to be merged
+ * @end_ext	last new extent to be merged
+ * @replaced	the number of blocks which will be replaced with new_ext
+ *
+ * This function returns 0 if succeed, otherwise returns -1.
+ */
+static int
+ext3_ext_merge_extents(handle_t *handle, struct inode *inode,
+		struct ext3_ext_path *org_path,
+		struct ext3_extent *o_start, struct ext3_extent *o_end,
+		struct ext3_extent *start_ext, struct ext3_extent *new_ext,
+		struct ext3_extent *end_ext, unsigned long replaced)
+{
+	int	 i = 0;
+	unsigned need_slots;
+	unsigned slots_range, len;
+	int	 range_to_move;
+	struct ext3_extent_header * eh;
+	struct ext3_extent *free_start, *free_end;
+	int	 depth;
+
+	/* The extents need to be inserted
+	 * start_extent + new_extent + end_extent
+	 */
+	need_slots = (le16_to_cpu(start_ext->ee_len) ? 1 : 0) +
+		     (le16_to_cpu(end_ext->ee_len) ? 1 : 0) +
+		     (le16_to_cpu(new_ext->ee_len) ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = o_end - o_start + 1;
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = org_path->p_depth;
+	org_path += depth;
+	eh = org_path->p_hdr;
+	/* expansion */
+	if (range_to_move > 0) {
+		if (range_to_move > le16_to_cpu(eh->eh_max)
+					- le16_to_cpu(eh->eh_entries)) {
+			printk("Cannot merge extents due to no space\n");
+			return -1;
+		}
+	}
+	if (depth) {
+		/* Register to journal */
+		if (ext3_journal_get_write_access(handle, org_path->p_bh)) {
+			return -1;
+		}
+	}
+
+	/* Free old blocks
+	 * dest        |---------------|
+	 * org  |---------------|
+	 */
+	free_start = o_start;
+	free_end = o_end;
+	if (le16_to_cpu(start_ext->ee_len)) {
+		if (le16_to_cpu(o_start->ee_len)
+			> le16_to_cpu(start_ext->ee_len)) {
+			ext3_free_blocks(handle, inode,
+				le32_to_cpu(o_start->ee_start)
+					 + le16_to_cpu(start_ext->ee_len),
+				le16_to_cpu(o_start->ee_len)
+					- le16_to_cpu(start_ext->ee_len), 0);
+		}
+		free_start++;
+	}
+
+	/* dest |----------------|
+	 * org          |---------------|
+	 */
+	if (le16_to_cpu(end_ext->ee_len)) {
+		ext3_free_blocks(handle, inode, le32_to_cpu(o_end->ee_start),
+			le16_to_cpu(o_end->ee_len)
+				 - le16_to_cpu(end_ext->ee_len), 0);
+		free_end--;
+	}
+
+	/* dest |-------------------|
+	 * org   |-----------------|
+	 */
+	for (; free_start <= free_end; free_start++) {
+		ext3_free_blocks(handle, inode,
+			le32_to_cpu(free_start->ee_start),
+			le32_to_cpu(free_start->ee_len), 0);
+	}
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = EXT_LAST_EXTENT(eh) - (o_end + 1) + 1;
+		len = len * sizeof(struct ext3_extent);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (le16_to_cpu(start_ext->ee_len)) {
+		o_start[i++].ee_len = start_ext->ee_len;
+	}
+
+	/* Insert new entry */
+	if (le16_to_cpu(new_ext->ee_len)) {
+		o_start[i].ee_block = new_ext->ee_block;
+		o_start[i].ee_len = cpu_to_le16(replaced);
+		o_start[i++].ee_start = new_ext->ee_start;
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len) {
+		o_start[i] = *end_ext;
+	}
+
+	/* Increment the total entries counter on the extent block */
+	eh->eh_entries
+		= cpu_to_le16(le16_to_cpu(eh->eh_entries) + range_to_move);
+	if (depth) {
+		if (ext3_journal_dirty_metadata(handle, org_path->p_bh)) {
+			return -1;
+		}
+	} else {
+		if (ext3_mark_inode_dirty(handle, inode) < 0) {
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ext3_ext_defrag_leaf_block -  Defragmentation for one leaf extent block.
+ * @handle	journal handle
+ * @org_tree	extent tree for the target inode
+ * @org_path	path indicates first extent to be defraged
+ * @dext	destination extent
+ * @from	start offset on the target file
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext3_ext_defrag_leaf_block(handle_t *handle, struct ext3_extents_tree *org_tree,
+		  struct ext3_ext_path *org_path, struct ext3_extent *dext,
+		  unsigned long *from)
+{
+	unsigned long depth, replaced = 0;
+	struct ext3_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+	struct ext3_extent new_ext, start_ext, end_ext;
+	unsigned long new_end;
+	unsigned long lblock;
+	unsigned long long new_phys_end;
+	int	ret;
+
+	depth = EXT_DEPTH(org_tree);
+	start_ext.ee_len = end_ext.ee_len = 0;
+	o_start = o_end = oext = org_path[depth].p_ext;
+	new_ext.ee_start = dext->ee_start;
+	new_ext.ee_len = dext->ee_len;
+	new_ext.ee_block = cpu_to_le32(*from);
+	new_ext.ee_start_hi = start_ext.ee_start_hi = end_ext.ee_start_hi = 0;
+	lblock = le32_to_cpu(oext->ee_block);
+	new_end = le32_to_cpu(new_ext.ee_block)
+			+ le16_to_cpu(new_ext.ee_len) - 1;
+	new_phys_end = le32_to_cpu(new_ext.ee_start)
+			+ le16_to_cpu(new_ext.ee_len) - 1;
+
+	/* First original extent
+	 * dest         |---------------|
+	 * org  |---------------|
+	*/
+	if (le32_to_cpu(new_ext.ee_block) >
+		le32_to_cpu(oext->ee_block) &&
+	    le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block)
+		+ le16_to_cpu(oext->ee_len)) {
+		start_ext.ee_len = cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+					- le32_to_cpu(oext->ee_block));
+		replaced += le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(start_ext.ee_len);
+	} else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+		/* We can merge previous extent. */
+		prev_ext = oext -1;
+		if ((le32_to_cpu(prev_ext->ee_start)
+			+ le32_to_cpu(prev_ext->ee_len))
+			== le32_to_cpu(new_ext.ee_start)) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le32(
+					le16_to_cpu(prev_ext->ee_len)
+					+ le16_to_cpu(new_ext.ee_len));
+			new_ext.ee_len = 0;
+		}
+	}
+
+	for (;;) {
+
+		/* The extent for destination must be found. */
+		EXT_ASSERT(oext && lblock == le32_to_cpu(oext->ee_block));
+		lblock += le16_to_cpu(oext->ee_len);
+
+		/* Middle of original extent
+		 * dest |-------------------|
+		 * org   |-----------------|
+		 */
+		if (le32_to_cpu(new_ext.ee_block) <=
+			le32_to_cpu(oext->ee_block) &&
+		    new_end >= le32_to_cpu(oext->ee_block)
+			 + le16_to_cpu(oext->ee_len) -1) {
+			replaced += le16_to_cpu(oext->ee_len);
+		}
+
+		/* Last original extent
+		 * dest |----------------|
+		 * org          |---------------|
+		 */
+		if (new_end >= le32_to_cpu(oext->ee_block) &&
+		    new_end < le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			end_ext.ee_len
+			    = cpu_to_le16(le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) -1 - new_end);
+			end_ext.ee_start
+			    = cpu_to_le32(le32_to_cpu(o_end->ee_start)
+				+ cpu_to_le16(oext->ee_len)
+				- cpu_to_le16(end_ext.ee_len));
+			end_ext.ee_block
+			    = cpu_to_le32(le32_to_cpu(o_end->ee_block)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len));
+			replaced += le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len);
+		}
+
+		/* Detected the block end, reached the number of replaced
+		 * blocks to dext->ee_len.  Then, merge the extent.
+		 */
+		if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+		    new_end <= le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			if (ext3_ext_merge_extents(handle, org_tree->inode,
+				org_path, o_start, o_end,
+				&start_ext, &new_ext, &end_ext, replaced) < 0) {
+				return -EIO;
+			}
+
+			/* All expected blocks are replaced */
+			if (new_ext.ee_len <= 0) {
+				return 0;
+			}
+
+			/* re-calculate new_ext */
+			new_ext.ee_len = cpu_to_le32(le16_to_cpu(new_ext.ee_len)
+					- replaced);
+			new_ext.ee_block =
+				cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+					+ replaced);
+			new_ext.ee_start =
+				cpu_to_le32(le32_to_cpu(new_ext.ee_start)
+					+ replaced);
+			replaced = 0;
+			start_ext.ee_len = end_ext.ee_len = 0;
+			o_start = NULL;
+
+			/* All expected blocks are replaced */
+			if (new_ext.ee_len <= 0) {
+				return 0;
+			}
+		}
+
+		/* Get next extent for original. */
+		if ((ret
+		  = ext3_ext_next_extent(org_tree->inode, org_path, &oext))
+								!= 0) {
+			if (ret == 1)
+				ret = -EIO;
+			return ret;
+		}
+		o_end = oext;
+		if (!o_start)
+			o_start = oext;
+	}
+}
+
+/**
+ * ext3_ext_replace_branches - replace original extents with new extents.
+ * @org_tree	Original extents tree
+ * @dest_tree	New extents tree
+ * @from_page	Page offset
+ * @count_page	Page count to be replaced
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from+count-1".
+ */
+static int
+ext3_ext_replace_branches(struct ext3_extents_tree *org_tree,
+			  struct ext3_extents_tree *dest_tree,
+			  pgoff_t from_page,  pgoff_t dest_from_page,
+			  pgoff_t count_page)
+{
+	struct ext3_ext_path *org_path = NULL;
+	struct ext3_ext_path *dest_path = NULL;
+	struct ext3_extent   *oext, *dext;
+	struct ext3_extent   tmp_ext;
+	int	err = 0;
+	int	 depth;
+	unsigned long from, count, dest_off, diff, replaced_count = 0;
+	handle_t *handle = NULL;
+	unsigned jnum;
+	struct inode *inode;
+
+
+	from = from_page << (PAGE_CACHE_SHIFT - dest_tree->inode->i_blkbits);
+	count = count_page << (PAGE_CACHE_SHIFT - dest_tree->inode->i_blkbits);
+	dest_off = dest_from_page << (PAGE_CACHE_SHIFT -
+		 dest_tree->inode->i_blkbits);
+	inode = org_tree->inode;
+
+	/* (blocks_per_page * count) * (extent_blocks + index_blocks)
+	 * + super_block + block_bitmap + group_descriptor
+	 * jnum = ext3_ext_writepage_trans_blocks(inode, count) + 3;
+	 */
+	/* TODO:
+	 * Need to consider the way of calculating journal blocks
+	 * because j_max_transaction_buffer may exceed 2048
+	 * if we have a deep depth.
+	 */
+	jnum = 2048;
+	handle = ext3_journal_start(inode, jnum);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out;
+	}
+
+	/* Get the original extent for the block "from" */
+	org_path = ext3_ext_find_extent(org_tree, from, NULL);
+	if (IS_ERR(org_path)) {
+		err = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get the destination extent for the head */
+	dest_path = ext3_ext_find_extent(dest_tree, dest_off, NULL);
+	if (IS_ERR(dest_path)) {
+		err = PTR_ERR(dest_path);
+		dest_path = NULL;
+		goto out;
+	}
+	depth = EXT_DEPTH(dest_tree);
+	dext = dest_path[depth].p_ext;
+	/* When dext is too large, pick up the target range. */
+	diff = dest_off - le32_to_cpu(dext->ee_block);
+	tmp_ext.ee_start = cpu_to_le32(le32_to_cpu(dext->ee_start) + diff);
+	tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+	tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+	tmp_ext.ee_start_hi = 0;
+	if (count < tmp_ext.ee_len) {
+		tmp_ext.ee_len = cpu_to_le16(count);
+	}
+	dext = &tmp_ext;
+
+	/* loop for the destination extents */
+	while (1) {
+		/* The extent for destination must be found. */
+		EXT_ASSERT(dext && dest_off == le32_to_cpu(dext->ee_block));
+
+		/* loop for the original extent blocks */
+		if ((err = ext3_ext_defrag_leaf_block(handle, org_tree,
+			org_path, dext, &from)) < 0) {
+			goto out;
+		}
+
+		replaced_count += le16_to_cpu(dext->ee_len);
+		dest_off += le16_to_cpu(dext->ee_len);
+		from += le16_to_cpu(dext->ee_len);
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		/* get the next extent on both original and destination. */
+		err = ext3_ext_next_extent(dest_tree->inode, dest_path, &dext);
+		if (err != 0) {
+			if (err > 0) {
+				err = 0;
+			}
+			goto out;
+		}
+		if ((err =
+		  ext3_ext_next_extent(inode, org_path, &oext)) < 0) {
+			goto out;
+		}
+	}
+out:
+	if (handle) {
+		ext3_journal_stop(handle);
+	}
+	if (org_path) {
+		ext3_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (dest_path) {
+		ext3_ext_drop_refs(dest_path);
+		kfree(dest_path);
+	}
+
+	return err;
+}
+
+/**
  * ext3_ext_remove_index_blocks - remove leaf blocks and index blocks
  * @handle 	handle
  * @dest_tree	extent tree of temporary inode
@@ -2581,6 +2996,73 @@ err:
 }
 
 /**
+ * ext3_ext_defrag_partial - defrag original file partially
+ * @filp:		pointer to file
+ * @org_tree:		extent tree of the original inode
+ * @dest_tree:		extent tree of the temporary inode
+ * @org_offset:		page index on original file
+ * @dest_offset:	page index on temporary file
+ * @nr_to_read:		the number of pages to read
+ *
+ * This function returns 0 if succeeded, otherwise returns error value
+ */
+static int
+ext3_ext_defrag_partial(struct file *filp,
+			struct ext3_extents_tree *org_tree,
+			struct ext3_extents_tree *dest_tree,
+			pgoff_t org_offset,
+			pgoff_t dest_offset,
+			pgoff_t nr_to_read)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	pgoff_t page_end = org_offset + nr_to_read - 1;
+	pgoff_t offset_in_page = PAGE_SIZE;
+	int ret = 0;
+
+	for (; org_offset <= page_end; ++org_offset, ++dest_offset) {
+		page = read_cache_page(inode->i_mapping,
+				       org_offset,
+				       (filler_t *)inode->i_mapping->a_ops->readpage,
+				       NULL);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			break;
+		}
+
+		wait_on_page_locked(page);
+		lock_page(page);
+
+		/* release old bh and drop refs */
+		try_to_release_page(page, 0);
+
+		ret = ext3_ext_replace_branches(org_tree, dest_tree,
+						org_offset, dest_offset, 1);
+		if (ret < 0)
+			goto ERR;
+
+		if (org_offset == ((inode->i_size - 1) >> PAGE_SHIFT))
+			offset_in_page = (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+		ret = mapping->a_ops->prepare_write(filp, page,
+						    0, offset_in_page);
+		if (ret)
+			goto ERR;
+
+		ret = mapping->a_ops->commit_write(filp, page,
+						   0, offset_in_page);
+ERR:
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (ret < 0)
+			break;
+	}
+	return (ret < 0 ? ret : 0);
+}
+
+/**
  * ext3_ext_new_extent_tree -  allocate contiguous blocks
  * @tmp_inode:		inode of the temporary file
  * @org_tree:		extent tree of the original inode

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2007-02-12  3:11 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-01-16 12:05 [RFC][PATCH 2/3] Move the file data to the new blocks sho
2007-02-05 13:12 ` Jan Kara
2007-02-05 22:06   ` Nathan Scott
2007-02-07  1:35   ` Andrew Morton
2007-02-07 20:46     ` Andreas Dilger
2007-02-07 20:56       ` Andrew Morton
2007-02-08  9:29         ` Jan Kara
2007-02-08  9:45           ` Andrew Morton
2007-02-08 10:21             ` Jan Kara
2007-02-08 10:32               ` Andrew Morton
2007-02-08 10:47                 ` Jan Kara
2007-02-12  3:11                   ` Theodore Tso
2007-02-07  1:33 ` Andrew Morton
2007-02-07  3:45   ` Eric Sandeen
2007-02-07  9:46     ` Takashi Sato
  -- strict thread matches above, loose matches on Subject: below --
2007-02-08  9:01 Takashi Sato
2006-12-22 10:30 sho
2006-11-09 11:10 sho

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.