linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] extents support for EXT3
@ 2003-08-28  8:22 Alex Tomas
  2003-08-28 17:22 ` [Ext2-devel] " Mike Fedyk
                   ` (3 more replies)
  0 siblings, 4 replies; 30+ messages in thread
From: Alex Tomas @ 2003-08-28  8:22 UTC (permalink / raw)
  To: linux-kernel; +Cc: ext2-devel


this is 2nd version of the patch. changes:
  - error handling seems completed
  - lots of cleanups and comments
  - few minor bug fixed

this version of the patch tries to solve couple
of corner cases:
  - very long truncate
  - rewrite 

it survived dbench, bonnie++ and fsx tests.

take a look at numbers I've just got, please.

                      before      after
5GB file, creation:   2m31.197s   2m21.933s
5GB file, read:       2m25.439s   2m24.833s
5GB file, rewrite:    2m48.434s   2m20.958s
5GB file, removal:    0m8.760s    0m0.858s

             before           after
dbench 16:   99.9868 MB/sec   179.243 MB/sec 16 procs
dbench 16:   89.9919 MB/sec   203.119 MB/sec 16 procs
dbench 16:   73.5519 MB/sec   185.815 MB/sec 16 procs
dbench 16:   94.6312 MB/sec   188.519 MB/sec 16 procs


to use extents you have to use 'extents' mount option

Alex

 fs/ext3/Makefile           |    2 
 fs/ext3/extents.c          | 1571 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext3/ialloc.c           |    2 
 fs/ext3/inode.c            |   25 
 fs/ext3/super.c            |   11 
 include/linux/ext3_fs.h    |   17 
 include/linux/ext3_fs_i.h  |    4 
 include/linux/ext3_fs_sb.h |   10 
 8 files changed, 1636 insertions(+), 6 deletions(-)

diff -puN /dev/null fs/ext3/extents.c
--- /dev/null	2003-01-30 13:24:37.000000000 +0300
+++ linux-2.6.0-test2-uml-alexey/fs/ext3/extents.c	2003-08-25 21:12:35.000000000 +0400
@@ -0,0 +1,1571 @@
+/*
+ *
+ * linux/fs/ext3/extents.c
+ *
+ * Extents support for EXT3
+ *
+ * 07/08/2003    Alex Tomas <bzzz@tmi.comex.ru>
+ * 
+ * TODO:
+ *   - ext3*_error() should be used in some situations
+ *   - find_goal() [to be tested and improved]
+ *   - quick search for index/leaf in ext3_ext_find_extent()
+ *   - tree reduction
+ *   - cache last found extent
+ *   - arch-independent
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext3_jbd.h>
+#include <linux/jbd.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+
+/*
+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
+ * become very little, so index split, in-depth growing and
+ * other hard changes happens much more often
+ * this is for debug purposes only
+ */
+#define AGRESSIVE_TEST_
+
+/*
+ * if EXT_DEBUG defined you can use 'extdebug' mount option
+ * to get lots of info what's going on
+ */
+#define EXT_DEBUG
+#ifdef EXT_DEBUG
+#define ext_debug(inode,fmt,a...) 		\
+do {						\
+	if (test_opt((inode)->i_sb, EXTDEBUG))	\
+		printk(fmt, ##a);		\
+} while (0);
+#else
+#define ext_debug(inode,fmt,a...)
+#endif
+
+#define EXT3_ALLOC_NEEDED	2	/* block bitmap + group descriptor */
+
+/*
+ * ext3_inode has i_block array (total 60 bytes)
+ * first 4 bytes are used to store:
+ *  - tree depth (0 mean there is no tree yet. all extents in the inode)
+ *  - number of alive extents in the inode
+ */
+
+/*
+ * this is extent on-disk structure
+ * it's used at the bottom of the tree
+ */
+struct ext3_extent {
+	__u32	e_block;	/* first logical block extent covers */
+	__u32	e_start;	/* first physical block extents lives */
+	__u32	e_num;		/* number of blocks covered by extent */
+};
+
+/*
+ * this is index on-disk structure
+ * it's used at all the levels, but the bottom
+ */
+struct ext3_extent_idx {
+	__u32	e_block;	/* index covers logical blocks from 'block' */
+	__u32	e_leaf;		/* pointer to the physical block of the next *
+				 * level. leaf or next index could bet here */
+};
+
+/*
+ * each block (leaves and indexes), even inode-stored has header
+ */
+struct ext3_extent_header {	
+	__u16	e_num;		/* number of valid entries */
+	__u16	e_max;		/* capacity of store in entries */
+};
+
+/*
+ * array of ext3_ext_path contains path to some extent
+ * creation/lookup routines use it for traversal/splitting/etc
+ * truncate uses it to simulate recursive walking
+ */
+struct ext3_ext_path {
+	__u32				p_block;
+	__u16				p_depth;
+	struct ext3_extent		*p_ext;
+	struct ext3_extent_idx		*p_idx;
+	struct ext3_extent_header	*p_hdr;
+	struct buffer_head		*p_bh;
+};
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+	((struct ext3_extent *) (((char *) (__hdr__)) +		\
+				 sizeof(struct ext3_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+	((struct ext3_extent_idx *) (((char *) (__hdr__)) +	\
+				     sizeof(struct ext3_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+	((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max)
+#define EXT_LAST_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ */
+static int ext3_ext_get_access(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		return ext3_journal_get_write_access(handle, path->p_bh);
+	}
+
+	/* path points to leaf/index in inode body */
+	return 0;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ *  - EIO
+ */
+static int ext3_ext_dirty(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		return ext3_journal_dirty_metadata(handle, path->p_bh);
+	}
+
+	/* path points to leaf/index in inode body */
+	return ext3_mark_inode_dirty(handle, inode);
+}
+
+static inline int ext3_ext_space_block(struct inode *inode)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header))
+		/ sizeof(struct ext3_extent);
+#ifdef AGRESSIVE_TEST
+	size = 6; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static inline int ext3_ext_space_inode(struct inode *inode)
+{
+	int size;
+
+	size = (sizeof(EXT3_I(inode)->i_data) -
+			sizeof(struct ext3_extent_header))
+			/ sizeof(struct ext3_extent);
+#ifdef AGRESSIVE_TEST
+	size = 3; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static inline int ext3_ext_space_inode_idx(struct inode *inode)
+{
+	int size;
+
+	size = (sizeof(EXT3_I(inode)->i_data) -
+			sizeof(struct ext3_extent_header))
+			/ sizeof(struct ext3_extent_idx);
+#ifdef AGRESSIVE_TEST
+	size = 4; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path)
+{
+	int k, l = path->p_depth;
+
+	ext_debug(inode, "path:");
+	for (k = 0; k <= l; k++, path++) {
+		if (path->p_idx) {
+			ext_debug(inode, "  %d->%d", path->p_idx->e_block,
+					path->p_idx->e_leaf);
+		} else if (path->p_ext) {
+			ext_debug(inode, "  %d:%d:%d",
+					path->p_ext->e_block,
+					path->p_ext->e_start,
+					path->p_ext->e_num);
+		} else
+			ext_debug(inode, "  []");
+	}
+	ext_debug(inode, "\n");
+}
+
+static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path)
+{
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent_header *eh = path[depth].p_hdr;
+	struct ext3_extent *ex = EXT_FIRST_EXTENT(eh);
+	int i;
+
+	for (i = 0; i < eh->e_num; i++, ex++) {
+		ext_debug(inode, "%d:%d:%d ",
+				ex->e_block, ex->e_start, ex->e_num);
+	}
+	ext_debug(inode, "\n");
+}
+
+static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path)
+{
+	int depth = path->p_depth;
+	int i;
+
+	for (i = 0; i <= depth; i++, path++)
+		if (path->p_bh) {
+			brelse(path->p_bh);
+			path->p_bh = NULL;
+		}
+}
+
+static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	unsigned long bg_start;
+	unsigned long colour;
+	int depth;
+	
+	if (path) {
+		depth = path->p_depth;
+		/* try to find previous block */
+		if (path[depth].p_ext)
+			return path[depth].p_ext->e_start +
+				path[depth].p_ext->e_num - 1;
+		
+		/* it looks index is empty
+		 * try to find starting from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+	colour = (current->pid % 16) *
+			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour;
+}
+
+static struct ext3_ext_path *
+ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_extent_header *eh = (void *) ei->i_data;
+	struct ext3_extent_idx *ix;
+	struct buffer_head *bh;
+	struct ext3_extent *ex;
+	int depth, i, k, ppos = 0;
+	
+	eh = (struct ext3_extent_header *) ei->i_data;
+
+	/* initialize capacity of leaf in inode for first time */
+	if (eh->e_max == 0)
+		eh->e_max = ext3_ext_space_inode(inode);
+	i = depth = ei->i_depth;
+	EXT_ASSERT(i == 0 || eh->e_num > 0);
+	
+	/* account possible depth increase */
+	if (!path) {
+		path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
+				GFP_NOFS);
+		if (!path)
+			return ERR_PTR(-ENOMEM);
+	}
+	memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
+
+	/* walk through the tree */
+	while (i) {
+		ext_debug(inode, "depth %d: num %d, max %d\n",
+				ppos, eh->e_num, eh->e_max);
+		ix = EXT_FIRST_INDEX(eh);
+		if (eh->e_num)
+			path[ppos].p_idx = ix;
+		EXT_ASSERT(eh->e_num <= eh->e_max);
+		for (k = 0; k < eh->e_num; k++, ix++) {
+			ext_debug(inode, "index: %d -> %d\n",
+					ix->e_block, ix->e_leaf);
+			if (block < ix->e_block)
+				break;
+			path[ppos].p_idx = ix;
+		}
+		path[ppos].p_block = path[ppos].p_idx->e_leaf;
+		path[ppos].p_depth = i;
+		path[ppos].p_hdr = eh;
+		path[ppos].p_ext = NULL;
+
+		bh = sb_bread(inode->i_sb, path[ppos].p_block);
+		if (!bh) {
+			ext3_ext_drop_refs(inode, path);
+			kfree(path);
+			return ERR_PTR(-EIO);
+		}
+		eh = (struct ext3_extent_header *) bh->b_data;
+		ppos++;
+		EXT_ASSERT(ppos <= depth);
+		path[ppos].p_bh = bh;
+		i--;
+	}
+
+	path[ppos].p_depth = i;
+	path[ppos].p_hdr = eh;
+	path[ppos].p_ext = NULL;
+	
+	/* find extent */
+	ex = EXT_FIRST_EXTENT(eh);
+	if (eh->e_num)
+		path[ppos].p_ext = ex;
+	EXT_ASSERT(eh->e_num <= eh->e_max);
+	for (k = 0; k < eh->e_num; k++, ex++) {
+		if (block < ex->e_block) 
+			break;
+		path[ppos].p_ext = ex;
+	}
+
+	ext3_ext_show_path(inode, path);
+
+	return path;
+}
+
+static void ext3_ext_check_boundary(struct inode *inode,
+					struct ext3_ext_path *curp,
+					void *addr, int len)
+{
+	void *end;
+
+	if (!len)
+		return;
+	if (curp->p_bh)
+		end = (void *) curp->p_hdr + inode->i_sb->s_blocksize;
+	else
+		end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data);
+	if (((unsigned long) addr) + len > (unsigned long) end) {
+		printk("overflow! 0x%p > 0x%p\n", addr + len, end);
+		BUG();
+	}
+	if ((unsigned long) addr < (unsigned long) curp->p_hdr) {
+		printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr);
+		BUG();
+	}
+}
+
+/*
+ * insert new index [logical;ptr] into the block at cupr
+ * it check where to insert: before curp or after curp
+ */
+static int ext3_ext_insert_index(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *curp, int logical,
+				int ptr)
+{
+	struct ext3_extent_idx *ix;
+	int len, err;
+
+	if ((err = ext3_ext_get_access(handle, inode, curp)))
+		return err;
+
+	EXT_ASSERT(logical != curp->p_idx->e_block);
+	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+	if (logical > curp->p_idx->e_block) {
+		/* insert after */
+		len = (len - 1) * sizeof(struct ext3_extent_idx);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert new index %d after: %d. "
+				"move %d from 0x%p to 0x%p\n",
+				logical, ptr, len,
+				(curp->p_idx + 1), (curp->p_idx + 2));
+
+		ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len);
+		memmove(curp->p_idx + 2, curp->p_idx + 1, len);
+		ix = curp->p_idx + 1;
+	} else {
+		/* insert before */
+		len = len * sizeof(struct ext3_extent_idx);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert new index %d before: %d. "
+				"move %d from 0x%p to 0x%p\n",
+				logical, ptr, len,
+				curp->p_idx, (curp->p_idx + 1));
+
+		ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len);
+		memmove(curp->p_idx + 1, curp->p_idx, len);
+		ix = curp->p_idx;
+	}
+
+	ix->e_block = logical;
+	ix->e_leaf = ptr;
+	curp->p_hdr->e_num++;
+
+	err = ext3_ext_dirty(handle, inode, curp);
+	ext3_std_error(inode->i_sb, err);
+
+	return err;
+}
+
+/*
+ * routine inserts new subtree into the path, using free index entry
+ * at depth 'at:
+ *  - allocates all needed blocks (new leaf and all intermediate index blocks)
+ *  - makes decision where to split
+ *  - moves remaining extens and index entries (right to the split point)
+ *    into the newly allocated blocks
+ *  - initialize subtree
+ */
+static int ext3_ext_split(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path,
+				struct ext3_extent *newext, int at)
+{
+	struct buffer_head *bh = NULL;
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent_header *neh;
+	struct ext3_extent_idx *fidx;
+	struct ext3_extent *ex;
+	int i = at, k, m, a;
+	sector_t newblock, oldblock, border;
+	int *ablocks = NULL; /* array of allocated blocks */
+	int err = 0;
+
+	/* make decision: where to split? */
+	/* FIXME: now desicion is simplest: at current extent */
+
+	/* if current leaf will be splitted, then we should use 
+	 * border from split point */
+	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		border = path[depth].p_ext[1].e_block;
+		ext_debug(inode, "leaf will be splitted."
+				" next leaf starts at %d\n",
+				(int)border);
+	} else {
+		border = newext->e_block;
+		ext_debug(inode, "leaf will be added."
+				" next leaf starts at %d\n",
+				(int)border);
+	}
+
+	/* 
+	 * if error occurs, then we break processing
+	 * and turn filesystem read-only. so, index won't
+	 * be inserted and tree will be in consistent
+	 * state. next mount will repair buffers too
+	 */
+
+	/*
+	 * get array to track all allocated blocks
+	 * we need this to handle errors and free blocks
+	 * upon them
+	 */
+	ablocks = kmalloc(sizeof(sector_t) * depth, GFP_NOFS);
+	if (!ablocks)
+		return -ENOMEM;
+	memset(ablocks, 0, sizeof(sector_t) * depth);
+
+	/* allocate all needed blocks */
+	ext_debug(inode, "allocate %d blocks for indexes and leaf\n",
+			depth - at);
+	ablocks[0] = newext->e_start++;
+	newext->e_num--;
+	for (a = 1; a < depth - at; a++) {
+		newblock = ext3_new_block(handle, inode, newext->e_start,
+						0, 0, &err);
+		if (newblock == 0)
+			goto cleanup;
+		ablocks[a] = newblock;
+	}
+
+	/* initialize new leaf */
+	newblock = ablocks[--a];
+	EXT_ASSERT(newblock);
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		goto cleanup;
+	}
+	lock_buffer(bh);
+
+	if ((err = ext3_journal_get_create_access(handle, bh)))
+		goto cleanup;
+
+	neh = (struct ext3_extent_header *) bh->b_data;
+	neh->e_num = 0;
+	neh->e_max = ext3_ext_space_block(inode);
+	ex = EXT_FIRST_EXTENT(neh);
+
+	/* move remain of path[depth] to the new leaf */
+	EXT_ASSERT(path[depth].p_hdr->e_num ==
+			path[depth].p_hdr->e_max);
+	/* start copy from next extent */
+	/* TODO: we could do it by single memmove */
+	m = 0;
+	path[depth].p_ext++;
+	while (path[depth].p_ext <=
+			EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug(inode, "move %d:%d:%d in new leaf\n",
+				path[depth].p_ext->e_block,
+				path[depth].p_ext->e_start,
+				path[depth].p_ext->e_num);
+		memmove(ex++, path[depth].p_ext++,
+				sizeof(struct ext3_extent));
+		neh->e_num++;
+		m++;
+	}
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	if ((err = ext3_journal_dirty_metadata(handle, bh)))
+		goto cleanup;	
+	brelse(bh);
+	bh = NULL;
+
+	/* correct old leaf */
+	if (m) {
+		if ((err = ext3_ext_get_access(handle, inode, path)))
+			goto cleanup;
+		path[depth].p_hdr->e_num -= m;
+		if ((err = ext3_ext_dirty(handle, inode, path)))
+			goto cleanup;
+		
+	}
+
+	/* create intermediate indexes */
+	k = depth - at - 1;
+	EXT_ASSERT(k >= 0);
+	if (k)
+		ext_debug(inode,
+				"create %d intermediate indices\n", k);
+	/* insert new index into current index block */
+	/* current depth stored in i var */
+	i = depth - 1;
+	while (k--) {
+		oldblock = newblock;
+		newblock = ablocks[--a];
+		bh = sb_getblk(inode->i_sb, newblock);
+		if (!bh) {
+			err = -EIO;
+			goto cleanup;
+		}
+		lock_buffer(bh);
+
+		if ((err = ext3_journal_get_create_access(handle, bh)))
+			goto cleanup;
+
+		neh = (struct ext3_extent_header *) bh->b_data;
+		neh->e_num = 1;
+		neh->e_max = ext3_ext_space_block(inode);
+		fidx = EXT_FIRST_INDEX(neh);
+		fidx->e_block = border;
+		fidx->e_leaf = oldblock;
+
+		ext_debug(inode,
+				"int.index at %d (block %u): %d -> %d\n",
+				i, (unsigned) newblock,
+				(int) border,
+				(int) oldblock);
+		/* copy indexes */
+		m = 0;
+		path[i].p_idx++;
+		EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
+				EXT_LAST_INDEX(path[i].p_hdr));
+		ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
+				EXT_MAX_INDEX(path[i].p_hdr));
+		while (path[i].p_idx <=
+				EXT_MAX_INDEX(path[i].p_hdr)) {
+			ext_debug(inode, "%d: move %d:%d in new index\n",
+					i, path[i].p_idx->e_block,
+					path[i].p_idx->e_leaf);
+			memmove(++fidx, path[i].p_idx++,
+					sizeof(struct ext3_extent_idx));
+			neh->e_num++;
+			m++;
+		}
+
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		if ((err = ext3_journal_dirty_metadata(handle, bh)))
+			goto cleanup;
+		brelse(bh);
+		bh = NULL;
+
+		/* correct old index */
+		if (m) {
+			err = ext3_ext_get_access(handle,inode,path+i);
+			if (err)
+				goto cleanup;
+			path[i].p_hdr->e_num -= m;
+			err = ext3_ext_dirty(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+		}
+
+		i--;
+	}
+
+	/* insert new index */
+	if (!err) 
+		err = ext3_ext_insert_index(handle, inode, path + at,
+						border, newblock);
+
+cleanup:
+	if (bh) {
+		if (buffer_locked(bh))
+			unlock_buffer(bh);
+		brelse(bh);
+	}
+
+	if (err) {
+		/* free all allocated blocks in error case */
+		for (i = 0; i < depth; i++)
+			if (!ablocks[i])
+				continue;
+			ext3_free_blocks(handle, inode, ablocks[i], 1);
+	}
+	kfree(ablocks);
+
+	return err;
+}
+
+/*
+ * routine implements tree growing procedure:
+ *  - allocates new block
+ *  - moves top-level data (index block or leaf) into the new block
+ *  - initialize new top-level, creating index that points to the
+ *    just created block
+ */
+static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path,
+					struct ext3_extent *newext)
+{
+	struct buffer_head *bh;
+	struct ext3_ext_path *curp = path;
+	struct ext3_extent_header *neh;
+	struct ext3_extent_idx *fidx;
+	int len, err = 0;
+	sector_t newblock;
+
+	/*
+	 * use already allocated by the called block for new root block
+	 */
+	newblock = newext->e_start++;
+	newext->e_num--;
+	
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		ext3_std_error(inode->i_sb, err);
+		return err;
+	}
+	lock_buffer(bh);
+
+	if ((err = ext3_journal_get_create_access(handle, bh))) {
+		unlock_buffer(bh);
+		goto out;	
+	}
+
+	/* move top-level index/leaf into new block */
+	len = sizeof(struct ext3_extent_header) +
+		sizeof(struct ext3_extent) * curp->p_hdr->e_max;
+	EXT_ASSERT(len >= 0 && len < 4096);
+	memmove(bh->b_data, curp->p_hdr, len);
+
+	/* set size of new block */
+	neh = (struct ext3_extent_header *) bh->b_data;
+	neh->e_max = ext3_ext_space_block(inode);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	if ((err = ext3_journal_dirty_metadata(handle, bh)))
+		goto out;
+
+	/* create index in new top-level index: num,max,pointer */
+	if ((err = ext3_ext_get_access(handle, inode, curp)))
+		goto out;
+
+	curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode);
+	curp->p_hdr->e_num = 1;
+	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
+	curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block;
+	curp->p_idx->e_leaf = newblock;
+
+	neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data;
+	fidx = EXT_FIRST_INDEX(neh);
+	ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n",
+			neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); 
+
+	EXT3_I(inode)->i_depth++;
+	err = ext3_ext_dirty(handle, inode, curp);
+out:
+	brelse(bh);
+
+	return err;
+}
+
+/*
+ * routine finds empty index and adds new leaf. if no free index found
+ * then it requests in-depth growing
+ */
+static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path,
+					struct ext3_extent *newext)
+{
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_ext_path *curp;
+	int i = depth, err = 0;
+	sector_t newblock = newext->e_start;
+
+	/* walk up to the tree and look for free index entry */
+	curp = path + depth;
+	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+		i--;
+		curp--;
+	}
+
+	/* we use already allocated block for index block
+	 * so, subsequent data blocks should be contigoues */
+	if (EXT_HAS_FREE_INDEX(curp)) {
+		/* if we found index with free entry, then use that
+		 * entry: create all needed subtree and add new leaf */
+		err = ext3_ext_split(handle, inode, path, newext, i);
+	} else {
+		/* tree is full, time to grow in depth */
+		err = ext3_ext_grow_indepth(handle, inode, path, newext);
+	}
+
+	if (!err) {
+		/* refill path */
+		ext3_ext_drop_refs(inode, path);
+		path = ext3_ext_find_extent(inode, newext->e_block, path);
+		if (IS_ERR(path))
+			err = PTR_ERR(path);
+
+		/*
+		 * probably we've used some blocks from extent
+		 * let's allocate new block for it
+		 */
+		if (newext->e_num == 0 && !err) {
+			newext->e_start =
+				ext3_new_block(handle, inode, newblock,
+						0, 0, &err);
+			newext->e_num = 1;
+		}
+	}
+
+	return err;
+}
+
+/*
+ * returns next allocated block or 0xffffffff
+ * NOTE: it consider block number from index entry as
+ * allocated block. thus, index entries have to be consistent
+ * with leafs
+ */
+static inline unsigned ext3_ext_next_allocated_block(struct inode *inode,
+                                               struct ext3_ext_path *path)
+{
+	int depth;
+
+	EXT_ASSERT(path != NULL);
+	depth = path->p_depth;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0xffffffff;
+
+	/* FIXME: what if index isn't full ?! */
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			/* leaf */
+			if (path[depth].p_ext !=
+					EXT_LAST_EXTENT(path[depth].p_hdr))
+				return path[depth].p_ext[1].e_block;
+		} else {
+			/* index */
+			if (path[depth].p_idx !=
+					EXT_LAST_INDEX(path[depth].p_hdr))
+				return path[depth].p_idx[1].e_block;
+		}
+		depth--;        
+	}
+
+	return 0xffffffff;
+}
+
+/*
+ * returns first allocated block from next leaf or 0xffffffff
+ */
+static unsigned ext3_ext_next_leaf_block(struct inode *inode,
+                                               struct ext3_ext_path *path)
+{
+	int depth;
+
+	EXT_ASSERT(path != NULL);
+	depth = path->p_depth;
+
+	/* zero-tree has no leaf blocks at all */
+	if (depth == 0)
+		return 0xffffffff;
+
+	/* go to index block */
+	depth--;
+	
+	while (depth >= 0) {
+		if (path[depth].p_idx !=
+				EXT_LAST_INDEX(path[depth].p_hdr))
+			return path[depth].p_idx[1].e_block;
+		depth--;        
+	}
+
+	return 0xffffffff;
+}
+
+/*
+ * if leaf gets modified and modified extent is first in the leaf
+ * then we have to correct all indexes above
+ * TODO: do we need to correct tree in all cases?
+ */
+int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	int depth = EXT3_I(inode)->i_depth;	
+	struct ext3_extent_header *eh;
+	struct ext3_extent *ex;
+	sector_t border;
+	int k, err = 0;
+	
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	EXT_ASSERT(ex);
+	EXT_ASSERT(eh);
+	
+	if (depth == 0) {
+		/* there is no tree at all */
+		return 0;
+	}
+	
+	if (ex != EXT_FIRST_EXTENT(eh)) {
+		/* we correct tree if first leaf got modified only */
+		return 0;
+	}
+	
+	k = depth - 1;
+	border = path[depth].p_ext->e_block;
+	if ((err = ext3_ext_get_access(handle, inode, path + k)))
+		return err;
+	path[k].p_idx->e_block = border;
+	if ((err = ext3_ext_dirty(handle, inode, path + k)))
+		return err;
+
+	while (k--) {
+		/* change all left-side indexes */
+		if (path[k].p_idx != EXT_FIRST_INDEX(path[k].p_hdr)
+				&& k != 0)
+			break;
+		if ((err = ext3_ext_get_access(handle, inode, path + k)))
+			break;
+		path[k].p_idx->e_block = border;
+		if ((err = ext3_ext_dirty(handle, inode, path + k)))
+			break;
+	}
+
+	return err;
+}
+
+/*
+ * this routine tries to merge requsted extent into the existing
+ * extent or inserts requested extent as new one into the tree,
+ * creating new leaf in no-space case
+ */
+int ext3_ext_insert_extent(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path,
+				struct ext3_extent *newext)
+{
+	int depth, len;
+	struct ext3_extent_header * eh;
+	struct ext3_extent *ex;
+	struct ext3_extent *nearex; /* nearest extent */
+	struct ext3_ext_path *npath = NULL;
+	int err;
+
+	depth = EXT3_I(inode)->i_depth;	
+	if ((ex = path[depth].p_ext)) {
+		/* try to insert block into found extent and return */
+		if (ex->e_block + ex->e_num == newext->e_block &&
+				ex->e_start + ex->e_num == newext->e_start) {
+#ifdef AGRESSIVE_TEST
+			if (ex->e_num >= 2)
+				goto repeat;
+#endif
+			if ((err = ext3_ext_get_access(handle, inode,
+							path + depth)))
+				return err;
+			ext_debug(inode, "append %d block to %d:%d (from %d)\n",
+					newext->e_num, ex->e_block, ex->e_num,
+					ex->e_start);
+			ex->e_num += newext->e_num;
+			err = ext3_ext_dirty(handle, inode, path + depth);
+			return err;
+		}
+	}
+
+repeat:
+	depth = EXT3_I(inode)->i_depth;	
+	eh = path[depth].p_hdr;
+	if (eh->e_num == eh->e_max) {
+		/* probably next leaf has space for us? */
+		int next = ext3_ext_next_leaf_block(inode, path);
+		if (next != 0xffffffff) {
+			ext_debug(inode, "next leaf block - %d\n", next);
+			EXT_ASSERT(!npath);
+			npath = ext3_ext_find_extent(inode, next, NULL);
+			if (IS_ERR(npath))
+				return PTR_ERR(npath);
+			EXT_ASSERT(npath->p_depth == path->p_depth);
+			eh = npath[depth].p_hdr;
+			if (eh->e_num < eh->e_max) {
+				ext_debug(inode,
+						"next leaf has free ext(%d)\n",
+						eh->e_num);
+				path = npath;
+				goto repeat;
+			}
+			ext_debug(inode, "next leaf hasno free space(%d,%d)\n",
+					eh->e_num, eh->e_max);
+		}
+		/*
+		 * there is no free space in found leaf
+		 * we're gonna add new leaf in the tree
+		 */
+		err = ext3_ext_create_new_leaf(handle, inode, path, newext);
+		if (err)
+			goto cleanup;
+		goto repeat;
+	}
+
+	nearex = path[depth].p_ext;
+
+	if ((err = ext3_ext_get_access(handle, inode, path + depth)))
+		goto cleanup;
+
+	if (!nearex) {
+		/* there is no extent in this leaf, create first one */
+		ext_debug(inode, "first extent in the leaf: %d:%d:%d\n",
+				newext->e_block, newext->e_start,
+				newext->e_num);
+		eh->e_num++;
+		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
+
+	} else if (newext->e_block > nearex->e_block) {
+		EXT_ASSERT(newext->e_block != nearex->e_block);
+		len = EXT_MAX_EXTENT(eh) - nearex;
+		len = (len - 1) * sizeof(struct ext3_extent);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, "
+				"move %d from 0x%p to 0x%p\n",
+				newext->e_block, newext->e_start, newext->e_num,
+				nearex, len, nearex + 1, nearex + 2);
+		ext3_ext_check_boundary(inode, path + depth, nearex + 2, len);
+		memmove(nearex + 2, nearex + 1, len);
+		path[depth].p_ext = nearex + 1;
+		eh->e_num++;
+	} else {
+		EXT_ASSERT(newext->e_block != nearex->e_block);
+		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, "
+				"move %d from 0x%p to 0x%p\n",
+				newext->e_block, newext->e_start, newext->e_num,
+				nearex, len, nearex + 1, nearex + 2);
+		memmove(nearex + 1, nearex, len);
+		path[depth].p_ext = nearex;
+		eh->e_num++;
+
+		/* time to correct all indexes above */
+		err = ext3_ext_correct_indexes(handle, inode, path);
+	}
+
+	if (!err) {
+		nearex = path[depth].p_ext;
+		nearex->e_block = newext->e_block;
+		nearex->e_start = newext->e_start;
+		nearex->e_num = newext->e_num;
+	}
+
+	err = ext3_ext_dirty(handle, inode, path + depth);
+
+cleanup:
+	if (npath) {
+		ext3_ext_drop_refs(inode, npath);
+		kfree(npath);
+	}
+		
+	return err;
+}
+
+int ext3_ext_get_block(handle_t *handle, struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create,
+			int extend_disksize)
+{
+	struct ext3_ext_path *path;
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent newex;
+	struct ext3_extent *ex;
+	int goal, newblock, err = 0;
+
+	ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n",
+			(int) iblock, (unsigned) inode->i_ino, bh_result);
+	clear_buffer_new(bh_result);
+
+	down(&EXT3_I(inode)->i_ext_sem);
+
+	/* find extent for this block */
+	path = ext3_ext_find_extent(inode, iblock, NULL);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		goto out2;
+	}
+
+	if ((ex = path[depth].p_ext)) {
+		/* if found exent covers block, simple return it */
+		if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) {
+			newblock = iblock - ex->e_block + ex->e_start;
+			ext_debug(inode, "%d fit into %d:%d -> %d\n",
+					(int) iblock, ex->e_block, ex->e_num,
+					newblock);
+			goto out;
+		}
+	}
+
+	/*
+	 * we couldn't try to create block if create flag is zero 
+	 */
+	if (!create) 
+		goto out2;
+
+	/* allocate new block */
+	goal = ext3_ext_find_goal(inode, path);
+	newblock = ext3_new_block(handle, inode, goal, 0, 0, &err);
+	if (!newblock)
+		goto out2;
+	ext_debug(inode, "allocate new block: goal %d, found %d\n",
+			goal, newblock);
+
+	/* try to insert new extent into found leaf and return */
+	newex.e_block = iblock;
+	newex.e_start = newblock;
+	newex.e_num = 1;
+	err = ext3_ext_insert_extent(handle, inode, path, &newex);
+	if (err)
+		goto out2;
+	
+	/* previous routine could use block we allocated */
+	newblock = newex.e_start;
+	set_buffer_new(bh_result);
+
+out:
+	ext3_ext_show_leaf(inode, path);
+	map_bh(bh_result, inode->i_sb, newblock);
+out2:
+	ext3_ext_drop_refs(inode, path);
+	kfree(path);
+	up(&EXT3_I(inode)->i_ext_sem);
+
+	return err;	
+}
+
+/*
+ * returns 1 if current index have to be freed (even partial)
+ */
+static int ext3_ext_more_to_truncate(struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	EXT_ASSERT(path->p_idx);
+
+	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+		return 0;
+
+	/*
+	 * if truncate on deeper level happened it it wasn't partial
+	 * so we have to consider current index for truncation
+	 */
+	if (path->p_hdr->e_num == path->p_block)
+		return 0;
+
+	/*
+	 * put actual number of indexes to know is this number got
+	 * changed at the next iteration
+	 */
+	path->p_block = path->p_hdr->e_num;
+	
+	return 1;
+}
+
+/*
+ * routine removes index from the index block
+ * it's used in truncate case only. thus all requests are for
+ * last index in the block only
+ */
+int ext3_ext_remove_index(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path)
+{
+	struct buffer_head *bh;
+	int err;
+	
+	/* free index block */
+	path--;
+	EXT_ASSERT(path->p_hdr->e_num);
+	if ((err = ext3_ext_get_access(handle, inode, path)))
+		return err;
+	path->p_hdr->e_num--;
+	if ((err = ext3_ext_dirty(handle, inode, path)))
+		return err;
+	bh = sb_find_get_block(inode->i_sb, path->p_idx->e_leaf);
+	ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf);
+	ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1);
+
+	ext_debug(inode, "index is empty, remove it, free block %d\n",
+			path->p_idx->e_leaf);
+	return err;
+}
+
+/*
+ * returns 1 if current extent needs to be freed (even partial)
+ * instead, returns 0
+ */
+int ext3_ext_more_leaves_to_truncate(struct inode *inode,
+					struct ext3_ext_path *path)
+{
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	struct ext3_extent *ex = path->p_ext;
+	int last_block; 
+
+	EXT_ASSERT(ex);
+
+	/* is there leave in the current leaf? */
+	if (ex < EXT_FIRST_EXTENT(path->p_hdr))
+		return 0;
+	
+	last_block = (inode->i_size + blocksize-1)
+			>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+
+	if (last_block >= ex->e_block + ex->e_num)
+		return 0;
+
+	/* seems it extent have to be freed */
+	return 1;
+}
+
+handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
+{
+	int err;
+
+	if (handle->h_buffer_credits > needed)
+		return handle;
+	if (!ext3_journal_extend(handle, needed))
+		return handle;
+	err = ext3_journal_restart(handle, needed);
+	
+	return handle;
+}
+
+/*
+ * this routine calculate max number of blocks to be modified
+ * while freeing extent and is intended to be used in truncate path
+ */
+static int ext3_ext_calc_credits(struct inode *inode,
+					struct ext3_ext_path *path,
+					int num)
+{
+	int depth = EXT3_I(inode)->i_depth;
+	int needed;
+	
+	/*
+	 * extent couldn't cross group, so we will modify
+	 * single bitmap block and single group descriptor
+	 */
+	needed = 2;
+
+	/*
+	 * if this is last extent in a leaf, then we have to
+	 * free leaf block and remove pointer from index above.
+	 * that pointer could be last in index block, so we'll
+	 * have to remove it too. this way we could modify/free
+	 * the whole path + root index (inode stored) will be
+	 * modified
+	 */
+	if (!path || (num == path->p_ext->e_num &&
+				path->p_ext == EXT_FIRST_EXTENT(path->p_hdr)))
+		needed += (depth * EXT3_ALLOC_NEEDED) + 1;
+
+	return needed;
+}
+
+/*
+ * core of the truncate procedure:
+ * - calculated what part of each extent in the requested leaf
+ *   need to be freed
+ * - frees and forgets these blocks
+ *
+ * TODO: we could optimize and free several extents during
+ *       single journal_restart()-journal_restart() cycle
+ */
+static int ext3_ext_truncate_leaf(handle_t *handle,
+					struct inode *inode,
+					struct ext3_ext_path *path,
+					int depth)
+{
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	int last_block; 
+	int i, err = 0, sf, num;
+
+	ext_debug(inode, "level %d - leaf\n", depth);
+	if (!path->p_hdr)
+		path->p_hdr =
+			(struct ext3_extent_header *) path->p_bh->b_data;
+
+	EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max);
+	
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+	path->p_ext = EXT_LAST_EXTENT(path->p_hdr);
+	while (ext3_ext_more_leaves_to_truncate(inode, path)) {
+
+		/* what part of extent have to be freed? */
+		sf = last_block > path->p_ext->e_block ?
+			last_block : path->p_ext->e_block;
+
+		/* number of blocks from extent to be freed */
+		num = path->p_ext->e_block + path->p_ext->e_num - sf;
+
+		/* calc physical first physical block to be freed */
+		sf = path->p_ext->e_start + (sf - path->p_ext->e_block);
+
+		i = ext3_ext_calc_credits(inode, path, num);
+		handle = ext3_ext_journal_restart(handle, i);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		
+		ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n",
+				path->p_ext->e_block, path->p_ext->e_start,
+				path->p_ext->e_num, sf, num);
+		for (i = 0; i < num; i++) {
+			struct buffer_head *bh =
+				sb_find_get_block(inode->i_sb, sf + i);
+			ext3_forget(handle, 0, inode, bh, sf + i);
+		}
+		ext3_free_blocks(handle, inode, sf, num);
+
+		/* collect extents usage stats */
+		spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock);
+		EXT3_SB(inode->i_sb)->s_ext_extents++;
+		EXT3_SB(inode->i_sb)->s_ext_blocks += num;
+		spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock);
+
+		/* reduce extent */
+		if ((err = ext3_ext_get_access(handle, inode, path)))
+			return err;
+		path->p_ext->e_num -= num;
+		if (path->p_ext->e_num == 0)
+			path->p_hdr->e_num--;
+		if ((err = ext3_ext_dirty(handle, inode, path)))
+			return err;
+
+		path->p_ext--;
+	}
+	
+	/* if this leaf is free, then we should
+	 * remove it from index block above */
+	if (path->p_hdr->e_num == 0 && depth > 0) 
+		err = ext3_ext_remove_index(handle, inode, path);
+
+	return err;
+}
+
+static void ext3_ext_collect_stats(struct inode *inode)
+{
+	int depth;
+	
+	/* skip inodes with old good bitmap */
+	if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
+		return;
+	
+	/* collect on full truncate only */
+	if (inode->i_size)
+		return;
+
+	depth = EXT3_I(inode)->i_depth;
+	if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth)
+		 EXT3_SB(inode->i_sb)->s_ext_mindepth = depth;
+	if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth)
+		 EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth;
+	EXT3_SB(inode->i_sb)->s_ext_sum += depth;
+	EXT3_SB(inode->i_sb)->s_ext_count++;
+	
+}
+
+void ext3_ext_truncate(struct inode * inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct ext3_ext_path *path;
+	struct page * page;
+	handle_t *handle;
+	int i, depth, err = 0;
+
+	down(&EXT3_I(inode)->i_ext_sem);
+
+	ext3_ext_collect_stats(inode);
+
+	/*
+	 * We have to lock the EOF page here, because lock_page() nests
+	 * outside journal_start().
+	 */
+	if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) {
+		/* Block boundary? Nothing to do */
+		page = NULL;
+	} else {
+		page = grab_cache_page(mapping,
+				inode->i_size >> PAGE_CACHE_SHIFT);
+		if (!page) {
+			up(&EXT3_I(inode)->i_ext_sem);
+			return;
+		}
+	}
+
+	/*
+	 * probably first extent we're gonna free will be last in block
+	 */
+	i = ext3_ext_calc_credits(inode, NULL, 0);
+	handle = ext3_journal_start(inode, i);
+	if (IS_ERR(handle)) {
+		if (page) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+		up(&EXT3_I(inode)->i_ext_sem);
+		return;
+	}
+
+	if (page)
+		ext3_block_truncate_page(handle, page, mapping, inode->i_size);
+
+	/* 
+	 * TODO: optimization is possible here
+	 * probably we need not scaning at all,
+	 * because page truncation is enough
+	 */
+	if (ext3_orphan_add(handle, inode))
+		goto out_stop;
+
+	/* we have to know where to truncate from in crash case */
+	EXT3_I(inode)->i_disksize = inode->i_size;
+	ext3_mark_inode_dirty(handle, inode);
+
+	/*
+	 * we start scanning from right side freeing all the blocks
+	 * after i_size and walking into the deep
+	 */
+	i = 0;
+	depth = EXT3_I(inode)->i_depth;
+	path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
+	if (IS_ERR(path)) {
+		ext3_error(inode->i_sb, "ext3_ext_truncate",
+				"Can't allocate path array");
+		goto out_stop;
+	}
+	memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
+
+	path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data;
+	while (i >= 0 && err == 0) {
+		if (i == depth) {
+			/* this is leaf block */
+			err = ext3_ext_truncate_leaf(handle, inode,
+							path + i, i);
+			/* root level have p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			i--;
+			continue;
+		}
+		
+		/* this is index block */
+		if (!path[i].p_hdr) {
+			path[i].p_hdr =
+				(struct ext3_extent_header *) path[i].p_bh->b_data;
+			ext_debug(inode, "initialize header\n");
+		}
+
+		EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max);
+		
+		if (!path[i].p_idx) {
+			/* this level hasn't touched yet */
+			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+			path[i].p_block = path[i].p_hdr->e_num + 1;
+			ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
+					path[i].p_hdr, path[i].p_hdr->e_num);
+		} else {
+			/* we've already was here, see at next index */
+			path[i].p_idx--;
+		}
+
+		ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
+				i, EXT_FIRST_INDEX(path[i].p_hdr),
+				path[i].p_idx);
+		if (ext3_ext_more_to_truncate(inode, path + i)) {
+			/* go to the next level */
+			ext_debug(inode, "move to level %d (block %d)\n", i+1,
+					path[i].p_idx->e_leaf);
+			memset(path + i + 1, 0, sizeof(*path));
+			path[i+1].p_bh = sb_bread(inode->i_sb,
+							path[i].p_idx->e_leaf);
+			if (!path[i+1].p_bh) {
+				/* should we reset i_size? */
+				err = -EIO;
+				break;
+			}
+			i++;
+		} else {
+			/* we finish processing this index, go up */
+			if (path[i].p_hdr->e_num == 0 && i > 0) {
+				/* index is empty, remove it
+				 * handle must be already prepared by the
+				 * truncate_leaf()
+				 */
+				err = ext3_ext_remove_index(handle, inode,
+								path + i);
+			}
+			/* root level have p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			i--;
+			ext_debug(inode, "return to level %d\n", i);
+		}
+	}
+
+	/* TODO: flexible tree reduction should be here */
+	if (path->p_hdr->e_num == 0) {
+		/*
+		 * truncate to zero freed all the tree
+		 * so, we need to correct i_depth
+		 */
+		EXT3_I(inode)->i_depth = 0;
+		path->p_hdr->e_max = 0;
+		ext3_mark_inode_dirty(handle, inode);
+	}
+
+	kfree(path);
+
+	/* In a multi-transaction truncate, we only make the final
+	 * transaction synchronous */
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+
+out_stop:
+	/*
+	 * If this was a simple ftruncate(), and the file will remain alive
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext3_delete_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(handle, inode);
+
+	up(&EXT3_I(inode)->i_ext_sem);
+	ext3_journal_stop(handle);
+}
+
+/*
+ * this routine calculate max number of blocks we could modify
+ * in order to allocate new block for an inode
+ */
+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	int depth = ei->i_depth + 1;
+	int needed;
+	
+	/*
+	 * the worste case we're expecting is creation of the
+	 * new root (growing in depth) with index splitting
+	 * for splitting we have to consider depth + 1 because
+	 * previous growing could increase it
+	 */
+
+	/* 
+	 * growing in depth:
+	 * block allocation + new root + old root
+	 */
+	needed = EXT3_ALLOC_NEEDED + 2;
+
+	/* index split. we may need:
+	 *   allocate intermediate indexes and new leaf
+	 *   change two blocks at each level, but root
+	 *   modify root block (inode)
+	 */
+	needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
+
+	/* caller want to allocate num blocks */
+	needed *= num;
+	
+#ifdef CONFIG_QUOTA
+	/* 
+	 * FIXME: real calculation should be here
+	 * it depends on blockmap format of qouta file
+	 */
+	needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
+#endif
+
+	return needed;
+}
+
+/*
+ * called at mount time
+ */
+void ext3_ext_init(struct super_block *sb)
+{
+	/*
+	 * possible initialization would be here
+	 */
+
+	if (test_opt(sb, EXTENTS))
+		printk("EXT3-fs: file extents enabled\n");
+	spin_lock_init(&EXT3_SB(sb)->s_ext_lock);
+}
+
+/*
+ * called at umount time
+ */
+void ext3_ext_release(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	/* show collected stats */
+	if (sbi->s_ext_count && sbi->s_ext_extents)
+		printk("EXT3-fs: min depth - %d, max depth - %d, "
+				"ave. depth - %d, ave. blocks/extent - %d\n",
+				sbi->s_ext_mindepth,
+				sbi->s_ext_maxdepth,
+				sbi->s_ext_sum / sbi->s_ext_count,
+				sbi->s_ext_blocks / sbi->s_ext_extents);
+}
+
diff -puN fs/ext3/ialloc.c~ext3-extents fs/ext3/ialloc.c
--- linux-2.6.0-test2-uml/fs/ext3/ialloc.c~ext3-extents	2003-08-19 08:53:27.000000000 +0400
+++ linux-2.6.0-test2-uml-alexey/fs/ext3/ialloc.c	2003-08-19 08:53:27.000000000 +0400
@@ -600,6 +600,8 @@ got:
 		DQUOT_FREE_INODE(inode);
 		goto fail2;
   	}
+	if (test_opt(sb, EXTENTS))
+		EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
 	err = ext3_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext3_std_error(sb, err);
diff -puN fs/ext3/inode.c~ext3-extents fs/ext3/inode.c
--- linux-2.6.0-test2-uml/fs/ext3/inode.c~ext3-extents	2003-08-19 08:53:27.000000000 +0400
+++ linux-2.6.0-test2-uml-alexey/fs/ext3/inode.c	2003-08-19 08:53:27.000000000 +0400
@@ -862,6 +862,15 @@ changed:
 	goto reread;
 }
 
+static inline int
+ext3_get_block_wrap(handle_t *handle, struct inode *inode, sector_t block,
+		struct buffer_head *bh, int create, int extend_disksize)
+{
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_get_block(handle, inode, block, bh, create, 1);
+	return ext3_get_block_handle(handle, inode, block, bh, create, 1);
+}
+
 static int ext3_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
@@ -872,7 +881,7 @@ static int ext3_get_block(struct inode *
 		handle = ext3_journal_current_handle();
 		J_ASSERT(handle != 0);
 	}
-	ret = ext3_get_block_handle(handle, inode, iblock,
+	ret = ext3_get_block_wrap(handle, inode, iblock,
 				bh_result, create, 1);
 	return ret;
 }
@@ -899,7 +908,7 @@ ext3_direct_io_get_blocks(struct inode *
 		}
 	}
 	if (ret == 0)
-		ret = ext3_get_block_handle(handle, inode, iblock,
+		ret = ext3_get_block_wrap(handle, inode, iblock,
 					bh_result, create, 0);
 	if (ret == 0)
 		bh_result->b_size = (1 << inode->i_blkbits);
@@ -921,7 +930,7 @@ struct buffer_head *ext3_getblk(handle_t
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	*errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
+	*errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
 	if (!*errp && buffer_mapped(&dummy)) {
 		struct buffer_head *bh;
 		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
@@ -1663,7 +1672,7 @@ void ext3_set_aops(struct inode *inode)
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
+int ext3_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from)
 {
 	unsigned long index = from >> PAGE_CACHE_SHIFT;
@@ -2145,6 +2154,9 @@ void ext3_truncate(struct inode * inode)
 
 	ext3_discard_prealloc(inode);
 
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_truncate(inode);
+
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
 	 * outside journal_start().
@@ -2540,6 +2552,7 @@ void ext3_read_inode(struct inode * inod
 	ei->i_prealloc_count = 0;
 #endif
 	ei->i_block_group = iloc.block_group;
+	ei->i_depth = raw_inode->osd2.linux2.l_i_depth;
 
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
@@ -2636,6 +2649,7 @@ static int ext3_do_update_inode(handle_t
 	raw_inode->i_frag = ei->i_frag_no;
 	raw_inode->i_fsize = ei->i_frag_size;
 #endif
+ 	raw_inode->osd2.linux2.l_i_depth = ei->i_depth;
 	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
 	if (!S_ISREG(inode->i_mode)) {
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
@@ -2840,6 +2854,9 @@ int ext3_writepage_trans_blocks(struct i
 	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
 	int ret;
 
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_writepage_trans_blocks(inode, bpp);
+
 	if (ext3_should_journal_data(inode))
 		ret = 3 * (bpp + indirects) + 2;
 	else
diff -puN fs/ext3/Makefile~ext3-extents fs/ext3/Makefile
--- linux-2.6.0-test2-uml/fs/ext3/Makefile~ext3-extents	2003-08-19 08:53:27.000000000 +0400
+++ linux-2.6.0-test2-uml-alexey/fs/ext3/Makefile	2003-08-19 08:53:27.000000000 +0400
@@ -5,7 +5,7 @@
 obj-$(CONFIG_EXT3_FS) += ext3.o
 
 ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-	   ioctl.o namei.o super.o symlink.o hash.o
+	   ioctl.o namei.o super.o symlink.o hash.o extents.o
 
 ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
 ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
diff -puN fs/ext3/super.c~ext3-extents fs/ext3/super.c
--- linux-2.6.0-test2-uml/fs/ext3/super.c~ext3-extents	2003-08-19 08:53:27.000000000 +0400
+++ linux-2.6.0-test2-uml-alexey/fs/ext3/super.c	2003-08-25 21:08:33.000000000 +0400
@@ -446,6 +446,7 @@ void ext3_put_super (struct super_block 
 	struct ext3_super_block *es = sbi->s_es;
 	int i;
 
+	ext3_ext_release(sb);
 	ext3_xattr_put_super(sb);
 	journal_destroy(sbi->s_journal);
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -504,6 +505,8 @@ static struct inode *ext3_alloc_inode(st
 	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
 #endif
 	ei->vfs_inode.i_version = 1;
+	ei->i_depth = 0;
+	sema_init(&ei->i_ext_sem, 1);
 	return &ei->vfs_inode;
 }
 
@@ -656,6 +659,12 @@ static int parse_options (char * options
 			continue;
 		if ((value = strchr (this_char, '=')) != NULL)
 			*value++ = 0;
+		if (!strcmp (this_char, "extents"))
+			set_opt (sbi->s_mount_opt, EXTENTS);
+		else
+		if (!strcmp (this_char, "extdebug"))
+			set_opt (sbi->s_mount_opt, EXTDEBUG);
+		else
 #ifdef CONFIG_EXT3_FS_XATTR
 		if (!strcmp (this_char, "user_xattr"))
 			set_opt (sbi->s_mount_opt, XATTR_USER);
@@ -1442,6 +1451,8 @@ static int ext3_fill_super (struct super
 	percpu_counter_mod(&sbi->s_dirs_counter,
 		ext3_count_dirs(sb));
 
+	ext3_ext_init(sb);
+
 	return 0;
 
 failed_mount3:
diff -puN include/linux/ext3_fs.h~ext3-extents include/linux/ext3_fs.h
--- linux-2.6.0-test2-uml/include/linux/ext3_fs.h~ext3-extents	2003-08-19 08:53:27.000000000 +0400
+++ linux-2.6.0-test2-uml-alexey/include/linux/ext3_fs.h	2003-08-19 08:53:27.000000000 +0400
@@ -186,6 +186,7 @@ struct ext3_group_desc
 #define EXT3_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 #define EXT3_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT3_RESERVED_FL		0x80000000 /* reserved for ext3 lib */
+#define EXT3_EXTENTS_FL			0x00080000 /* Inode uses extents */
 
 #define EXT3_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
 #define EXT3_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
@@ -244,7 +245,7 @@ struct ext3_inode {
 		struct {
 			__u8	l_i_frag;	/* Fragment number */
 			__u8	l_i_fsize;	/* Fragment size */
-			__u16	i_pad1;
+			__u16	l_i_depth;
 			__u16	l_i_uid_high;	/* these 2 fields    */
 			__u16	l_i_gid_high;	/* were reserved2[0] */
 			__u32	l_i_reserved2;
@@ -324,6 +325,8 @@ struct ext3_inode {
 #define EXT3_MOUNT_NO_UID32		0x2000  /* Disable 32-bit UIDs */
 #define EXT3_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
 #define EXT3_MOUNT_POSIX_ACL		0x8000	/* POSIX Access Control Lists */
+#define EXT3_MOUNT_EXTENTS		0x10000	/* Extents support */
+#define EXT3_MOUNT_EXTDEBUG		0x20000	/* Extents debug */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -733,6 +736,11 @@ extern int ext3_change_inode_journal_fla
 extern void ext3_truncate (struct inode *);
 extern void ext3_set_inode_flags(struct inode *);
 extern void ext3_set_aops(struct inode *inode);
+extern int ext3_block_truncate_page(handle_t *handle, struct page *page,
+					struct address_space *mapping, loff_t from);
+extern int ext3_forget(handle_t *handle, int is_metadata,
+		       struct inode *inode, struct buffer_head *bh,
+		       int blocknr);
 
 /* ioctl.c */
 extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
@@ -789,6 +797,13 @@ extern struct inode_operations ext3_spec
 extern struct inode_operations ext3_symlink_inode_operations;
 extern struct inode_operations ext3_fast_symlink_inode_operations;
 
+/* extents.c */
+extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext3_ext_get_block(handle_t *, struct inode *, sector_t,
+				struct buffer_head *, int, int);
+extern void ext3_ext_truncate(struct inode *);
+extern void ext3_ext_init(struct super_block *);
+extern void ext3_ext_release(struct super_block *);
 
 #endif	/* __KERNEL__ */
 
diff -puN include/linux/ext3_fs_i.h~ext3-extents include/linux/ext3_fs_i.h
--- linux-2.6.0-test2-uml/include/linux/ext3_fs_i.h~ext3-extents	2003-08-19 08:53:27.000000000 +0400
+++ linux-2.6.0-test2-uml-alexey/include/linux/ext3_fs_i.h	2003-08-25 21:08:33.000000000 +0400
@@ -108,6 +108,10 @@ struct ext3_inode_info {
 	 */
 	struct rw_semaphore truncate_sem;
 	struct inode vfs_inode;
+
+	/* extents-related data */
+	struct semaphore i_ext_sem;
+	__u16 i_depth;
 };
 
 #endif	/* _LINUX_EXT3_FS_I */
diff -puN include/linux/ext3_fs_sb.h~ext3-extents include/linux/ext3_fs_sb.h
--- linux-2.6.0-test2-uml/include/linux/ext3_fs_sb.h~ext3-extents	2003-08-19 08:53:27.000000000 +0400
+++ linux-2.6.0-test2-uml-alexey/include/linux/ext3_fs_sb.h	2003-08-19 08:53:27.000000000 +0400
@@ -68,6 +68,16 @@ struct ext3_sb_info {
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
 	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
 #endif
+
+	/* extents */
+	int s_ext_debug;
+	int s_ext_mindepth;
+	int s_ext_maxdepth;
+	int s_ext_sum;
+	int s_ext_count;
+	spinlock_t s_ext_lock;
+	int s_ext_extents;
+	int s_ext_blocks;
 };
 
 #endif	/* _LINUX_EXT3_FS_SB */

_


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] [RFC] extents support for EXT3
  2003-08-28  8:22 [RFC] extents support for EXT3 Alex Tomas
@ 2003-08-28 17:22 ` Mike Fedyk
  2003-08-29  5:55   ` Alex Tomas
  2003-08-28 18:12 ` Ed Sweetman
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 30+ messages in thread
From: Mike Fedyk @ 2003-08-28 17:22 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

On Thu, Aug 28, 2003 at 12:22:04PM +0400, Alex Tomas wrote:
> 
> this is 2nd version of the patch. changes:
>   - error handling seems completed
>   - lots of cleanups and comments
>   - few minor bug fixed
> 
> this version of the patch tries to solve couple
> of corner cases:
>   - very long truncate
>   - rewrite 
> 
> it survived dbench, bonnie++ and fsx tests.
> 
> take a look at numbers I've just got, please.
> 
>                       before      after
> 5GB file, creation:   2m31.197s   2m21.933s
> 5GB file, read:       2m25.439s   2m24.833s
> 5GB file, rewrite:    2m48.434s   2m20.958s
> 5GB file, removal:    0m8.760s    0m0.858s

With extents, what are the worst/best cases for max file size on a 1k block
filesystem?  AFAICT, worst case is 16GB if it backs out to the second level
like we have now...

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-28  8:22 [RFC] extents support for EXT3 Alex Tomas
  2003-08-28 17:22 ` [Ext2-devel] " Mike Fedyk
@ 2003-08-28 18:12 ` Ed Sweetman
  2003-08-29  5:59   ` Alex Tomas
  2003-08-28 22:00 ` Ramón Rey Vicente󮠒
  2003-09-06  0:19 ` jw schultz
  3 siblings, 1 reply; 30+ messages in thread
From: Ed Sweetman @ 2003-08-28 18:12 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

This patch seems to be against test2, just wondering if anyone before me 
has used it on the latest test release.  If not then i'm gonna do 
something stupid and be the first.


Alex Tomas wrote:
> this is 2nd version of the patch. changes:
>   - error handling seems completed
>   - lots of cleanups and comments
>   - few minor bug fixed
> 
> this version of the patch tries to solve couple
> of corner cases:
>   - very long truncate
>   - rewrite 
> 
> it survived dbench, bonnie++ and fsx tests.
> 
> take a look at numbers I've just got, please.
> 
>                       before      after
> 5GB file, creation:   2m31.197s   2m21.933s
> 5GB file, read:       2m25.439s   2m24.833s
> 5GB file, rewrite:    2m48.434s   2m20.958s
> 5GB file, removal:    0m8.760s    0m0.858s
> 
>              before           after
> dbench 16:   99.9868 MB/sec   179.243 MB/sec 16 procs
> dbench 16:   89.9919 MB/sec   203.119 MB/sec 16 procs
> dbench 16:   73.5519 MB/sec   185.815 MB/sec 16 procs
> dbench 16:   94.6312 MB/sec   188.519 MB/sec 16 procs
> 
> 
> to use extents you have to use 'extents' mount option
> 
> Alex


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-28  8:22 [RFC] extents support for EXT3 Alex Tomas
  2003-08-28 17:22 ` [Ext2-devel] " Mike Fedyk
  2003-08-28 18:12 ` Ed Sweetman
@ 2003-08-28 22:00 ` Ramón Rey Vicente󮠒
  2003-08-29  6:04   ` Alex Tomas
  2003-09-06  0:19 ` jw schultz
  3 siblings, 1 reply; 30+ messages in thread
From: Ramón Rey Vicente󮠒 @ 2003-08-28 22:00 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

El jue, 28-08-2003 a las 10:22, Alex Tomas escribió:

> this is 2nd version of the patch. changes:
>   - error handling seems completed
>   - lots of cleanups and comments
>   - few minor bug fixed
> 
> this version of the patch tries to solve couple
> of corner cases:
>   - very long truncate
>   - rewrite 
> 
> it survived dbench, bonnie++ and fsx tests.

It seems very good job.

> take a look at numbers I've just got, please.
> 
>                       before      after
> 5GB file, creation:   2m31.197s   2m21.933s
> 5GB file, read:       2m25.439s   2m24.833s
> 5GB file, rewrite:    2m48.434s   2m20.958s
> 5GB file, removal:    0m8.760s    0m0.858s
> 
>              before           after
> dbench 16:   99.9868 MB/sec   179.243 MB/sec 16 procs
> dbench 16:   89.9919 MB/sec   203.119 MB/sec 16 procs
> dbench 16:   73.5519 MB/sec   185.815 MB/sec 16 procs
> dbench 16:   94.6312 MB/sec   188.519 MB/sec 16 procs
> 
> 
> to use extents you have to use 'extents' mount option

This patch could be included with ext3 in 2.6.x?
-- 
Ramón Rey Vicente       <ramon dot rey at hispalinux dot es>
        jabber ID       <rreylinux at jabber dot org>
------------------------------------------------------------
gpg public key ID 0xBEBD71D5 # http://pgp.escomposlinux.org/


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] [RFC] extents support for EXT3
  2003-08-28 17:22 ` [Ext2-devel] " Mike Fedyk
@ 2003-08-29  5:55   ` Alex Tomas
  0 siblings, 0 replies; 30+ messages in thread
From: Alex Tomas @ 2003-08-29  5:55 UTC (permalink / raw)
  To: Mike Fedyk; +Cc: linux-kernel, ext2-devel

>>>>> Mike Fedyk (MF) writes:

 MF> With extents, what are the worst/best cases for max file size on a 1k block
 MF> filesystem?  AFAICT, worst case is 16GB if it backs out to the second level
 MF> like we have now...

well, for extents case the worst/best cases are function of block allocation. 
in fact, the best case could have ~31000 blocks per extent (limited by groups).
the worst case is having 1 block per extent -> 12 bytes per block (4 bytes
w/o extents)


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-28 18:12 ` Ed Sweetman
@ 2003-08-29  5:59   ` Alex Tomas
  2003-08-29 15:28     ` Ed Sweetman
  0 siblings, 1 reply; 30+ messages in thread
From: Alex Tomas @ 2003-08-29  5:59 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel, ext2-devel

>>>>> Ed Sweetman (ES) writes:

 ES> This patch seems to be against test2, just wondering if anyone before
 ES> me has used it on the latest test release.  If not then i'm gonna do
 ES> something stupid and be the first.

I would be happy to see anyone trying this patch ;)

following patch applies to -test4 ok


 fs/ext3/Makefile           |    2 
 fs/ext3/extents.c          | 1567 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext3/ialloc.c           |    2 
 fs/ext3/inode.c            |   25 
 fs/ext3/super.c            |   11 
 include/linux/ext3_fs.h    |   17 
 include/linux/ext3_fs_i.h  |    4 
 include/linux/ext3_fs_sb.h |   10 
 8 files changed, 1632 insertions(+), 6 deletions(-)

diff -puN /dev/null fs/ext3/extents.c
--- /dev/null	2003-01-30 13:24:37.000000000 +0300
+++ linux-2.6.0-test4-alexey/fs/ext3/extents.c	2003-08-29 09:58:39.000000000 +0400
@@ -0,0 +1,1567 @@
+/*
+ *
+ * linux/fs/ext3/extents.c
+ *
+ * Extents support for EXT3
+ *
+ * 07/08/2003    Alex Tomas <bzzz@tmi.comex.ru>
+ * 
+ * TODO:
+ *   - ext3*_error() should be used in some situations
+ *   - find_goal() [to be tested and improved]
+ *   - quick search for index/leaf in ext3_ext_find_extent()
+ *   - tree reduction
+ *   - cache last found extent
+ *   - arch-independent
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext3_jbd.h>
+#include <linux/jbd.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+
+/*
+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
+ * become very little, so index split, in-depth growing and
+ * other hard changes happens much more often
+ * this is for debug purposes only
+ */
+#define AGRESSIVE_TEST_
+
+/*
+ * if EXT_DEBUG defined you can use 'extdebug' mount option
+ * to get lots of info what's going on
+ */
+#define EXT_DEBUG
+#ifdef EXT_DEBUG
+#define ext_debug(inode,fmt,a...) 		\
+do {						\
+	if (test_opt((inode)->i_sb, EXTDEBUG))	\
+		printk(fmt, ##a);		\
+} while (0);
+#else
+#define ext_debug(inode,fmt,a...)
+#endif
+
+#define EXT3_ALLOC_NEEDED	2	/* block bitmap + group descriptor */
+
+/*
+ * ext3_inode has i_block array (total 60 bytes)
+ * first 4 bytes are used to store:
+ *  - tree depth (0 mean there is no tree yet. all extents in the inode)
+ *  - number of alive extents in the inode
+ */
+
+/*
+ * this is extent on-disk structure
+ * it's used at the bottom of the tree
+ */
+struct ext3_extent {
+	__u32	e_block;	/* first logical block extent covers */
+	__u32	e_start;	/* first physical block extents lives */
+	__u32	e_num;		/* number of blocks covered by extent */
+};
+
+/*
+ * this is index on-disk structure
+ * it's used at all the levels, but the bottom
+ */
+struct ext3_extent_idx {
+	__u32	e_block;	/* index covers logical blocks from 'block' */
+	__u32	e_leaf;		/* pointer to the physical block of the next *
+				 * level. leaf or next index could bet here */
+};
+
+/*
+ * each block (leaves and indexes), even inode-stored has header
+ */
+struct ext3_extent_header {	
+	__u16	e_num;		/* number of valid entries */
+	__u16	e_max;		/* capacity of store in entries */
+};
+
+/*
+ * array of ext3_ext_path contains path to some extent
+ * creation/lookup routines use it for traversal/splitting/etc
+ * truncate uses it to simulate recursive walking
+ */
+struct ext3_ext_path {
+	__u32				p_block;
+	__u16				p_depth;
+	struct ext3_extent		*p_ext;
+	struct ext3_extent_idx		*p_idx;
+	struct ext3_extent_header	*p_hdr;
+	struct buffer_head		*p_bh;
+};
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+	((struct ext3_extent *) (((char *) (__hdr__)) +		\
+				 sizeof(struct ext3_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+	((struct ext3_extent_idx *) (((char *) (__hdr__)) +	\
+				     sizeof(struct ext3_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+	((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max)
+#define EXT_LAST_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ */
+static int ext3_ext_get_access(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		return ext3_journal_get_write_access(handle, path->p_bh);
+	}
+
+	/* path points to leaf/index in inode body */
+	return 0;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ *  - EIO
+ */
+static int ext3_ext_dirty(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		return ext3_journal_dirty_metadata(handle, path->p_bh);
+	}
+
+	/* path points to leaf/index in inode body */
+	return ext3_mark_inode_dirty(handle, inode);
+}
+
+static inline int ext3_ext_space_block(struct inode *inode)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header))
+		/ sizeof(struct ext3_extent);
+#ifdef AGRESSIVE_TEST
+	size = 6; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static inline int ext3_ext_space_inode(struct inode *inode)
+{
+	int size;
+
+	size = (sizeof(EXT3_I(inode)->i_data) -
+			sizeof(struct ext3_extent_header))
+			/ sizeof(struct ext3_extent);
+#ifdef AGRESSIVE_TEST
+	size = 3; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static inline int ext3_ext_space_inode_idx(struct inode *inode)
+{
+	int size;
+
+	size = (sizeof(EXT3_I(inode)->i_data) -
+			sizeof(struct ext3_extent_header))
+			/ sizeof(struct ext3_extent_idx);
+#ifdef AGRESSIVE_TEST
+	size = 4; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path)
+{
+	int k, l = path->p_depth;
+
+	ext_debug(inode, "path:");
+	for (k = 0; k <= l; k++, path++) {
+		if (path->p_idx) {
+			ext_debug(inode, "  %d->%d", path->p_idx->e_block,
+					path->p_idx->e_leaf);
+		} else if (path->p_ext) {
+			ext_debug(inode, "  %d:%d:%d",
+					path->p_ext->e_block,
+					path->p_ext->e_start,
+					path->p_ext->e_num);
+		} else
+			ext_debug(inode, "  []");
+	}
+	ext_debug(inode, "\n");
+}
+
+static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path)
+{
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent_header *eh = path[depth].p_hdr;
+	struct ext3_extent *ex = EXT_FIRST_EXTENT(eh);
+	int i;
+
+	for (i = 0; i < eh->e_num; i++, ex++) {
+		ext_debug(inode, "%d:%d:%d ",
+				ex->e_block, ex->e_start, ex->e_num);
+	}
+	ext_debug(inode, "\n");
+}
+
+static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path)
+{
+	int depth = path->p_depth;
+	int i;
+
+	for (i = 0; i <= depth; i++, path++)
+		if (path->p_bh) {
+			brelse(path->p_bh);
+			path->p_bh = NULL;
+		}
+}
+
+static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	unsigned long bg_start;
+	unsigned long colour;
+	int depth;
+	
+	if (path) {
+		depth = path->p_depth;
+		/* try to find previous block */
+		if (path[depth].p_ext)
+			return path[depth].p_ext->e_start +
+				path[depth].p_ext->e_num - 1;
+		
+		/* it looks index is empty
+		 * try to find starting from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+	colour = (current->pid % 16) *
+			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour;
+}
+
+static struct ext3_ext_path *
+ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_extent_header *eh = (void *) ei->i_data;
+	struct ext3_extent_idx *ix;
+	struct buffer_head *bh;
+	struct ext3_extent *ex;
+	int depth, i, k, ppos = 0;
+	
+	eh = (struct ext3_extent_header *) ei->i_data;
+
+	/* initialize capacity of leaf in inode for first time */
+	if (eh->e_max == 0)
+		eh->e_max = ext3_ext_space_inode(inode);
+	i = depth = ei->i_depth;
+	EXT_ASSERT(i == 0 || eh->e_num > 0);
+	
+	/* account possible depth increase */
+	if (!path) {
+		path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
+				GFP_NOFS);
+		if (!path)
+			return ERR_PTR(-ENOMEM);
+	}
+	memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
+
+	/* walk through the tree */
+	while (i) {
+		ext_debug(inode, "depth %d: num %d, max %d\n",
+				ppos, eh->e_num, eh->e_max);
+		ix = EXT_FIRST_INDEX(eh);
+		if (eh->e_num)
+			path[ppos].p_idx = ix;
+		EXT_ASSERT(eh->e_num <= eh->e_max);
+		for (k = 0; k < eh->e_num; k++, ix++) {
+			ext_debug(inode, "index: %d -> %d\n",
+					ix->e_block, ix->e_leaf);
+			if (block < ix->e_block)
+				break;
+			path[ppos].p_idx = ix;
+		}
+		path[ppos].p_block = path[ppos].p_idx->e_leaf;
+		path[ppos].p_depth = i;
+		path[ppos].p_hdr = eh;
+		path[ppos].p_ext = NULL;
+
+		bh = sb_bread(inode->i_sb, path[ppos].p_block);
+		if (!bh) {
+			ext3_ext_drop_refs(inode, path);
+			kfree(path);
+			return ERR_PTR(-EIO);
+		}
+		eh = (struct ext3_extent_header *) bh->b_data;
+		ppos++;
+		EXT_ASSERT(ppos <= depth);
+		path[ppos].p_bh = bh;
+		i--;
+	}
+
+	path[ppos].p_depth = i;
+	path[ppos].p_hdr = eh;
+	path[ppos].p_ext = NULL;
+	
+	/* find extent */
+	ex = EXT_FIRST_EXTENT(eh);
+	if (eh->e_num)
+		path[ppos].p_ext = ex;
+	EXT_ASSERT(eh->e_num <= eh->e_max);
+	for (k = 0; k < eh->e_num; k++, ex++) {
+		if (block < ex->e_block) 
+			break;
+		path[ppos].p_ext = ex;
+	}
+
+	ext3_ext_show_path(inode, path);
+
+	return path;
+}
+
+static void ext3_ext_check_boundary(struct inode *inode,
+					struct ext3_ext_path *curp,
+					void *addr, int len)
+{
+	void *end;
+
+	if (!len)
+		return;
+	if (curp->p_bh)
+		end = (void *) curp->p_hdr + inode->i_sb->s_blocksize;
+	else
+		end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data);
+	if (((unsigned long) addr) + len > (unsigned long) end) {
+		printk("overflow! 0x%p > 0x%p\n", addr + len, end);
+		BUG();
+	}
+	if ((unsigned long) addr < (unsigned long) curp->p_hdr) {
+		printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr);
+		BUG();
+	}
+}
+
+/*
+ * insert new index [logical;ptr] into the block at cupr
+ * it check where to insert: before curp or after curp
+ */
+static int ext3_ext_insert_index(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *curp, int logical,
+				int ptr)
+{
+	struct ext3_extent_idx *ix;
+	int len, err;
+
+	if ((err = ext3_ext_get_access(handle, inode, curp)))
+		return err;
+
+	EXT_ASSERT(logical != curp->p_idx->e_block);
+	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+	if (logical > curp->p_idx->e_block) {
+		/* insert after */
+		len = (len - 1) * sizeof(struct ext3_extent_idx);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert new index %d after: %d. "
+				"move %d from 0x%p to 0x%p\n",
+				logical, ptr, len,
+				(curp->p_idx + 1), (curp->p_idx + 2));
+
+		ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len);
+		memmove(curp->p_idx + 2, curp->p_idx + 1, len);
+		ix = curp->p_idx + 1;
+	} else {
+		/* insert before */
+		len = len * sizeof(struct ext3_extent_idx);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert new index %d before: %d. "
+				"move %d from 0x%p to 0x%p\n",
+				logical, ptr, len,
+				curp->p_idx, (curp->p_idx + 1));
+
+		ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len);
+		memmove(curp->p_idx + 1, curp->p_idx, len);
+		ix = curp->p_idx;
+	}
+
+	ix->e_block = logical;
+	ix->e_leaf = ptr;
+	curp->p_hdr->e_num++;
+
+	err = ext3_ext_dirty(handle, inode, curp);
+	ext3_std_error(inode->i_sb, err);
+
+	return err;
+}
+
+/*
+ * routine inserts new subtree into the path, using free index entry
+ * at depth 'at:
+ *  - allocates all needed blocks (new leaf and all intermediate index blocks)
+ *  - makes decision where to split
+ *  - moves remaining extens and index entries (right to the split point)
+ *    into the newly allocated blocks
+ *  - initialize subtree
+ */
+static int ext3_ext_split(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path,
+				struct ext3_extent *newext, int at)
+{
+	struct buffer_head *bh = NULL;
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent_header *neh;
+	struct ext3_extent_idx *fidx;
+	struct ext3_extent *ex;
+	int i = at, k, m, a;
+	sector_t newblock, oldblock, border;
+	int *ablocks = NULL; /* array of allocated blocks */
+	int err = 0;
+
+	/* make decision: where to split? */
+	/* FIXME: now desicion is simplest: at current extent */
+
+	/* if current leaf will be splitted, then we should use 
+	 * border from split point */
+	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		border = path[depth].p_ext[1].e_block;
+		ext_debug(inode, "leaf will be splitted."
+				" next leaf starts at %d\n",
+				(int)border);
+	} else {
+		border = newext->e_block;
+		ext_debug(inode, "leaf will be added."
+				" next leaf starts at %d\n",
+				(int)border);
+	}
+
+	/* 
+	 * if error occurs, then we break processing
+	 * and turn filesystem read-only. so, index won't
+	 * be inserted and tree will be in consistent
+	 * state. next mount will repair buffers too
+	 */
+
+	/*
+	 * get array to track all allocated blocks
+	 * we need this to handle errors and free blocks
+	 * upon them
+	 */
+	ablocks = kmalloc(sizeof(sector_t) * depth, GFP_NOFS);
+	if (!ablocks)
+		return -ENOMEM;
+	memset(ablocks, 0, sizeof(sector_t) * depth);
+
+	/* allocate all needed blocks */
+	ext_debug(inode, "allocate %d blocks for indexes and leaf\n",
+			depth - at);
+	ablocks[0] = newext->e_start++;
+	newext->e_num--;
+	for (a = 1; a < depth - at; a++) {
+		newblock = ext3_new_block(handle, inode, newext->e_start,
+						0, 0, &err);
+		if (newblock == 0)
+			goto cleanup;
+		ablocks[a] = newblock;
+	}
+
+	/* initialize new leaf */
+	newblock = ablocks[--a];
+	EXT_ASSERT(newblock);
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		goto cleanup;
+	}
+	lock_buffer(bh);
+
+	if ((err = ext3_journal_get_create_access(handle, bh)))
+		goto cleanup;
+
+	neh = (struct ext3_extent_header *) bh->b_data;
+	neh->e_num = 0;
+	neh->e_max = ext3_ext_space_block(inode);
+	ex = EXT_FIRST_EXTENT(neh);
+
+	/* move remain of path[depth] to the new leaf */
+	EXT_ASSERT(path[depth].p_hdr->e_num ==
+			path[depth].p_hdr->e_max);
+	/* start copy from next extent */
+	/* TODO: we could do it by single memmove */
+	m = 0;
+	path[depth].p_ext++;
+	while (path[depth].p_ext <=
+			EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug(inode, "move %d:%d:%d in new leaf\n",
+				path[depth].p_ext->e_block,
+				path[depth].p_ext->e_start,
+				path[depth].p_ext->e_num);
+		memmove(ex++, path[depth].p_ext++,
+				sizeof(struct ext3_extent));
+		neh->e_num++;
+		m++;
+	}
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	if ((err = ext3_journal_dirty_metadata(handle, bh)))
+		goto cleanup;	
+	brelse(bh);
+	bh = NULL;
+
+	/* correct old leaf */
+	if (m) {
+		if ((err = ext3_ext_get_access(handle, inode, path)))
+			goto cleanup;
+		path[depth].p_hdr->e_num -= m;
+		if ((err = ext3_ext_dirty(handle, inode, path)))
+			goto cleanup;
+		
+	}
+
+	/* create intermediate indexes */
+	k = depth - at - 1;
+	EXT_ASSERT(k >= 0);
+	if (k)
+		ext_debug(inode,
+				"create %d intermediate indices\n", k);
+	/* insert new index into current index block */
+	/* current depth stored in i var */
+	i = depth - 1;
+	while (k--) {
+		oldblock = newblock;
+		newblock = ablocks[--a];
+		bh = sb_getblk(inode->i_sb, newblock);
+		if (!bh) {
+			err = -EIO;
+			goto cleanup;
+		}
+		lock_buffer(bh);
+
+		if ((err = ext3_journal_get_create_access(handle, bh)))
+			goto cleanup;
+
+		neh = (struct ext3_extent_header *) bh->b_data;
+		neh->e_num = 1;
+		neh->e_max = ext3_ext_space_block(inode);
+		fidx = EXT_FIRST_INDEX(neh);
+		fidx->e_block = border;
+		fidx->e_leaf = oldblock;
+
+		ext_debug(inode,
+				"int.index at %d (block %u): %d -> %d\n",
+				i, (unsigned) newblock,
+				(int) border,
+				(int) oldblock);
+		/* copy indexes */
+		m = 0;
+		path[i].p_idx++;
+		EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
+				EXT_LAST_INDEX(path[i].p_hdr));
+		ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
+				EXT_MAX_INDEX(path[i].p_hdr));
+		while (path[i].p_idx <=
+				EXT_MAX_INDEX(path[i].p_hdr)) {
+			ext_debug(inode, "%d: move %d:%d in new index\n",
+					i, path[i].p_idx->e_block,
+					path[i].p_idx->e_leaf);
+			memmove(++fidx, path[i].p_idx++,
+					sizeof(struct ext3_extent_idx));
+			neh->e_num++;
+			m++;
+		}
+
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		if ((err = ext3_journal_dirty_metadata(handle, bh)))
+			goto cleanup;
+		brelse(bh);
+		bh = NULL;
+
+		/* correct old index */
+		if (m) {
+			err = ext3_ext_get_access(handle,inode,path+i);
+			if (err)
+				goto cleanup;
+			path[i].p_hdr->e_num -= m;
+			err = ext3_ext_dirty(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+		}
+
+		i--;
+	}
+
+	/* insert new index */
+	if (!err) 
+		err = ext3_ext_insert_index(handle, inode, path + at,
+						border, newblock);
+
+cleanup:
+	if (bh) {
+		if (buffer_locked(bh))
+			unlock_buffer(bh);
+		brelse(bh);
+	}
+
+	if (err) {
+		/* free all allocated blocks in error case */
+		for (i = 0; i < depth; i++)
+			if (!ablocks[i])
+				continue;
+			ext3_free_blocks(handle, inode, ablocks[i], 1);
+	}
+	kfree(ablocks);
+
+	return err;
+}
+
+/*
+ * routine implements tree growing procedure:
+ *  - allocates new block
+ *  - moves top-level data (index block or leaf) into the new block
+ *  - initialize new top-level, creating index that points to the
+ *    just created block
+ */
+static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path,
+					struct ext3_extent *newext)
+{
+	struct buffer_head *bh;
+	struct ext3_ext_path *curp = path;
+	struct ext3_extent_header *neh;
+	struct ext3_extent_idx *fidx;
+	int len, err = 0;
+	sector_t newblock;
+
+	/*
+	 * use already allocated by the called block for new root block
+	 */
+	newblock = newext->e_start++;
+	newext->e_num--;
+	
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		ext3_std_error(inode->i_sb, err);
+		return err;
+	}
+	lock_buffer(bh);
+
+	if ((err = ext3_journal_get_create_access(handle, bh))) {
+		unlock_buffer(bh);
+		goto out;	
+	}
+
+	/* move top-level index/leaf into new block */
+	len = sizeof(struct ext3_extent_header) +
+		sizeof(struct ext3_extent) * curp->p_hdr->e_max;
+	EXT_ASSERT(len >= 0 && len < 4096);
+	memmove(bh->b_data, curp->p_hdr, len);
+
+	/* set size of new block */
+	neh = (struct ext3_extent_header *) bh->b_data;
+	neh->e_max = ext3_ext_space_block(inode);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	if ((err = ext3_journal_dirty_metadata(handle, bh)))
+		goto out;
+
+	/* create index in new top-level index: num,max,pointer */
+	if ((err = ext3_ext_get_access(handle, inode, curp)))
+		goto out;
+
+	curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode);
+	curp->p_hdr->e_num = 1;
+	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
+	curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block;
+	curp->p_idx->e_leaf = newblock;
+
+	neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data;
+	fidx = EXT_FIRST_INDEX(neh);
+	ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n",
+			neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); 
+
+	EXT3_I(inode)->i_depth++;
+	err = ext3_ext_dirty(handle, inode, curp);
+out:
+	brelse(bh);
+
+	return err;
+}
+
+/*
+ * routine finds empty index and adds new leaf. if no free index found
+ * then it requests in-depth growing
+ */
+static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path,
+					struct ext3_extent *newext)
+{
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_ext_path *curp;
+	int i = depth, err = 0;
+	sector_t newblock = newext->e_start;
+
+	/* walk up to the tree and look for free index entry */
+	curp = path + depth;
+	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+		i--;
+		curp--;
+	}
+
+	/* we use already allocated block for index block
+	 * so, subsequent data blocks should be contigoues */
+	if (EXT_HAS_FREE_INDEX(curp)) {
+		/* if we found index with free entry, then use that
+		 * entry: create all needed subtree and add new leaf */
+		err = ext3_ext_split(handle, inode, path, newext, i);
+	} else {
+		/* tree is full, time to grow in depth */
+		err = ext3_ext_grow_indepth(handle, inode, path, newext);
+	}
+
+	if (!err) {
+		/* refill path */
+		ext3_ext_drop_refs(inode, path);
+		path = ext3_ext_find_extent(inode, newext->e_block, path);
+		if (IS_ERR(path))
+			err = PTR_ERR(path);
+
+		/*
+		 * probably we've used some blocks from extent
+		 * let's allocate new block for it
+		 */
+		if (newext->e_num == 0 && !err) {
+			newext->e_start =
+				ext3_new_block(handle, inode, newblock,
+						0, 0, &err);
+			newext->e_num = 1;
+		}
+	}
+
+	return err;
+}
+
+/*
+ * returns next allocated block or 0xffffffff
+ * NOTE: it consider block number from index entry as
+ * allocated block. thus, index entries have to be consistent
+ * with leafs
+ */
+static inline unsigned ext3_ext_next_allocated_block(struct inode *inode,
+                                               struct ext3_ext_path *path)
+{
+	int depth;
+
+	EXT_ASSERT(path != NULL);
+	depth = path->p_depth;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0xffffffff;
+
+	/* FIXME: what if index isn't full ?! */
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			/* leaf */
+			if (path[depth].p_ext !=
+					EXT_LAST_EXTENT(path[depth].p_hdr))
+				return path[depth].p_ext[1].e_block;
+		} else {
+			/* index */
+			if (path[depth].p_idx !=
+					EXT_LAST_INDEX(path[depth].p_hdr))
+				return path[depth].p_idx[1].e_block;
+		}
+		depth--;        
+	}
+
+	return 0xffffffff;
+}
+
+/*
+ * returns first allocated block from next leaf or 0xffffffff
+ */
+static unsigned ext3_ext_next_leaf_block(struct inode *inode,
+                                               struct ext3_ext_path *path)
+{
+	int depth;
+
+	EXT_ASSERT(path != NULL);
+	depth = path->p_depth;
+
+	/* zero-tree has no leaf blocks at all */
+	if (depth == 0)
+		return 0xffffffff;
+
+	/* go to index block */
+	depth--;
+	
+	while (depth >= 0) {
+		if (path[depth].p_idx !=
+				EXT_LAST_INDEX(path[depth].p_hdr))
+			return path[depth].p_idx[1].e_block;
+		depth--;        
+	}
+
+	return 0xffffffff;
+}
+
+/*
+ * if leaf gets modified and modified extent is first in the leaf
+ * then we have to correct all indexes above
+ * TODO: do we need to correct tree in all cases?
+ */
+int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	int depth = EXT3_I(inode)->i_depth;	
+	struct ext3_extent_header *eh;
+	struct ext3_extent *ex;
+	sector_t border;
+	int k, err = 0;
+	
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	EXT_ASSERT(ex);
+	EXT_ASSERT(eh);
+	
+	if (depth == 0) {
+		/* there is no tree at all */
+		return 0;
+	}
+	
+	if (ex != EXT_FIRST_EXTENT(eh)) {
+		/* we correct tree if first leaf got modified only */
+		return 0;
+	}
+	
+	k = depth - 1;
+	border = path[depth].p_ext->e_block;
+	if ((err = ext3_ext_get_access(handle, inode, path + k)))
+		return err;
+	path[k].p_idx->e_block = border;
+	if ((err = ext3_ext_dirty(handle, inode, path + k)))
+		return err;
+
+	while (k--) {
+		/* change all left-side indexes */
+		if (path[k].p_idx != EXT_FIRST_INDEX(path[k].p_hdr)
+				&& k != 0)
+			break;
+		if ((err = ext3_ext_get_access(handle, inode, path + k)))
+			break;
+		path[k].p_idx->e_block = border;
+		if ((err = ext3_ext_dirty(handle, inode, path + k)))
+			break;
+	}
+
+	return err;
+}
+
+/*
+ * this routine tries to merge requsted extent into the existing
+ * extent or inserts requested extent as new one into the tree,
+ * creating new leaf in no-space case
+ */
+int ext3_ext_insert_extent(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path,
+				struct ext3_extent *newext)
+{
+	int depth, len;
+	struct ext3_extent_header * eh;
+	struct ext3_extent *ex;
+	struct ext3_extent *nearex; /* nearest extent */
+	struct ext3_ext_path *npath = NULL;
+	int err;
+
+	depth = EXT3_I(inode)->i_depth;	
+	if ((ex = path[depth].p_ext)) {
+		/* try to insert block into found extent and return */
+		if (ex->e_block + ex->e_num == newext->e_block &&
+				ex->e_start + ex->e_num == newext->e_start) {
+#ifdef AGRESSIVE_TEST
+			if (ex->e_num >= 2)
+				goto repeat;
+#endif
+			if ((err = ext3_ext_get_access(handle, inode,
+							path + depth)))
+				return err;
+			ext_debug(inode, "append %d block to %d:%d (from %d)\n",
+					newext->e_num, ex->e_block, ex->e_num,
+					ex->e_start);
+			ex->e_num += newext->e_num;
+			err = ext3_ext_dirty(handle, inode, path + depth);
+			return err;
+		}
+	}
+
+repeat:
+	depth = EXT3_I(inode)->i_depth;	
+	eh = path[depth].p_hdr;
+	if (eh->e_num == eh->e_max) {
+		/* probably next leaf has space for us? */
+		int next = ext3_ext_next_leaf_block(inode, path);
+		if (next != 0xffffffff) {
+			ext_debug(inode, "next leaf block - %d\n", next);
+			EXT_ASSERT(!npath);
+			npath = ext3_ext_find_extent(inode, next, NULL);
+			if (IS_ERR(npath))
+				return PTR_ERR(npath);
+			EXT_ASSERT(npath->p_depth == path->p_depth);
+			eh = npath[depth].p_hdr;
+			if (eh->e_num < eh->e_max) {
+				ext_debug(inode,
+						"next leaf has free ext(%d)\n",
+						eh->e_num);
+				path = npath;
+				goto repeat;
+			}
+			ext_debug(inode, "next leaf hasno free space(%d,%d)\n",
+					eh->e_num, eh->e_max);
+		}
+		/*
+		 * there is no free space in found leaf
+		 * we're gonna add new leaf in the tree
+		 */
+		err = ext3_ext_create_new_leaf(handle, inode, path, newext);
+		if (err)
+			goto cleanup;
+		goto repeat;
+	}
+
+	nearex = path[depth].p_ext;
+
+	if ((err = ext3_ext_get_access(handle, inode, path + depth)))
+		goto cleanup;
+
+	if (!nearex) {
+		/* there is no extent in this leaf, create first one */
+		ext_debug(inode, "first extent in the leaf: %d:%d:%d\n",
+				newext->e_block, newext->e_start,
+				newext->e_num);
+		eh->e_num++;
+		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
+
+	} else if (newext->e_block > nearex->e_block) {
+		EXT_ASSERT(newext->e_block != nearex->e_block);
+		len = EXT_MAX_EXTENT(eh) - nearex;
+		len = (len - 1) * sizeof(struct ext3_extent);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, "
+				"move %d from 0x%p to 0x%p\n",
+				newext->e_block, newext->e_start, newext->e_num,
+				nearex, len, nearex + 1, nearex + 2);
+		ext3_ext_check_boundary(inode, path + depth, nearex + 2, len);
+		memmove(nearex + 2, nearex + 1, len);
+		path[depth].p_ext = nearex + 1;
+		eh->e_num++;
+	} else {
+		EXT_ASSERT(newext->e_block != nearex->e_block);
+		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, "
+				"move %d from 0x%p to 0x%p\n",
+				newext->e_block, newext->e_start, newext->e_num,
+				nearex, len, nearex + 1, nearex + 2);
+		memmove(nearex + 1, nearex, len);
+		path[depth].p_ext = nearex;
+		eh->e_num++;
+
+		/* time to correct all indexes above */
+		err = ext3_ext_correct_indexes(handle, inode, path);
+	}
+
+	if (!err) {
+		nearex = path[depth].p_ext;
+		nearex->e_block = newext->e_block;
+		nearex->e_start = newext->e_start;
+		nearex->e_num = newext->e_num;
+	}
+
+	err = ext3_ext_dirty(handle, inode, path + depth);
+
+cleanup:
+	if (npath) {
+		ext3_ext_drop_refs(inode, npath);
+		kfree(npath);
+	}
+		
+	return err;
+}
+
+int ext3_ext_get_block(handle_t *handle, struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create,
+			int extend_disksize)
+{
+	struct ext3_ext_path *path;
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent newex;
+	struct ext3_extent *ex;
+	int goal, newblock, err = 0;
+
+	ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n",
+			(int) iblock, (unsigned) inode->i_ino, bh_result);
+	clear_buffer_new(bh_result);
+
+	down(&EXT3_I(inode)->i_ext_sem);
+
+	/* find extent for this block */
+	path = ext3_ext_find_extent(inode, iblock, NULL);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		goto out2;
+	}
+
+	if ((ex = path[depth].p_ext)) {
+		/* if found exent covers block, simple return it */
+		if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) {
+			newblock = iblock - ex->e_block + ex->e_start;
+			ext_debug(inode, "%d fit into %d:%d -> %d\n",
+					(int) iblock, ex->e_block, ex->e_num,
+					newblock);
+			goto out;
+		}
+	}
+
+	/*
+	 * we couldn't try to create block if create flag is zero 
+	 */
+	if (!create) 
+		goto out2;
+
+	/* allocate new block */
+	goal = ext3_ext_find_goal(inode, path);
+	newblock = ext3_new_block(handle, inode, goal, 0, 0, &err);
+	if (!newblock)
+		goto out2;
+	ext_debug(inode, "allocate new block: goal %d, found %d\n",
+			goal, newblock);
+
+	/* try to insert new extent into found leaf and return */
+	newex.e_block = iblock;
+	newex.e_start = newblock;
+	newex.e_num = 1;
+	err = ext3_ext_insert_extent(handle, inode, path, &newex);
+	if (err)
+		goto out2;
+	
+	/* previous routine could use block we allocated */
+	newblock = newex.e_start;
+	set_buffer_new(bh_result);
+
+out:
+	ext3_ext_show_leaf(inode, path);
+	map_bh(bh_result, inode->i_sb, newblock);
+out2:
+	ext3_ext_drop_refs(inode, path);
+	kfree(path);
+	up(&EXT3_I(inode)->i_ext_sem);
+
+	return err;	
+}
+
+/*
+ * returns 1 if current index have to be freed (even partial)
+ */
+static int ext3_ext_more_to_truncate(struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	EXT_ASSERT(path->p_idx);
+
+	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+		return 0;
+
+	/*
+	 * if truncate on deeper level happened it it wasn't partial
+	 * so we have to consider current index for truncation
+	 */
+	if (path->p_hdr->e_num == path->p_block)
+		return 0;
+
+	/*
+	 * put actual number of indexes to know is this number got
+	 * changed at the next iteration
+	 */
+	path->p_block = path->p_hdr->e_num;
+	
+	return 1;
+}
+
+/*
+ * routine removes index from the index block
+ * it's used in truncate case only. thus all requests are for
+ * last index in the block only
+ */
+int ext3_ext_remove_index(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path)
+{
+	struct buffer_head *bh;
+	int err;
+	
+	/* free index block */
+	path--;
+	EXT_ASSERT(path->p_hdr->e_num);
+	if ((err = ext3_ext_get_access(handle, inode, path)))
+		return err;
+	path->p_hdr->e_num--;
+	if ((err = ext3_ext_dirty(handle, inode, path)))
+		return err;
+	bh = sb_find_get_block(inode->i_sb, path->p_idx->e_leaf);
+	ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf);
+	ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1);
+
+	ext_debug(inode, "index is empty, remove it, free block %d\n",
+			path->p_idx->e_leaf);
+	return err;
+}
+
+/*
+ * returns 1 if current extent needs to be freed (even partial)
+ * instead, returns 0
+ */
+int ext3_ext_more_leaves_to_truncate(struct inode *inode,
+					struct ext3_ext_path *path)
+{
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	struct ext3_extent *ex = path->p_ext;
+	int last_block; 
+
+	EXT_ASSERT(ex);
+
+	/* is there leave in the current leaf? */
+	if (ex < EXT_FIRST_EXTENT(path->p_hdr))
+		return 0;
+	
+	last_block = (inode->i_size + blocksize-1)
+			>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+
+	if (last_block >= ex->e_block + ex->e_num)
+		return 0;
+
+	/* seems it extent have to be freed */
+	return 1;
+}
+
+handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
+{
+	int err;
+
+	if (handle->h_buffer_credits > needed)
+		return handle;
+	if (!ext3_journal_extend(handle, needed))
+		return handle;
+	err = ext3_journal_restart(handle, needed);
+	
+	return handle;
+}
+
+/*
+ * this routine calculate max number of blocks to be modified
+ * while freeing extent and is intended to be used in truncate path
+ */
+static int ext3_ext_calc_credits(struct inode *inode,
+					struct ext3_ext_path *path,
+					int num)
+{
+	int depth = EXT3_I(inode)->i_depth;
+	int needed;
+	
+	/*
+	 * extent couldn't cross group, so we will modify
+	 * single bitmap block and single group descriptor
+	 */
+	needed = 2;
+
+	/*
+	 * if this is last extent in a leaf, then we have to
+	 * free leaf block and remove pointer from index above.
+	 * that pointer could be last in index block, so we'll
+	 * have to remove it too. this way we could modify/free
+	 * the whole path + root index (inode stored) will be
+	 * modified
+	 */
+	if (!path || (num == path->p_ext->e_num &&
+				path->p_ext == EXT_FIRST_EXTENT(path->p_hdr)))
+		needed += (depth * EXT3_ALLOC_NEEDED) + 1;
+
+	return needed;
+}
+
+/*
+ * core of the truncate procedure:
+ * - calculated what part of each extent in the requested leaf
+ *   need to be freed
+ * - frees and forgets these blocks
+ *
+ * TODO: we could optimize and free several extents during
+ *       single journal_restart()-journal_restart() cycle
+ */
+static int ext3_ext_truncate_leaf(handle_t *handle,
+					struct inode *inode,
+					struct ext3_ext_path *path,
+					int depth)
+{
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	int last_block; 
+	int i, err = 0, sf, num;
+
+	ext_debug(inode, "level %d - leaf\n", depth);
+	if (!path->p_hdr)
+		path->p_hdr =
+			(struct ext3_extent_header *) path->p_bh->b_data;
+
+	EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max);
+	
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+	path->p_ext = EXT_LAST_EXTENT(path->p_hdr);
+	while (ext3_ext_more_leaves_to_truncate(inode, path)) {
+
+		/* what part of extent have to be freed? */
+		sf = last_block > path->p_ext->e_block ?
+			last_block : path->p_ext->e_block;
+
+		/* number of blocks from extent to be freed */
+		num = path->p_ext->e_block + path->p_ext->e_num - sf;
+
+		/* calc physical first physical block to be freed */
+		sf = path->p_ext->e_start + (sf - path->p_ext->e_block);
+
+		i = ext3_ext_calc_credits(inode, path, num);
+		handle = ext3_ext_journal_restart(handle, i);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		
+		ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n",
+				path->p_ext->e_block, path->p_ext->e_start,
+				path->p_ext->e_num, sf, num);
+		for (i = 0; i < num; i++) {
+			struct buffer_head *bh =
+				sb_find_get_block(inode->i_sb, sf + i);
+			ext3_forget(handle, 0, inode, bh, sf + i);
+		}
+		ext3_free_blocks(handle, inode, sf, num);
+
+		/* collect extents usage stats */
+		spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock);
+		EXT3_SB(inode->i_sb)->s_ext_extents++;
+		EXT3_SB(inode->i_sb)->s_ext_blocks += num;
+		spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock);
+
+		/* reduce extent */
+		if ((err = ext3_ext_get_access(handle, inode, path)))
+			return err;
+		path->p_ext->e_num -= num;
+		if (path->p_ext->e_num == 0)
+			path->p_hdr->e_num--;
+		if ((err = ext3_ext_dirty(handle, inode, path)))
+			return err;
+
+		path->p_ext--;
+	}
+	
+	/* if this leaf is free, then we should
+	 * remove it from index block above */
+	if (path->p_hdr->e_num == 0 && depth > 0) 
+		err = ext3_ext_remove_index(handle, inode, path);
+
+	return err;
+}
+
+static void ext3_ext_collect_stats(struct inode *inode)
+{
+	int depth;
+	
+	/* skip inodes with old good bitmap */
+	if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
+		return;
+	
+	/* collect on full truncate only */
+	if (inode->i_size)
+		return;
+
+	depth = EXT3_I(inode)->i_depth;
+	if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth)
+		 EXT3_SB(inode->i_sb)->s_ext_mindepth = depth;
+	if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth)
+		 EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth;
+	EXT3_SB(inode->i_sb)->s_ext_sum += depth;
+	EXT3_SB(inode->i_sb)->s_ext_count++;
+	
+}
+
+void ext3_ext_truncate(struct inode * inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct ext3_ext_path *path;
+	struct page * page;
+	handle_t *handle;
+	int i, depth, err = 0;
+
+	ext3_ext_collect_stats(inode);
+
+	/*
+	 * We have to lock the EOF page here, because lock_page() nests
+	 * outside journal_start().
+	 */
+	if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) {
+		/* Block boundary? Nothing to do */
+		page = NULL;
+	} else {
+		page = grab_cache_page(mapping,
+				inode->i_size >> PAGE_CACHE_SHIFT);
+		if (!page)
+			return;
+	}
+
+	/*
+	 * probably first extent we're gonna free will be last in block
+	 */
+	i = ext3_ext_calc_credits(inode, NULL, 0);
+	handle = ext3_journal_start(inode, i);
+	if (IS_ERR(handle)) {
+		if (page) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+		return;
+	}
+
+	if (page)
+		ext3_block_truncate_page(handle, page, mapping, inode->i_size);
+
+	down(&EXT3_I(inode)->i_ext_sem);
+	/* 
+	 * TODO: optimization is possible here
+	 * probably we need not scaning at all,
+	 * because page truncation is enough
+	 */
+	if (ext3_orphan_add(handle, inode))
+		goto out_stop;
+
+	/* we have to know where to truncate from in crash case */
+	EXT3_I(inode)->i_disksize = inode->i_size;
+	ext3_mark_inode_dirty(handle, inode);
+
+	/*
+	 * we start scanning from right side freeing all the blocks
+	 * after i_size and walking into the deep
+	 */
+	i = 0;
+	depth = EXT3_I(inode)->i_depth;
+	path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
+	if (IS_ERR(path)) {
+		ext3_error(inode->i_sb, "ext3_ext_truncate",
+				"Can't allocate path array");
+		goto out_stop;
+	}
+	memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
+
+	path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data;
+	while (i >= 0 && err == 0) {
+		if (i == depth) {
+			/* this is leaf block */
+			err = ext3_ext_truncate_leaf(handle, inode,
+							path + i, i);
+			/* root level have p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			i--;
+			continue;
+		}
+		
+		/* this is index block */
+		if (!path[i].p_hdr) {
+			path[i].p_hdr =
+				(struct ext3_extent_header *) path[i].p_bh->b_data;
+			ext_debug(inode, "initialize header\n");
+		}
+
+		EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max);
+		
+		if (!path[i].p_idx) {
+			/* this level hasn't touched yet */
+			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+			path[i].p_block = path[i].p_hdr->e_num + 1;
+			ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
+					path[i].p_hdr, path[i].p_hdr->e_num);
+		} else {
+			/* we've already was here, see at next index */
+			path[i].p_idx--;
+		}
+
+		ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
+				i, EXT_FIRST_INDEX(path[i].p_hdr),
+				path[i].p_idx);
+		if (ext3_ext_more_to_truncate(inode, path + i)) {
+			/* go to the next level */
+			ext_debug(inode, "move to level %d (block %d)\n", i+1,
+					path[i].p_idx->e_leaf);
+			memset(path + i + 1, 0, sizeof(*path));
+			path[i+1].p_bh = sb_bread(inode->i_sb,
+							path[i].p_idx->e_leaf);
+			if (!path[i+1].p_bh) {
+				/* should we reset i_size? */
+				err = -EIO;
+				break;
+			}
+			i++;
+		} else {
+			/* we finish processing this index, go up */
+			if (path[i].p_hdr->e_num == 0 && i > 0) {
+				/* index is empty, remove it
+				 * handle must be already prepared by the
+				 * truncate_leaf()
+				 */
+				err = ext3_ext_remove_index(handle, inode,
+								path + i);
+			}
+			/* root level have p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			i--;
+			ext_debug(inode, "return to level %d\n", i);
+		}
+	}
+
+	/* TODO: flexible tree reduction should be here */
+	if (path->p_hdr->e_num == 0) {
+		/*
+		 * truncate to zero freed all the tree
+		 * so, we need to correct i_depth
+		 */
+		EXT3_I(inode)->i_depth = 0;
+		path->p_hdr->e_max = 0;
+		ext3_mark_inode_dirty(handle, inode);
+	}
+
+	kfree(path);
+
+	/* In a multi-transaction truncate, we only make the final
+	 * transaction synchronous */
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+
+out_stop:
+	/*
+	 * If this was a simple ftruncate(), and the file will remain alive
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext3_delete_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(handle, inode);
+
+	up(&EXT3_I(inode)->i_ext_sem);
+	ext3_journal_stop(handle);
+}
+
+/*
+ * this routine calculate max number of blocks we could modify
+ * in order to allocate new block for an inode
+ */
+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	int depth = ei->i_depth + 1;
+	int needed;
+	
+	/*
+	 * the worste case we're expecting is creation of the
+	 * new root (growing in depth) with index splitting
+	 * for splitting we have to consider depth + 1 because
+	 * previous growing could increase it
+	 */
+
+	/* 
+	 * growing in depth:
+	 * block allocation + new root + old root
+	 */
+	needed = EXT3_ALLOC_NEEDED + 2;
+
+	/* index split. we may need:
+	 *   allocate intermediate indexes and new leaf
+	 *   change two blocks at each level, but root
+	 *   modify root block (inode)
+	 */
+	needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
+
+	/* caller want to allocate num blocks */
+	needed *= num;
+	
+#ifdef CONFIG_QUOTA
+	/* 
+	 * FIXME: real calculation should be here
+	 * it depends on blockmap format of qouta file
+	 */
+	needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
+#endif
+
+	return needed;
+}
+
+/*
+ * called at mount time
+ */
+void ext3_ext_init(struct super_block *sb)
+{
+	/*
+	 * possible initialization would be here
+	 */
+
+	if (test_opt(sb, EXTENTS))
+		printk("EXT3-fs: file extents enabled\n");
+	spin_lock_init(&EXT3_SB(sb)->s_ext_lock);
+}
+
+/*
+ * called at umount time
+ */
+void ext3_ext_release(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	/* show collected stats */
+	if (sbi->s_ext_count && sbi->s_ext_extents)
+		printk("EXT3-fs: min depth - %d, max depth - %d, "
+				"ave. depth - %d, ave. blocks/extent - %d\n",
+				sbi->s_ext_mindepth,
+				sbi->s_ext_maxdepth,
+				sbi->s_ext_sum / sbi->s_ext_count,
+				sbi->s_ext_blocks / sbi->s_ext_extents);
+}
+
diff -puN fs/ext3/ialloc.c~ext3-extents fs/ext3/ialloc.c
--- linux-2.6.0-test4/fs/ext3/ialloc.c~ext3-extents	2003-08-29 09:58:39.000000000 +0400
+++ linux-2.6.0-test4-alexey/fs/ext3/ialloc.c	2003-08-29 09:58:39.000000000 +0400
@@ -600,6 +600,8 @@ got:
 		DQUOT_FREE_INODE(inode);
 		goto fail2;
   	}
+	if (test_opt(sb, EXTENTS))
+		EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
 	err = ext3_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext3_std_error(sb, err);
diff -puN fs/ext3/inode.c~ext3-extents fs/ext3/inode.c
--- linux-2.6.0-test4/fs/ext3/inode.c~ext3-extents	2003-08-29 09:58:39.000000000 +0400
+++ linux-2.6.0-test4-alexey/fs/ext3/inode.c	2003-08-29 09:58:39.000000000 +0400
@@ -862,6 +862,15 @@ changed:
 	goto reread;
 }
 
+static inline int
+ext3_get_block_wrap(handle_t *handle, struct inode *inode, sector_t block,
+		struct buffer_head *bh, int create, int extend_disksize)
+{
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_get_block(handle, inode, block, bh, create, 1);
+	return ext3_get_block_handle(handle, inode, block, bh, create, 1);
+}
+
 static int ext3_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
@@ -872,7 +881,7 @@ static int ext3_get_block(struct inode *
 		handle = ext3_journal_current_handle();
 		J_ASSERT(handle != 0);
 	}
-	ret = ext3_get_block_handle(handle, inode, iblock,
+	ret = ext3_get_block_wrap(handle, inode, iblock,
 				bh_result, create, 1);
 	return ret;
 }
@@ -899,7 +908,7 @@ ext3_direct_io_get_blocks(struct inode *
 		}
 	}
 	if (ret == 0)
-		ret = ext3_get_block_handle(handle, inode, iblock,
+		ret = ext3_get_block_wrap(handle, inode, iblock,
 					bh_result, create, 0);
 	if (ret == 0)
 		bh_result->b_size = (1 << inode->i_blkbits);
@@ -921,7 +930,7 @@ struct buffer_head *ext3_getblk(handle_t
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	*errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
+	*errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
 	if (!*errp && buffer_mapped(&dummy)) {
 		struct buffer_head *bh;
 		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
@@ -1663,7 +1672,7 @@ void ext3_set_aops(struct inode *inode)
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
+int ext3_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from)
 {
 	unsigned long index = from >> PAGE_CACHE_SHIFT;
@@ -2145,6 +2154,9 @@ void ext3_truncate(struct inode * inode)
 
 	ext3_discard_prealloc(inode);
 
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_truncate(inode);
+
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
 	 * outside journal_start().
@@ -2540,6 +2552,7 @@ void ext3_read_inode(struct inode * inod
 	ei->i_prealloc_count = 0;
 #endif
 	ei->i_block_group = iloc.block_group;
+	ei->i_depth = raw_inode->osd2.linux2.l_i_depth;
 
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
@@ -2636,6 +2649,7 @@ static int ext3_do_update_inode(handle_t
 	raw_inode->i_frag = ei->i_frag_no;
 	raw_inode->i_fsize = ei->i_frag_size;
 #endif
+ 	raw_inode->osd2.linux2.l_i_depth = ei->i_depth;
 	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
 	if (!S_ISREG(inode->i_mode)) {
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
@@ -2840,6 +2854,9 @@ int ext3_writepage_trans_blocks(struct i
 	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
 	int ret;
 
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_writepage_trans_blocks(inode, bpp);
+
 	if (ext3_should_journal_data(inode))
 		ret = 3 * (bpp + indirects) + 2;
 	else
diff -puN fs/ext3/Makefile~ext3-extents fs/ext3/Makefile
--- linux-2.6.0-test4/fs/ext3/Makefile~ext3-extents	2003-08-29 09:58:39.000000000 +0400
+++ linux-2.6.0-test4-alexey/fs/ext3/Makefile	2003-08-29 09:58:39.000000000 +0400
@@ -5,7 +5,7 @@
 obj-$(CONFIG_EXT3_FS) += ext3.o
 
 ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-	   ioctl.o namei.o super.o symlink.o hash.o
+	   ioctl.o namei.o super.o symlink.o hash.o extents.o
 
 ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
 ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
diff -puN fs/ext3/super.c~ext3-extents fs/ext3/super.c
--- linux-2.6.0-test4/fs/ext3/super.c~ext3-extents	2003-08-29 09:58:39.000000000 +0400
+++ linux-2.6.0-test4-alexey/fs/ext3/super.c	2003-08-29 09:58:39.000000000 +0400
@@ -446,6 +446,7 @@ void ext3_put_super (struct super_block 
 	struct ext3_super_block *es = sbi->s_es;
 	int i;
 
+	ext3_ext_release(sb);
 	ext3_xattr_put_super(sb);
 	journal_destroy(sbi->s_journal);
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -504,6 +505,8 @@ static struct inode *ext3_alloc_inode(st
 	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
 #endif
 	ei->vfs_inode.i_version = 1;
+	ei->i_depth = 0;
+	sema_init(&ei->i_ext_sem, 1);
 	return &ei->vfs_inode;
 }
 
@@ -656,6 +659,12 @@ static int parse_options (char * options
 			continue;
 		if ((value = strchr (this_char, '=')) != NULL)
 			*value++ = 0;
+		if (!strcmp (this_char, "extents"))
+			set_opt (sbi->s_mount_opt, EXTENTS);
+		else
+		if (!strcmp (this_char, "extdebug"))
+			set_opt (sbi->s_mount_opt, EXTDEBUG);
+		else
 #ifdef CONFIG_EXT3_FS_XATTR
 		if (!strcmp (this_char, "user_xattr"))
 			set_opt (sbi->s_mount_opt, XATTR_USER);
@@ -1442,6 +1451,8 @@ static int ext3_fill_super (struct super
 	percpu_counter_mod(&sbi->s_dirs_counter,
 		ext3_count_dirs(sb));
 
+	ext3_ext_init(sb);
+
 	return 0;
 
 failed_mount3:
diff -puN include/linux/ext3_fs.h~ext3-extents include/linux/ext3_fs.h
--- linux-2.6.0-test4/include/linux/ext3_fs.h~ext3-extents	2003-08-29 09:58:39.000000000 +0400
+++ linux-2.6.0-test4-alexey/include/linux/ext3_fs.h	2003-08-29 09:58:39.000000000 +0400
@@ -186,6 +186,7 @@ struct ext3_group_desc
 #define EXT3_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 #define EXT3_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT3_RESERVED_FL		0x80000000 /* reserved for ext3 lib */
+#define EXT3_EXTENTS_FL			0x00080000 /* Inode uses extents */
 
 #define EXT3_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
 #define EXT3_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
@@ -244,7 +245,7 @@ struct ext3_inode {
 		struct {
 			__u8	l_i_frag;	/* Fragment number */
 			__u8	l_i_fsize;	/* Fragment size */
-			__u16	i_pad1;
+			__u16	l_i_depth;
 			__u16	l_i_uid_high;	/* these 2 fields    */
 			__u16	l_i_gid_high;	/* were reserved2[0] */
 			__u32	l_i_reserved2;
@@ -324,6 +325,8 @@ struct ext3_inode {
 #define EXT3_MOUNT_NO_UID32		0x2000  /* Disable 32-bit UIDs */
 #define EXT3_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
 #define EXT3_MOUNT_POSIX_ACL		0x8000	/* POSIX Access Control Lists */
+#define EXT3_MOUNT_EXTENTS		0x10000	/* Extents support */
+#define EXT3_MOUNT_EXTDEBUG		0x20000	/* Extents debug */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -733,6 +736,11 @@ extern int ext3_change_inode_journal_fla
 extern void ext3_truncate (struct inode *);
 extern void ext3_set_inode_flags(struct inode *);
 extern void ext3_set_aops(struct inode *inode);
+extern int ext3_block_truncate_page(handle_t *handle, struct page *page,
+					struct address_space *mapping, loff_t from);
+extern int ext3_forget(handle_t *handle, int is_metadata,
+		       struct inode *inode, struct buffer_head *bh,
+		       int blocknr);
 
 /* ioctl.c */
 extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
@@ -789,6 +797,13 @@ extern struct inode_operations ext3_spec
 extern struct inode_operations ext3_symlink_inode_operations;
 extern struct inode_operations ext3_fast_symlink_inode_operations;
 
+/* extents.c */
+extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext3_ext_get_block(handle_t *, struct inode *, sector_t,
+				struct buffer_head *, int, int);
+extern void ext3_ext_truncate(struct inode *);
+extern void ext3_ext_init(struct super_block *);
+extern void ext3_ext_release(struct super_block *);
 
 #endif	/* __KERNEL__ */
 
diff -puN include/linux/ext3_fs_i.h~ext3-extents include/linux/ext3_fs_i.h
--- linux-2.6.0-test4/include/linux/ext3_fs_i.h~ext3-extents	2003-08-29 09:58:39.000000000 +0400
+++ linux-2.6.0-test4-alexey/include/linux/ext3_fs_i.h	2003-08-29 09:58:39.000000000 +0400
@@ -108,6 +108,10 @@ struct ext3_inode_info {
 	 */
 	struct rw_semaphore truncate_sem;
 	struct inode vfs_inode;
+
+	/* extents-related data */
+	struct semaphore i_ext_sem;
+	__u16 i_depth;
 };
 
 #endif	/* _LINUX_EXT3_FS_I */
diff -puN include/linux/ext3_fs_sb.h~ext3-extents include/linux/ext3_fs_sb.h
--- linux-2.6.0-test4/include/linux/ext3_fs_sb.h~ext3-extents	2003-08-29 09:58:39.000000000 +0400
+++ linux-2.6.0-test4-alexey/include/linux/ext3_fs_sb.h	2003-08-29 09:58:39.000000000 +0400
@@ -68,6 +68,16 @@ struct ext3_sb_info {
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
 	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
 #endif
+
+	/* extents */
+	int s_ext_debug;
+	int s_ext_mindepth;
+	int s_ext_maxdepth;
+	int s_ext_sum;
+	int s_ext_count;
+	spinlock_t s_ext_lock;
+	int s_ext_extents;
+	int s_ext_blocks;
 };
 
 #endif	/* _LINUX_EXT3_FS_SB */

_


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-28 22:00 ` Ramón Rey Vicente󮠒
@ 2003-08-29  6:04   ` Alex Tomas
  2003-08-29  9:55     ` Ramón Rey Vicente󮠒
  0 siblings, 1 reply; 30+ messages in thread
From: Alex Tomas @ 2003-08-29  6:04 UTC (permalink / raw)
  To: Ramón Rey VicenteС╝; +Cc: linux-kernel, ext2-devel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=utf-8, Size: 384 bytes --]

>>>>> Ramón Rey Vicenteó®  (RRV) writes:

 RRV> This patch could be included with ext3 in 2.6.x?

well, as Andreas already said I don't try to get this patch into 2.6.
this is impossible, obviously. lots of work need to be done before mainline.
userspace utility (fsck, debugfs) should be prepared. at this time
I'd like to get comments, suggestions and wider testing, of course ;)


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29  6:04   ` Alex Tomas
@ 2003-08-29  9:55     ` Ramón Rey Vicente󮠒
  0 siblings, 0 replies; 30+ messages in thread
From: Ramón Rey Vicente󮠒 @ 2003-08-29  9:55 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

El vie, 29-08-2003 a las 08:04, Alex Tomas escribió:

>  RRV> This patch could be included with ext3 in 2.6.x?
> 
> well, as Andreas already said I don't try to get this patch into 2.6.
> this is impossible, obviously. lots of work need to be done before mainline.
> userspace utility (fsck, debugfs) should be prepared. at this time
> I'd like to get comments, suggestions and wider testing, of course ;)

Well, I think this is 2.7 thing :). I will comment it and try testing :)
-- 
Ramón Rey Vicente       <ramon dot rey at hispalinux dot es>
        jabber ID       <rreylinux at jabber dot org>
------------------------------------------------------------
gpg public key ID 0xBEBD71D5 # http://pgp.escomposlinux.org/


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29  5:59   ` Alex Tomas
@ 2003-08-29 15:28     ` Ed Sweetman
  2003-08-29 15:38       ` Alex Tomas
  0 siblings, 1 reply; 30+ messages in thread
From: Ed Sweetman @ 2003-08-29 15:28 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

Alex Tomas wrote:
>>>>>>Ed Sweetman (ES) writes:
> 
> 
>  ES> This patch seems to be against test2, just wondering if anyone before
>  ES> me has used it on the latest test release.  If not then i'm gonna do
>  ES> something stupid and be the first.
> 
> I would be happy to see anyone trying this patch ;)
> 
> following patch applies to -test4 o





If it's the same as test2 then i've already tried it.  I got no 
performance gains as far as dbench is concerned.  Perhaps my block size 
is not optimal on that partition for extents.  Either way it seemed to 
make the kernel unstable and i've been trying to fix things since 
mid-day yesterday.  Been getting very strange problems, no error 
messages are reported or anything like that.





^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 15:28     ` Ed Sweetman
@ 2003-08-29 15:38       ` Alex Tomas
  2003-08-29 15:52         ` Ed Sweetman
  0 siblings, 1 reply; 30+ messages in thread
From: Alex Tomas @ 2003-08-29 15:38 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel, ext2-devel

>>>>> Ed Sweetman (ES) writes:

 ES> If it's the same as test2 then i've already tried it.  I got no
 ES> performance gains as far as dbench is concerned.  Perhaps my block
 ES> size is not optimal on that partition for extents.  Either way it
 ES> seemed to make the kernel unstable and i've been trying to fix things
 ES> since mid-day yesterday.  Been getting very strange problems, no error
 ES> messages are reported or anything like that.

I still use -test2, because -test4 detects my scsi hdds in another order
than -test2. last time I sent rediff against -test4. what kind of problem
did you see?


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 15:38       ` Alex Tomas
@ 2003-08-29 15:52         ` Ed Sweetman
  2003-08-29 16:10           ` Alex Tomas
  0 siblings, 1 reply; 30+ messages in thread
From: Ed Sweetman @ 2003-08-29 15:52 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

Alex Tomas wrote:
>>>>>>Ed Sweetman (ES) writes:
> 
> 
>  ES> If it's the same as test2 then i've already tried it.  I got no
>  ES> performance gains as far as dbench is concerned.  Perhaps my block
>  ES> size is not optimal on that partition for extents.  Either way it
>  ES> seemed to make the kernel unstable and i've been trying to fix things
>  ES> since mid-day yesterday.  Been getting very strange problems, no error
>  ES> messages are reported or anything like that.
> 
> I still use -test2, because -test4 detects my scsi hdds in another order
> than -test2. last time I sent rediff against -test4. what kind of problem
> did you see?
> 

in the kernels that would boot (for some reason test4's videodev driver 
is borked so i used the mm patchset) passed the serio drivers, init was 
unable to be found, no matter what even though it mounted the root fs 
and the root fs is not as far as i can tell when booting on older 
kernels, corrupted.   I'm writing now in mozilla from the very system 
but with extents turned off.  I'm somewhat afraid though that even 
though i didn't mount the partitions with the extents option, that the 
patch may still be having an adverse effect.  Right now things seem 
pretty stable but last night apt was hanging while generating locales 
reproducably causing the entire kernel to lose the ability to do 
anything to the fs. This was all being tested on test3-mm1.  I am aware 
that mm does have some patches to ext3 that aren't in the main kernel i 
believe. perhaps the xattr stuff is conflicting in some way?  I really 
have no way of testing the linus tree directly because the drivers i use 
wont compile.


All in all though, when it was enabled, i saw really no difference from 
when it was not enabled. dbench 16 gave me ~140MB/sec either way. 
md5summing large files resulted in equal performance as well.  I got 
nothing even close to the kind of performance increases you showed in 
the first mail.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 15:52         ` Ed Sweetman
@ 2003-08-29 16:10           ` Alex Tomas
  2003-08-29 16:16             ` Alex Tomas
  2003-08-29 16:20             ` Ed Sweetman
  0 siblings, 2 replies; 30+ messages in thread
From: Alex Tomas @ 2003-08-29 16:10 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel, ext2-devel

>>>>> Ed Sweetman (ES) writes:

 ES> in the kernels that would boot (for some reason test4's videodev
 ES> driver is borked so i used the mm patchset) passed the serio drivers,
 ES> init was unable to be found, no matter what even though it mounted the
 ES> root fs and the root fs is not as far as i can tell when booting on
 ES> older kernels, corrupted.   I'm writing now in mozilla from the very
 ES> system but with extents turned off.  I'm somewhat afraid though that
 ES> even though i didn't mount the partitions with the extents option,
 ES> that the patch may still be having an adverse effect.  Right now
 ES> things seem pretty stable but last night apt was hanging while
 ES> generating locales reproducably causing the entire kernel to lose the
 ES> ability to do anything to the fs. This was all being tested on
 ES> test3-mm1.  I am aware that mm does have some patches to ext3 that
 ES> aren't in the main kernel i believe. perhaps the xattr stuff is
 ES> conflicting in some way?  I really have no way of testing the linus
 ES> tree directly because the drivers i use wont compile.

first of all, once fs gets mounted with extents support any newly created
files/dirs will be stored in extents-format. thus, if you remount that fs
w/o extents support you won't be able to access those files/dirs.

I really propose you don't use extents on a partitions you care about for a while.

 ES> All in all though, when it was enabled, i saw really no difference
 ES> from when it was not enabled. dbench 16 gave me ~140MB/sec either
 ES> way. md5summing large files resulted in equal performance as well.  I
 ES> got nothing even close to the kind of performance increases you showed
 ES> in the first mail.

quite interesting result. could you help me to investigate that?
it would be great to go through following steps:
1) create fresh ext3 fs
2) mount it w/o extents option
3) run dbench 16 for few times (say, 4)
   make sure it performs on that filesystem (cd <mntpoint>; dbench -c ... 16)
4) unmount fs
5) recreate that fs
6) mount it with extents option
   'EXT3-fs: file extents enabled' should be printed in logs
7) run dbench 16 for few times
8) unmount that fs and take a look in logs, you should see stats info about
   extents usage

thank you!


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 16:10           ` Alex Tomas
@ 2003-08-29 16:16             ` Alex Tomas
  2003-08-29 16:20             ` Ed Sweetman
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Tomas @ 2003-08-29 16:16 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ed Sweetman, linux-kernel, ext2-devel


I forgot the most important thing - I need vmstat output for all runs
to anylize performing

>>>>> Alex Tomas (AT) writes:

 AT> quite interesting result. could you help me to investigate that?
 AT> it would be great to go through following steps:
 AT> 1) create fresh ext3 fs
 AT> 2) mount it w/o extents option
 AT> 3) run dbench 16 for few times (say, 4)
 AT>    make sure it performs on that filesystem (cd <mntpoint>; dbench -c ... 16)
 AT> 4) unmount fs
 AT> 5) recreate that fs
 AT> 6) mount it with extents option
 AT>    'EXT3-fs: file extents enabled' should be printed in logs
 AT> 7) run dbench 16 for few times
 AT> 8) unmount that fs and take a look in logs, you should see stats info about
 AT>    extents usage



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 16:10           ` Alex Tomas
  2003-08-29 16:16             ` Alex Tomas
@ 2003-08-29 16:20             ` Ed Sweetman
  2003-08-29 16:34               ` Alex Tomas
  1 sibling, 1 reply; 30+ messages in thread
From: Ed Sweetman @ 2003-08-29 16:20 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

Alex Tomas wrote:
>>>>>>Ed Sweetman (ES) writes:
> 
> 
>  ES> in the kernels that would boot (for some reason test4's videodev
>  ES> driver is borked so i used the mm patchset) passed the serio drivers,
>  ES> init was unable to be found, no matter what even though it mounted the
>  ES> root fs and the root fs is not as far as i can tell when booting on
>  ES> older kernels, corrupted.   I'm writing now in mozilla from the very
>  ES> system but with extents turned off.  I'm somewhat afraid though that
>  ES> even though i didn't mount the partitions with the extents option,
>  ES> that the patch may still be having an adverse effect.  Right now
>  ES> things seem pretty stable but last night apt was hanging while
>  ES> generating locales reproducably causing the entire kernel to lose the
>  ES> ability to do anything to the fs. This was all being tested on
>  ES> test3-mm1.  I am aware that mm does have some patches to ext3 that
>  ES> aren't in the main kernel i believe. perhaps the xattr stuff is
>  ES> conflicting in some way?  I really have no way of testing the linus
>  ES> tree directly because the drivers i use wont compile.
> 
> first of all, once fs gets mounted with extents support any newly created
> files/dirs will be stored in extents-format. thus, if you remount that fs
> w/o extents support you won't be able to access those files/dirs

> I really propose you don't use extents on a partitions you care about for a while.

I was testing this with only a single partition mounted with extents 
enabled when benchmarking.  Ext3 gave no messages of being mounted 
afterbootup with or without extents so to make sure i had extents 
enabled i booted with all my partitions with the extents option.  I 
suspect then my problems began.  I'm completely unaware of the extent of 
the damage enabling extents has done since most of the important things 
were opened, not created during my extents use.  In any case it may be 
that the reason why init is not able to be found is because i used apt 
and upgraded my system ...and I dont remember if i had extents enabled 
at the time or not.  If my init is in extents format though, then why is 
a patched kernel able to read it with extents not being enabled via the 
omunt option where as kernels without the patch cannot.  Is extents able 
to be read from a fs even when it's not mounted with the option but not 
written?   I'm kinda confused, this aspect of extents wasn't in the 
original email.

>  ES> All in all though, when it was enabled, i saw really no difference
>  ES> from when it was not enabled. dbench 16 gave me ~140MB/sec either
>  ES> way. md5summing large files resulted in equal performance as well.  I
>  ES> got nothing even close to the kind of performance increases you showed
>  ES> in the first mail.
> 
> quite interesting result. could you help me to investigate that?
> it would be great to go through following steps:
> 1) create fresh ext3 fs
> 2) mount it w/o extents option
> 3) run dbench 16 for few times (say, 4)
>    make sure it performs on that filesystem (cd <mntpoint>; dbench -c ... 16)
> 4) unmount fs
> 5) recreate that fs
> 6) mount it with extents option
>    'EXT3-fs: file extents enabled' should be printed in logs
> 7) run dbench 16 for few times
> 8) unmount that fs and take a look in logs, you should see stats info about
>    extents usage
> 
> thank you!

i'm going to try and boot a kernel without the extents patch (so far 
hasn't been possible) and run dbench again and see if i get different 
numbers.  I'm almost suspecting extents being enabled no matter what i 
mount the fs's as.




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 16:20             ` Ed Sweetman
@ 2003-08-29 16:34               ` Alex Tomas
  2003-08-29 17:49                 ` Ed Sweetman
  0 siblings, 1 reply; 30+ messages in thread
From: Alex Tomas @ 2003-08-29 16:34 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel, ext2-devel

>>>>> Ed Sweetman (ES) writes:

 ES> I was testing this with only a single partition mounted with extents
 ES> enabled when benchmarking.  Ext3 gave no messages of being mounted
 ES> afterbootup with or without extents so to make sure i had extents
 ES> enabled i booted with all my partitions with the extents option.  I
 ES> suspect then my problems began.  I'm completely unaware of the extent
 ES> of the damage enabling extents has done since most of the important
 ES> things were opened, not created during my extents use.  In any case it
 ES> may be that the reason why init is not able to be found is because i
 ES> used apt and upgraded my system ...and I dont remember if i had
 ES> extents enabled at the time or not.  If my init is in extents format
 ES> though, then why is a patched kernel able to read it with extents not
 ES> being enabled via the omunt option where as kernels without the patch
 ES> cannot.  Is extents able to be read from a fs even when it's not
 ES> mounted with the option but not written?   I'm kinda confused, this
 ES> aspect of extents wasn't in the original email.

well, on my testbox I use _patched with extents_ ext3 as / and /boot partitions.
I haven't seen any problems on them. with patch, ext3 look at special EXTENTS
flag in inode (this flag is set up only for newly created files on fs being
mounted with extents enabled) and calls apropriate routines. thus, it will
call extents routines for those file even if fs is being mounted with extents
disabled. I really do believe that your root filesystem haven't been mounted
with extents enabled, so init must be stored in good old format.

 ES> i'm going to try and boot a kernel without the extents patch (so far
 ES> hasn't been possible) and run dbench again and see if i get different
 ES> numbers.  I'm almost suspecting extents being enabled no matter what i
 ES> mount the fs's as.

that would be fine!





^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 16:34               ` Alex Tomas
@ 2003-08-29 17:49                 ` Ed Sweetman
  2003-08-29 18:09                   ` Alex Tomas
                                     ` (2 more replies)
  0 siblings, 3 replies; 30+ messages in thread
From: Ed Sweetman @ 2003-08-29 17:49 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

Alex Tomas wrote:
>>>>>>Ed Sweetman (ES) writes:
> 
> 
>  ES> I was testing this with only a single partition mounted with extents
>  ES> enabled when benchmarking.  Ext3 gave no messages of being mounted
>  ES> afterbootup with or without extents so to make sure i had extents
>  ES> enabled i booted with all my partitions with the extents option.  I
>  ES> suspect then my problems began.  I'm completely unaware of the extent
>  ES> of the damage enabling extents has done since most of the important
>  ES> things were opened, not created during my extents use.  In any case it
>  ES> may be that the reason why init is not able to be found is because i
>  ES> used apt and upgraded my system ...and I dont remember if i had
>  ES> extents enabled at the time or not.  If my init is in extents format
>  ES> though, then why is a patched kernel able to read it with extents not
>  ES> being enabled via the omunt option where as kernels without the patch
>  ES> cannot.  Is extents able to be read from a fs even when it's not
>  ES> mounted with the option but not written?   I'm kinda confused, this
>  ES> aspect of extents wasn't in the original email.
> 
> well, on my testbox I use _patched with extents_ ext3 as / and /boot partitions.
> I haven't seen any problems on them. with patch, ext3 look at special EXTENTS
> flag in inode (this flag is set up only for newly created files on fs being
> mounted with extents enabled) and calls apropriate routines. thus, it will
> call extents routines for those file even if fs is being mounted with extents
> disabled. I really do believe that your root filesystem haven't been mounted
> with extents enabled, so init must be stored in good old format.

Ok well little wait on the non-patched bootup.

I booted with test4-mm2 patched



Throughput 221.812 MB/sec 16 procs    ext2
Throughput 159.495 MB/sec 16 procs    ext3-extents (definitely enabled)
Throughput 147.598 MB/sec 16 procs    ext3 (patched but disabled)

There is an obvious improvement, but nothing near the 70+% increase you 
saw.  Subsequent runs run anything from a little lower than above for 
extents to 167MB/s.

I'm using the largest inode size possible for ext3 for the filesystem 
tested.


By the way, what's the behavior of opening an existing non-extent file 
and writing and reading to it while the partition is mounted with 
extents enabled?




>  ES> i'm going to try and boot a kernel without the extents patch (so far
>  ES> hasn't been possible) and run dbench again and see if i get different
>  ES> numbers.  I'm almost suspecting extents being enabled no matter what i
>  ES> mount the fs's as.
> 
> that would be fine!
> 
> 



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 17:49                 ` Ed Sweetman
@ 2003-08-29 18:09                   ` Alex Tomas
  2003-08-29 19:55                     ` Ed Sweetman
  2003-08-30  8:55                   ` Alex Tomas
  2003-08-30 10:13                   ` Alex Tomas
  2 siblings, 1 reply; 30+ messages in thread
From: Alex Tomas @ 2003-08-29 18:09 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel, ext2-devel

>>>>> Ed Sweetman (ES) writes:

 ES> Throughput 221.812 MB/sec 16 procs    ext2
 ES> Throughput 159.495 MB/sec 16 procs    ext3-extents (definitely enabled)
 ES> Throughput 147.598 MB/sec 16 procs    ext3 (patched but disabled)

 ES> There is an obvious improvement, but nothing near the 70+% increase
 ES> you saw.  Subsequent runs run anything from a little lower than above
 ES> for extents to 167MB/s.

it seems one of my scsi drive is a bit broken (caching, at least).
sorry for invalid numbers. on another drive I see following:

w/o extents:
[root@zefir root]# /root/db2.sh 2 16
Throughput 119.199 MB/sec 16 procs
Throughput 106.09 MB/sec 16 procs
Average: 112.64450


with extents:
[root@zefir root]# /root/db2.sh 2 16
Throughput 156.846 MB/sec 16 procs
Throughput 170.591 MB/sec 16 procs
Average: 163.71850

so, this time improvement is about 45%

I can't explain this yet. need to be investigated carefully

 ES> By the way, what's the behavior of opening an existing non-extent file
 ES> and writing and reading to it while the partition is mounted with
 ES> extents enabled?

those files are handled usual way


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 18:09                   ` Alex Tomas
@ 2003-08-29 19:55                     ` Ed Sweetman
  2003-08-29 21:39                       ` [Ext2-devel] " Mike Fedyk
  2003-08-30  9:09                       ` Alex Tomas
  0 siblings, 2 replies; 30+ messages in thread
From: Ed Sweetman @ 2003-08-29 19:55 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel

Well, it appears that you need about 10+ blocks per extent to see any 
noticable performance gain.  The problem is most files are not large 
enough to achieve this.  Most range from a few kB to a couple mB. High 
activity directories like /tmp and /usr deal mostly with numerous small 
files.  Now the reason i bring this up is that extents basically make 
your fs incompatible with any kernel not compiled with the patch, which 
means if something bad happened and you needed to use a bootable cdrom 
with some safety kernel, it wouldn't be that useful.  for such small 
improvements, it doesn't seem worth the risk to make / or directories 
like /tmp,/var,/usr,/boot,/lib etc, with extents.  The files just dont 
get large enough to make performance gains worth more than backward 
compatibility.

Now for media, like music and movies and such, this makes a lot of 
sense. Files get large enough so that the block to extent use is very 
high and the files aren't necessary to use the system.  extents are 5 
seconds faster when md5summing a 622MB file than the same file written 
without extents enabled.


I would not recommend using the patch for system directories only 
because it leaves you with no way to rescue the system and does very 
little in the way of performance for those directories. Ext3 is 
backwards compatible with ext2, this patch seemingly breaks that. 
Because of that it doesn't seem to be ext3 anymore, rather a one way 
compatibility with ext3 with a purely large media bias.



Alex Tomas wrote:
>>>>>>Ed Sweetman (ES) writes:
> 
> 
>  ES> Throughput 221.812 MB/sec 16 procs    ext2
>  ES> Throughput 159.495 MB/sec 16 procs    ext3-extents (definitely enabled)
>  ES> Throughput 147.598 MB/sec 16 procs    ext3 (patched but disabled)
> 
>  ES> There is an obvious improvement, but nothing near the 70+% increase
>  ES> you saw.  Subsequent runs run anything from a little lower than above
>  ES> for extents to 167MB/s.
> 
> it seems one of my scsi drive is a bit broken (caching, at least).
> sorry for invalid numbers. on another drive I see following:
> 
> w/o extents:
> [root@zefir root]# /root/db2.sh 2 16
> Throughput 119.199 MB/sec 16 procs
> Throughput 106.09 MB/sec 16 procs
> Average: 112.64450
> 
> 
> with extents:
> [root@zefir root]# /root/db2.sh 2 16
> Throughput 156.846 MB/sec 16 procs
> Throughput 170.591 MB/sec 16 procs
> Average: 163.71850
> 
> so, this time improvement is about 45%


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] Re: [RFC] extents support for EXT3
  2003-08-29 19:55                     ` Ed Sweetman
@ 2003-08-29 21:39                       ` Mike Fedyk
  2003-08-29 22:25                         ` Ed Sweetman
  2003-08-30  9:09                       ` Alex Tomas
  1 sibling, 1 reply; 30+ messages in thread
From: Mike Fedyk @ 2003-08-29 21:39 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: Alex Tomas, linux-kernel, ext2-devel

On Fri, Aug 29, 2003 at 03:55:14PM -0400, Ed Sweetman wrote:
> I would not recommend using the patch for system directories only 
> because it leaves you with no way to rescue the system and does very 
> little in the way of performance for those directories. Ext3 is 
> backwards compatible with ext2, this patch seemingly breaks that. 
> Because of that it doesn't seem to be ext3 anymore, rather a one way 
> compatibility with ext3 with a purely large media bias.

Do you get any slowdown with the extents on small files though?

The plan is to add extent reading code to the three other stable trees so
that at the very least you could have read-only access to the extent based
ext2/3.

Remember, if the implementation of journaling hadn't have been so extensive,
we wouldn't have an ext3, but ext2 with journaling (and called ext2).

So what this will be is ext2 format with extents, and with (ext3) or without
journaling (ext2).

And this is planned for 2.7, so if anything goes into 2.6, it'll be
read-only support of extents.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] Re: [RFC] extents support for EXT3
  2003-08-29 21:39                       ` [Ext2-devel] " Mike Fedyk
@ 2003-08-29 22:25                         ` Ed Sweetman
  2003-08-29 23:17                           ` Mike Fedyk
  0 siblings, 1 reply; 30+ messages in thread
From: Ed Sweetman @ 2003-08-29 22:25 UTC (permalink / raw)
  To: Mike Fedyk; +Cc: Alex Tomas, linux-kernel, ext2-devel

Mike Fedyk wrote:
> On Fri, Aug 29, 2003 at 03:55:14PM -0400, Ed Sweetman wrote:
> 
>>I would not recommend using the patch for system directories only 
>>because it leaves you with no way to rescue the system and does very 
>>little in the way of performance for those directories. Ext3 is 
>>backwards compatible with ext2, this patch seemingly breaks that. 
>>Because of that it doesn't seem to be ext3 anymore, rather a one way 
>>compatibility with ext3 with a purely large media bias.
> 
> 
> Do you get any slowdown with the extents on small files though?
> 
> The plan is to add extent reading code to the three other stable trees so
> that at the very least you could have read-only access to the extent based
> ext2/3.
> 
> Remember, if the implementation of journaling hadn't have been so extensive,
> we wouldn't have an ext3, but ext2 with journaling (and called ext2).
> 
> So what this will be is ext2 format with extents, and with (ext3) or without
> journaling (ext2).
> 
> And this is planned for 2.7, so if anything goes into 2.6, it'll be
> read-only support of extents.
> 

ext3 fs's could be read by ext2 drivers without having knowledge of what 
ext3 is.  Hence when ext3 was introduced, i could use present stable 
kernels to still fall back on in case things didn't go as planned 
(happened a lot in the beginning of ext3).  so ext3 fs's are compatible 
with ext2 without any knowledge of ext3.  This is more like the upgrade 
to ext2 that happened in 2.2 compared to 2.0 that made later ext2 fs's 
unreadable to the older 2.0 kernel produced ones.

you get no real slowdown as far as rough benchmarks are concerned, 
perhaps with a microbenchmark you would see one and also, doesn't it 
take up more space to save the extent info and such? Either way, all of 
it's real benefits occur on large files.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] Re: [RFC] extents support for EXT3
  2003-08-29 22:25                         ` Ed Sweetman
@ 2003-08-29 23:17                           ` Mike Fedyk
  2003-08-31 20:25                             ` Eric W. Biederman
  0 siblings, 1 reply; 30+ messages in thread
From: Mike Fedyk @ 2003-08-29 23:17 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: Alex Tomas, linux-kernel, ext2-devel

On Fri, Aug 29, 2003 at 06:25:02PM -0400, Ed Sweetman wrote:
> you get no real slowdown as far as rough benchmarks are concerned, 
> perhaps with a microbenchmark you would see one and also, doesn't it 
> take up more space to save the extent info and such? Either way, all of 
> it's real benefits occur on large files.

IIRC, if your blocks are contiguous, you can save as soon as soon as the
file size goes above one block (witout extents, the first 12 blocks are
pointed to by what?  I forget... :-/ )

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 17:49                 ` Ed Sweetman
  2003-08-29 18:09                   ` Alex Tomas
@ 2003-08-30  8:55                   ` Alex Tomas
  2003-08-30 10:13                   ` Alex Tomas
  2 siblings, 0 replies; 30+ messages in thread
From: Alex Tomas @ 2003-08-30  8:55 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: Alex Tomas, linux-kernel, ext2-devel

>>>>> Ed Sweetman (ES) writes:

 ES> I booted with test4-mm2 patched

 ES> Throughput 221.812 MB/sec 16 procs    ext2
 ES> Throughput 159.495 MB/sec 16 procs    ext3-extents (definitely enabled)
 ES> Throughput 147.598 MB/sec 16 procs    ext3 (patched but disabled)

 ES> There is an obvious improvement, but nothing near the 70+% increase
 ES> you saw.  Subsequent runs run anything from a little lower than above
 ES> for extents to 167MB/s.

yep. Andrew pointed out that I was using SMP on testbox. so, I reran tests on UP:

[root@zefir root]# vim /root/db2.sh 
[root@zefir root]# /root/db2.sh 2 16
Throughput 116.412 MB/sec 16 procs
Throughput 108.163 MB/sec 16 procs
Average: 112.28750

[root@zefir root]# /root/db2.sh 2 16
Throughput 120.401 MB/sec 16 procs
Throughput 118.392 MB/sec 16 procs
Average: 119.39650

on UP improvement is about 6% (8% in your case). this result looks much more fair.

now I have following items on todo list:
1) debugfs support
   I really need it in order to understand extents usage for diff apps. having
   this info we could tune extents for more improvements
2) multiblock allocation
3) delayed allocation

thank you, Ed!


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 19:55                     ` Ed Sweetman
  2003-08-29 21:39                       ` [Ext2-devel] " Mike Fedyk
@ 2003-08-30  9:09                       ` Alex Tomas
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Tomas @ 2003-08-30  9:09 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel, ext2-devel


yes, you're correct. extents make sense for large files generally speaking.
having extents, it's simpler to implement delayed allocation, imho. delayed
allocation makes sense for all files. especially for temporary files (say,
.S files during make bzImage). this technique allows to avoid block allocation
at all for such files and make file's placement more contigoues.

I agree about system fs (/, /boot, /usr, /var), because most of files are
quite small and change rarely.

>>>>> Ed Sweetman (ES) writes:

 ES> Well, it appears that you need about 10+ blocks per extent to see any
 ES> noticable performance gain.  The problem is most files are not large
 ES> enough to achieve this.  Most range from a few kB to a couple mB. High
 ES> activity directories like /tmp and /usr deal mostly with numerous
 ES> small files.  Now the reason i bring this up is that extents basically
 ES> make your fs incompatible with any kernel not compiled with the patch,
 ES> which means if something bad happened and you needed to use a bootable
 ES> cdrom with some safety kernel, it wouldn't be that useful.  for such
 ES> small improvements, it doesn't seem worth the risk to make / or
 ES> directories like /tmp,/var,/usr,/boot,/lib etc, with extents.  The
 ES> files just dont get large enough to make performance gains worth more
 ES> than backward compatibility.

 ES> Now for media, like music and movies and such, this makes a lot of
 ES> sense. Files get large enough so that the block to extent use is very
 ES> high and the files aren't necessary to use the system.  extents are 5
 ES> seconds faster when md5summing a 622MB file than the same file written
 ES> without extents enabled.


 ES> I would not recommend using the patch for system directories only
 ES> because it leaves you with no way to rescue the system and does very
 ES> little in the way of performance for those directories. Ext3 is
 ES> backwards compatible with ext2, this patch seemingly breaks
 ES> that. Because of that it doesn't seem to be ext3 anymore, rather a one
 ES> way compatibility with ext3 with a purely large media bias.



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-29 17:49                 ` Ed Sweetman
  2003-08-29 18:09                   ` Alex Tomas
  2003-08-30  8:55                   ` Alex Tomas
@ 2003-08-30 10:13                   ` Alex Tomas
  2 siblings, 0 replies; 30+ messages in thread
From: Alex Tomas @ 2003-08-30 10:13 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel, ext2-devel

>>>>> Ed Sweetman (ES) writes:


 ES> I'm using the largest inode size possible for ext3 for the filesystem
 ES> tested.

inode size doesn't make sense because extents use i_data space only



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] Re: [RFC] extents support for EXT3
  2003-08-29 23:17                           ` Mike Fedyk
@ 2003-08-31 20:25                             ` Eric W. Biederman
  2003-08-31 20:37                               ` Alex Tomas
  2003-09-05 10:06                               ` Pavel Machek
  0 siblings, 2 replies; 30+ messages in thread
From: Eric W. Biederman @ 2003-08-31 20:25 UTC (permalink / raw)
  To: Mike Fedyk; +Cc: Ed Sweetman, Alex Tomas, linux-kernel, ext2-devel

Mike Fedyk <mfedyk@matchmail.com> writes:

> On Fri, Aug 29, 2003 at 06:25:02PM -0400, Ed Sweetman wrote:
> > you get no real slowdown as far as rough benchmarks are concerned, 
> > perhaps with a microbenchmark you would see one and also, doesn't it 
> > take up more space to save the extent info and such? Either way, all of 
> > it's real benefits occur on large files.
> 
> IIRC, if your blocks are contiguous, you can save as soon as soon as the
> file size goes above one block (witout extents, the first 12 blocks are
> pointed to by what?  I forget... :-/ )

They are pointed to directly from the inode.

In light of other concerns how reasonable is a switch to e2fsck that
will remove extents so people can downgrade filesystems?

Also given the incompatibility on the file format any chance of this
being developed as ext4?

Eric

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] Re: [RFC] extents support for EXT3
  2003-08-31 20:25                             ` Eric W. Biederman
@ 2003-08-31 20:37                               ` Alex Tomas
  2003-09-05 10:06                               ` Pavel Machek
  1 sibling, 0 replies; 30+ messages in thread
From: Alex Tomas @ 2003-08-31 20:37 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Mike Fedyk, Ed Sweetman, linux-kernel, ext2-devel

>>>>> Eric W Biederman (EWB) writes:

 EWB> In light of other concerns how reasonable is a switch to e2fsck that
 EWB> will remove extents so people can downgrade filesystems?

also this could be done following way:
1) remount fs w/o extents support
2) copy files
3) remove old ones


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] Re: [RFC] extents support for EXT3
  2003-08-31 20:25                             ` Eric W. Biederman
  2003-08-31 20:37                               ` Alex Tomas
@ 2003-09-05 10:06                               ` Pavel Machek
  2003-09-05 14:55                                 ` Eric W. Biederman
  1 sibling, 1 reply; 30+ messages in thread
From: Pavel Machek @ 2003-09-05 10:06 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Mike Fedyk, Ed Sweetman, Alex Tomas, linux-kernel, ext2-devel

Hi!

> > > you get no real slowdown as far as rough benchmarks are concerned, 
> > > perhaps with a microbenchmark you would see one and also, doesn't it 
> > > take up more space to save the extent info and such? Either way, all of 
> > > it's real benefits occur on large files.
> > 
> > IIRC, if your blocks are contiguous, you can save as soon as soon as the
> > file size goes above one block (witout extents, the first 12 blocks are
> > pointed to by what?  I forget... :-/ )
> 
> They are pointed to directly from the inode.
> 
> In light of other concerns how reasonable is a switch to e2fsck that
> will remove extents so people can downgrade filesystems?

It is going to be non-trivial: downgrading filesystem will likely need
free space. And now: what happens when there's no free space?

								Pavel
-- 
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Ext2-devel] Re: [RFC] extents support for EXT3
  2003-09-05 10:06                               ` Pavel Machek
@ 2003-09-05 14:55                                 ` Eric W. Biederman
  0 siblings, 0 replies; 30+ messages in thread
From: Eric W. Biederman @ 2003-09-05 14:55 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Mike Fedyk, Ed Sweetman, Alex Tomas, linux-kernel, ext2-devel

Pavel Machek <pavel@ucw.cz> writes:

> Hi!
> 
> > > > you get no real slowdown as far as rough benchmarks are concerned, 
> > > > perhaps with a microbenchmark you would see one and also, doesn't it 
> > > > take up more space to save the extent info and such? Either way, all of 
> > > > it's real benefits occur on large files.
> > > 
> > > IIRC, if your blocks are contiguous, you can save as soon as soon as the
> > > file size goes above one block (witout extents, the first 12 blocks are
> > > pointed to by what?  I forget... :-/ )
> > 
> > They are pointed to directly from the inode.
> > 
> > In light of other concerns how reasonable is a switch to e2fsck that
> > will remove extents so people can downgrade filesystems?
> 
> It is going to be non-trivial: downgrading filesystem will likely need
> free space. And now: what happens when there's no free space?

Having a full filesystem is generally a rare event.  And the actual size
difference between an extent tree based solution and a block tree solution
is usually quite small.  

And if it fails, well filesystems checkers are not required to succeed.
Safety wise it should be possible to allocate and probably even write
the entire block tree before the extent tree is removed so no data
should be lost.

Maybe downgrading is just silly but it is a nice option to have while
everything is still in development.  For the most part people seem to
be completely capable of making a back up and totally recreating a
filesystem as people have shown.  But if you are going to require that
what is the point of staying with the ext2 file format.


Eric

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-08-28  8:22 [RFC] extents support for EXT3 Alex Tomas
                   ` (2 preceding siblings ...)
  2003-08-28 22:00 ` Ramón Rey Vicente󮠒
@ 2003-09-06  0:19 ` jw schultz
  2003-09-06  6:09   ` Alex Tomas
  3 siblings, 1 reply; 30+ messages in thread
From: jw schultz @ 2003-09-06  0:19 UTC (permalink / raw)
  To: linux-kernel

On Thu, Aug 28, 2003 at 12:22:04PM +0400, Alex Tomas wrote:
> 
> this is 2nd version of the patch. changes:
>   - error handling seems completed
>   - lots of cleanups and comments
>   - few minor bug fixed
> 
> this version of the patch tries to solve couple
> of corner cases:
>   - very long truncate
>   - rewrite 
> 
> it survived dbench, bonnie++ and fsx tests.

Seems to me that moving to extents offers the perfect
opportunity to eliminate the 1 block limit of extended
attributes.  I don't see any sign of that in this patch but
i might have missed it.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC] extents support for EXT3
  2003-09-06  0:19 ` jw schultz
@ 2003-09-06  6:09   ` Alex Tomas
  0 siblings, 0 replies; 30+ messages in thread
From: Alex Tomas @ 2003-09-06  6:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-kernel

>>>>> jw schultz (js) writes:

 js> Seems to me that moving to extents offers the perfect
 js> opportunity to eliminate the 1 block limit of extended
 js> attributes.  I don't see any sign of that in this patch but
 js> i might have missed it.

let's wait for Daniel's design paper about shared EAs


^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2003-09-06  6:04 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-08-28  8:22 [RFC] extents support for EXT3 Alex Tomas
2003-08-28 17:22 ` [Ext2-devel] " Mike Fedyk
2003-08-29  5:55   ` Alex Tomas
2003-08-28 18:12 ` Ed Sweetman
2003-08-29  5:59   ` Alex Tomas
2003-08-29 15:28     ` Ed Sweetman
2003-08-29 15:38       ` Alex Tomas
2003-08-29 15:52         ` Ed Sweetman
2003-08-29 16:10           ` Alex Tomas
2003-08-29 16:16             ` Alex Tomas
2003-08-29 16:20             ` Ed Sweetman
2003-08-29 16:34               ` Alex Tomas
2003-08-29 17:49                 ` Ed Sweetman
2003-08-29 18:09                   ` Alex Tomas
2003-08-29 19:55                     ` Ed Sweetman
2003-08-29 21:39                       ` [Ext2-devel] " Mike Fedyk
2003-08-29 22:25                         ` Ed Sweetman
2003-08-29 23:17                           ` Mike Fedyk
2003-08-31 20:25                             ` Eric W. Biederman
2003-08-31 20:37                               ` Alex Tomas
2003-09-05 10:06                               ` Pavel Machek
2003-09-05 14:55                                 ` Eric W. Biederman
2003-08-30  9:09                       ` Alex Tomas
2003-08-30  8:55                   ` Alex Tomas
2003-08-30 10:13                   ` Alex Tomas
2003-08-28 22:00 ` Ramón Rey Vicente󮠒
2003-08-29  6:04   ` Alex Tomas
2003-08-29  9:55     ` Ramón Rey Vicente󮠒
2003-09-06  0:19 ` jw schultz
2003-09-06  6:09   ` Alex Tomas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).