All of lore.kernel.org
 help / color / mirror / Atom feed
* [Cluster-devel] [GFS2 Patch] [Try 2] GFS2: Reduce file fragmentation
       [not found] <785c19c3-631a-4976-84fd-048876691678@zmail12.collab.prod.int.phx2.redhat.com>
@ 2012-07-05 21:58 ` Bob Peterson
  2012-07-10 14:23   ` Steven Whitehouse
  0 siblings, 1 reply; 13+ messages in thread
From: Bob Peterson @ 2012-07-05 21:58 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

Here is my replacement GFS2 file defragmentation patch for upstream:

Description:

This patch reduces GFS2 file fragmentation by pre-reserving blocks.
In this patch, I've implemented almost all of Steve's suggestions.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d957a8..205eddc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (error)
 		goto out_rlist;
 
+	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
+		gfs2_rs_treedel(ip->i_res, true);
+
 	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
 				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
 				 revokes);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fbf3cb..779b6222 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		return ret;
 
+	atomic_set(&ip->i_res->rs_sizehint,
+		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
+
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
@@ -571,22 +574,15 @@ fail:
 
 static int gfs2_release(struct inode *inode, struct file *file)
 {
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_file *fp;
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	fp = file->private_data;
+	kfree(file->private_data);
 	file->private_data = NULL;
 
-	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
-	    (atomic_read(&inode->i_writecount) == 1))
+	if ((file->f_mode & FMODE_WRITE) && gfs2_rs_active(ip->i_res) &&
+	    (atomic_read(&inode->i_writecount) == 1)) {
 		gfs2_rs_delete(ip);
-
-	if (gfs2_assert_warn(sdp, fp))
-		return -EIO;
-
-	kfree(fp);
-
+	}
 	return 0;
 }
 
@@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				   unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
+	size_t writesize = iov_length(iov, nr_segs);
 	struct dentry *dentry = file->f_dentry;
 	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_sbd *sdp;
 	int ret;
 
+	sdp = GFS2_SB(file->f_mapping->host);
 	ret = gfs2_rs_alloc(ip);
 	if (ret)
 		return ret;
 
+	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
 	if (file->f_flags & O_APPEND) {
 		struct gfs2_holder gh;
 
@@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	if (unlikely(error))
 		goto out_uninit;
 
+	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
+
 	while (len > 0) {
 		if (len < bytes)
 			bytes = len;
@@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 			offset += bytes;
 			continue;
 		}
-		error = gfs2_rindex_update(sdp);
-		if (error)
-			goto out_unlock;
-
 		error = gfs2_quota_lock_check(ip);
 		if (error)
 			goto out_unlock;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dc73070..f7bd2e3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -84,6 +84,7 @@ struct gfs2_rgrpd {
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
 	u32 rd_free;
+	u32 rd_reserved;                /* number of blocks reserved */
 	u32 rd_free_clone;
 	u32 rd_dinodes;
 	u64 rd_igeneration;
@@ -91,11 +92,15 @@ struct gfs2_rgrpd {
 	struct gfs2_sbd *rd_sbd;
 	struct gfs2_rgrp_lvb *rd_rgl;
 	u32 rd_last_alloc;
+	u32 rd_next_rsrv;               /* preferred next reservation */
 	u32 rd_flags;
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
 #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
+	struct mutex rd_mutex;          /* protects reservation related vars */
+	struct rb_root rd_rstree;       /* multi-block reservation tree */
+	u32 rd_rs_cnt;                  /* count of current reservations */
 };
 
 enum gfs2_state_bits {
@@ -233,6 +238,41 @@ struct gfs2_holder {
 	unsigned long gh_ip;
 };
 
+/* Resource group multi-block reservation, in order of appearance:
+
+   Step 1. Function prepares to write, allocates a mb, sets the size hint.
+   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
+   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
+   Step 4. Bits are assigned from the rgrp based on either the reservation
+           or wherever it can.
+*/
+
+struct gfs2_blkreserv {
+	/* components used during write (step 1): */
+	u64 rs_inum; /* inode address */
+	atomic_t rs_sizehint;         /* hint of the write size */
+
+	/* components used during inplace_reserve (step 2): */
+	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+
+	/* components used during get_local_rgrp (step 3): */
+	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
+	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
+	struct rb_node rs_node;       /* link to other block reservations */
+
+	/* components used during block searches and assignments (step 4): */
+	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
+	u64 rs_start;                 /* absolute fs block address */
+	u32 rs_biblk;                 /* start block relative to the bi */
+	u32 rs_blks;                  /* number of blocks reserved */
+	u32 rs_free;                  /* how many blocks are still free */
+
+	/* ancillary quota stuff */
+	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
+	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+	unsigned int rs_qa_qd_num;
+};
+
 enum {
 	GLF_LOCK			= 1,
 	GLF_DEMOTE			= 3,
@@ -290,16 +330,6 @@ struct gfs2_glock {
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
 
-struct gfs2_blkreserv {
-	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
-	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
-
-	/* ancillary quota stuff */
-	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
-	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
-	unsigned int rs_qa_qd_num;
-};
-
 enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
@@ -307,7 +337,6 @@ enum {
 	GIF_SW_PAGED		= 3,
 };
 
-
 struct gfs2_inode {
 	struct inode i_inode;
 	u64 i_no_addr;
@@ -318,7 +347,7 @@ struct gfs2_inode {
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
-	struct gfs2_blkreserv *i_res; /* resource group block reservation */
+	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
 	struct gfs2_rgrpd *i_rgd;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b035e0..12c7b6a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	int error;
 
 	munge_mode_uid_gid(dip, &mode, &uid, &gid);
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
 
 	error = gfs2_quota_lock(dip, uid, gid);
 	if (error)
@@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	struct buffer_head *dibh;
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto fail;
@@ -596,7 +603,8 @@ fail_end_trans:
 	gfs2_trans_end(sdp);
 
 fail_ipreserv:
-	gfs2_inplace_release(dip);
+	if (alloc_required)
+		gfs2_inplace_release(dip);
 
 fail_quota_locks:
 	gfs2_quota_unlock(dip);
@@ -738,6 +746,7 @@ fail_gunlock:
 		iput(inode);
 	}
 fail:
+	gfs2_rs_delete(dip);
 	if (bh)
 		brelse(bh);
 	return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e53d0a1..4d1f4f8 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -35,6 +35,9 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
+#define RSRV_CONTENTION_FACTOR 2
+#define RGRP_RSRV_MAX_CONTENDERS 4
+
 #if BITS_PER_LONG == 32
 #define LBITMASK   (0x55555555UL)
 #define LBITSKIP55 (0x55555555UL)
@@ -178,6 +181,60 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
 }
 
 /**
+ * rs_cmp - multi-block reservation range compare
+ * @blk: absolute file system block number of the new reservation
+ * @len: number of blocks in the new reservation
+ * @rs: existing reservation to compare against
+ *
+ * returns: 1 if the block range is beyond the reach of the reservation
+ *         -1 if the block range is before the start of the reservation
+ *          0 if the block range overlaps with the reservation
+ */
+static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
+{
+	if (blk > rs->rs_start + rs->rs_blks - 1)
+		return 1;
+	if (blk + len - 1 < rs->rs_start)
+		return -1;
+	return 0;
+}
+
+/**
+ * rs_find - Find a rgrp multi-block reservation that contains a given block
+ * @rgd: The rgrp
+ * @rgblk: The block we're looking for, relative to the rgrp
+ */
+static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
+{
+	struct rb_node **newn;
+	int rc;
+	u64 fsblk = rgblk + rgd->rd_data0;
+
+	mutex_lock(&rgd->rd_mutex);
+	newn = &rgd->rd_rstree.rb_node;
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+		rc = rs_cmp(fsblk, 1, cur);
+		if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else {
+			mutex_unlock(&rgd->rd_mutex);
+			return cur;
+		}
+	}
+	mutex_unlock(&rgd->rd_mutex);
+	return NULL;
+}
+
+static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
+{
+	return (bi->bi_start * GFS2_NBBY) + blk;
+}
+
+/**
  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
  *       a block in a given allocation state.
  * @buf: the buffer that holds the bitmaps
@@ -435,8 +492,66 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
 	return error;
 }
 
+static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
+{
+	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u i:%u f:%u\n",
+		       rs->rs_rgd->rd_addr, rs->rs_start, rs->rs_biblk,
+		       rs->rs_blks, rs->rs_free);
+}
+
+/**
+ * gfs2_rs_treedel - remove a multi-block reservation from the rgd tree
+ * @ip: The inode for this reservation
+ * @lock: we need to lock the rgrp
+ *
+ */
+void gfs2_rs_treedel(struct gfs2_blkreserv *rs, bool lock)
+{
+	struct gfs2_rgrpd *rgd;
+
+	if (!gfs2_rs_active(rs))
+		return;
+
+	rgd = rs->rs_rgd;
+	/* We can't do this: The reason is that when the rgrp is invalidated,
+	   it's in the "middle" of acquiring the glock, but the HOLDER bit
+	   isn't set yet:
+	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
+	trace_gfs2_rs(rs, 0, TRACE_RS_TREEDEL);
+
+	if (lock)
+		mutex_lock(&rgd->rd_mutex);
+	/* Now that we have the mutex, check again if the tree is empty just
+	   in case someone else deleted this entry. */
+	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
+		rb_erase(&rs->rs_node, &rgd->rd_rstree);
+	BUG_ON(!rgd->rd_rs_cnt);
+	rgd->rd_rs_cnt--;
+
+	if (rs->rs_free) {
+		/* return reserved blocks to the rgrp and the ip */
+		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
+		rs->rs_rgd->rd_reserved -= rs->rs_free;
+		rs->rs_free = 0;
+		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
+		smp_mb__after_clear_bit();
+	}
+	if (lock)
+		mutex_unlock(&rgd->rd_mutex);
+	/* We can't change any of the step 1 or step 2 components of the rs.
+	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
+	   dequeued through this pointer.
+	   Can't: atomic_set(&rs->rs_sizehint, 0);
+	   Can't: rs->rs_requested = 0;
+	   Can't: rs->rs_rgd = NULL;*/
+	rs->rs_bi = NULL;
+	rs->rs_start = 0;
+	rs->rs_biblk = 0;
+	rs->rs_blks = 0;
+}
+
 /**
- * gfs2_rs_delete - delete a reservation
+ * gfs2_rs_delete - delete a multi-block reservation
  * @ip: The inode for this reservation
  *
  */
@@ -444,12 +559,37 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
 {
 	down_write(&ip->i_rw_mutex);
 	if (ip->i_res) {
+		if (gfs2_rs_active(ip->i_res))
+			gfs2_rs_treedel(ip->i_res, true);
+		trace_gfs2_rs(ip->i_res, ip->i_res->rs_start, TRACE_RS_DELETE);
+		BUG_ON(ip->i_res->rs_free);
 		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
 		ip->i_res = NULL;
 	}
 	up_write(&ip->i_rw_mutex);
 }
 
+/**
+ * return_all_reservations - return all reserved blocks back to the rgrp.
+ * @rgd: the rgrp that needs its space back
+ *
+ * We previously reserved a bunch of blocks for allocation. Now we need to
+ * give them back. This leave the reservation structures in tact, but removes
+ * all of their corresponding "no-fly zones".
+ */
+static void return_all_reservations(struct gfs2_rgrpd *rgd)
+{
+	struct rb_node *n;
+	struct gfs2_blkreserv *rs;
+
+	mutex_lock(&rgd->rd_mutex);
+	while ((n = rb_first(&rgd->rd_rstree))) {
+		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		gfs2_rs_treedel(rs, false);
+	}
+	mutex_unlock(&rgd->rd_mutex);
+}
+
 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 {
 	struct rb_node *n;
@@ -472,6 +612,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 
 		gfs2_free_clones(rgd);
 		kfree(rgd->rd_bits);
+		return_all_reservations(rgd);
 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
 	}
 }
@@ -649,6 +790,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
 	rgd->rd_data = be32_to_cpu(buf.ri_data);
 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
+	mutex_init(&rgd->rd_mutex);
 
 	error = compute_bitstructs(rgd);
 	if (error)
@@ -1115,29 +1257,228 @@ out:
 }
 
 /**
+ * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
+ * @bi: the bitmap with the blocks
+ * @ip: the inode structure
+ * @biblk: the 32-bit block number relative to the start of the bitmap
+ * @amount: the number of blocks to reserve
+ *
+ * Returns: NULL - reservation was already taken, so not inserted
+ *          pointer to the inserted reservation
+ */
+static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
+				       struct gfs2_inode *ip, u32 biblk,
+				       int amount)
+{
+	struct rb_node **newn, *parent = NULL;
+	int rc;
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rgd;
+	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+
+	mutex_lock(&rgd->rd_mutex);
+	newn = &rgd->rd_rstree.rb_node;
+	BUG_ON(!ip->i_res);
+	BUG_ON(gfs2_rs_active(rs));
+	/* Figure out where to put new node */
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+
+		parent = *newn;
+		rc = rs_cmp(fsblock, amount, cur);
+		if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else {
+			mutex_unlock(&rgd->rd_mutex);
+			return NULL; /* reservation already in use */
+		}
+	}
+
+	/* Do our reservation work */
+	rs = ip->i_res;
+	rs->rs_start = fsblock;
+	rs->rs_blks = amount;
+	rs->rs_free = amount;
+	rs->rs_biblk = biblk;
+	rs->rs_bi = bi;
+	rs->rs_inum = ip->i_no_addr;
+	rb_link_node(&rs->rs_node, parent, newn);
+	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
+
+	/* Do our inode accounting for the reservation */
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+
+	/* Do our rgrp accounting for the reservation */
+	rgd->rd_reserved += amount; /* blocks reserved */
+	rgd->rd_next_rsrv = rs->rs_start + rs->rs_blks;
+	rgd->rd_rs_cnt++; /* number of in-tree reservations */
+	if (!rgrp_contains_block(rgd, rgd->rd_next_rsrv))
+		rgd->rd_next_rsrv = 0;
+	mutex_unlock(&rgd->rd_mutex);
+	trace_gfs2_rs(rs, fsblock, TRACE_RS_INSERT);
+	return rs;
+}
+
+/**
+ * unclaimed_blocks - return number of blocks that aren't spoken for
+ */
+static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
+{
+	return rgd->rd_free_clone - rgd->rd_reserved;
+}
+
+/**
+ * rg_mblk_search - find a group of multiple free blocks
+ * @rgd: the resource group descriptor
+ * @rs: the block reservation
+ * @ip: pointer to the inode for which we're reserving blocks
+ *
+ * This is very similar to rgblk_search, except we're looking for whole
+ * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
+ * on aligned dwords for speed's sake.
+ *
+ * Returns: 0 if successful or BFITNOENT if there isn't enough free space
+ */
+
+static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+	struct gfs2_bitmap *bi = rgd->rd_bits;
+	const u32 length = rgd->rd_length;
+	u32 blk;
+	unsigned int buf, x, search_bytes;
+	u8 *buffer = NULL;
+	u8 *ptr, *end, *nonzero;
+	u32 goal, rsv_bytes;
+	struct gfs2_blkreserv *rs;
+	u32 best_rs_bytes, unclaimed;
+	int best_rs_blocks;
+
+	/* Find bitmap block that contains bits for goal block */
+	if (rgd->rd_next_rsrv) {
+		goal = rgd->rd_next_rsrv - rgd->rd_data0;
+		for (buf = 0; buf < length; buf++) {
+			bi = rgd->rd_bits + buf;
+			/* Convert scope of "goal" from rgrp-wide to within
+			   found bit block */
+			if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+				goal -= bi->bi_start * GFS2_NBBY;
+				goto do_search;
+			}
+		}
+	}
+	buf = 0;
+	goal = 0;
+
+do_search:
+	best_rs_blocks = max(atomic_read(&ip->i_res->rs_sizehint),
+			     (int)(RGRP_RSRV_MINBLKS * rgd->rd_length));
+	best_rs_bytes = (best_rs_blocks *
+			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
+		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
+	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
+	unclaimed = unclaimed_blocks(rgd);
+	if (best_rs_bytes * GFS2_NBBY > unclaimed)
+		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
+
+	for (x = 0; x <= length; x++) {
+		bi = rgd->rd_bits + buf;
+
+		if (test_bit(GBF_FULL, &bi->bi_flags))
+			goto skip;
+
+		WARN_ON(!buffer_uptodate(bi->bi_bh));
+		if (bi->bi_clone)
+			buffer = bi->bi_clone + bi->bi_offset;
+		else
+			buffer = bi->bi_bh->b_data + bi->bi_offset;
+
+		/* We have to keep the reservations aligned on u64 boundaries
+		   otherwise we could get situations where a byte can't be
+		   used because it's after a reservation, but a free bit still
+		   is within the reservation's area. */
+		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
+		end = (buffer + bi->bi_len);
+		while (ptr < end) {
+			rsv_bytes = 0;
+			if ((ptr + best_rs_bytes) <= end)
+				search_bytes = best_rs_bytes;
+			else
+				search_bytes = end - ptr;
+			BUG_ON(!search_bytes);
+			nonzero = memchr_inv(ptr, 0, search_bytes);
+			/* If the lot is all zeroes, reserve the whole size. If
+			   there's enough zeroes to satisfy the request, use
+			   what we can. If there's not enough, keep looking. */
+			if (nonzero == NULL)
+				rsv_bytes = search_bytes;
+			else if ((nonzero - ptr) * GFS2_NBBY >=
+				 ip->i_res->rs_requested)
+				rsv_bytes = (nonzero - ptr);
+
+			if (rsv_bytes) {
+				blk = ((ptr - buffer) * GFS2_NBBY);
+				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
+				rs = rs_insert(bi, ip, blk,
+					       rsv_bytes * GFS2_NBBY);
+				if (IS_ERR(rs))
+					return PTR_ERR(rs);
+				if (rs)
+					return 0;
+			}
+			ptr += ALIGN(search_bytes, sizeof(u64));
+		}
+skip:
+		/* Try next bitmap block (wrap back to rgrp header
+		   if at end) */
+		buf++;
+		buf %= length;
+		goal = 0;
+	}
+
+	return BFITNOENT;
+}
+
+/**
  * try_rgrp_fit - See if a given reservation will fit in a given RG
  * @rgd: the RG data
  * @ip: the inode
  *
  * If there's room for the requested blocks to be allocated from the RG:
+ * This will try to get a multi-block reservation first, and if that doesn't
+ * fit, it will take what it can.
  *
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
  */
 
-static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
 	const struct gfs2_blkreserv *rs = ip->i_res;
 
 	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
-	if (rgd->rd_free_clone >= rs->rs_requested)
+	/* Look for a multi-block reservation. We only do this for files,
+	   not directories or other types. The problem with directories is
+	   that items are often created and deleted in the directory in the
+	   same breath, which can create "holes". In other words, we could get
+	   into situations where two or more blocks are reserved, then used,
+	   then one or more of the earlier blocks is freed. When we delete the
+	   reservation, the rs_free will be off due to the hole, so the rgrp's
+	   rg_free count can get off. Other non-files, like symlinks and
+	   char or block devices, are very small in size, so it doesn't make
+	   sense to reserve multiple blocks for them. */
+	if (S_ISREG(ip->i_inode.i_mode) &&
+	    unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS) {
+		if (rg_mblk_search(rgd, ip) != BFITNOENT)
+			return 1;
+	}
+	if (unclaimed_blocks(rgd) >= rs->rs_requested)
 		return 1;
-	return 0;
-}
 
-static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
-{
-	return (bi->bi_start * GFS2_NBBY) + blk;
+	return 0;
 }
 
 /**
@@ -1217,7 +1558,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrpd *rgd, *begin = NULL;
+	struct gfs2_rgrpd *begin = NULL;
 	struct gfs2_blkreserv *rs = ip->i_res;
 	int error = 0, rg_locked, flags = LM_FLAG_TRY;
 	u64 last_unlinked = NO_BLOCK;
@@ -1225,32 +1566,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 
 	if (sdp->sd_args.ar_rgrplvb)
 		flags |= GL_SKIP;
-	rs = ip->i_res;
 	rs->rs_requested = requested;
 	if (gfs2_assert_warn(sdp, requested)) {
 		error = -EINVAL;
 		goto out;
 	}
-
-	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
-		rgd = begin = ip->i_rgd;
-	else
-		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
-
-	if (rgd == NULL)
+	if (gfs2_rs_active(rs)) {
+		begin = rs->rs_rgd;
+		flags = 0; /* Yoda: Do or do not. There is no try */
+	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
+		rs->rs_rgd = begin = ip->i_rgd;
+	} else {
+		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
+	}
+	if (rs->rs_rgd == NULL)
 		return -EBADSLT;
 
 	while (loops < 3) {
 		rg_locked = 0;
 
-		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
 			rg_locked = 1;
 			error = 0;
+		} else if (!loops && !gfs2_rs_active(rs) &&
+			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
+			/* If the rgrp already is maxed out for contenders,
+			   we can eliminate it as a "first pass" without even
+			   requesting the rgrp glock. */
+			error = GLR_TRYFAILED;
 		} else {
-			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
-						   flags, &rs->rs_rgd_gh);
+			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
+						   LM_ST_EXCLUSIVE, flags,
+						   &rs->rs_rgd_gh);
 			if (!error && sdp->sd_args.ar_rgrplvb) {
-				error = update_rgrp_lvb(rgd);
+				error = update_rgrp_lvb(rs->rs_rgd);
 				if (error) {
 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 					return error;
@@ -1259,24 +1608,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 		}
 		switch (error) {
 		case 0:
-			if (try_rgrp_fit(rgd, ip)) {
+			if (gfs2_rs_active(rs)) {
+				if (unclaimed_blocks(rs->rs_rgd) +
+				    rs->rs_free >= rs->rs_requested) {
+					ip->i_rgd = rs->rs_rgd;
+					return 0;
+				}
+				/* We have a multi-block reservation, but the
+				   rgrp doesn't have enough free blocks to
+				   satisfy the request. Free the reservation
+				   and look for a suitable rgrp. */
+				gfs2_rs_treedel(rs, true);
+			}
+			if (try_rgrp_fit(rs->rs_rgd, ip)) {
 				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rgd);
-				ip->i_rgd = rgd;
+					gfs2_rgrp_bh_get(rs->rs_rgd);
+				ip->i_rgd = rs->rs_rgd;
 				return 0;
 			}
-			if (rgd->rd_flags & GFS2_RDF_CHECK) {
+			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
 				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rgd);
-				try_rgrp_unlink(rgd, &last_unlinked,
+					gfs2_rgrp_bh_get(rs->rs_rgd);
+				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
 						ip->i_no_addr);
 			}
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 			/* fall through */
 		case GLR_TRYFAILED:
-			rgd = gfs2_rgrpd_get_next(rgd);
-			if (rgd != begin) /* If we didn't wrap */
+			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
+			if (rs->rs_rgd != begin) /* If we didn't wrap */
 				break;
 
 			flags &= ~LM_FLAG_TRY;
@@ -1314,6 +1675,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 {
 	struct gfs2_blkreserv *rs = ip->i_res;
 
+	if (!rs)
+		return;
+
+	if (gfs2_rs_active(rs) && !rs->rs_free)
+		gfs2_rs_treedel(rs, true);
+
 	if (rs->rs_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 	rs->rs_requested = 0;
@@ -1412,7 +1779,27 @@ do_search:
 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
 			buffer = bi->bi_clone + bi->bi_offset;
 
-		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+		while (1) {
+			struct gfs2_blkreserv *rs;
+			u32 rgblk;
+
+			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+			if (biblk == BFITNOENT)
+				break;
+			/* Check if this block is reserved() */
+			rgblk = gfs2_bi2rgd_blk(bi, biblk);
+			rs = rs_find(rgd, rgblk);
+			if (rs == NULL)
+				break;
+
+			BUG_ON(rs->rs_bi != bi);
+			biblk = BFITNOENT;
+			/* This should jump to the first block after the
+			   reservation. */
+			goal = rs->rs_biblk + rs->rs_free;
+			if (goal >= bi->bi_len * GFS2_NBBY)
+				break;
+		}
 		if (biblk != BFITNOENT)
 			break;
 
@@ -1448,8 +1835,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 			     u32 blk, bool dinode, unsigned int *n)
 {
 	const unsigned int elen = *n;
-	u32 goal;
+	u32 goal, rgblk;
 	const u8 *buffer = NULL;
+	struct gfs2_blkreserv *rs;
 
 	*n = 0;
 	buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1462,6 +1850,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 		goal++;
 		if (goal >= (bi->bi_len * GFS2_NBBY))
 			break;
+		rgblk = gfs2_bi2rgd_blk(bi, goal);
+		rs = rs_find(rgd, rgblk);
+		if (rs) /* Oops, we bumped into someone's reservation */
+			break;
 		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
 		    GFS2_BLKST_FREE)
 			break;
@@ -1538,11 +1930,19 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
 	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	struct gfs2_blkreserv *trs;
+	const struct rb_node *n;
+
 	if (rgd == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
+		       rgd->rd_reserved);
+	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
+		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		dump_rs(seq, trs);
+	}
 	return 0;
 }
 
@@ -1557,10 +1957,74 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 }
 
 /**
+ * claim_reserved_blks - Claim previously reserved blocks
+ * @ip: the inode that's claiming the reservation
+ * @dinode: 1 if this block is a dinode block, otherwise data block
+ * @nblocks: desired extent length
+ *
+ * Lay claim to previously allocated block reservation blocks.
+ * Returns: Starting block number of the blocks claimed.
+ * Sets *nblocks to the actual extent length allocated.
+ */
+static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
+			       unsigned int *nblocks)
+{
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rgd;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_bitmap *bi;
+	u32 biblk;
+	u64 start_block = 0, fsblock;
+	const unsigned int elen = *nblocks;
+
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+	gfs2_assert_withdraw(sdp, rgd);
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+	bi = rs->rs_bi;
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+
+	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
+		biblk = rs->rs_biblk;
+
+		fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+		/* Check if the next free block is not contiguous with
+		   the others in the extent. */
+		if (start_block && fsblock != start_block + (*nblocks))
+			break;
+
+		rs->rs_biblk++;
+		rs->rs_free--;
+		/* Make sure the bitmap hasn't changed */
+		gfs2_setbit(rgd, bi->bi_clone, bi, biblk, GFS2_BLKST_USED);
+
+		BUG_ON(!rgd->rd_reserved);
+		rgd->rd_reserved--;
+		if (!start_block) {
+			start_block = fsblock;
+			dinode = false;
+		}
+		trace_gfs2_rs(rs, fsblock, TRACE_RS_CLAIM);
+	}
+
+	if (!rs->rs_free) {
+		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
+
+		gfs2_rs_treedel(rs, true);
+		/* -nblocks because we haven't returned to do the math yet.
+		   I'm doing the math backwards to prevent negative numbers,
+		   but think of it as:
+		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
+		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
+			rg_mblk_search(rgd, ip);
+	}
+	return start_block;
+}
+
+/**
  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
  * @ip: the inode to allocate the block for
  * @bn: Used to return the starting block number
- * @ndata: requested number of blocks/extent length (value/result)
+ * @nblocks: requested number of blocks/extent length (value/result)
  * @dinode: 1 if we're allocating a dinode block, else 0
  * @generation: the generation number of the inode
  *
@@ -1585,20 +2049,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	if (ip->i_res->rs_requested == 0)
 		return -ECANCELED;
 
-	rgd = ip->i_rgd;
-
-	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
-		goal = ip->i_goal - rgd->rd_data0;
-	else
-		goal = rgd->rd_last_alloc;
-
-	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+	/* Check if we have a multi-block reservation, and if so, claim the
+	   next free block from it. */
+	if (gfs2_rs_active(ip->i_res)) {
+		BUG_ON(!ip->i_res->rs_free);
+		rgd = ip->i_res->rs_rgd;
+		block = claim_reserved_blks(ip, dinode, nblocks);
+	} else {
+		rgd = ip->i_rgd;
 
-	/* Since all blocks are reserved in advance, this shouldn't happen */
-	if (blk == BFITNOENT)
-		goto rgrp_error;
+		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
+			goal = ip->i_goal - rgd->rd_data0;
+		else
+			goal = rgd->rd_last_alloc;
+
+		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+
+		/* Since all blocks are reserved in advance, this shouldn't
+		   happen */
+		if (blk == BFITNOENT) {
+			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
+			       *nblocks);
+			printk(KERN_WARNING "FULL=%d\n",
+			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
+			goto rgrp_error;
+		}
 
-	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+	}
 	ndata = *nblocks;
 	if (dinode)
 		ndata--;
@@ -1615,8 +2093,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 			brelse(dibh);
 		}
 	}
-	if (rgd->rd_free < *nblocks)
+	if (rgd->rd_free < *nblocks) {
+		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
 		goto rgrp_error;
+	}
 
 	rgd->rd_free -= *nblocks;
 	if (dinode) {
@@ -1672,6 +2152,9 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
 		return;
 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
 	rgd->rd_free += blen;
+	if (bstart < rgd->rd_next_rsrv)
+		rgd->rd_next_rsrv = bstart;
+
 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5d8314d..b604009 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -13,6 +13,14 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 
+/* Since each block in the file system is represented by two bits in the
+ * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
+ * By reserving 32 blocks at a time, we can optimize / shortcut how we search
+ * through the bitmaps by looking a word at a time.
+ */
+#define RGRP_RSRV_MINBYTES 8
+#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
@@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
 extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
 extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+
 extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
@@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 			     bool dinode, u64 *generation);
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern void gfs2_rs_treedel(struct gfs2_blkreserv *rs, bool lock);
 extern void gfs2_rs_delete(struct gfs2_inode *ip);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
 extern int gfs2_fitrim(struct file *filp, void __user *argp);
 
-/* This is how to tell if a reservation is "inplace" reserved: */
+/* This is how to tell if a multi-block reservation is "inplace" reserved: */
 static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
 {
 	if (ip->i_res && ip->i_res->rs_requested)
@@ -70,4 +81,12 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
 	return 0;
 }
 
+/* This is how to tell if a multi-block reservation is in the rgrp tree: */
+static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
+{
+	if (rs && rs->rs_start)
+		return 1;
+	return 0;
+}
+
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7880687..433da86 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 		return -EIO;
 	}
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		return error;
@@ -1550,6 +1554,9 @@ out_truncate:
 
 out_unlock:
 	/* Error path for case 1 */
+	if (gfs2_rs_active(ip->i_res))
+		gfs2_rs_treedel(ip->i_res, true);
+
 	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
 		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 1b8b815..134bb9d 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -31,6 +31,17 @@
 			    { GFS2_BLKST_DINODE, "dinode" },	\
 			    { GFS2_BLKST_UNLINKED, "unlinked" })
 
+#define TRACE_RS_DELETE  0
+#define TRACE_RS_TREEDEL 1
+#define TRACE_RS_INSERT  2
+#define TRACE_RS_CLAIM   3
+
+#define rs_func_name(x) __print_symbolic(x,	\
+					 { 0, "del " },	\
+					 { 1, "tdel" },	\
+					 { 2, "ins " },	\
+					 { 3, "clm " })
+
 #define show_glock_flags(flags) __print_flags(flags, "",	\
 	{(1UL << GLF_LOCK),			"l" },		\
 	{(1UL << GLF_DEMOTE),			"D" },		\
@@ -470,6 +481,7 @@ TRACE_EVENT(gfs2_block_alloc,
 		__field(	u8,	block_state		)
 		__field(        u64,	rd_addr			)
 		__field(        u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
 	),
 
 	TP_fast_assign(
@@ -480,16 +492,63 @@ TRACE_EVENT(gfs2_block_alloc,
 		__entry->block_state	= block_state;
 		__entry->rd_addr	= rgd->rd_addr;
 		__entry->rd_free_clone	= rgd->rd_free_clone;
+		__entry->rd_reserved	= rgd->rd_reserved;
 	),
 
-	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
+	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long long)__entry->inum,
 		  (unsigned long long)__entry->start,
 		  (unsigned long)__entry->len,
 		  block_state_name(__entry->block_state),
 		  (unsigned long long)__entry->rd_addr,
-		  __entry->rd_free_clone)
+		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
+);
+
+/* Keep track of multi-block reservations as they are allocated/freed */
+TRACE_EVENT(gfs2_rs,
+
+	TP_PROTO(const struct gfs2_blkreserv *rs, u64 claimed, u8 func),
+
+	TP_ARGS(rs, claimed, func),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	rd_addr			)
+		__field(	u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
+		__field(	u64,	inum			)
+		__field(	u64,	start			)
+		__field(	u32,	len			)
+		__field(	u64,	claimed			)
+		__field(	u32,	free			)
+		__field(	u8,	func			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
+		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
+		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
+		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
+		__entry->inum		= rs->rs_inum;
+		__entry->start		= rs->rs_start;
+		__entry->len		= rs->rs_blks;
+		__entry->claimed	= claimed;
+		__entry->free		= rs->rs_free;
+		__entry->func		= func;
+	),
+
+	TP_printk("%u,%u bmap %llu resrv %llu/%lu rg:%llu rf:%lu rr:%lu %s "
+		  "c:%llu f:%lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->start,
+		  (unsigned long)__entry->len,
+		  (unsigned long long)__entry->rd_addr,
+		  (unsigned long)__entry->rd_free_clone,
+		  (unsigned long)__entry->rd_reserved,
+		  rs_func_name(__entry->func),
+		  __entry->claimed, (unsigned long)__entry->free)
 );
 
 #endif /* _TRACE_GFS2_H */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 523c0de..27a0b4a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 {
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto out_alloc;
@@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	struct buffer_head *dibh;
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_lock_check(ip);
 	if (error)
 		return error;
@@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 {
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		return error;



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 2] GFS2: Reduce file fragmentation
  2012-07-05 21:58 ` [Cluster-devel] [GFS2 Patch] [Try 2] GFS2: Reduce file fragmentation Bob Peterson
@ 2012-07-10 14:23   ` Steven Whitehouse
  2012-07-11 18:07     ` Bob Peterson
  2012-07-11 18:10     ` [Cluster-devel] [GFS2 Patch] [Try 3] " Bob Peterson
  0 siblings, 2 replies; 13+ messages in thread
From: Steven Whitehouse @ 2012-07-10 14:23 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

Generally looking much better, but some further comments are included
below...

I'm still not sure that I understand fully the implications of the new
allocation algorithm, but the code surrounding that core part of the
patch is looking much better now.


On Thu, 2012-07-05 at 17:58 -0400, Bob Peterson wrote:
> Hi,
> 
> Here is my replacement GFS2 file defragmentation patch for upstream:
> 
> Description:
> 
> This patch reduces GFS2 file fragmentation by pre-reserving blocks.
> In this patch, I've implemented almost all of Steve's suggestions.
> 
> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---
> diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
> index 6d957a8..205eddc 100644
> --- a/fs/gfs2/bmap.c
> +++ b/fs/gfs2/bmap.c
> @@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
>  	if (error)
>  		goto out_rlist;
>  
> +	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
> +		gfs2_rs_treedel(ip->i_res, true);
> +
>  	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
>  				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
>  				 revokes);
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 6fbf3cb..779b6222 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint,
> +		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
> +
>  	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
>  	ret = gfs2_glock_nq(&gh);
>  	if (ret)
> @@ -571,22 +574,15 @@ fail:
>  
>  static int gfs2_release(struct inode *inode, struct file *file)
>  {
> -	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
> -	struct gfs2_file *fp;
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  
> -	fp = file->private_data;
> +	kfree(file->private_data);
>  	file->private_data = NULL;
>  
> -	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
> -	    (atomic_read(&inode->i_writecount) == 1))
> +	if ((file->f_mode & FMODE_WRITE) && gfs2_rs_active(ip->i_res) &&
> +	    (atomic_read(&inode->i_writecount) == 1)) {
>  		gfs2_rs_delete(ip);
Don't we always need to drop this data structure at this point, whether
or not the reservation is active?

> -
> -	if (gfs2_assert_warn(sdp, fp))
> -		return -EIO;
> -
> -	kfree(fp);
> -
> +	}
>  	return 0;
>  }
>  
> @@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  				   unsigned long nr_segs, loff_t pos)
>  {
>  	struct file *file = iocb->ki_filp;
> +	size_t writesize = iov_length(iov, nr_segs);
>  	struct dentry *dentry = file->f_dentry;
>  	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
> +	struct gfs2_sbd *sdp;
>  	int ret;
>  
> +	sdp = GFS2_SB(file->f_mapping->host);
>  	ret = gfs2_rs_alloc(ip);
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
>  	if (file->f_flags & O_APPEND) {
>  		struct gfs2_holder gh;
>  
> @@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  	if (unlikely(error))
>  		goto out_uninit;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
> +
>  	while (len > 0) {
>  		if (len < bytes)
>  			bytes = len;
> @@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  			offset += bytes;
>  			continue;
>  		}
> -		error = gfs2_rindex_update(sdp);
> -		if (error)
> -			goto out_unlock;
> -
>  		error = gfs2_quota_lock_check(ip);
>  		if (error)
>  			goto out_unlock;
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index dc73070..f7bd2e3 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -84,6 +84,7 @@ struct gfs2_rgrpd {
>  	u32 rd_data;			/* num of data blocks in rgrp */
>  	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
>  	u32 rd_free;
> +	u32 rd_reserved;                /* number of blocks reserved */
>  	u32 rd_free_clone;
>  	u32 rd_dinodes;
>  	u64 rd_igeneration;
> @@ -91,11 +92,15 @@ struct gfs2_rgrpd {
>  	struct gfs2_sbd *rd_sbd;
>  	struct gfs2_rgrp_lvb *rd_rgl;
>  	u32 rd_last_alloc;
> +	u32 rd_next_rsrv;               /* preferred next reservation */
>  	u32 rd_flags;
>  #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
>  #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
>  #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
>  #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
> +	struct mutex rd_mutex;          /* protects reservation related vars */
Does this need to be a mutex? I think a spinlock would do just as well
in this case since we never schedule while holding the lock.

> +	struct rb_root rd_rstree;       /* multi-block reservation tree */
> +	u32 rd_rs_cnt;                  /* count of current reservations */
>  };
>  
>  enum gfs2_state_bits {
> @@ -233,6 +238,41 @@ struct gfs2_holder {
>  	unsigned long gh_ip;
>  };
>  
> +/* Resource group multi-block reservation, in order of appearance:
> +
> +   Step 1. Function prepares to write, allocates a mb, sets the size hint.
> +   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
> +   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
> +   Step 4. Bits are assigned from the rgrp based on either the reservation
> +           or wherever it can.
> +*/
> +
> +struct gfs2_blkreserv {
> +	/* components used during write (step 1): */
> +	u64 rs_inum; /* inode address */
Why are we adding an extra field just to pass to the tracepoint? Surely
it is easier just to pass the inode if required, rather than this
structure, then you can access all the required fields.

> +	atomic_t rs_sizehint;         /* hint of the write size */
> +
> +	/* components used during inplace_reserve (step 2): */
> +	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> +
> +	/* components used during get_local_rgrp (step 3): */
> +	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
> +	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
> +	struct rb_node rs_node;       /* link to other block reservations */
> +
> +	/* components used during block searches and assignments (step 4): */
> +	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
> +	u64 rs_start;                 /* absolute fs block address */
> +	u32 rs_biblk;                 /* start block relative to the bi */
> +	u32 rs_blks;                  /* number of blocks reserved */
> +	u32 rs_free;                  /* how many blocks are still free */
What is the difference between rs_free and rs_blks? Shouldn't these two
always be identical, since there is no point in reserving blocks which
are not free.
 
> +
> +	/* ancillary quota stuff */
> +	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> +	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> +	unsigned int rs_qa_qd_num;
> +};
> +
>  enum {
>  	GLF_LOCK			= 1,
>  	GLF_DEMOTE			= 3,
> @@ -290,16 +330,6 @@ struct gfs2_glock {
>  
>  #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
>  
> -struct gfs2_blkreserv {
> -	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> -	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
> -
> -	/* ancillary quota stuff */
> -	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> -	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> -	unsigned int rs_qa_qd_num;
> -};
> -
>  enum {
>  	GIF_INVALID		= 0,
>  	GIF_QD_LOCKED		= 1,
> @@ -307,7 +337,6 @@ enum {
>  	GIF_SW_PAGED		= 3,
>  };
>  
> -
>  struct gfs2_inode {
>  	struct inode i_inode;
>  	u64 i_no_addr;
> @@ -318,7 +347,7 @@ struct gfs2_inode {
>  	struct gfs2_glock *i_gl; /* Move into i_gh? */
>  	struct gfs2_holder i_iopen_gh;
>  	struct gfs2_holder i_gh; /* for prepare/commit_write only */
> -	struct gfs2_blkreserv *i_res; /* resource group block reservation */
> +	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
>  	struct gfs2_rgrpd *i_rgd;
>  	u64 i_goal;	/* goal block for allocations */
>  	struct rw_semaphore i_rw_mutex;
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 2b035e0..12c7b6a 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
>  	int error;
>  
>  	munge_mode_uid_gid(dip, &mode, &uid, &gid);
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
>  
>  	error = gfs2_quota_lock(dip, uid, gid);
>  	if (error)
> @@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto fail;
> @@ -596,7 +603,8 @@ fail_end_trans:
>  	gfs2_trans_end(sdp);
>  
>  fail_ipreserv:
> -	gfs2_inplace_release(dip);
> +	if (alloc_required)
> +		gfs2_inplace_release(dip);
>  
>  fail_quota_locks:
>  	gfs2_quota_unlock(dip);
> @@ -738,6 +746,7 @@ fail_gunlock:
>  		iput(inode);
>  	}
>  fail:
> +	gfs2_rs_delete(dip);
>  	if (bh)
>  		brelse(bh);
>  	return error;
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index e53d0a1..4d1f4f8 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -35,6 +35,9 @@
>  #define BFITNOENT ((u32)~0)
>  #define NO_BLOCK ((u64)~0)
>  
> +#define RSRV_CONTENTION_FACTOR 2
> +#define RGRP_RSRV_MAX_CONTENDERS 4
> +
>  #if BITS_PER_LONG == 32
>  #define LBITMASK   (0x55555555UL)
>  #define LBITSKIP55 (0x55555555UL)
> @@ -178,6 +181,60 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
>  }
>  
>  /**
> + * rs_cmp - multi-block reservation range compare
> + * @blk: absolute file system block number of the new reservation
> + * @len: number of blocks in the new reservation
> + * @rs: existing reservation to compare against
> + *
> + * returns: 1 if the block range is beyond the reach of the reservation
> + *         -1 if the block range is before the start of the reservation
> + *          0 if the block range overlaps with the reservation
> + */
> +static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
> +{
> +	if (blk > rs->rs_start + rs->rs_blks - 1)
> +		return 1;
> +	if (blk + len - 1 < rs->rs_start)
> +		return -1;
> +	return 0;
> +}
> +
> +/**
> + * rs_find - Find a rgrp multi-block reservation that contains a given block
> + * @rgd: The rgrp
> + * @rgblk: The block we're looking for, relative to the rgrp
> + */
> +static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
> +{
> +	struct rb_node **newn;
> +	int rc;
> +	u64 fsblk = rgblk + rgd->rd_data0;
> +
> +	mutex_lock(&rgd->rd_mutex);
> +	newn = &rgd->rd_rstree.rb_node;
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +		rc = rs_cmp(fsblk, 1, cur);
> +		if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else {
> +			mutex_unlock(&rgd->rd_mutex);
> +			return cur;
> +		}
> +	}
> +	mutex_unlock(&rgd->rd_mutex);
> +	return NULL;
> +}
> +
> +static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
> +{
> +	return (bi->bi_start * GFS2_NBBY) + blk;
> +}
> +
> +/**
>   * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
>   *       a block in a given allocation state.
>   * @buf: the buffer that holds the bitmaps
> @@ -435,8 +492,66 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
>  	return error;
>  }
>  
> +static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
> +{
> +	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u i:%u f:%u\n",
> +		       rs->rs_rgd->rd_addr, rs->rs_start, rs->rs_biblk,
> +		       rs->rs_blks, rs->rs_free);
> +}
> +
> +/**
> + * gfs2_rs_treedel - remove a multi-block reservation from the rgd tree
> + * @ip: The inode for this reservation
> + * @lock: we need to lock the rgrp
> + *
> + */
> +void gfs2_rs_treedel(struct gfs2_blkreserv *rs, bool lock)
> +{
> +	struct gfs2_rgrpd *rgd;
> +
> +	if (!gfs2_rs_active(rs))
> +		return;
> +
> +	rgd = rs->rs_rgd;
> +	/* We can't do this: The reason is that when the rgrp is invalidated,
> +	   it's in the "middle" of acquiring the glock, but the HOLDER bit
> +	   isn't set yet:
> +	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
> +	trace_gfs2_rs(rs, 0, TRACE_RS_TREEDEL);
> +
> +	if (lock)
> +		mutex_lock(&rgd->rd_mutex);

Can we split this into two functions? That way we can have a
__gfs2_tree_del() which doesn't do anything with the lock, and a
gfs2_tree_del() which grabs and drop the lock and calls the former.

> +	/* Now that we have the mutex, check again if the tree is empty just
> +	   in case someone else deleted this entry. */
> +	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
> +		rb_erase(&rs->rs_node, &rgd->rd_rstree);
> +	BUG_ON(!rgd->rd_rs_cnt);
> +	rgd->rd_rs_cnt--;
> +
> +	if (rs->rs_free) {
> +		/* return reserved blocks to the rgrp and the ip */
> +		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
> +		rs->rs_rgd->rd_reserved -= rs->rs_free;
> +		rs->rs_free = 0;
> +		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
> +		smp_mb__after_clear_bit();
> +	}
> +	if (lock)
> +		mutex_unlock(&rgd->rd_mutex);
> +	/* We can't change any of the step 1 or step 2 components of the rs.
> +	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
> +	   dequeued through this pointer.
> +	   Can't: atomic_set(&rs->rs_sizehint, 0);
> +	   Can't: rs->rs_requested = 0;
> +	   Can't: rs->rs_rgd = NULL;*/
> +	rs->rs_bi = NULL;
> +	rs->rs_start = 0;
> +	rs->rs_biblk = 0;
> +	rs->rs_blks = 0;
> +}
> +
>  /**
> - * gfs2_rs_delete - delete a reservation
> + * gfs2_rs_delete - delete a multi-block reservation
>   * @ip: The inode for this reservation
>   *
>   */
> @@ -444,12 +559,37 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
>  {
>  	down_write(&ip->i_rw_mutex);
>  	if (ip->i_res) {
> +		if (gfs2_rs_active(ip->i_res))
> +			gfs2_rs_treedel(ip->i_res, true);
> +		trace_gfs2_rs(ip->i_res, ip->i_res->rs_start, TRACE_RS_DELETE);
> +		BUG_ON(ip->i_res->rs_free);
>  		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
>  		ip->i_res = NULL;
>  	}
>  	up_write(&ip->i_rw_mutex);
>  }
>  
> +/**
> + * return_all_reservations - return all reserved blocks back to the rgrp.
> + * @rgd: the rgrp that needs its space back
> + *
> + * We previously reserved a bunch of blocks for allocation. Now we need to
> + * give them back. This leave the reservation structures in tact, but removes
> + * all of their corresponding "no-fly zones".
> + */
> +static void return_all_reservations(struct gfs2_rgrpd *rgd)
> +{
> +	struct rb_node *n;
> +	struct gfs2_blkreserv *rs;
> +
> +	mutex_lock(&rgd->rd_mutex);
> +	while ((n = rb_first(&rgd->rd_rstree))) {
> +		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		gfs2_rs_treedel(rs, false);
> +	}
> +	mutex_unlock(&rgd->rd_mutex);
> +}
> +
>  void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  {
>  	struct rb_node *n;
> @@ -472,6 +612,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  
>  		gfs2_free_clones(rgd);
>  		kfree(rgd->rd_bits);
> +		return_all_reservations(rgd);
>  		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
>  	}
>  }
> @@ -649,6 +790,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
>  	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
>  	rgd->rd_data = be32_to_cpu(buf.ri_data);
>  	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
> +	mutex_init(&rgd->rd_mutex);
>  
>  	error = compute_bitstructs(rgd);
>  	if (error)
> @@ -1115,29 +1257,228 @@ out:
>  }
>  
>  /**
> + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
> + * @bi: the bitmap with the blocks
> + * @ip: the inode structure
> + * @biblk: the 32-bit block number relative to the start of the bitmap
> + * @amount: the number of blocks to reserve
> + *
> + * Returns: NULL - reservation was already taken, so not inserted
> + *          pointer to the inserted reservation
> + */
> +static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
> +				       struct gfs2_inode *ip, u32 biblk,
> +				       int amount)
> +{
> +	struct rb_node **newn, *parent = NULL;
> +	int rc;
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
> +
> +	mutex_lock(&rgd->rd_mutex);
> +	newn = &rgd->rd_rstree.rb_node;
> +	BUG_ON(!ip->i_res);
> +	BUG_ON(gfs2_rs_active(rs));
> +	/* Figure out where to put new node */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +
> +		parent = *newn;
> +		rc = rs_cmp(fsblock, amount, cur);
> +		if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else {
> +			mutex_unlock(&rgd->rd_mutex);
> +			return NULL; /* reservation already in use */
> +		}
> +	}
> +
> +	/* Do our reservation work */
> +	rs = ip->i_res;
> +	rs->rs_start = fsblock;
> +	rs->rs_blks = amount;
> +	rs->rs_free = amount;
> +	rs->rs_biblk = biblk;
> +	rs->rs_bi = bi;
> +	rs->rs_inum = ip->i_no_addr;
> +	rb_link_node(&rs->rs_node, parent, newn);
> +	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
> +
> +	/* Do our inode accounting for the reservation */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +
> +	/* Do our rgrp accounting for the reservation */
> +	rgd->rd_reserved += amount; /* blocks reserved */
> +	rgd->rd_next_rsrv = rs->rs_start + rs->rs_blks;
> +	rgd->rd_rs_cnt++; /* number of in-tree reservations */
> +	if (!rgrp_contains_block(rgd, rgd->rd_next_rsrv))
> +		rgd->rd_next_rsrv = 0;
> +	mutex_unlock(&rgd->rd_mutex);
> +	trace_gfs2_rs(rs, fsblock, TRACE_RS_INSERT);
> +	return rs;
> +}
> +
> +/**
> + * unclaimed_blocks - return number of blocks that aren't spoken for
> + */
> +static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
> +{
> +	return rgd->rd_free_clone - rgd->rd_reserved;
> +}
> +
> +/**
> + * rg_mblk_search - find a group of multiple free blocks
> + * @rgd: the resource group descriptor
> + * @rs: the block reservation
> + * @ip: pointer to the inode for which we're reserving blocks
> + *
> + * This is very similar to rgblk_search, except we're looking for whole
> + * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
> + * on aligned dwords for speed's sake.
> + *
> + * Returns: 0 if successful or BFITNOENT if there isn't enough free space
> + */
> +
> +static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
> +{
> +	struct gfs2_bitmap *bi = rgd->rd_bits;
> +	const u32 length = rgd->rd_length;
> +	u32 blk;
> +	unsigned int buf, x, search_bytes;
> +	u8 *buffer = NULL;
> +	u8 *ptr, *end, *nonzero;
> +	u32 goal, rsv_bytes;
> +	struct gfs2_blkreserv *rs;
> +	u32 best_rs_bytes, unclaimed;
> +	int best_rs_blocks;
> +
> +	/* Find bitmap block that contains bits for goal block */
> +	if (rgd->rd_next_rsrv) {
> +		goal = rgd->rd_next_rsrv - rgd->rd_data0;
> +		for (buf = 0; buf < length; buf++) {
> +			bi = rgd->rd_bits + buf;
> +			/* Convert scope of "goal" from rgrp-wide to within
> +			   found bit block */
> +			if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
> +				goal -= bi->bi_start * GFS2_NBBY;
> +				goto do_search;
> +			}
> +		}
> +	}
> +	buf = 0;
> +	goal = 0;
> +
> +do_search:
> +	best_rs_blocks = max(atomic_read(&ip->i_res->rs_sizehint),
> +			     (int)(RGRP_RSRV_MINBLKS * rgd->rd_length));
Use max_t here?

> +	best_rs_bytes = (best_rs_blocks *
> +			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
> +		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
> +	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
> +	unclaimed = unclaimed_blocks(rgd);
> +	if (best_rs_bytes * GFS2_NBBY > unclaimed)
> +		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
> +
> +	for (x = 0; x <= length; x++) {
> +		bi = rgd->rd_bits + buf;
> +
> +		if (test_bit(GBF_FULL, &bi->bi_flags))
> +			goto skip;
> +
> +		WARN_ON(!buffer_uptodate(bi->bi_bh));
> +		if (bi->bi_clone)
> +			buffer = bi->bi_clone + bi->bi_offset;
> +		else
> +			buffer = bi->bi_bh->b_data + bi->bi_offset;
> +
> +		/* We have to keep the reservations aligned on u64 boundaries
> +		   otherwise we could get situations where a byte can't be
> +		   used because it's after a reservation, but a free bit still
> +		   is within the reservation's area. */
> +		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
> +		end = (buffer + bi->bi_len);
> +		while (ptr < end) {
> +			rsv_bytes = 0;
> +			if ((ptr + best_rs_bytes) <= end)
> +				search_bytes = best_rs_bytes;
> +			else
> +				search_bytes = end - ptr;
> +			BUG_ON(!search_bytes);
> +			nonzero = memchr_inv(ptr, 0, search_bytes);
> +			/* If the lot is all zeroes, reserve the whole size. If
> +			   there's enough zeroes to satisfy the request, use
> +			   what we can. If there's not enough, keep looking. */
> +			if (nonzero == NULL)
> +				rsv_bytes = search_bytes;
> +			else if ((nonzero - ptr) * GFS2_NBBY >=
> +				 ip->i_res->rs_requested)
> +				rsv_bytes = (nonzero - ptr);
> +
> +			if (rsv_bytes) {
> +				blk = ((ptr - buffer) * GFS2_NBBY);
> +				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
> +				rs = rs_insert(bi, ip, blk,
> +					       rsv_bytes * GFS2_NBBY);
> +				if (IS_ERR(rs))
> +					return PTR_ERR(rs);
> +				if (rs)
> +					return 0;
> +			}
> +			ptr += ALIGN(search_bytes, sizeof(u64));
> +		}
> +skip:
> +		/* Try next bitmap block (wrap back to rgrp header
> +		   if at end) */
> +		buf++;
> +		buf %= length;
> +		goal = 0;
> +	}
> +
> +	return BFITNOENT;
> +}
> +
> +/**
>   * try_rgrp_fit - See if a given reservation will fit in a given RG
>   * @rgd: the RG data
>   * @ip: the inode
>   *
>   * If there's room for the requested blocks to be allocated from the RG:
> + * This will try to get a multi-block reservation first, and if that doesn't
> + * fit, it will take what it can.
>   *
>   * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
>   */
>  
> -static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
> +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
>  {
>  	const struct gfs2_blkreserv *rs = ip->i_res;
>  
>  	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
>  		return 0;
> -	if (rgd->rd_free_clone >= rs->rs_requested)
> +	/* Look for a multi-block reservation. We only do this for files,
> +	   not directories or other types. The problem with directories is
> +	   that items are often created and deleted in the directory in the
> +	   same breath, which can create "holes". In other words, we could get
> +	   into situations where two or more blocks are reserved, then used,
> +	   then one or more of the earlier blocks is freed. When we delete the
> +	   reservation, the rs_free will be off due to the hole, so the rgrp's
> +	   rg_free count can get off. Other non-files, like symlinks and
> +	   char or block devices, are very small in size, so it doesn't make
> +	   sense to reserve multiple blocks for them. */

I'm not sure that I understand this comment at all. Currently with
directories we never deallocate any blocks at all until the directory is
deallocated when it is unlinked. We will want to extend this to
directories eventually, even if we don't do that immediately.

> +	if (S_ISREG(ip->i_inode.i_mode) &&
> +	    unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS) {
> +		if (rg_mblk_search(rgd, ip) != BFITNOENT)
> +			return 1;
> +	}
> +	if (unclaimed_blocks(rgd) >= rs->rs_requested)
>  		return 1;
> -	return 0;
> -}
>  
> -static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
> -{
> -	return (bi->bi_start * GFS2_NBBY) + blk;
> +	return 0;
>  }
>  
>  /**
> @@ -1217,7 +1558,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>  int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  {
>  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> -	struct gfs2_rgrpd *rgd, *begin = NULL;
> +	struct gfs2_rgrpd *begin = NULL;
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  	int error = 0, rg_locked, flags = LM_FLAG_TRY;
>  	u64 last_unlinked = NO_BLOCK;
> @@ -1225,32 +1566,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  
>  	if (sdp->sd_args.ar_rgrplvb)
>  		flags |= GL_SKIP;
> -	rs = ip->i_res;
>  	rs->rs_requested = requested;
>  	if (gfs2_assert_warn(sdp, requested)) {
>  		error = -EINVAL;
>  		goto out;
>  	}
> -
> -	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
> -		rgd = begin = ip->i_rgd;
> -	else
> -		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> -
> -	if (rgd == NULL)
> +	if (gfs2_rs_active(rs)) {
> +		begin = rs->rs_rgd;
> +		flags = 0; /* Yoda: Do or do not. There is no try */
> +	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
> +		rs->rs_rgd = begin = ip->i_rgd;
> +	} else {
> +		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> +	}
> +	if (rs->rs_rgd == NULL)
>  		return -EBADSLT;
>  
>  	while (loops < 3) {
>  		rg_locked = 0;
>  
> -		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
> +		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
>  			rg_locked = 1;
>  			error = 0;
> +		} else if (!loops && !gfs2_rs_active(rs) &&
> +			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
> +			/* If the rgrp already is maxed out for contenders,
> +			   we can eliminate it as a "first pass" without even
> +			   requesting the rgrp glock. */
> +			error = GLR_TRYFAILED;
>  		} else {
> -			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
> -						   flags, &rs->rs_rgd_gh);
> +			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
> +						   LM_ST_EXCLUSIVE, flags,
> +						   &rs->rs_rgd_gh);
>  			if (!error && sdp->sd_args.ar_rgrplvb) {
> -				error = update_rgrp_lvb(rgd);
> +				error = update_rgrp_lvb(rs->rs_rgd);
>  				if (error) {
>  					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  					return error;
> @@ -1259,24 +1608,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  		}
>  		switch (error) {
>  		case 0:
> -			if (try_rgrp_fit(rgd, ip)) {
> +			if (gfs2_rs_active(rs)) {
> +				if (unclaimed_blocks(rs->rs_rgd) +
> +				    rs->rs_free >= rs->rs_requested) {
> +					ip->i_rgd = rs->rs_rgd;
> +					return 0;
> +				}
> +				/* We have a multi-block reservation, but the
> +				   rgrp doesn't have enough free blocks to
> +				   satisfy the request. Free the reservation
> +				   and look for a suitable rgrp. */
> +				gfs2_rs_treedel(rs, true);
> +			}
> +			if (try_rgrp_fit(rs->rs_rgd, ip)) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				ip->i_rgd = rgd;
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				ip->i_rgd = rs->rs_rgd;
>  				return 0;
>  			}
> -			if (rgd->rd_flags & GFS2_RDF_CHECK) {
> +			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				try_rgrp_unlink(rgd, &last_unlinked,
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
>  						ip->i_no_addr);
>  			}
>  			if (!rg_locked)
>  				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  			/* fall through */
>  		case GLR_TRYFAILED:
> -			rgd = gfs2_rgrpd_get_next(rgd);
> -			if (rgd != begin) /* If we didn't wrap */
> +			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
> +			if (rs->rs_rgd != begin) /* If we didn't wrap */
>  				break;
>  
>  			flags &= ~LM_FLAG_TRY;
> @@ -1314,6 +1675,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
>  {
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  
> +	if (!rs)
> +		return;
> +
> +	if (gfs2_rs_active(rs) && !rs->rs_free)
> +		gfs2_rs_treedel(rs, true);
> +
>  	if (rs->rs_rgd_gh.gh_gl)
>  		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  	rs->rs_requested = 0;
> @@ -1412,7 +1779,27 @@ do_search:
>  		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
>  			buffer = bi->bi_clone + bi->bi_offset;
>  
> -		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +		while (1) {
> +			struct gfs2_blkreserv *rs;
> +			u32 rgblk;
> +
> +			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +			if (biblk == BFITNOENT)
> +				break;
> +			/* Check if this block is reserved() */
> +			rgblk = gfs2_bi2rgd_blk(bi, biblk);
> +			rs = rs_find(rgd, rgblk);
> +			if (rs == NULL)
> +				break;
> +
> +			BUG_ON(rs->rs_bi != bi);
> +			biblk = BFITNOENT;
> +			/* This should jump to the first block after the
> +			   reservation. */
> +			goal = rs->rs_biblk + rs->rs_free;
> +			if (goal >= bi->bi_len * GFS2_NBBY)
> +				break;
> +		}
>  		if (biblk != BFITNOENT)
>  			break;
>  
> @@ -1448,8 +1835,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  			     u32 blk, bool dinode, unsigned int *n)
>  {
>  	const unsigned int elen = *n;
> -	u32 goal;
> +	u32 goal, rgblk;
>  	const u8 *buffer = NULL;
> +	struct gfs2_blkreserv *rs;
>  
>  	*n = 0;
>  	buffer = bi->bi_bh->b_data + bi->bi_offset;
> @@ -1462,6 +1850,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  		goal++;
>  		if (goal >= (bi->bi_len * GFS2_NBBY))
>  			break;
> +		rgblk = gfs2_bi2rgd_blk(bi, goal);
> +		rs = rs_find(rgd, rgblk);
> +		if (rs) /* Oops, we bumped into someone's reservation */
> +			break;
>  		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
>  		    GFS2_BLKST_FREE)
>  			break;
> @@ -1538,11 +1930,19 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
>  int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
>  {
>  	const struct gfs2_rgrpd *rgd = gl->gl_object;
> +	struct gfs2_blkreserv *trs;
> +	const struct rb_node *n;
> +
>  	if (rgd == NULL)
>  		return 0;
> -	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
> +	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
>  		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
> -		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
> +		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
> +		       rgd->rd_reserved);
> +	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
> +		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		dump_rs(seq, trs);
> +	}
Is there some missing locking here? What protects the rb tree at this
point?


>  	return 0;
>  }
>  
> @@ -1557,10 +1957,74 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
>  }
>  
>  /**
> + * claim_reserved_blks - Claim previously reserved blocks
> + * @ip: the inode that's claiming the reservation
> + * @dinode: 1 if this block is a dinode block, otherwise data block
> + * @nblocks: desired extent length
> + *
> + * Lay claim to previously allocated block reservation blocks.
> + * Returns: Starting block number of the blocks claimed.
> + * Sets *nblocks to the actual extent length allocated.
> + */
> +static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
> +			       unsigned int *nblocks)
> +{
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> +	struct gfs2_bitmap *bi;
> +	u32 biblk;
> +	u64 start_block = 0, fsblock;
> +	const unsigned int elen = *nblocks;
> +
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +	gfs2_assert_withdraw(sdp, rgd);
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	bi = rs->rs_bi;
> +	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
> +
> +	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
> +		biblk = rs->rs_biblk;
> +
> +		fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
> +		/* Check if the next free block is not contiguous with
> +		   the others in the extent. */
> +		if (start_block && fsblock != start_block + (*nblocks))
> +			break;
> +
> +		rs->rs_biblk++;
> +		rs->rs_free--;
> +		/* Make sure the bitmap hasn't changed */
> +		gfs2_setbit(rgd, bi->bi_clone, bi, biblk, GFS2_BLKST_USED);
> +
> +		BUG_ON(!rgd->rd_reserved);
> +		rgd->rd_reserved--;
> +		if (!start_block) {
> +			start_block = fsblock;
> +			dinode = false;
> +		}
> +		trace_gfs2_rs(rs, fsblock, TRACE_RS_CLAIM);
> +	}
> +
> +	if (!rs->rs_free) {
> +		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
> +
> +		gfs2_rs_treedel(rs, true);
> +		/* -nblocks because we haven't returned to do the math yet.
> +		   I'm doing the math backwards to prevent negative numbers,
> +		   but think of it as:
> +		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
> +		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
> +			rg_mblk_search(rgd, ip);
> +	}
> +	return start_block;
> +}
> +
> +/**
>   * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
>   * @ip: the inode to allocate the block for
>   * @bn: Used to return the starting block number
> - * @ndata: requested number of blocks/extent length (value/result)
> + * @nblocks: requested number of blocks/extent length (value/result)
>   * @dinode: 1 if we're allocating a dinode block, else 0
>   * @generation: the generation number of the inode
>   *
> @@ -1585,20 +2049,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  	if (ip->i_res->rs_requested == 0)
>  		return -ECANCELED;
>  
> -	rgd = ip->i_rgd;
> -
> -	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> -		goal = ip->i_goal - rgd->rd_data0;
> -	else
> -		goal = rgd->rd_last_alloc;
> -
> -	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +	/* Check if we have a multi-block reservation, and if so, claim the
> +	   next free block from it. */
> +	if (gfs2_rs_active(ip->i_res)) {
> +		BUG_ON(!ip->i_res->rs_free);
> +		rgd = ip->i_res->rs_rgd;
> +		block = claim_reserved_blks(ip, dinode, nblocks);
> +	} else {
> +		rgd = ip->i_rgd;
>  
> -	/* Since all blocks are reserved in advance, this shouldn't happen */
> -	if (blk == BFITNOENT)
> -		goto rgrp_error;
> +		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> +			goal = ip->i_goal - rgd->rd_data0;
> +		else
> +			goal = rgd->rd_last_alloc;
> +
> +		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +
> +		/* Since all blocks are reserved in advance, this shouldn't
> +		   happen */
> +		if (blk == BFITNOENT) {
> +			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
> +			       *nblocks);
> +			printk(KERN_WARNING "FULL=%d\n",
> +			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
> +			goto rgrp_error;
> +		}
>  
> -	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +	}
>  	ndata = *nblocks;
>  	if (dinode)
>  		ndata--;
> @@ -1615,8 +2093,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  			brelse(dibh);
>  		}
>  	}
> -	if (rgd->rd_free < *nblocks)
> +	if (rgd->rd_free < *nblocks) {
> +		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
>  		goto rgrp_error;
> +	}
>  
>  	rgd->rd_free -= *nblocks;
>  	if (dinode) {
> @@ -1672,6 +2152,9 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
>  		return;
>  	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
>  	rgd->rd_free += blen;
> +	if (bstart < rgd->rd_next_rsrv)
> +		rgd->rd_next_rsrv = bstart;
> +
>  	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
>  	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
>  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
> diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
> index 5d8314d..b604009 100644
> --- a/fs/gfs2/rgrp.h
> +++ b/fs/gfs2/rgrp.h
> @@ -13,6 +13,14 @@
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  
> +/* Since each block in the file system is represented by two bits in the
> + * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
> + * By reserving 32 blocks at a time, we can optimize / shortcut how we search
> + * through the bitmaps by looking a word at a time.
> + */
> +#define RGRP_RSRV_MINBYTES 8
> +#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
> +
>  struct gfs2_rgrpd;
>  struct gfs2_sbd;
>  struct gfs2_holder;
> @@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
>  extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
>  extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
>  
> +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
> +
>  extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
>  extern void gfs2_inplace_release(struct gfs2_inode *ip);
>  
> @@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
>  			     bool dinode, u64 *generation);
>  
>  extern int gfs2_rs_alloc(struct gfs2_inode *ip);
> +extern void gfs2_rs_treedel(struct gfs2_blkreserv *rs, bool lock);
>  extern void gfs2_rs_delete(struct gfs2_inode *ip);
>  extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
>  extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
> @@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
>  				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
>  extern int gfs2_fitrim(struct file *filp, void __user *argp);
>  
> -/* This is how to tell if a reservation is "inplace" reserved: */
> +/* This is how to tell if a multi-block reservation is "inplace" reserved: */
>  static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  {
>  	if (ip->i_res && ip->i_res->rs_requested)
> @@ -70,4 +81,12 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  	return 0;
>  }
>  
> +/* This is how to tell if a multi-block reservation is in the rgrp tree: */
> +static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
> +{
> +	if (rs && rs->rs_start)
> +		return 1;
> +	return 0;
> +}
> +
>  #endif /* __RGRP_DOT_H__ */
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index 7880687..433da86 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
>  		return -EIO;
>  	}
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;
> @@ -1550,6 +1554,9 @@ out_truncate:
>  
>  out_unlock:
>  	/* Error path for case 1 */
> +	if (gfs2_rs_active(ip->i_res))
> +		gfs2_rs_treedel(ip->i_res, true);
> +
>  	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
>  		gfs2_glock_dq(&ip->i_iopen_gh);
>  	gfs2_holder_uninit(&ip->i_iopen_gh);
> diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
> index 1b8b815..134bb9d 100644
> --- a/fs/gfs2/trace_gfs2.h
> +++ b/fs/gfs2/trace_gfs2.h
> @@ -31,6 +31,17 @@
>  			    { GFS2_BLKST_DINODE, "dinode" },	\
>  			    { GFS2_BLKST_UNLINKED, "unlinked" })
>  
> +#define TRACE_RS_DELETE  0
> +#define TRACE_RS_TREEDEL 1
> +#define TRACE_RS_INSERT  2
> +#define TRACE_RS_CLAIM   3
> +
> +#define rs_func_name(x) __print_symbolic(x,	\
> +					 { 0, "del " },	\
> +					 { 1, "tdel" },	\
> +					 { 2, "ins " },	\
> +					 { 3, "clm " })
> +
>  #define show_glock_flags(flags) __print_flags(flags, "",	\
>  	{(1UL << GLF_LOCK),			"l" },		\
>  	{(1UL << GLF_DEMOTE),			"D" },		\
> @@ -470,6 +481,7 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__field(	u8,	block_state		)
>  		__field(        u64,	rd_addr			)
>  		__field(        u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
>  	),
>  
>  	TP_fast_assign(
> @@ -480,16 +492,63 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__entry->block_state	= block_state;
>  		__entry->rd_addr	= rgd->rd_addr;
>  		__entry->rd_free_clone	= rgd->rd_free_clone;
> +		__entry->rd_reserved	= rgd->rd_reserved;
>  	),
>  
> -	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
> +	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  (unsigned long long)__entry->inum,
>  		  (unsigned long long)__entry->start,
>  		  (unsigned long)__entry->len,
>  		  block_state_name(__entry->block_state),
>  		  (unsigned long long)__entry->rd_addr,
> -		  __entry->rd_free_clone)
> +		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
> +);
> +
> +/* Keep track of multi-block reservations as they are allocated/freed */
> +TRACE_EVENT(gfs2_rs,
> +
> +	TP_PROTO(const struct gfs2_blkreserv *rs, u64 claimed, u8 func),
> +
> +	TP_ARGS(rs, claimed, func),
> +
> +	TP_STRUCT__entry(
> +		__field(        dev_t,  dev                     )
> +		__field(	u64,	rd_addr			)
> +		__field(	u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
> +		__field(	u64,	inum			)
> +		__field(	u64,	start			)
> +		__field(	u32,	len			)
> +		__field(	u64,	claimed			)
> +		__field(	u32,	free			)
> +		__field(	u8,	func			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
> +		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
> +		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
> +		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
> +		__entry->inum		= rs->rs_inum;
> +		__entry->start		= rs->rs_start;
> +		__entry->len		= rs->rs_blks;
> +		__entry->claimed	= claimed;
> +		__entry->free		= rs->rs_free;
> +		__entry->func		= func;
> +	),
> +
> +	TP_printk("%u,%u bmap %llu resrv %llu/%lu rg:%llu rf:%lu rr:%lu %s "
> +		  "c:%llu f:%lu",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long long)__entry->inum,
> +		  (unsigned long long)__entry->start,
> +		  (unsigned long)__entry->len,
> +		  (unsigned long long)__entry->rd_addr,
> +		  (unsigned long)__entry->rd_free_clone,
> +		  (unsigned long)__entry->rd_reserved,
> +		  rs_func_name(__entry->func),
> +		  __entry->claimed, (unsigned long)__entry->free)
>  );
>  
>  #endif /* _TRACE_GFS2_H */
> diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
> index 523c0de..27a0b4a 100644
> --- a/fs/gfs2/xattr.c
> +++ b/fs/gfs2/xattr.c
> @@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto out_alloc;
> @@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock_check(ip);
>  	if (error)
>  		return error;
> @@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;
> 




^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 2] GFS2: Reduce file fragmentation
  2012-07-10 14:23   ` Steven Whitehouse
@ 2012-07-11 18:07     ` Bob Peterson
  2012-07-11 18:14       ` Steven Whitehouse
  2012-07-11 18:10     ` [Cluster-devel] [GFS2 Patch] [Try 3] " Bob Peterson
  1 sibling, 1 reply; 13+ messages in thread
From: Bob Peterson @ 2012-07-11 18:07 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| Hi,
| 
| Generally looking much better, but some further comments are included
| below...
| 
| I'm still not sure that I understand fully the implications of the
| new
| allocation algorithm, but the code surrounding that core part of the
| patch is looking much better now.
(snip)
| Don't we always need to drop this data structure at this point,
| whether
| or not the reservation is active?
Yes we do. Good catch. Fixed in the latest version.

| Does this need to be a mutex? I think a spinlock would do just as
| well
| in this case since we never schedule while holding the lock.

Good suggestion; changed to a spinlock.
 
| Why are we adding an extra field just to pass to the tracepoint?
| Surely
| it is easier just to pass the inode if required, rather than this
| structure, then you can access all the required fields.

Good idea. Changed to pass a parameter to the trace point.
 
| What is the difference between rs_free and rs_blks? Shouldn't these
| two
| always be identical, since there is no point in reserving blocks
| which
| are not free.

I guess I used a misleading variable name. The two variables have
two meanings and both are needed. I renamed variable rs_blks to rs_len
because it represents the length of the reservation.

| Can we split this into two functions? That way we can have a
| __gfs2_tree_del() which doesn't do anything with the lock, and a
| gfs2_tree_del() which grabs and drop the lock and calls the former.

Good idea. Done.

| Use max_t here?

Good idea. Done.

| 
| I'm not sure that I understand this comment at all. Currently with
| directories we never deallocate any blocks at all until the directory
| is
| deallocated when it is unlinked. We will want to extend this to
| directories eventually, even if we don't do that immediately.

I clarified the comment to make it more clear what's going on.
I'm talking about gaps in the _reservation_ not gaps in the blocks.
The current algorithm makes assumptions based on the fact that block
reservations don't have gaps, and the "next" free block will be the
successor to the last claimed. If you use reservations for directories,
what can happen is that two files may be created, which claims two
blocks in the reservation. If the first file is deleted from the
directory, that block becomes a "hole" in the reservation, which breaks
the code with its current assumptions. We either have to:
(a) keep the current assumptions which make block claims faster, or
(b) Make no such assumptions and implement a bitmap-like search of the
    reservation that can fill holes. It wouldn't be too tough to do,
    especially since we already have nicely tuned functions to do it.
    I'm just worried that it's going to hurt performance.
 
| Is there some missing locking here? What protects the rb tree at this
| point?

Good catch; fixed in the latest and greatest.

I'll post the replacement patch as a separate email.

Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 3] GFS2: Reduce file fragmentation
  2012-07-10 14:23   ` Steven Whitehouse
  2012-07-11 18:07     ` Bob Peterson
@ 2012-07-11 18:10     ` Bob Peterson
  2012-07-12 10:01       ` Steven Whitehouse
  1 sibling, 1 reply; 13+ messages in thread
From: Bob Peterson @ 2012-07-11 18:10 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

Here is my third version of the GFS2 file defragmentation patch for
upstream:

Description:

This patch reduces GFS2 file fragmentation by pre-reserving blocks.
In this patch, I've implemented almost all of Steve's suggestions.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d957a8..49cd7dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (error)
 		goto out_rlist;
 
+	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
+		gfs2_rs_deltree(ip->i_res);
+
 	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
 				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
 				 revokes);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fbf3cb..b93275f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		return ret;
 
+	atomic_set(&ip->i_res->rs_sizehint,
+		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
+
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
@@ -571,22 +574,15 @@ fail:
 
 static int gfs2_release(struct inode *inode, struct file *file)
 {
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_file *fp;
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	fp = file->private_data;
+	kfree(file->private_data);
 	file->private_data = NULL;
 
-	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
-	    (atomic_read(&inode->i_writecount) == 1))
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (atomic_read(&inode->i_writecount) == 1)) {
 		gfs2_rs_delete(ip);
-
-	if (gfs2_assert_warn(sdp, fp))
-		return -EIO;
-
-	kfree(fp);
-
+	}
 	return 0;
 }
 
@@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				   unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
+	size_t writesize = iov_length(iov, nr_segs);
 	struct dentry *dentry = file->f_dentry;
 	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_sbd *sdp;
 	int ret;
 
+	sdp = GFS2_SB(file->f_mapping->host);
 	ret = gfs2_rs_alloc(ip);
 	if (ret)
 		return ret;
 
+	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
 	if (file->f_flags & O_APPEND) {
 		struct gfs2_holder gh;
 
@@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	if (unlikely(error))
 		goto out_uninit;
 
+	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
+
 	while (len > 0) {
 		if (len < bytes)
 			bytes = len;
@@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 			offset += bytes;
 			continue;
 		}
-		error = gfs2_rindex_update(sdp);
-		if (error)
-			goto out_unlock;
-
 		error = gfs2_quota_lock_check(ip);
 		if (error)
 			goto out_unlock;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dc73070..4384d77 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -84,6 +84,7 @@ struct gfs2_rgrpd {
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
 	u32 rd_free;
+	u32 rd_reserved;                /* number of blocks reserved */
 	u32 rd_free_clone;
 	u32 rd_dinodes;
 	u64 rd_igeneration;
@@ -91,11 +92,15 @@ struct gfs2_rgrpd {
 	struct gfs2_sbd *rd_sbd;
 	struct gfs2_rgrp_lvb *rd_rgl;
 	u32 rd_last_alloc;
+	u32 rd_next_rsrv;               /* preferred next reservation */
 	u32 rd_flags;
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
 #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
+	spinlock_t rd_rsspin;           /* protects reservation related vars */
+	struct rb_root rd_rstree;       /* multi-block reservation tree */
+	u32 rd_rs_cnt;                  /* count of current reservations */
 };
 
 enum gfs2_state_bits {
@@ -233,6 +238,40 @@ struct gfs2_holder {
 	unsigned long gh_ip;
 };
 
+/* Resource group multi-block reservation, in order of appearance:
+
+   Step 1. Function prepares to write, allocates a mb, sets the size hint.
+   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
+   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
+   Step 4. Bits are assigned from the rgrp based on either the reservation
+           or wherever it can.
+*/
+
+struct gfs2_blkreserv {
+	/* components used during write (step 1): */
+	atomic_t rs_sizehint;         /* hint of the write size */
+
+	/* components used during inplace_reserve (step 2): */
+	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+
+	/* components used during get_local_rgrp (step 3): */
+	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
+	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
+	struct rb_node rs_node;       /* link to other block reservations */
+
+	/* components used during block searches and assignments (step 4): */
+	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
+	u64 rs_start;                 /* absolute fs block address */
+	u32 rs_biblk;                 /* start block relative to the bi */
+	u32 rs_len;                  /* number of blocks reserved */
+	u32 rs_free;                  /* how many blocks are still free */
+
+	/* ancillary quota stuff */
+	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
+	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+	unsigned int rs_qa_qd_num;
+};
+
 enum {
 	GLF_LOCK			= 1,
 	GLF_DEMOTE			= 3,
@@ -290,16 +329,6 @@ struct gfs2_glock {
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
 
-struct gfs2_blkreserv {
-	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
-	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
-
-	/* ancillary quota stuff */
-	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
-	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
-	unsigned int rs_qa_qd_num;
-};
-
 enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
@@ -307,7 +336,6 @@ enum {
 	GIF_SW_PAGED		= 3,
 };
 
-
 struct gfs2_inode {
 	struct inode i_inode;
 	u64 i_no_addr;
@@ -318,7 +346,7 @@ struct gfs2_inode {
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
-	struct gfs2_blkreserv *i_res; /* resource group block reservation */
+	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
 	struct gfs2_rgrpd *i_rgd;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b035e0..12c7b6a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	int error;
 
 	munge_mode_uid_gid(dip, &mode, &uid, &gid);
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
 
 	error = gfs2_quota_lock(dip, uid, gid);
 	if (error)
@@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	struct buffer_head *dibh;
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto fail;
@@ -596,7 +603,8 @@ fail_end_trans:
 	gfs2_trans_end(sdp);
 
 fail_ipreserv:
-	gfs2_inplace_release(dip);
+	if (alloc_required)
+		gfs2_inplace_release(dip);
 
 fail_quota_locks:
 	gfs2_quota_unlock(dip);
@@ -738,6 +746,7 @@ fail_gunlock:
 		iput(inode);
 	}
 fail:
+	gfs2_rs_delete(dip);
 	if (bh)
 		brelse(bh);
 	return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e53d0a1..ba32001 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -35,6 +35,9 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
+#define RSRV_CONTENTION_FACTOR 2
+#define RGRP_RSRV_MAX_CONTENDERS 4
+
 #if BITS_PER_LONG == 32
 #define LBITMASK   (0x55555555UL)
 #define LBITSKIP55 (0x55555555UL)
@@ -178,6 +181,60 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
 }
 
 /**
+ * rs_cmp - multi-block reservation range compare
+ * @blk: absolute file system block number of the new reservation
+ * @len: number of blocks in the new reservation
+ * @rs: existing reservation to compare against
+ *
+ * returns: 1 if the block range is beyond the reach of the reservation
+ *         -1 if the block range is before the start of the reservation
+ *          0 if the block range overlaps with the reservation
+ */
+static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
+{
+	if (blk > rs->rs_start + rs->rs_len - 1)
+		return 1;
+	if (blk + len - 1 < rs->rs_start)
+		return -1;
+	return 0;
+}
+
+/**
+ * rs_find - Find a rgrp multi-block reservation that contains a given block
+ * @rgd: The rgrp
+ * @rgblk: The block we're looking for, relative to the rgrp
+ */
+static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
+{
+	struct rb_node **newn;
+	int rc;
+	u64 fsblk = rgblk + rgd->rd_data0;
+
+	spin_lock(&rgd->rd_rsspin);
+	newn = &rgd->rd_rstree.rb_node;
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+		rc = rs_cmp(fsblk, 1, cur);
+		if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else {
+			spin_unlock(&rgd->rd_rsspin);
+			return cur;
+		}
+	}
+	spin_unlock(&rgd->rd_rsspin);
+	return NULL;
+}
+
+static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
+{
+	return (bi->bi_start * GFS2_NBBY) + blk;
+}
+
+/**
  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
  *       a block in a given allocation state.
  * @buf: the buffer that holds the bitmaps
@@ -435,8 +492,79 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
 	return error;
 }
 
+static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
+{
+	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u i:%u f:%u\n",
+		       rs->rs_rgd->rd_addr, rs->rs_start, rs->rs_biblk,
+		       rs->rs_len, rs->rs_free);
+}
+
 /**
- * gfs2_rs_delete - delete a reservation
+ * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+static void __gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+{
+	struct gfs2_rgrpd *rgd;
+
+	if (!gfs2_rs_active(rs))
+		return;
+
+	rgd = rs->rs_rgd;
+	/* We can't do this: The reason is that when the rgrp is invalidated,
+	   it's in the "middle" of acquiring the glock, but the HOLDER bit
+	   isn't set yet:
+	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
+	trace_gfs2_rs(NULL, rs, 0, TRACE_RS_TREEDEL);
+
+	/* Now that we have the mutex, check again if the tree is empty just
+	   in case someone else deleted this entry. */
+	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
+		rb_erase(&rs->rs_node, &rgd->rd_rstree);
+	BUG_ON(!rgd->rd_rs_cnt);
+	rgd->rd_rs_cnt--;
+
+	if (rs->rs_free) {
+		/* return reserved blocks to the rgrp and the ip */
+		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
+		rs->rs_rgd->rd_reserved -= rs->rs_free;
+		rs->rs_free = 0;
+		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
+		smp_mb__after_clear_bit();
+	}
+	/* We can't change any of the step 1 or step 2 components of the rs.
+	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
+	   dequeued through this pointer.
+	   Can't: atomic_set(&rs->rs_sizehint, 0);
+	   Can't: rs->rs_requested = 0;
+	   Can't: rs->rs_rgd = NULL;*/
+	rs->rs_bi = NULL;
+	rs->rs_start = 0;
+	rs->rs_biblk = 0;
+	rs->rs_len = 0;
+}
+
+/**
+ * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+{
+	struct gfs2_rgrpd *rgd;
+
+	if (!gfs2_rs_active(rs))
+		return;
+
+	rgd = rs->rs_rgd;
+	spin_lock(&rgd->rd_rsspin);
+	__gfs2_rs_deltree(rs);
+	spin_unlock(&rgd->rd_rsspin);
+}
+
+/**
+ * gfs2_rs_delete - delete a multi-block reservation
  * @ip: The inode for this reservation
  *
  */
@@ -444,12 +572,37 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
 {
 	down_write(&ip->i_rw_mutex);
 	if (ip->i_res) {
+		gfs2_rs_deltree(ip->i_res);
+		trace_gfs2_rs(ip, ip->i_res, ip->i_res->rs_start,
+			      TRACE_RS_DELETE);
+		BUG_ON(ip->i_res->rs_free);
 		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
 		ip->i_res = NULL;
 	}
 	up_write(&ip->i_rw_mutex);
 }
 
+/**
+ * return_all_reservations - return all reserved blocks back to the rgrp.
+ * @rgd: the rgrp that needs its space back
+ *
+ * We previously reserved a bunch of blocks for allocation. Now we need to
+ * give them back. This leave the reservation structures in tact, but removes
+ * all of their corresponding "no-fly zones".
+ */
+static void return_all_reservations(struct gfs2_rgrpd *rgd)
+{
+	struct rb_node *n;
+	struct gfs2_blkreserv *rs;
+
+	spin_lock(&rgd->rd_rsspin);
+	while ((n = rb_first(&rgd->rd_rstree))) {
+		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		__gfs2_rs_deltree(rs);
+	}
+	spin_unlock(&rgd->rd_rsspin);
+}
+
 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 {
 	struct rb_node *n;
@@ -472,6 +625,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 
 		gfs2_free_clones(rgd);
 		kfree(rgd->rd_bits);
+		return_all_reservations(rgd);
 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
 	}
 }
@@ -649,6 +803,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
 	rgd->rd_data = be32_to_cpu(buf.ri_data);
 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
+	spin_lock_init(&rgd->rd_rsspin);
 
 	error = compute_bitstructs(rgd);
 	if (error)
@@ -1115,29 +1270,228 @@ out:
 }
 
 /**
+ * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
+ * @bi: the bitmap with the blocks
+ * @ip: the inode structure
+ * @biblk: the 32-bit block number relative to the start of the bitmap
+ * @amount: the number of blocks to reserve
+ *
+ * Returns: NULL - reservation was already taken, so not inserted
+ *          pointer to the inserted reservation
+ */
+static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
+				       struct gfs2_inode *ip, u32 biblk,
+				       int amount)
+{
+	struct rb_node **newn, *parent = NULL;
+	int rc;
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rgd;
+	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+
+	spin_lock(&rgd->rd_rsspin);
+	newn = &rgd->rd_rstree.rb_node;
+	BUG_ON(!ip->i_res);
+	BUG_ON(gfs2_rs_active(rs));
+	/* Figure out where to put new node */
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+
+		parent = *newn;
+		rc = rs_cmp(fsblock, amount, cur);
+		if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else {
+			spin_unlock(&rgd->rd_rsspin);
+			return NULL; /* reservation already in use */
+		}
+	}
+
+	/* Do our reservation work */
+	rs = ip->i_res;
+	rs->rs_start = fsblock;
+	rs->rs_len = amount;
+	rs->rs_free = amount;
+	rs->rs_biblk = biblk;
+	rs->rs_bi = bi;
+	rb_link_node(&rs->rs_node, parent, newn);
+	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
+
+	/* Do our inode accounting for the reservation */
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+
+	/* Do our rgrp accounting for the reservation */
+	rgd->rd_reserved += amount; /* blocks reserved */
+	rgd->rd_next_rsrv = rs->rs_start + rs->rs_len;
+	rgd->rd_rs_cnt++; /* number of in-tree reservations */
+	if (!rgrp_contains_block(rgd, rgd->rd_next_rsrv))
+		rgd->rd_next_rsrv = 0;
+	spin_unlock(&rgd->rd_rsspin);
+	trace_gfs2_rs(ip, rs, fsblock, TRACE_RS_INSERT);
+	return rs;
+}
+
+/**
+ * unclaimed_blocks - return number of blocks that aren't spoken for
+ */
+static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
+{
+	return rgd->rd_free_clone - rgd->rd_reserved;
+}
+
+/**
+ * rg_mblk_search - find a group of multiple free blocks
+ * @rgd: the resource group descriptor
+ * @rs: the block reservation
+ * @ip: pointer to the inode for which we're reserving blocks
+ *
+ * This is very similar to rgblk_search, except we're looking for whole
+ * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
+ * on aligned dwords for speed's sake.
+ *
+ * Returns: 0 if successful or BFITNOENT if there isn't enough free space
+ */
+
+static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+	struct gfs2_bitmap *bi = rgd->rd_bits;
+	const u32 length = rgd->rd_length;
+	u32 blk;
+	unsigned int buf, x, search_bytes;
+	u8 *buffer = NULL;
+	u8 *ptr, *end, *nonzero;
+	u32 goal, rsv_bytes;
+	struct gfs2_blkreserv *rs;
+	u32 best_rs_bytes, unclaimed;
+	int best_rs_blocks;
+
+	/* Find bitmap block that contains bits for goal block */
+	if (rgd->rd_next_rsrv) {
+		goal = rgd->rd_next_rsrv - rgd->rd_data0;
+		for (buf = 0; buf < length; buf++) {
+			bi = rgd->rd_bits + buf;
+			/* Convert scope of "goal" from rgrp-wide to within
+			   found bit block */
+			if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+				goal -= bi->bi_start * GFS2_NBBY;
+				goto do_search;
+			}
+		}
+	}
+	buf = 0;
+	goal = 0;
+
+do_search:
+	best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
+			       (RGRP_RSRV_MINBLKS * rgd->rd_length));
+	best_rs_bytes = (best_rs_blocks *
+			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
+		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
+	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
+	unclaimed = unclaimed_blocks(rgd);
+	if (best_rs_bytes * GFS2_NBBY > unclaimed)
+		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
+
+	for (x = 0; x <= length; x++) {
+		bi = rgd->rd_bits + buf;
+
+		if (test_bit(GBF_FULL, &bi->bi_flags))
+			goto skip;
+
+		WARN_ON(!buffer_uptodate(bi->bi_bh));
+		if (bi->bi_clone)
+			buffer = bi->bi_clone + bi->bi_offset;
+		else
+			buffer = bi->bi_bh->b_data + bi->bi_offset;
+
+		/* We have to keep the reservations aligned on u64 boundaries
+		   otherwise we could get situations where a byte can't be
+		   used because it's after a reservation, but a free bit still
+		   is within the reservation's area. */
+		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
+		end = (buffer + bi->bi_len);
+		while (ptr < end) {
+			rsv_bytes = 0;
+			if ((ptr + best_rs_bytes) <= end)
+				search_bytes = best_rs_bytes;
+			else
+				search_bytes = end - ptr;
+			BUG_ON(!search_bytes);
+			nonzero = memchr_inv(ptr, 0, search_bytes);
+			/* If the lot is all zeroes, reserve the whole size. If
+			   there's enough zeroes to satisfy the request, use
+			   what we can. If there's not enough, keep looking. */
+			if (nonzero == NULL)
+				rsv_bytes = search_bytes;
+			else if ((nonzero - ptr) * GFS2_NBBY >=
+				 ip->i_res->rs_requested)
+				rsv_bytes = (nonzero - ptr);
+
+			if (rsv_bytes) {
+				blk = ((ptr - buffer) * GFS2_NBBY);
+				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
+				rs = rs_insert(bi, ip, blk,
+					       rsv_bytes * GFS2_NBBY);
+				if (IS_ERR(rs))
+					return PTR_ERR(rs);
+				if (rs)
+					return 0;
+			}
+			ptr += ALIGN(search_bytes, sizeof(u64));
+		}
+skip:
+		/* Try next bitmap block (wrap back to rgrp header
+		   if at end) */
+		buf++;
+		buf %= length;
+		goal = 0;
+	}
+
+	return BFITNOENT;
+}
+
+/**
  * try_rgrp_fit - See if a given reservation will fit in a given RG
  * @rgd: the RG data
  * @ip: the inode
  *
  * If there's room for the requested blocks to be allocated from the RG:
+ * This will try to get a multi-block reservation first, and if that doesn't
+ * fit, it will take what it can.
  *
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
  */
 
-static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
-	const struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_blkreserv *rs = ip->i_res;
 
 	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
-	if (rgd->rd_free_clone >= rs->rs_requested)
+	/* Look for a multi-block reservation. We only do this for files,
+	   not directories or other types. The problem with directories is
+	   that items are often created and deleted in the directory in the
+	   same breath, which can create "holes" in the reservation. By holes
+	   I mean that your next "claim" may not be the next free block in the
+	   reservation. In other words, we could get into situations where two
+	   or more blocks are reserved, then used, then one or more of the
+	   earlier blocks is freed. When we delete the reservation, the rs_free
+	   will be off due to the hole, so the rgrp's rg_free count can get
+	   off. Other non-files, like symlinks and char or block devices, are
+	   very small in size, so it doesn't make sense to reserve multiple
+	   blocks for them. */
+	if (S_ISREG(ip->i_inode.i_mode) &&
+	    unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
+	    rg_mblk_search(rgd, ip) != BFITNOENT)
+		return 1;
+	if (unclaimed_blocks(rgd) >= rs->rs_requested)
 		return 1;
-	return 0;
-}
 
-static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
-{
-	return (bi->bi_start * GFS2_NBBY) + blk;
+	return 0;
 }
 
 /**
@@ -1217,7 +1571,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrpd *rgd, *begin = NULL;
+	struct gfs2_rgrpd *begin = NULL;
 	struct gfs2_blkreserv *rs = ip->i_res;
 	int error = 0, rg_locked, flags = LM_FLAG_TRY;
 	u64 last_unlinked = NO_BLOCK;
@@ -1225,32 +1579,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 
 	if (sdp->sd_args.ar_rgrplvb)
 		flags |= GL_SKIP;
-	rs = ip->i_res;
 	rs->rs_requested = requested;
 	if (gfs2_assert_warn(sdp, requested)) {
 		error = -EINVAL;
 		goto out;
 	}
-
-	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
-		rgd = begin = ip->i_rgd;
-	else
-		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
-
-	if (rgd == NULL)
+	if (gfs2_rs_active(rs)) {
+		begin = rs->rs_rgd;
+		flags = 0; /* Yoda: Do or do not. There is no try */
+	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
+		rs->rs_rgd = begin = ip->i_rgd;
+	} else {
+		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
+	}
+	if (rs->rs_rgd == NULL)
 		return -EBADSLT;
 
 	while (loops < 3) {
 		rg_locked = 0;
 
-		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
 			rg_locked = 1;
 			error = 0;
+		} else if (!loops && !gfs2_rs_active(rs) &&
+			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
+			/* If the rgrp already is maxed out for contenders,
+			   we can eliminate it as a "first pass" without even
+			   requesting the rgrp glock. */
+			error = GLR_TRYFAILED;
 		} else {
-			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
-						   flags, &rs->rs_rgd_gh);
+			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
+						   LM_ST_EXCLUSIVE, flags,
+						   &rs->rs_rgd_gh);
 			if (!error && sdp->sd_args.ar_rgrplvb) {
-				error = update_rgrp_lvb(rgd);
+				error = update_rgrp_lvb(rs->rs_rgd);
 				if (error) {
 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 					return error;
@@ -1259,24 +1621,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 		}
 		switch (error) {
 		case 0:
-			if (try_rgrp_fit(rgd, ip)) {
+			if (gfs2_rs_active(rs)) {
+				if (unclaimed_blocks(rs->rs_rgd) +
+				    rs->rs_free >= rs->rs_requested) {
+					ip->i_rgd = rs->rs_rgd;
+					return 0;
+				}
+				/* We have a multi-block reservation, but the
+				   rgrp doesn't have enough free blocks to
+				   satisfy the request. Free the reservation
+				   and look for a suitable rgrp. */
+				gfs2_rs_deltree(rs);
+			}
+			if (try_rgrp_fit(rs->rs_rgd, ip)) {
 				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rgd);
-				ip->i_rgd = rgd;
+					gfs2_rgrp_bh_get(rs->rs_rgd);
+				ip->i_rgd = rs->rs_rgd;
 				return 0;
 			}
-			if (rgd->rd_flags & GFS2_RDF_CHECK) {
+			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
 				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rgd);
-				try_rgrp_unlink(rgd, &last_unlinked,
+					gfs2_rgrp_bh_get(rs->rs_rgd);
+				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
 						ip->i_no_addr);
 			}
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 			/* fall through */
 		case GLR_TRYFAILED:
-			rgd = gfs2_rgrpd_get_next(rgd);
-			if (rgd != begin) /* If we didn't wrap */
+			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
+			if (rs->rs_rgd != begin) /* If we didn't wrap */
 				break;
 
 			flags &= ~LM_FLAG_TRY;
@@ -1314,6 +1688,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 {
 	struct gfs2_blkreserv *rs = ip->i_res;
 
+	if (!rs)
+		return;
+
+	if (!rs->rs_free)
+		gfs2_rs_deltree(rs);
+
 	if (rs->rs_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 	rs->rs_requested = 0;
@@ -1412,7 +1792,27 @@ do_search:
 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
 			buffer = bi->bi_clone + bi->bi_offset;
 
-		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+		while (1) {
+			struct gfs2_blkreserv *rs;
+			u32 rgblk;
+
+			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+			if (biblk == BFITNOENT)
+				break;
+			/* Check if this block is reserved() */
+			rgblk = gfs2_bi2rgd_blk(bi, biblk);
+			rs = rs_find(rgd, rgblk);
+			if (rs == NULL)
+				break;
+
+			BUG_ON(rs->rs_bi != bi);
+			biblk = BFITNOENT;
+			/* This should jump to the first block after the
+			   reservation. */
+			goal = rs->rs_biblk + rs->rs_free;
+			if (goal >= bi->bi_len * GFS2_NBBY)
+				break;
+		}
 		if (biblk != BFITNOENT)
 			break;
 
@@ -1448,8 +1848,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 			     u32 blk, bool dinode, unsigned int *n)
 {
 	const unsigned int elen = *n;
-	u32 goal;
+	u32 goal, rgblk;
 	const u8 *buffer = NULL;
+	struct gfs2_blkreserv *rs;
 
 	*n = 0;
 	buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1462,6 +1863,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 		goal++;
 		if (goal >= (bi->bi_len * GFS2_NBBY))
 			break;
+		rgblk = gfs2_bi2rgd_blk(bi, goal);
+		rs = rs_find(rgd, rgblk);
+		if (rs) /* Oops, we bumped into someone's reservation */
+			break;
 		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
 		    GFS2_BLKST_FREE)
 			break;
@@ -1537,12 +1942,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 
 int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
-	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	struct gfs2_rgrpd *rgd = gl->gl_object;
+	struct gfs2_blkreserv *trs;
+	const struct rb_node *n;
+
 	if (rgd == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
+		       rgd->rd_reserved);
+	spin_lock(&rgd->rd_rsspin);
+	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
+		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		dump_rs(seq, trs);
+	}
+	spin_unlock(&rgd->rd_rsspin);
 	return 0;
 }
 
@@ -1557,10 +1972,74 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 }
 
 /**
+ * claim_reserved_blks - Claim previously reserved blocks
+ * @ip: the inode that's claiming the reservation
+ * @dinode: 1 if this block is a dinode block, otherwise data block
+ * @nblocks: desired extent length
+ *
+ * Lay claim to previously allocated block reservation blocks.
+ * Returns: Starting block number of the blocks claimed.
+ * Sets *nblocks to the actual extent length allocated.
+ */
+static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
+			       unsigned int *nblocks)
+{
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rgd;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_bitmap *bi;
+	u32 biblk;
+	u64 start_block = 0, fsblock;
+	const unsigned int elen = *nblocks;
+
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+	gfs2_assert_withdraw(sdp, rgd);
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+	bi = rs->rs_bi;
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+
+	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
+		biblk = rs->rs_biblk;
+
+		fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+		/* Check if the next free block is not contiguous with
+		   the others in the extent. */
+		if (start_block && fsblock != start_block + (*nblocks))
+			break;
+
+		rs->rs_biblk++;
+		rs->rs_free--;
+		/* Make sure the bitmap hasn't changed */
+		gfs2_setbit(rgd, bi->bi_clone, bi, biblk, GFS2_BLKST_USED);
+
+		BUG_ON(!rgd->rd_reserved);
+		rgd->rd_reserved--;
+		if (!start_block) {
+			start_block = fsblock;
+			dinode = false;
+		}
+		trace_gfs2_rs(ip, rs, fsblock, TRACE_RS_CLAIM);
+	}
+
+	if (!rs->rs_free) {
+		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
+
+		gfs2_rs_deltree(rs);
+		/* -nblocks because we haven't returned to do the math yet.
+		   I'm doing the math backwards to prevent negative numbers,
+		   but think of it as:
+		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
+		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
+			rg_mblk_search(rgd, ip);
+	}
+	return start_block;
+}
+
+/**
  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
  * @ip: the inode to allocate the block for
  * @bn: Used to return the starting block number
- * @ndata: requested number of blocks/extent length (value/result)
+ * @nblocks: requested number of blocks/extent length (value/result)
  * @dinode: 1 if we're allocating a dinode block, else 0
  * @generation: the generation number of the inode
  *
@@ -1585,20 +2064,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	if (ip->i_res->rs_requested == 0)
 		return -ECANCELED;
 
-	rgd = ip->i_rgd;
-
-	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
-		goal = ip->i_goal - rgd->rd_data0;
-	else
-		goal = rgd->rd_last_alloc;
-
-	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+	/* Check if we have a multi-block reservation, and if so, claim the
+	   next free block from it. */
+	if (gfs2_rs_active(ip->i_res)) {
+		BUG_ON(!ip->i_res->rs_free);
+		rgd = ip->i_res->rs_rgd;
+		block = claim_reserved_blks(ip, dinode, nblocks);
+	} else {
+		rgd = ip->i_rgd;
 
-	/* Since all blocks are reserved in advance, this shouldn't happen */
-	if (blk == BFITNOENT)
-		goto rgrp_error;
+		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
+			goal = ip->i_goal - rgd->rd_data0;
+		else
+			goal = rgd->rd_last_alloc;
+
+		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+
+		/* Since all blocks are reserved in advance, this shouldn't
+		   happen */
+		if (blk == BFITNOENT) {
+			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
+			       *nblocks);
+			printk(KERN_WARNING "FULL=%d\n",
+			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
+			goto rgrp_error;
+		}
 
-	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+	}
 	ndata = *nblocks;
 	if (dinode)
 		ndata--;
@@ -1615,8 +2108,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 			brelse(dibh);
 		}
 	}
-	if (rgd->rd_free < *nblocks)
+	if (rgd->rd_free < *nblocks) {
+		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
 		goto rgrp_error;
+	}
 
 	rgd->rd_free -= *nblocks;
 	if (dinode) {
@@ -1672,6 +2167,9 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
 		return;
 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
 	rgd->rd_free += blen;
+	if (bstart < rgd->rd_next_rsrv)
+		rgd->rd_next_rsrv = bstart;
+
 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5d8314d..95b00d5 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -13,6 +13,14 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 
+/* Since each block in the file system is represented by two bits in the
+ * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
+ * By reserving 32 blocks at a time, we can optimize / shortcut how we search
+ * through the bitmaps by looking a word at a time.
+ */
+#define RGRP_RSRV_MINBYTES 8
+#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
@@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
 extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
 extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+
 extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
@@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 			     bool dinode, u64 *generation);
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
 extern void gfs2_rs_delete(struct gfs2_inode *ip);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
 extern int gfs2_fitrim(struct file *filp, void __user *argp);
 
-/* This is how to tell if a reservation is "inplace" reserved: */
+/* This is how to tell if a multi-block reservation is "inplace" reserved: */
 static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
 {
 	if (ip->i_res && ip->i_res->rs_requested)
@@ -70,4 +81,12 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
 	return 0;
 }
 
+/* This is how to tell if a multi-block reservation is in the rgrp tree: */
+static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
+{
+	if (rs && rs->rs_start)
+		return 1;
+	return 0;
+}
+
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7880687..b1502c4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 		return -EIO;
 	}
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		return error;
@@ -1550,6 +1554,9 @@ out_truncate:
 
 out_unlock:
 	/* Error path for case 1 */
+	if (gfs2_rs_active(ip->i_res))
+		gfs2_rs_deltree(ip->i_res);
+
 	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
 		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 1b8b815..98fd480 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -31,6 +31,17 @@
 			    { GFS2_BLKST_DINODE, "dinode" },	\
 			    { GFS2_BLKST_UNLINKED, "unlinked" })
 
+#define TRACE_RS_DELETE  0
+#define TRACE_RS_TREEDEL 1
+#define TRACE_RS_INSERT  2
+#define TRACE_RS_CLAIM   3
+
+#define rs_func_name(x) __print_symbolic(x,	\
+					 { 0, "del " },	\
+					 { 1, "tdel" },	\
+					 { 2, "ins " },	\
+					 { 3, "clm " })
+
 #define show_glock_flags(flags) __print_flags(flags, "",	\
 	{(1UL << GLF_LOCK),			"l" },		\
 	{(1UL << GLF_DEMOTE),			"D" },		\
@@ -470,6 +481,7 @@ TRACE_EVENT(gfs2_block_alloc,
 		__field(	u8,	block_state		)
 		__field(        u64,	rd_addr			)
 		__field(        u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
 	),
 
 	TP_fast_assign(
@@ -480,16 +492,64 @@ TRACE_EVENT(gfs2_block_alloc,
 		__entry->block_state	= block_state;
 		__entry->rd_addr	= rgd->rd_addr;
 		__entry->rd_free_clone	= rgd->rd_free_clone;
+		__entry->rd_reserved	= rgd->rd_reserved;
 	),
 
-	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
+	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long long)__entry->inum,
 		  (unsigned long long)__entry->start,
 		  (unsigned long)__entry->len,
 		  block_state_name(__entry->block_state),
 		  (unsigned long long)__entry->rd_addr,
-		  __entry->rd_free_clone)
+		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
+);
+
+/* Keep track of multi-block reservations as they are allocated/freed */
+TRACE_EVENT(gfs2_rs,
+
+	TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
+		 u64 claimed, u8 func),
+
+	TP_ARGS(ip, rs, claimed, func),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	rd_addr			)
+		__field(	u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
+		__field(	u64,	inum			)
+		__field(	u64,	start			)
+		__field(	u32,	len			)
+		__field(	u64,	claimed			)
+		__field(	u32,	free			)
+		__field(	u8,	func			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
+		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
+		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
+		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
+		__entry->inum		= ip ? ip->i_no_addr : 0;
+		__entry->start		= rs->rs_start;
+		__entry->len		= rs->rs_len;
+		__entry->claimed	= claimed;
+		__entry->free		= rs->rs_free;
+		__entry->func		= func;
+	),
+
+	TP_printk("%u,%u bmap %llu resrv %llu/%lu rg:%llu rf:%lu rr:%lu %s "
+		  "c:%llu f:%lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->start,
+		  (unsigned long)__entry->len,
+		  (unsigned long long)__entry->rd_addr,
+		  (unsigned long)__entry->rd_free_clone,
+		  (unsigned long)__entry->rd_reserved,
+		  rs_func_name(__entry->func),
+		  __entry->claimed, (unsigned long)__entry->free)
 );
 
 #endif /* _TRACE_GFS2_H */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 523c0de..27a0b4a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 {
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto out_alloc;
@@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	struct buffer_head *dibh;
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_lock_check(ip);
 	if (error)
 		return error;
@@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 {
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		return error;



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 2] GFS2: Reduce file fragmentation
  2012-07-11 18:07     ` Bob Peterson
@ 2012-07-11 18:14       ` Steven Whitehouse
  2012-07-11 18:51         ` Bob Peterson
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Whitehouse @ 2012-07-11 18:14 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Wed, 2012-07-11 at 14:07 -0400, Bob Peterson wrote:
[snip]
>  
> | What is the difference between rs_free and rs_blks? Shouldn't these
> | two
> | always be identical, since there is no point in reserving blocks
> | which
> | are not free.
> 
> I guess I used a misleading variable name. The two variables have
> two meanings and both are needed. I renamed variable rs_blks to rs_len
> because it represents the length of the reservation.
> 
Thats not really answering the question though... all the blocks in the
reservation must be free, otherwise there is no point in reserving them.
So rs_free should be identical to rs_len or whatever it is called.
Either that or maybe I'm not understanding why there are two different
variables?

[snip]
> | 
> | I'm not sure that I understand this comment at all. Currently with
> | directories we never deallocate any blocks at all until the directory
> | is
> | deallocated when it is unlinked. We will want to extend this to
> | directories eventually, even if we don't do that immediately.
> 
> I clarified the comment to make it more clear what's going on.
> I'm talking about gaps in the _reservation_ not gaps in the blocks.
> The current algorithm makes assumptions based on the fact that block
> reservations don't have gaps, and the "next" free block will be the
> successor to the last claimed. If you use reservations for directories,
> what can happen is that two files may be created, which claims two
> blocks in the reservation. If the first file is deleted from the
> directory, that block becomes a "hole" in the reservation, which breaks
> the code with its current assumptions. We either have to:
> (a) keep the current assumptions which make block claims faster, or
> (b) Make no such assumptions and implement a bitmap-like search of the
>     reservation that can fill holes. It wouldn't be too tough to do,
>     especially since we already have nicely tuned functions to do it.
>     I'm just worried that it's going to hurt performance.
>  
That is just a bug in the way we are doing the allocations. The
allocation of new inodes should be done based on the inode's own
reservation, and not on the reservation of its parent directory. That is
something else on the "to fix" list, but it is complicated to do,

Steve.




^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 2] GFS2: Reduce file fragmentation
  2012-07-11 18:14       ` Steven Whitehouse
@ 2012-07-11 18:51         ` Bob Peterson
  0 siblings, 0 replies; 13+ messages in thread
From: Bob Peterson @ 2012-07-11 18:51 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| Hi,
| 
| On Wed, 2012-07-11 at 14:07 -0400, Bob Peterson wrote:
| [snip]
| >  
| > | What is the difference between rs_free and rs_blks? Shouldn't
| > | these
| > | two
| > | always be identical, since there is no point in reserving blocks
| > | which
| > | are not free.
| > 
| > I guess I used a misleading variable name. The two variables have
| > two meanings and both are needed. I renamed variable rs_blks to
| > rs_len
| > because it represents the length of the reservation.
| > 
| Thats not really answering the question though... all the blocks in
| the
| reservation must be free, otherwise there is no point in reserving
| them.
| So rs_free should be identical to rs_len or whatever it is called.
| Either that or maybe I'm not understanding why there are two
| different
| variables?

The variables and their meanings are as follows:

1. rs_start - this is where the block reservation originally started.
   This never changes during the life of the reservation.
2. rs_len - this is the length of the reservation, in blocks.
   This never changes during the life of the reservation.
3. rs_free - this is how many of those blocks are free.
   This is decremented every time a block is claimed from the reservation.

So the number of blocks "used" or "claimed" from the reservation is len - free.

We could likely accomplish the same thing with only two variables,
by bumping rs_start and subtracting rs_len when every block is claimed
from the reservation, but I'm also using rs_start as a means of
keeping the reservations aligned in the bitmap.

I think keeping the reservations on u64 boundaries gives us the best
performance for function memchr_inv, which I think is optimized to use
word compares where it can.

Doing it my way also makes it easier to read the trace points: You can
see where the reservation started, what's been claimed and what's free.
It's easier to detect problems with overlapping reservations and such.

| [snip]
| > | 
| > | I'm not sure that I understand this comment at all. Currently
| > | with
| > | directories we never deallocate any blocks at all until the
| > | directory
| > | is
| > | deallocated when it is unlinked. We will want to extend this to
| > | directories eventually, even if we don't do that immediately.
| > 
| > I clarified the comment to make it more clear what's going on.
| > I'm talking about gaps in the _reservation_ not gaps in the blocks.
| > The current algorithm makes assumptions based on the fact that
| > block
| > reservations don't have gaps, and the "next" free block will be the
| > successor to the last claimed. If you use reservations for
| > directories,
| > what can happen is that two files may be created, which claims two
| > blocks in the reservation. If the first file is deleted from the
| > directory, that block becomes a "hole" in the reservation, which
| > breaks
| > the code with its current assumptions. We either have to:
| > (a) keep the current assumptions which make block claims faster, or
| > (b) Make no such assumptions and implement a bitmap-like search of
| > the
| >     reservation that can fill holes. It wouldn't be too tough to
| >     do,
| >     especially since we already have nicely tuned functions to do
| >     it.
| >     I'm just worried that it's going to hurt performance.
| >  
| That is just a bug in the way we are doing the allocations. The
| allocation of new inodes should be done based on the inode's own
| reservation, and not on the reservation of its parent directory. That
| is
| something else on the "to fix" list, but it is complicated to do,

No, it goes beyond that. It has to do with the way the block accounting is
done for the reservations. If a big write is trying to write 7 blocks
(let's say in a multi-page write, which isn't implemented yet, but similar
things happen today) and rs_free says there are 7 free blocks in the
reservation, it claims them all starting with rs_start + (rs_len - rs_free).
If there are "holes" in the reservation, that would throw the whole thing
off. It would have to do a bitmap search for the reservation to figure out
where the first available block is. If there are 7 free blocks, but one of
them is a "hole" and six are contiguous, it has to bitmap-search of the
reservation to find each of the 7.

On the other hand, if we allow holes and adjust the algorithm
appropriately, I think the file system will end up being more fragmented
than the current algorithm. This is written with the thought that files
will have larger runs of data blocks and metadata blocks can fill in the
holes left behind.

The other approach that I talked about above (incrementing the starting
block and decrementing the size) would solve this problem, but the
deleted file would force a block to be "left behind" for a future
reservation to find, which would likely add to the fragmentation of the
file system. I could be wrong about that, and we could prototype it to
find out for sure. (IOW, it may not be any worse, since we're talking about
directories which bypass the reservations and do individual searches
anyway).

Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 3] GFS2: Reduce file fragmentation
  2012-07-11 18:10     ` [Cluster-devel] [GFS2 Patch] [Try 3] " Bob Peterson
@ 2012-07-12 10:01       ` Steven Whitehouse
  2012-07-12 19:28         ` [Cluster-devel] [GFS2 Patch] [Try 4] " Bob Peterson
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Whitehouse @ 2012-07-12 10:01 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Wed, 2012-07-11 at 14:10 -0400, Bob Peterson wrote:
> Hi,
> 
> Here is my third version of the GFS2 file defragmentation patch for
> upstream:
> 
> Description:
> 
> This patch reduces GFS2 file fragmentation by pre-reserving blocks.
> In this patch, I've implemented almost all of Steve's suggestions.
> 
> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---

I've found a workload that seems to do something a bit odd...


[root at chywoon /]# for i in `seq 0 9`; do dd if=/dev/zero of=file.${i} bs=4096 count=1; done

(i.e. create 10 single block files, sequentially)

Then I looked at the allocation pattern using stat & filefrag. We get 10
inodes, at positions: 33150, 33151, 33152, 33153, 33154, 33155, 33156,
33157, 33158, 33159 (so sequential with each other)

The lets look at where the data lands up: 33302, 33462, 33622, 33782,
33942, 34102, 34262, 34422, 34582, 34742 (respectively for each inode)

What I'd hope to see is that the data blocks are being allocated
contiguously with the inodes, at least in most cases. The data blocks
appear to be spaced out with gaps of 160 blocks between them. This was
on newly created filesystem, btw, so there should be nothing to get in
the way of the allocations.

Given that directories don't have reservations, and the reservations
relating to the individual inodes that are being created should be
cleared on close (i.e. before the subsequent creation) this should not
have changed in behaviour from the current kernels,

Steve.


> diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
> index 6d957a8..49cd7dd 100644
> --- a/fs/gfs2/bmap.c
> +++ b/fs/gfs2/bmap.c
> @@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
>  	if (error)
>  		goto out_rlist;
>  
> +	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
> +		gfs2_rs_deltree(ip->i_res);
> +
>  	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
>  				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
>  				 revokes);
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 6fbf3cb..b93275f 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint,
> +		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
> +
>  	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
>  	ret = gfs2_glock_nq(&gh);
>  	if (ret)
> @@ -571,22 +574,15 @@ fail:
>  
>  static int gfs2_release(struct inode *inode, struct file *file)
>  {
> -	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
> -	struct gfs2_file *fp;
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  
> -	fp = file->private_data;
> +	kfree(file->private_data);
>  	file->private_data = NULL;
>  
> -	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
> -	    (atomic_read(&inode->i_writecount) == 1))
> +	if ((file->f_mode & FMODE_WRITE) &&
> +	    (atomic_read(&inode->i_writecount) == 1)) {
>  		gfs2_rs_delete(ip);
> -
> -	if (gfs2_assert_warn(sdp, fp))
> -		return -EIO;
> -
> -	kfree(fp);
> -
> +	}
>  	return 0;
>  }
>  
> @@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  				   unsigned long nr_segs, loff_t pos)
>  {
>  	struct file *file = iocb->ki_filp;
> +	size_t writesize = iov_length(iov, nr_segs);
>  	struct dentry *dentry = file->f_dentry;
>  	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
> +	struct gfs2_sbd *sdp;
>  	int ret;
>  
> +	sdp = GFS2_SB(file->f_mapping->host);
>  	ret = gfs2_rs_alloc(ip);
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
>  	if (file->f_flags & O_APPEND) {
>  		struct gfs2_holder gh;
>  
> @@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  	if (unlikely(error))
>  		goto out_uninit;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
> +
>  	while (len > 0) {
>  		if (len < bytes)
>  			bytes = len;
> @@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  			offset += bytes;
>  			continue;
>  		}
> -		error = gfs2_rindex_update(sdp);
> -		if (error)
> -			goto out_unlock;
> -
>  		error = gfs2_quota_lock_check(ip);
>  		if (error)
>  			goto out_unlock;
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index dc73070..4384d77 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -84,6 +84,7 @@ struct gfs2_rgrpd {
>  	u32 rd_data;			/* num of data blocks in rgrp */
>  	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
>  	u32 rd_free;
> +	u32 rd_reserved;                /* number of blocks reserved */
>  	u32 rd_free_clone;
>  	u32 rd_dinodes;
>  	u64 rd_igeneration;
> @@ -91,11 +92,15 @@ struct gfs2_rgrpd {
>  	struct gfs2_sbd *rd_sbd;
>  	struct gfs2_rgrp_lvb *rd_rgl;
>  	u32 rd_last_alloc;
> +	u32 rd_next_rsrv;               /* preferred next reservation */
>  	u32 rd_flags;
>  #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
>  #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
>  #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
>  #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
> +	spinlock_t rd_rsspin;           /* protects reservation related vars */
> +	struct rb_root rd_rstree;       /* multi-block reservation tree */
> +	u32 rd_rs_cnt;                  /* count of current reservations */
>  };
>  
>  enum gfs2_state_bits {
> @@ -233,6 +238,40 @@ struct gfs2_holder {
>  	unsigned long gh_ip;
>  };
>  
> +/* Resource group multi-block reservation, in order of appearance:
> +
> +   Step 1. Function prepares to write, allocates a mb, sets the size hint.
> +   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
> +   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
> +   Step 4. Bits are assigned from the rgrp based on either the reservation
> +           or wherever it can.
> +*/
> +
> +struct gfs2_blkreserv {
> +	/* components used during write (step 1): */
> +	atomic_t rs_sizehint;         /* hint of the write size */
> +
> +	/* components used during inplace_reserve (step 2): */
> +	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> +
> +	/* components used during get_local_rgrp (step 3): */
> +	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
> +	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
> +	struct rb_node rs_node;       /* link to other block reservations */
> +
> +	/* components used during block searches and assignments (step 4): */
> +	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
> +	u64 rs_start;                 /* absolute fs block address */
> +	u32 rs_biblk;                 /* start block relative to the bi */
> +	u32 rs_len;                  /* number of blocks reserved */
> +	u32 rs_free;                  /* how many blocks are still free */
> +
> +	/* ancillary quota stuff */
> +	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> +	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> +	unsigned int rs_qa_qd_num;
> +};
> +
>  enum {
>  	GLF_LOCK			= 1,
>  	GLF_DEMOTE			= 3,
> @@ -290,16 +329,6 @@ struct gfs2_glock {
>  
>  #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
>  
> -struct gfs2_blkreserv {
> -	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> -	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
> -
> -	/* ancillary quota stuff */
> -	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> -	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> -	unsigned int rs_qa_qd_num;
> -};
> -
>  enum {
>  	GIF_INVALID		= 0,
>  	GIF_QD_LOCKED		= 1,
> @@ -307,7 +336,6 @@ enum {
>  	GIF_SW_PAGED		= 3,
>  };
>  
> -
>  struct gfs2_inode {
>  	struct inode i_inode;
>  	u64 i_no_addr;
> @@ -318,7 +346,7 @@ struct gfs2_inode {
>  	struct gfs2_glock *i_gl; /* Move into i_gh? */
>  	struct gfs2_holder i_iopen_gh;
>  	struct gfs2_holder i_gh; /* for prepare/commit_write only */
> -	struct gfs2_blkreserv *i_res; /* resource group block reservation */
> +	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
>  	struct gfs2_rgrpd *i_rgd;
>  	u64 i_goal;	/* goal block for allocations */
>  	struct rw_semaphore i_rw_mutex;
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 2b035e0..12c7b6a 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
>  	int error;
>  
>  	munge_mode_uid_gid(dip, &mode, &uid, &gid);
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
>  
>  	error = gfs2_quota_lock(dip, uid, gid);
>  	if (error)
> @@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto fail;
> @@ -596,7 +603,8 @@ fail_end_trans:
>  	gfs2_trans_end(sdp);
>  
>  fail_ipreserv:
> -	gfs2_inplace_release(dip);
> +	if (alloc_required)
> +		gfs2_inplace_release(dip);
>  
>  fail_quota_locks:
>  	gfs2_quota_unlock(dip);
> @@ -738,6 +746,7 @@ fail_gunlock:
>  		iput(inode);
>  	}
>  fail:
> +	gfs2_rs_delete(dip);
>  	if (bh)
>  		brelse(bh);
>  	return error;
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index e53d0a1..ba32001 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -35,6 +35,9 @@
>  #define BFITNOENT ((u32)~0)
>  #define NO_BLOCK ((u64)~0)
>  
> +#define RSRV_CONTENTION_FACTOR 2
> +#define RGRP_RSRV_MAX_CONTENDERS 4
> +
>  #if BITS_PER_LONG == 32
>  #define LBITMASK   (0x55555555UL)
>  #define LBITSKIP55 (0x55555555UL)
> @@ -178,6 +181,60 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
>  }
>  
>  /**
> + * rs_cmp - multi-block reservation range compare
> + * @blk: absolute file system block number of the new reservation
> + * @len: number of blocks in the new reservation
> + * @rs: existing reservation to compare against
> + *
> + * returns: 1 if the block range is beyond the reach of the reservation
> + *         -1 if the block range is before the start of the reservation
> + *          0 if the block range overlaps with the reservation
> + */
> +static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
> +{
> +	if (blk > rs->rs_start + rs->rs_len - 1)
> +		return 1;
> +	if (blk + len - 1 < rs->rs_start)
> +		return -1;
> +	return 0;
> +}
> +
> +/**
> + * rs_find - Find a rgrp multi-block reservation that contains a given block
> + * @rgd: The rgrp
> + * @rgblk: The block we're looking for, relative to the rgrp
> + */
> +static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
> +{
> +	struct rb_node **newn;
> +	int rc;
> +	u64 fsblk = rgblk + rgd->rd_data0;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	newn = &rgd->rd_rstree.rb_node;
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +		rc = rs_cmp(fsblk, 1, cur);
> +		if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else {
> +			spin_unlock(&rgd->rd_rsspin);
> +			return cur;
> +		}
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
> +	return NULL;
> +}
> +
> +static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
> +{
> +	return (bi->bi_start * GFS2_NBBY) + blk;
> +}
> +
> +/**
>   * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
>   *       a block in a given allocation state.
>   * @buf: the buffer that holds the bitmaps
> @@ -435,8 +492,79 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
>  	return error;
>  }
>  
> +static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
> +{
> +	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u i:%u f:%u\n",
> +		       rs->rs_rgd->rd_addr, rs->rs_start, rs->rs_biblk,
> +		       rs->rs_len, rs->rs_free);
> +}
> +
>  /**
> - * gfs2_rs_delete - delete a reservation
> + * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
> + * @rs: The reservation to remove
> + *
> + */
> +static void __gfs2_rs_deltree(struct gfs2_blkreserv *rs)
> +{
> +	struct gfs2_rgrpd *rgd;
> +
> +	if (!gfs2_rs_active(rs))
> +		return;
> +
> +	rgd = rs->rs_rgd;
> +	/* We can't do this: The reason is that when the rgrp is invalidated,
> +	   it's in the "middle" of acquiring the glock, but the HOLDER bit
> +	   isn't set yet:
> +	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
> +	trace_gfs2_rs(NULL, rs, 0, TRACE_RS_TREEDEL);
> +
> +	/* Now that we have the mutex, check again if the tree is empty just
> +	   in case someone else deleted this entry. */
> +	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
> +		rb_erase(&rs->rs_node, &rgd->rd_rstree);
> +	BUG_ON(!rgd->rd_rs_cnt);
> +	rgd->rd_rs_cnt--;
> +
> +	if (rs->rs_free) {
> +		/* return reserved blocks to the rgrp and the ip */
> +		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
> +		rs->rs_rgd->rd_reserved -= rs->rs_free;
> +		rs->rs_free = 0;
> +		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
> +		smp_mb__after_clear_bit();
> +	}
> +	/* We can't change any of the step 1 or step 2 components of the rs.
> +	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
> +	   dequeued through this pointer.
> +	   Can't: atomic_set(&rs->rs_sizehint, 0);
> +	   Can't: rs->rs_requested = 0;
> +	   Can't: rs->rs_rgd = NULL;*/
> +	rs->rs_bi = NULL;
> +	rs->rs_start = 0;
> +	rs->rs_biblk = 0;
> +	rs->rs_len = 0;
> +}
> +
> +/**
> + * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
> + * @rs: The reservation to remove
> + *
> + */
> +void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
> +{
> +	struct gfs2_rgrpd *rgd;
> +
> +	if (!gfs2_rs_active(rs))
> +		return;
> +
> +	rgd = rs->rs_rgd;
> +	spin_lock(&rgd->rd_rsspin);
> +	__gfs2_rs_deltree(rs);
> +	spin_unlock(&rgd->rd_rsspin);
> +}
> +
> +/**
> + * gfs2_rs_delete - delete a multi-block reservation
>   * @ip: The inode for this reservation
>   *
>   */
> @@ -444,12 +572,37 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
>  {
>  	down_write(&ip->i_rw_mutex);
>  	if (ip->i_res) {
> +		gfs2_rs_deltree(ip->i_res);
> +		trace_gfs2_rs(ip, ip->i_res, ip->i_res->rs_start,
> +			      TRACE_RS_DELETE);
> +		BUG_ON(ip->i_res->rs_free);
>  		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
>  		ip->i_res = NULL;
>  	}
>  	up_write(&ip->i_rw_mutex);
>  }
>  
> +/**
> + * return_all_reservations - return all reserved blocks back to the rgrp.
> + * @rgd: the rgrp that needs its space back
> + *
> + * We previously reserved a bunch of blocks for allocation. Now we need to
> + * give them back. This leave the reservation structures in tact, but removes
> + * all of their corresponding "no-fly zones".
> + */
> +static void return_all_reservations(struct gfs2_rgrpd *rgd)
> +{
> +	struct rb_node *n;
> +	struct gfs2_blkreserv *rs;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	while ((n = rb_first(&rgd->rd_rstree))) {
> +		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		__gfs2_rs_deltree(rs);
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
> +}
> +
>  void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  {
>  	struct rb_node *n;
> @@ -472,6 +625,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  
>  		gfs2_free_clones(rgd);
>  		kfree(rgd->rd_bits);
> +		return_all_reservations(rgd);
>  		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
>  	}
>  }
> @@ -649,6 +803,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
>  	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
>  	rgd->rd_data = be32_to_cpu(buf.ri_data);
>  	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
> +	spin_lock_init(&rgd->rd_rsspin);
>  
>  	error = compute_bitstructs(rgd);
>  	if (error)
> @@ -1115,29 +1270,228 @@ out:
>  }
>  
>  /**
> + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
> + * @bi: the bitmap with the blocks
> + * @ip: the inode structure
> + * @biblk: the 32-bit block number relative to the start of the bitmap
> + * @amount: the number of blocks to reserve
> + *
> + * Returns: NULL - reservation was already taken, so not inserted
> + *          pointer to the inserted reservation
> + */
> +static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
> +				       struct gfs2_inode *ip, u32 biblk,
> +				       int amount)
> +{
> +	struct rb_node **newn, *parent = NULL;
> +	int rc;
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	newn = &rgd->rd_rstree.rb_node;
> +	BUG_ON(!ip->i_res);
> +	BUG_ON(gfs2_rs_active(rs));
> +	/* Figure out where to put new node */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +
> +		parent = *newn;
> +		rc = rs_cmp(fsblock, amount, cur);
> +		if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else {
> +			spin_unlock(&rgd->rd_rsspin);
> +			return NULL; /* reservation already in use */
> +		}
> +	}
> +
> +	/* Do our reservation work */
> +	rs = ip->i_res;
> +	rs->rs_start = fsblock;
> +	rs->rs_len = amount;
> +	rs->rs_free = amount;
> +	rs->rs_biblk = biblk;
> +	rs->rs_bi = bi;
> +	rb_link_node(&rs->rs_node, parent, newn);
> +	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
> +
> +	/* Do our inode accounting for the reservation */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +
> +	/* Do our rgrp accounting for the reservation */
> +	rgd->rd_reserved += amount; /* blocks reserved */
> +	rgd->rd_next_rsrv = rs->rs_start + rs->rs_len;
> +	rgd->rd_rs_cnt++; /* number of in-tree reservations */
> +	if (!rgrp_contains_block(rgd, rgd->rd_next_rsrv))
> +		rgd->rd_next_rsrv = 0;
> +	spin_unlock(&rgd->rd_rsspin);
> +	trace_gfs2_rs(ip, rs, fsblock, TRACE_RS_INSERT);
> +	return rs;
> +}
> +
> +/**
> + * unclaimed_blocks - return number of blocks that aren't spoken for
> + */
> +static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
> +{
> +	return rgd->rd_free_clone - rgd->rd_reserved;
> +}
> +
> +/**
> + * rg_mblk_search - find a group of multiple free blocks
> + * @rgd: the resource group descriptor
> + * @rs: the block reservation
> + * @ip: pointer to the inode for which we're reserving blocks
> + *
> + * This is very similar to rgblk_search, except we're looking for whole
> + * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
> + * on aligned dwords for speed's sake.
> + *
> + * Returns: 0 if successful or BFITNOENT if there isn't enough free space
> + */
> +
> +static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
> +{
> +	struct gfs2_bitmap *bi = rgd->rd_bits;
> +	const u32 length = rgd->rd_length;
> +	u32 blk;
> +	unsigned int buf, x, search_bytes;
> +	u8 *buffer = NULL;
> +	u8 *ptr, *end, *nonzero;
> +	u32 goal, rsv_bytes;
> +	struct gfs2_blkreserv *rs;
> +	u32 best_rs_bytes, unclaimed;
> +	int best_rs_blocks;
> +
> +	/* Find bitmap block that contains bits for goal block */
> +	if (rgd->rd_next_rsrv) {
> +		goal = rgd->rd_next_rsrv - rgd->rd_data0;
> +		for (buf = 0; buf < length; buf++) {
> +			bi = rgd->rd_bits + buf;
> +			/* Convert scope of "goal" from rgrp-wide to within
> +			   found bit block */
> +			if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
> +				goal -= bi->bi_start * GFS2_NBBY;
> +				goto do_search;
> +			}
> +		}
> +	}
> +	buf = 0;
> +	goal = 0;
> +
> +do_search:
> +	best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
> +			       (RGRP_RSRV_MINBLKS * rgd->rd_length));
> +	best_rs_bytes = (best_rs_blocks *
> +			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
> +		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
> +	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
> +	unclaimed = unclaimed_blocks(rgd);
> +	if (best_rs_bytes * GFS2_NBBY > unclaimed)
> +		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
> +
> +	for (x = 0; x <= length; x++) {
> +		bi = rgd->rd_bits + buf;
> +
> +		if (test_bit(GBF_FULL, &bi->bi_flags))
> +			goto skip;
> +
> +		WARN_ON(!buffer_uptodate(bi->bi_bh));
> +		if (bi->bi_clone)
> +			buffer = bi->bi_clone + bi->bi_offset;
> +		else
> +			buffer = bi->bi_bh->b_data + bi->bi_offset;
> +
> +		/* We have to keep the reservations aligned on u64 boundaries
> +		   otherwise we could get situations where a byte can't be
> +		   used because it's after a reservation, but a free bit still
> +		   is within the reservation's area. */
> +		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
> +		end = (buffer + bi->bi_len);
> +		while (ptr < end) {
> +			rsv_bytes = 0;
> +			if ((ptr + best_rs_bytes) <= end)
> +				search_bytes = best_rs_bytes;
> +			else
> +				search_bytes = end - ptr;
> +			BUG_ON(!search_bytes);
> +			nonzero = memchr_inv(ptr, 0, search_bytes);
> +			/* If the lot is all zeroes, reserve the whole size. If
> +			   there's enough zeroes to satisfy the request, use
> +			   what we can. If there's not enough, keep looking. */
> +			if (nonzero == NULL)
> +				rsv_bytes = search_bytes;
> +			else if ((nonzero - ptr) * GFS2_NBBY >=
> +				 ip->i_res->rs_requested)
> +				rsv_bytes = (nonzero - ptr);
> +
> +			if (rsv_bytes) {
> +				blk = ((ptr - buffer) * GFS2_NBBY);
> +				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
> +				rs = rs_insert(bi, ip, blk,
> +					       rsv_bytes * GFS2_NBBY);
> +				if (IS_ERR(rs))
> +					return PTR_ERR(rs);
> +				if (rs)
> +					return 0;
> +			}
> +			ptr += ALIGN(search_bytes, sizeof(u64));
> +		}
> +skip:
> +		/* Try next bitmap block (wrap back to rgrp header
> +		   if at end) */
> +		buf++;
> +		buf %= length;
> +		goal = 0;
> +	}
> +
> +	return BFITNOENT;
> +}
> +
> +/**
>   * try_rgrp_fit - See if a given reservation will fit in a given RG
>   * @rgd: the RG data
>   * @ip: the inode
>   *
>   * If there's room for the requested blocks to be allocated from the RG:
> + * This will try to get a multi-block reservation first, and if that doesn't
> + * fit, it will take what it can.
>   *
>   * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
>   */
>  
> -static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
> +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
>  {
> -	const struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_blkreserv *rs = ip->i_res;
>  
>  	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
>  		return 0;
> -	if (rgd->rd_free_clone >= rs->rs_requested)
> +	/* Look for a multi-block reservation. We only do this for files,
> +	   not directories or other types. The problem with directories is
> +	   that items are often created and deleted in the directory in the
> +	   same breath, which can create "holes" in the reservation. By holes
> +	   I mean that your next "claim" may not be the next free block in the
> +	   reservation. In other words, we could get into situations where two
> +	   or more blocks are reserved, then used, then one or more of the
> +	   earlier blocks is freed. When we delete the reservation, the rs_free
> +	   will be off due to the hole, so the rgrp's rg_free count can get
> +	   off. Other non-files, like symlinks and char or block devices, are
> +	   very small in size, so it doesn't make sense to reserve multiple
> +	   blocks for them. */
> +	if (S_ISREG(ip->i_inode.i_mode) &&
> +	    unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
> +	    rg_mblk_search(rgd, ip) != BFITNOENT)
> +		return 1;
> +	if (unclaimed_blocks(rgd) >= rs->rs_requested)
>  		return 1;
> -	return 0;
> -}
>  
> -static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
> -{
> -	return (bi->bi_start * GFS2_NBBY) + blk;
> +	return 0;
>  }
>  
>  /**
> @@ -1217,7 +1571,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>  int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  {
>  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> -	struct gfs2_rgrpd *rgd, *begin = NULL;
> +	struct gfs2_rgrpd *begin = NULL;
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  	int error = 0, rg_locked, flags = LM_FLAG_TRY;
>  	u64 last_unlinked = NO_BLOCK;
> @@ -1225,32 +1579,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  
>  	if (sdp->sd_args.ar_rgrplvb)
>  		flags |= GL_SKIP;
> -	rs = ip->i_res;
>  	rs->rs_requested = requested;
>  	if (gfs2_assert_warn(sdp, requested)) {
>  		error = -EINVAL;
>  		goto out;
>  	}
> -
> -	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
> -		rgd = begin = ip->i_rgd;
> -	else
> -		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> -
> -	if (rgd == NULL)
> +	if (gfs2_rs_active(rs)) {
> +		begin = rs->rs_rgd;
> +		flags = 0; /* Yoda: Do or do not. There is no try */
> +	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
> +		rs->rs_rgd = begin = ip->i_rgd;
> +	} else {
> +		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> +	}
> +	if (rs->rs_rgd == NULL)
>  		return -EBADSLT;
>  
>  	while (loops < 3) {
>  		rg_locked = 0;
>  
> -		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
> +		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
>  			rg_locked = 1;
>  			error = 0;
> +		} else if (!loops && !gfs2_rs_active(rs) &&
> +			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
> +			/* If the rgrp already is maxed out for contenders,
> +			   we can eliminate it as a "first pass" without even
> +			   requesting the rgrp glock. */
> +			error = GLR_TRYFAILED;
>  		} else {
> -			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
> -						   flags, &rs->rs_rgd_gh);
> +			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
> +						   LM_ST_EXCLUSIVE, flags,
> +						   &rs->rs_rgd_gh);
>  			if (!error && sdp->sd_args.ar_rgrplvb) {
> -				error = update_rgrp_lvb(rgd);
> +				error = update_rgrp_lvb(rs->rs_rgd);
>  				if (error) {
>  					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  					return error;
> @@ -1259,24 +1621,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  		}
>  		switch (error) {
>  		case 0:
> -			if (try_rgrp_fit(rgd, ip)) {
> +			if (gfs2_rs_active(rs)) {
> +				if (unclaimed_blocks(rs->rs_rgd) +
> +				    rs->rs_free >= rs->rs_requested) {
> +					ip->i_rgd = rs->rs_rgd;
> +					return 0;
> +				}
> +				/* We have a multi-block reservation, but the
> +				   rgrp doesn't have enough free blocks to
> +				   satisfy the request. Free the reservation
> +				   and look for a suitable rgrp. */
> +				gfs2_rs_deltree(rs);
> +			}
> +			if (try_rgrp_fit(rs->rs_rgd, ip)) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				ip->i_rgd = rgd;
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				ip->i_rgd = rs->rs_rgd;
>  				return 0;
>  			}
> -			if (rgd->rd_flags & GFS2_RDF_CHECK) {
> +			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				try_rgrp_unlink(rgd, &last_unlinked,
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
>  						ip->i_no_addr);
>  			}
>  			if (!rg_locked)
>  				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  			/* fall through */
>  		case GLR_TRYFAILED:
> -			rgd = gfs2_rgrpd_get_next(rgd);
> -			if (rgd != begin) /* If we didn't wrap */
> +			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
> +			if (rs->rs_rgd != begin) /* If we didn't wrap */
>  				break;
>  
>  			flags &= ~LM_FLAG_TRY;
> @@ -1314,6 +1688,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
>  {
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  
> +	if (!rs)
> +		return;
> +
> +	if (!rs->rs_free)
> +		gfs2_rs_deltree(rs);
> +
>  	if (rs->rs_rgd_gh.gh_gl)
>  		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  	rs->rs_requested = 0;
> @@ -1412,7 +1792,27 @@ do_search:
>  		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
>  			buffer = bi->bi_clone + bi->bi_offset;
>  
> -		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +		while (1) {
> +			struct gfs2_blkreserv *rs;
> +			u32 rgblk;
> +
> +			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +			if (biblk == BFITNOENT)
> +				break;
> +			/* Check if this block is reserved() */
> +			rgblk = gfs2_bi2rgd_blk(bi, biblk);
> +			rs = rs_find(rgd, rgblk);
> +			if (rs == NULL)
> +				break;
> +
> +			BUG_ON(rs->rs_bi != bi);
> +			biblk = BFITNOENT;
> +			/* This should jump to the first block after the
> +			   reservation. */
> +			goal = rs->rs_biblk + rs->rs_free;
> +			if (goal >= bi->bi_len * GFS2_NBBY)
> +				break;
> +		}
>  		if (biblk != BFITNOENT)
>  			break;
>  
> @@ -1448,8 +1848,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  			     u32 blk, bool dinode, unsigned int *n)
>  {
>  	const unsigned int elen = *n;
> -	u32 goal;
> +	u32 goal, rgblk;
>  	const u8 *buffer = NULL;
> +	struct gfs2_blkreserv *rs;
>  
>  	*n = 0;
>  	buffer = bi->bi_bh->b_data + bi->bi_offset;
> @@ -1462,6 +1863,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  		goal++;
>  		if (goal >= (bi->bi_len * GFS2_NBBY))
>  			break;
> +		rgblk = gfs2_bi2rgd_blk(bi, goal);
> +		rs = rs_find(rgd, rgblk);
> +		if (rs) /* Oops, we bumped into someone's reservation */
> +			break;
>  		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
>  		    GFS2_BLKST_FREE)
>  			break;
> @@ -1537,12 +1942,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
>  
>  int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
>  {
> -	const struct gfs2_rgrpd *rgd = gl->gl_object;
> +	struct gfs2_rgrpd *rgd = gl->gl_object;
> +	struct gfs2_blkreserv *trs;
> +	const struct rb_node *n;
> +
>  	if (rgd == NULL)
>  		return 0;
> -	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
> +	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
>  		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
> -		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
> +		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
> +		       rgd->rd_reserved);
> +	spin_lock(&rgd->rd_rsspin);
> +	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
> +		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		dump_rs(seq, trs);
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
>  	return 0;
>  }
>  
> @@ -1557,10 +1972,74 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
>  }
>  
>  /**
> + * claim_reserved_blks - Claim previously reserved blocks
> + * @ip: the inode that's claiming the reservation
> + * @dinode: 1 if this block is a dinode block, otherwise data block
> + * @nblocks: desired extent length
> + *
> + * Lay claim to previously allocated block reservation blocks.
> + * Returns: Starting block number of the blocks claimed.
> + * Sets *nblocks to the actual extent length allocated.
> + */
> +static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
> +			       unsigned int *nblocks)
> +{
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> +	struct gfs2_bitmap *bi;
> +	u32 biblk;
> +	u64 start_block = 0, fsblock;
> +	const unsigned int elen = *nblocks;
> +
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +	gfs2_assert_withdraw(sdp, rgd);
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	bi = rs->rs_bi;
> +	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
> +
> +	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
> +		biblk = rs->rs_biblk;
> +
> +		fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
> +		/* Check if the next free block is not contiguous with
> +		   the others in the extent. */
> +		if (start_block && fsblock != start_block + (*nblocks))
> +			break;
> +
> +		rs->rs_biblk++;
> +		rs->rs_free--;
> +		/* Make sure the bitmap hasn't changed */
> +		gfs2_setbit(rgd, bi->bi_clone, bi, biblk, GFS2_BLKST_USED);
> +
> +		BUG_ON(!rgd->rd_reserved);
> +		rgd->rd_reserved--;
> +		if (!start_block) {
> +			start_block = fsblock;
> +			dinode = false;
> +		}
> +		trace_gfs2_rs(ip, rs, fsblock, TRACE_RS_CLAIM);
> +	}
> +
> +	if (!rs->rs_free) {
> +		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
> +
> +		gfs2_rs_deltree(rs);
> +		/* -nblocks because we haven't returned to do the math yet.
> +		   I'm doing the math backwards to prevent negative numbers,
> +		   but think of it as:
> +		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
> +		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
> +			rg_mblk_search(rgd, ip);
> +	}
> +	return start_block;
> +}
> +
> +/**
>   * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
>   * @ip: the inode to allocate the block for
>   * @bn: Used to return the starting block number
> - * @ndata: requested number of blocks/extent length (value/result)
> + * @nblocks: requested number of blocks/extent length (value/result)
>   * @dinode: 1 if we're allocating a dinode block, else 0
>   * @generation: the generation number of the inode
>   *
> @@ -1585,20 +2064,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  	if (ip->i_res->rs_requested == 0)
>  		return -ECANCELED;
>  
> -	rgd = ip->i_rgd;
> -
> -	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> -		goal = ip->i_goal - rgd->rd_data0;
> -	else
> -		goal = rgd->rd_last_alloc;
> -
> -	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +	/* Check if we have a multi-block reservation, and if so, claim the
> +	   next free block from it. */
> +	if (gfs2_rs_active(ip->i_res)) {
> +		BUG_ON(!ip->i_res->rs_free);
> +		rgd = ip->i_res->rs_rgd;
> +		block = claim_reserved_blks(ip, dinode, nblocks);
> +	} else {
> +		rgd = ip->i_rgd;
>  
> -	/* Since all blocks are reserved in advance, this shouldn't happen */
> -	if (blk == BFITNOENT)
> -		goto rgrp_error;
> +		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> +			goal = ip->i_goal - rgd->rd_data0;
> +		else
> +			goal = rgd->rd_last_alloc;
> +
> +		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +
> +		/* Since all blocks are reserved in advance, this shouldn't
> +		   happen */
> +		if (blk == BFITNOENT) {
> +			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
> +			       *nblocks);
> +			printk(KERN_WARNING "FULL=%d\n",
> +			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
> +			goto rgrp_error;
> +		}
>  
> -	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +	}
>  	ndata = *nblocks;
>  	if (dinode)
>  		ndata--;
> @@ -1615,8 +2108,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  			brelse(dibh);
>  		}
>  	}
> -	if (rgd->rd_free < *nblocks)
> +	if (rgd->rd_free < *nblocks) {
> +		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
>  		goto rgrp_error;
> +	}
>  
>  	rgd->rd_free -= *nblocks;
>  	if (dinode) {
> @@ -1672,6 +2167,9 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
>  		return;
>  	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
>  	rgd->rd_free += blen;
> +	if (bstart < rgd->rd_next_rsrv)
> +		rgd->rd_next_rsrv = bstart;
> +
>  	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
>  	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
>  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
> diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
> index 5d8314d..95b00d5 100644
> --- a/fs/gfs2/rgrp.h
> +++ b/fs/gfs2/rgrp.h
> @@ -13,6 +13,14 @@
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  
> +/* Since each block in the file system is represented by two bits in the
> + * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
> + * By reserving 32 blocks at a time, we can optimize / shortcut how we search
> + * through the bitmaps by looking a word at a time.
> + */
> +#define RGRP_RSRV_MINBYTES 8
> +#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
> +
>  struct gfs2_rgrpd;
>  struct gfs2_sbd;
>  struct gfs2_holder;
> @@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
>  extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
>  extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
>  
> +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
> +
>  extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
>  extern void gfs2_inplace_release(struct gfs2_inode *ip);
>  
> @@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
>  			     bool dinode, u64 *generation);
>  
>  extern int gfs2_rs_alloc(struct gfs2_inode *ip);
> +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
>  extern void gfs2_rs_delete(struct gfs2_inode *ip);
>  extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
>  extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
> @@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
>  				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
>  extern int gfs2_fitrim(struct file *filp, void __user *argp);
>  
> -/* This is how to tell if a reservation is "inplace" reserved: */
> +/* This is how to tell if a multi-block reservation is "inplace" reserved: */
>  static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  {
>  	if (ip->i_res && ip->i_res->rs_requested)
> @@ -70,4 +81,12 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  	return 0;
>  }
>  
> +/* This is how to tell if a multi-block reservation is in the rgrp tree: */
> +static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
> +{
> +	if (rs && rs->rs_start)
> +		return 1;
> +	return 0;
> +}
> +
>  #endif /* __RGRP_DOT_H__ */
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index 7880687..b1502c4 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
>  		return -EIO;
>  	}
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;
> @@ -1550,6 +1554,9 @@ out_truncate:
>  
>  out_unlock:
>  	/* Error path for case 1 */
> +	if (gfs2_rs_active(ip->i_res))
> +		gfs2_rs_deltree(ip->i_res);
> +
>  	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
>  		gfs2_glock_dq(&ip->i_iopen_gh);
>  	gfs2_holder_uninit(&ip->i_iopen_gh);
> diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
> index 1b8b815..98fd480 100644
> --- a/fs/gfs2/trace_gfs2.h
> +++ b/fs/gfs2/trace_gfs2.h
> @@ -31,6 +31,17 @@
>  			    { GFS2_BLKST_DINODE, "dinode" },	\
>  			    { GFS2_BLKST_UNLINKED, "unlinked" })
>  
> +#define TRACE_RS_DELETE  0
> +#define TRACE_RS_TREEDEL 1
> +#define TRACE_RS_INSERT  2
> +#define TRACE_RS_CLAIM   3
> +
> +#define rs_func_name(x) __print_symbolic(x,	\
> +					 { 0, "del " },	\
> +					 { 1, "tdel" },	\
> +					 { 2, "ins " },	\
> +					 { 3, "clm " })
> +
>  #define show_glock_flags(flags) __print_flags(flags, "",	\
>  	{(1UL << GLF_LOCK),			"l" },		\
>  	{(1UL << GLF_DEMOTE),			"D" },		\
> @@ -470,6 +481,7 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__field(	u8,	block_state		)
>  		__field(        u64,	rd_addr			)
>  		__field(        u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
>  	),
>  
>  	TP_fast_assign(
> @@ -480,16 +492,64 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__entry->block_state	= block_state;
>  		__entry->rd_addr	= rgd->rd_addr;
>  		__entry->rd_free_clone	= rgd->rd_free_clone;
> +		__entry->rd_reserved	= rgd->rd_reserved;
>  	),
>  
> -	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
> +	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  (unsigned long long)__entry->inum,
>  		  (unsigned long long)__entry->start,
>  		  (unsigned long)__entry->len,
>  		  block_state_name(__entry->block_state),
>  		  (unsigned long long)__entry->rd_addr,
> -		  __entry->rd_free_clone)
> +		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
> +);
> +
> +/* Keep track of multi-block reservations as they are allocated/freed */
> +TRACE_EVENT(gfs2_rs,
> +
> +	TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
> +		 u64 claimed, u8 func),
> +
> +	TP_ARGS(ip, rs, claimed, func),
> +
> +	TP_STRUCT__entry(
> +		__field(        dev_t,  dev                     )
> +		__field(	u64,	rd_addr			)
> +		__field(	u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
> +		__field(	u64,	inum			)
> +		__field(	u64,	start			)
> +		__field(	u32,	len			)
> +		__field(	u64,	claimed			)
> +		__field(	u32,	free			)
> +		__field(	u8,	func			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
> +		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
> +		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
> +		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
> +		__entry->inum		= ip ? ip->i_no_addr : 0;
> +		__entry->start		= rs->rs_start;
> +		__entry->len		= rs->rs_len;
> +		__entry->claimed	= claimed;
> +		__entry->free		= rs->rs_free;
> +		__entry->func		= func;
> +	),
> +
> +	TP_printk("%u,%u bmap %llu resrv %llu/%lu rg:%llu rf:%lu rr:%lu %s "
> +		  "c:%llu f:%lu",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long long)__entry->inum,
> +		  (unsigned long long)__entry->start,
> +		  (unsigned long)__entry->len,
> +		  (unsigned long long)__entry->rd_addr,
> +		  (unsigned long)__entry->rd_free_clone,
> +		  (unsigned long)__entry->rd_reserved,
> +		  rs_func_name(__entry->func),
> +		  __entry->claimed, (unsigned long)__entry->free)
>  );
>  
>  #endif /* _TRACE_GFS2_H */
> diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
> index 523c0de..27a0b4a 100644
> --- a/fs/gfs2/xattr.c
> +++ b/fs/gfs2/xattr.c
> @@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto out_alloc;
> @@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock_check(ip);
>  	if (error)
>  		return error;
> @@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;




^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 4] GFS2: Reduce file fragmentation
  2012-07-12 10:01       ` Steven Whitehouse
@ 2012-07-12 19:28         ` Bob Peterson
  2012-07-13  9:36           ` Steven Whitehouse
  0 siblings, 1 reply; 13+ messages in thread
From: Bob Peterson @ 2012-07-12 19:28 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| I've found a workload that seems to do something a bit odd...
| 
| 
| [root at chywoon /]# for i in `seq 0 9`; do dd if=/dev/zero of=file.${i}
| bs=4096 count=1; done
| 
| (i.e. create 10 single block files, sequentially)
| 
| Then I looked at the allocation pattern using stat & filefrag. We get
| 10
| inodes, at positions: 33150, 33151, 33152, 33153, 33154, 33155,
| 33156,
| 33157, 33158, 33159 (so sequential with each other)
| 
| The lets look at where the data lands up: 33302, 33462, 33622, 33782,
| 33942, 34102, 34262, 34422, 34582, 34742 (respectively for each
| inode)
| 
| What I'd hope to see is that the data blocks are being allocated
| contiguously with the inodes, at least in most cases. The data blocks
| appear to be spaced out with gaps of 160 blocks between them. This
| was
| on newly created filesystem, btw, so there should be nothing to get
| in
| the way of the allocations.
| 
| Given that directories don't have reservations, and the reservations
| relating to the individual inodes that are being created should be
| cleared on close (i.e. before the subsequent creation) this should
| not
| have changed in behaviour from the current kernels,
| 
| Steve.

Hi,

This was not unexpected behavior, but I see the merits in making the
allocations for the dinode follow the dinode when it's created.

This fourth version of the patch addresses the problem by allowing
directories to have allocations, like files, but when a file is
created, the allocation is moved to the new gfs2_inode. That solves
the problem of preventing "holes" in the directory's reservation, plus
allows future block allocations to follow in line. This continuity
may allow it to be somewhat faster than the previous version. Thanks,
Steve!

Description:

This patch reduces GFS2 file fragmentation by pre-reserving blocks.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d957a8..49cd7dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (error)
 		goto out_rlist;
 
+	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
+		gfs2_rs_deltree(ip->i_res);
+
 	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
 				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
 				 revokes);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fbf3cb..b93275f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		return ret;
 
+	atomic_set(&ip->i_res->rs_sizehint,
+		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
+
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
@@ -571,22 +574,15 @@ fail:
 
 static int gfs2_release(struct inode *inode, struct file *file)
 {
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_file *fp;
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	fp = file->private_data;
+	kfree(file->private_data);
 	file->private_data = NULL;
 
-	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
-	    (atomic_read(&inode->i_writecount) == 1))
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (atomic_read(&inode->i_writecount) == 1)) {
 		gfs2_rs_delete(ip);
-
-	if (gfs2_assert_warn(sdp, fp))
-		return -EIO;
-
-	kfree(fp);
-
+	}
 	return 0;
 }
 
@@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				   unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
+	size_t writesize = iov_length(iov, nr_segs);
 	struct dentry *dentry = file->f_dentry;
 	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_sbd *sdp;
 	int ret;
 
+	sdp = GFS2_SB(file->f_mapping->host);
 	ret = gfs2_rs_alloc(ip);
 	if (ret)
 		return ret;
 
+	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
 	if (file->f_flags & O_APPEND) {
 		struct gfs2_holder gh;
 
@@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	if (unlikely(error))
 		goto out_uninit;
 
+	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
+
 	while (len > 0) {
 		if (len < bytes)
 			bytes = len;
@@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 			offset += bytes;
 			continue;
 		}
-		error = gfs2_rindex_update(sdp);
-		if (error)
-			goto out_unlock;
-
 		error = gfs2_quota_lock_check(ip);
 		if (error)
 			goto out_unlock;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dc73070..d887faf 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -84,6 +84,7 @@ struct gfs2_rgrpd {
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
 	u32 rd_free;
+	u32 rd_reserved;                /* number of blocks reserved */
 	u32 rd_free_clone;
 	u32 rd_dinodes;
 	u64 rd_igeneration;
@@ -91,11 +92,15 @@ struct gfs2_rgrpd {
 	struct gfs2_sbd *rd_sbd;
 	struct gfs2_rgrp_lvb *rd_rgl;
 	u32 rd_last_alloc;
+	u32 rd_next_rsrv;               /* preferred next reservation */
 	u32 rd_flags;
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
 #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
+	spinlock_t rd_rsspin;           /* protects reservation related vars */
+	struct rb_root rd_rstree;       /* multi-block reservation tree */
+	u32 rd_rs_cnt;                  /* count of current reservations */
 };
 
 enum gfs2_state_bits {
@@ -233,6 +238,40 @@ struct gfs2_holder {
 	unsigned long gh_ip;
 };
 
+/* Resource group multi-block reservation, in order of appearance:
+
+   Step 1. Function prepares to write, allocates a mb, sets the size hint.
+   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
+   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
+   Step 4. Bits are assigned from the rgrp based on either the reservation
+           or wherever it can.
+*/
+
+struct gfs2_blkreserv {
+	/* components used during write (step 1): */
+	atomic_t rs_sizehint;         /* hint of the write size */
+
+	/* components used during inplace_reserve (step 2): */
+	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+
+	/* components used during get_local_rgrp (step 3): */
+	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
+	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
+	struct rb_node rs_node;       /* link to other block reservations */
+
+	/* components used during block searches and assignments (step 4): */
+	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
+	u64 rs_start;                 /* absolute fs block address */
+	u32 rs_biblk;                 /* start block relative to the bi */
+	u32 rs_len;                   /* number of blocks reserved */
+	u32 rs_free;                  /* how many blocks are still free */
+
+	/* ancillary quota stuff */
+	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
+	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+	unsigned int rs_qa_qd_num;
+};
+
 enum {
 	GLF_LOCK			= 1,
 	GLF_DEMOTE			= 3,
@@ -290,16 +329,6 @@ struct gfs2_glock {
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
 
-struct gfs2_blkreserv {
-	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
-	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
-
-	/* ancillary quota stuff */
-	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
-	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
-	unsigned int rs_qa_qd_num;
-};
-
 enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
@@ -307,7 +336,6 @@ enum {
 	GIF_SW_PAGED		= 3,
 };
 
-
 struct gfs2_inode {
 	struct inode i_inode;
 	u64 i_no_addr;
@@ -318,7 +346,7 @@ struct gfs2_inode {
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
-	struct gfs2_blkreserv *i_res; /* resource group block reservation */
+	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
 	struct gfs2_rgrpd *i_rgd;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b035e0..d4e6882 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	int error;
 
 	munge_mode_uid_gid(dip, &mode, &uid, &gid);
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
 
 	error = gfs2_quota_lock(dip, uid, gid);
 	if (error)
@@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	struct buffer_head *dibh;
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto fail;
@@ -596,7 +603,8 @@ fail_end_trans:
 	gfs2_trans_end(sdp);
 
 fail_ipreserv:
-	gfs2_inplace_release(dip);
+	if (alloc_required)
+		gfs2_inplace_release(dip);
 
 fail_quota_locks:
 	gfs2_quota_unlock(dip);
@@ -647,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_holder ghs[2];
 	struct inode *inode = NULL;
-	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_inode *dip = GFS2_I(dir), *ip;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
 	int error;
@@ -694,24 +702,35 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
-	error = gfs2_inode_refresh(GFS2_I(inode));
+	ip = GFS2_I(inode);
+	error = gfs2_inode_refresh(ip);
 	if (error)
 		goto fail_gunlock2;
 
-	/* the new inode needs a reservation so it can allocate xattrs. */
-	error = gfs2_rs_alloc(GFS2_I(inode));
-	if (error)
-		goto fail_gunlock2;
+	/* Tricky: The newly created inode needs a reservation so it can
+	   allocate xattrs. At the same time, we don't want the directory
+	   to retain its reservation, and here's why: With directories, items
+	   are often created and deleted in the directory in the same breath,
+	   which can create "holes" in the reservation. By holes I mean that
+	   your next "claim" may not be the next free block in the reservation.
+	   In other words, we could get into situations where two or more
+	   blocks are reserved, then used, then one or more of the earlier
+	   blocks is freed. When we delete the reservation, the rs_free
+	   will be off due to the hole, so the rgrp's rg_free count can get
+	   off. The solution is that we transfer ownership of the reservation
+	   from the directory to the new inode. */
+	ip->i_res = dip->i_res;
+	dip->i_res = NULL;
 
 	error = gfs2_acl_create(dip, inode);
 	if (error)
 		goto fail_gunlock2;
 
-	error = gfs2_security_init(dip, GFS2_I(inode), name);
+	error = gfs2_security_init(dip, ip, name);
 	if (error)
 		goto fail_gunlock2;
 
-	error = link_dinode(dip, name, GFS2_I(inode));
+	error = link_dinode(dip, name, ip);
 	if (error)
 		goto fail_gunlock2;
 
@@ -738,6 +757,7 @@ fail_gunlock:
 		iput(inode);
 	}
 fail:
+	gfs2_rs_delete(dip);
 	if (bh)
 		brelse(bh);
 	return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e53d0a1..dcb8f0c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -35,6 +35,9 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
+#define RSRV_CONTENTION_FACTOR 2
+#define RGRP_RSRV_MAX_CONTENDERS 4
+
 #if BITS_PER_LONG == 32
 #define LBITMASK   (0x55555555UL)
 #define LBITSKIP55 (0x55555555UL)
@@ -178,6 +181,60 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
 }
 
 /**
+ * rs_cmp - multi-block reservation range compare
+ * @blk: absolute file system block number of the new reservation
+ * @len: number of blocks in the new reservation
+ * @rs: existing reservation to compare against
+ *
+ * returns: 1 if the block range is beyond the reach of the reservation
+ *         -1 if the block range is before the start of the reservation
+ *          0 if the block range overlaps with the reservation
+ */
+static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
+{
+	if (blk > rs->rs_start + rs->rs_len - 1)
+		return 1;
+	if (blk + len - 1 < rs->rs_start)
+		return -1;
+	return 0;
+}
+
+/**
+ * rs_find - Find a rgrp multi-block reservation that contains a given block
+ * @rgd: The rgrp
+ * @rgblk: The block we're looking for, relative to the rgrp
+ */
+static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
+{
+	struct rb_node **newn;
+	int rc;
+	u64 fsblk = rgblk + rgd->rd_data0;
+
+	spin_lock(&rgd->rd_rsspin);
+	newn = &rgd->rd_rstree.rb_node;
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+		rc = rs_cmp(fsblk, 1, cur);
+		if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else {
+			spin_unlock(&rgd->rd_rsspin);
+			return cur;
+		}
+	}
+	spin_unlock(&rgd->rd_rsspin);
+	return NULL;
+}
+
+static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
+{
+	return (bi->bi_start * GFS2_NBBY) + blk;
+}
+
+/**
  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
  *       a block in a given allocation state.
  * @buf: the buffer that holds the bitmaps
@@ -435,8 +492,79 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
 	return error;
 }
 
+static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
+{
+	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u i:%u f:%u\n",
+		       rs->rs_rgd->rd_addr, rs->rs_start, rs->rs_biblk,
+		       rs->rs_len, rs->rs_free);
+}
+
+/**
+ * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+static void __gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+{
+	struct gfs2_rgrpd *rgd;
+
+	if (!gfs2_rs_active(rs))
+		return;
+
+	rgd = rs->rs_rgd;
+	/* We can't do this: The reason is that when the rgrp is invalidated,
+	   it's in the "middle" of acquiring the glock, but the HOLDER bit
+	   isn't set yet:
+	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
+	trace_gfs2_rs(NULL, rs, 0, TRACE_RS_TREEDEL);
+
+	/* Now that we have the mutex, check again if the tree is empty just
+	   in case someone else deleted this entry. */
+	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
+		rb_erase(&rs->rs_node, &rgd->rd_rstree);
+	BUG_ON(!rgd->rd_rs_cnt);
+	rgd->rd_rs_cnt--;
+
+	if (rs->rs_free) {
+		/* return reserved blocks to the rgrp and the ip */
+		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
+		rs->rs_rgd->rd_reserved -= rs->rs_free;
+		rs->rs_free = 0;
+		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
+		smp_mb__after_clear_bit();
+	}
+	/* We can't change any of the step 1 or step 2 components of the rs.
+	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
+	   dequeued through this pointer.
+	   Can't: atomic_set(&rs->rs_sizehint, 0);
+	   Can't: rs->rs_requested = 0;
+	   Can't: rs->rs_rgd = NULL;*/
+	rs->rs_bi = NULL;
+	rs->rs_start = 0;
+	rs->rs_biblk = 0;
+	rs->rs_len = 0;
+}
+
+/**
+ * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+{
+	struct gfs2_rgrpd *rgd;
+
+	if (!gfs2_rs_active(rs))
+		return;
+
+	rgd = rs->rs_rgd;
+	spin_lock(&rgd->rd_rsspin);
+	__gfs2_rs_deltree(rs);
+	spin_unlock(&rgd->rd_rsspin);
+}
+
 /**
- * gfs2_rs_delete - delete a reservation
+ * gfs2_rs_delete - delete a multi-block reservation
  * @ip: The inode for this reservation
  *
  */
@@ -444,12 +572,37 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
 {
 	down_write(&ip->i_rw_mutex);
 	if (ip->i_res) {
+		gfs2_rs_deltree(ip->i_res);
+		trace_gfs2_rs(ip, ip->i_res, ip->i_res->rs_start,
+			      TRACE_RS_DELETE);
+		BUG_ON(ip->i_res->rs_free);
 		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
 		ip->i_res = NULL;
 	}
 	up_write(&ip->i_rw_mutex);
 }
 
+/**
+ * return_all_reservations - return all reserved blocks back to the rgrp.
+ * @rgd: the rgrp that needs its space back
+ *
+ * We previously reserved a bunch of blocks for allocation. Now we need to
+ * give them back. This leave the reservation structures in tact, but removes
+ * all of their corresponding "no-fly zones".
+ */
+static void return_all_reservations(struct gfs2_rgrpd *rgd)
+{
+	struct rb_node *n;
+	struct gfs2_blkreserv *rs;
+
+	spin_lock(&rgd->rd_rsspin);
+	while ((n = rb_first(&rgd->rd_rstree))) {
+		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		__gfs2_rs_deltree(rs);
+	}
+	spin_unlock(&rgd->rd_rsspin);
+}
+
 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 {
 	struct rb_node *n;
@@ -472,6 +625,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 
 		gfs2_free_clones(rgd);
 		kfree(rgd->rd_bits);
+		return_all_reservations(rgd);
 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
 	}
 }
@@ -649,6 +803,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
 	rgd->rd_data = be32_to_cpu(buf.ri_data);
 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
+	spin_lock_init(&rgd->rd_rsspin);
 
 	error = compute_bitstructs(rgd);
 	if (error)
@@ -1115,29 +1270,216 @@ out:
 }
 
 /**
+ * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
+ * @bi: the bitmap with the blocks
+ * @ip: the inode structure
+ * @biblk: the 32-bit block number relative to the start of the bitmap
+ * @amount: the number of blocks to reserve
+ *
+ * Returns: NULL - reservation was already taken, so not inserted
+ *          pointer to the inserted reservation
+ */
+static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
+				       struct gfs2_inode *ip, u32 biblk,
+				       int amount)
+{
+	struct rb_node **newn, *parent = NULL;
+	int rc;
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rgd;
+	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+
+	spin_lock(&rgd->rd_rsspin);
+	newn = &rgd->rd_rstree.rb_node;
+	BUG_ON(!ip->i_res);
+	BUG_ON(gfs2_rs_active(rs));
+	/* Figure out where to put new node */
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+
+		parent = *newn;
+		rc = rs_cmp(fsblock, amount, cur);
+		if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else {
+			spin_unlock(&rgd->rd_rsspin);
+			return NULL; /* reservation already in use */
+		}
+	}
+
+	/* Do our reservation work */
+	rs = ip->i_res;
+	rs->rs_start = fsblock;
+	rs->rs_len = amount;
+	rs->rs_free = amount;
+	rs->rs_biblk = biblk;
+	rs->rs_bi = bi;
+	rb_link_node(&rs->rs_node, parent, newn);
+	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
+
+	/* Do our inode accounting for the reservation */
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+
+	/* Do our rgrp accounting for the reservation */
+	rgd->rd_reserved += amount; /* blocks reserved */
+	rgd->rd_next_rsrv = rs->rs_start + rs->rs_len;
+	rgd->rd_rs_cnt++; /* number of in-tree reservations */
+	if (!rgrp_contains_block(rgd, rgd->rd_next_rsrv))
+		rgd->rd_next_rsrv = 0;
+	spin_unlock(&rgd->rd_rsspin);
+	trace_gfs2_rs(ip, rs, fsblock, TRACE_RS_INSERT);
+	return rs;
+}
+
+/**
+ * unclaimed_blocks - return number of blocks that aren't spoken for
+ */
+static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
+{
+	return rgd->rd_free_clone - rgd->rd_reserved;
+}
+
+/**
+ * rg_mblk_search - find a group of multiple free blocks
+ * @rgd: the resource group descriptor
+ * @rs: the block reservation
+ * @ip: pointer to the inode for which we're reserving blocks
+ *
+ * This is very similar to rgblk_search, except we're looking for whole
+ * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
+ * on aligned dwords for speed's sake.
+ *
+ * Returns: 0 if successful or BFITNOENT if there isn't enough free space
+ */
+
+static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+	struct gfs2_bitmap *bi = rgd->rd_bits;
+	const u32 length = rgd->rd_length;
+	u32 blk;
+	unsigned int buf, x, search_bytes;
+	u8 *buffer = NULL;
+	u8 *ptr, *end, *nonzero;
+	u32 goal, rsv_bytes;
+	struct gfs2_blkreserv *rs;
+	u32 best_rs_bytes, unclaimed;
+	int best_rs_blocks;
+
+	/* Find bitmap block that contains bits for goal block */
+	if (rgd->rd_next_rsrv) {
+		goal = rgd->rd_next_rsrv - rgd->rd_data0;
+		for (buf = 0; buf < length; buf++) {
+			bi = rgd->rd_bits + buf;
+			/* Convert scope of "goal" from rgrp-wide to within
+			   found bit block */
+			if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+				goal -= bi->bi_start * GFS2_NBBY;
+				goto do_search;
+			}
+		}
+	}
+	buf = 0;
+	goal = 0;
+
+do_search:
+	best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
+			       (RGRP_RSRV_MINBLKS * rgd->rd_length));
+	best_rs_bytes = (best_rs_blocks *
+			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
+		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
+	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
+	unclaimed = unclaimed_blocks(rgd);
+	if (best_rs_bytes * GFS2_NBBY > unclaimed)
+		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
+
+	for (x = 0; x <= length; x++) {
+		bi = rgd->rd_bits + buf;
+
+		if (test_bit(GBF_FULL, &bi->bi_flags))
+			goto skip;
+
+		WARN_ON(!buffer_uptodate(bi->bi_bh));
+		if (bi->bi_clone)
+			buffer = bi->bi_clone + bi->bi_offset;
+		else
+			buffer = bi->bi_bh->b_data + bi->bi_offset;
+
+		/* We have to keep the reservations aligned on u64 boundaries
+		   otherwise we could get situations where a byte can't be
+		   used because it's after a reservation, but a free bit still
+		   is within the reservation's area. */
+		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
+		end = (buffer + bi->bi_len);
+		while (ptr < end) {
+			rsv_bytes = 0;
+			if ((ptr + best_rs_bytes) <= end)
+				search_bytes = best_rs_bytes;
+			else
+				search_bytes = end - ptr;
+			BUG_ON(!search_bytes);
+			nonzero = memchr_inv(ptr, 0, search_bytes);
+			/* If the lot is all zeroes, reserve the whole size. If
+			   there's enough zeroes to satisfy the request, use
+			   what we can. If there's not enough, keep looking. */
+			if (nonzero == NULL)
+				rsv_bytes = search_bytes;
+			else if ((nonzero - ptr) * GFS2_NBBY >=
+				 ip->i_res->rs_requested)
+				rsv_bytes = (nonzero - ptr);
+
+			if (rsv_bytes) {
+				blk = ((ptr - buffer) * GFS2_NBBY);
+				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
+				rs = rs_insert(bi, ip, blk,
+					       rsv_bytes * GFS2_NBBY);
+				if (IS_ERR(rs))
+					return PTR_ERR(rs);
+				if (rs)
+					return 0;
+			}
+			ptr += ALIGN(search_bytes, sizeof(u64));
+		}
+skip:
+		/* Try next bitmap block (wrap back to rgrp header
+		   if at end) */
+		buf++;
+		buf %= length;
+		goal = 0;
+	}
+
+	return BFITNOENT;
+}
+
+/**
  * try_rgrp_fit - See if a given reservation will fit in a given RG
  * @rgd: the RG data
  * @ip: the inode
  *
  * If there's room for the requested blocks to be allocated from the RG:
+ * This will try to get a multi-block reservation first, and if that doesn't
+ * fit, it will take what it can.
  *
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
  */
 
-static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
-	const struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_blkreserv *rs = ip->i_res;
 
 	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
-	if (rgd->rd_free_clone >= rs->rs_requested)
+	/* Look for a multi-block reservation. */
+	if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
+	    rg_mblk_search(rgd, ip) != BFITNOENT)
+		return 1;
+	if (unclaimed_blocks(rgd) >= rs->rs_requested)
 		return 1;
-	return 0;
-}
 
-static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
-{
-	return (bi->bi_start * GFS2_NBBY) + blk;
+	return 0;
 }
 
 /**
@@ -1217,7 +1559,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrpd *rgd, *begin = NULL;
+	struct gfs2_rgrpd *begin = NULL;
 	struct gfs2_blkreserv *rs = ip->i_res;
 	int error = 0, rg_locked, flags = LM_FLAG_TRY;
 	u64 last_unlinked = NO_BLOCK;
@@ -1225,32 +1567,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 
 	if (sdp->sd_args.ar_rgrplvb)
 		flags |= GL_SKIP;
-	rs = ip->i_res;
 	rs->rs_requested = requested;
 	if (gfs2_assert_warn(sdp, requested)) {
 		error = -EINVAL;
 		goto out;
 	}
-
-	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
-		rgd = begin = ip->i_rgd;
-	else
-		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
-
-	if (rgd == NULL)
+	if (gfs2_rs_active(rs)) {
+		begin = rs->rs_rgd;
+		flags = 0; /* Yoda: Do or do not. There is no try */
+	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
+		rs->rs_rgd = begin = ip->i_rgd;
+	} else {
+		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
+	}
+	if (rs->rs_rgd == NULL)
 		return -EBADSLT;
 
 	while (loops < 3) {
 		rg_locked = 0;
 
-		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
 			rg_locked = 1;
 			error = 0;
+		} else if (!loops && !gfs2_rs_active(rs) &&
+			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
+			/* If the rgrp already is maxed out for contenders,
+			   we can eliminate it as a "first pass" without even
+			   requesting the rgrp glock. */
+			error = GLR_TRYFAILED;
 		} else {
-			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
-						   flags, &rs->rs_rgd_gh);
+			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
+						   LM_ST_EXCLUSIVE, flags,
+						   &rs->rs_rgd_gh);
 			if (!error && sdp->sd_args.ar_rgrplvb) {
-				error = update_rgrp_lvb(rgd);
+				error = update_rgrp_lvb(rs->rs_rgd);
 				if (error) {
 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 					return error;
@@ -1259,24 +1609,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 		}
 		switch (error) {
 		case 0:
-			if (try_rgrp_fit(rgd, ip)) {
+			if (gfs2_rs_active(rs)) {
+				if (unclaimed_blocks(rs->rs_rgd) +
+				    rs->rs_free >= rs->rs_requested) {
+					ip->i_rgd = rs->rs_rgd;
+					return 0;
+				}
+				/* We have a multi-block reservation, but the
+				   rgrp doesn't have enough free blocks to
+				   satisfy the request. Free the reservation
+				   and look for a suitable rgrp. */
+				gfs2_rs_deltree(rs);
+			}
+			if (try_rgrp_fit(rs->rs_rgd, ip)) {
 				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rgd);
-				ip->i_rgd = rgd;
+					gfs2_rgrp_bh_get(rs->rs_rgd);
+				ip->i_rgd = rs->rs_rgd;
 				return 0;
 			}
-			if (rgd->rd_flags & GFS2_RDF_CHECK) {
+			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
 				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rgd);
-				try_rgrp_unlink(rgd, &last_unlinked,
+					gfs2_rgrp_bh_get(rs->rs_rgd);
+				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
 						ip->i_no_addr);
 			}
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 			/* fall through */
 		case GLR_TRYFAILED:
-			rgd = gfs2_rgrpd_get_next(rgd);
-			if (rgd != begin) /* If we didn't wrap */
+			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
+			if (rs->rs_rgd != begin) /* If we didn't wrap */
 				break;
 
 			flags &= ~LM_FLAG_TRY;
@@ -1314,6 +1676,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 {
 	struct gfs2_blkreserv *rs = ip->i_res;
 
+	if (!rs)
+		return;
+
+	if (!rs->rs_free)
+		gfs2_rs_deltree(rs);
+
 	if (rs->rs_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 	rs->rs_requested = 0;
@@ -1412,7 +1780,27 @@ do_search:
 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
 			buffer = bi->bi_clone + bi->bi_offset;
 
-		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+		while (1) {
+			struct gfs2_blkreserv *rs;
+			u32 rgblk;
+
+			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+			if (biblk == BFITNOENT)
+				break;
+			/* Check if this block is reserved() */
+			rgblk = gfs2_bi2rgd_blk(bi, biblk);
+			rs = rs_find(rgd, rgblk);
+			if (rs == NULL)
+				break;
+
+			BUG_ON(rs->rs_bi != bi);
+			biblk = BFITNOENT;
+			/* This should jump to the first block after the
+			   reservation. */
+			goal = rs->rs_biblk + rs->rs_free;
+			if (goal >= bi->bi_len * GFS2_NBBY)
+				break;
+		}
 		if (biblk != BFITNOENT)
 			break;
 
@@ -1448,8 +1836,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 			     u32 blk, bool dinode, unsigned int *n)
 {
 	const unsigned int elen = *n;
-	u32 goal;
+	u32 goal, rgblk;
 	const u8 *buffer = NULL;
+	struct gfs2_blkreserv *rs;
 
 	*n = 0;
 	buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1462,6 +1851,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 		goal++;
 		if (goal >= (bi->bi_len * GFS2_NBBY))
 			break;
+		rgblk = gfs2_bi2rgd_blk(bi, goal);
+		rs = rs_find(rgd, rgblk);
+		if (rs) /* Oops, we bumped into someone's reservation */
+			break;
 		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
 		    GFS2_BLKST_FREE)
 			break;
@@ -1537,12 +1930,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 
 int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
-	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	struct gfs2_rgrpd *rgd = gl->gl_object;
+	struct gfs2_blkreserv *trs;
+	const struct rb_node *n;
+
 	if (rgd == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
+		       rgd->rd_reserved);
+	spin_lock(&rgd->rd_rsspin);
+	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
+		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		dump_rs(seq, trs);
+	}
+	spin_unlock(&rgd->rd_rsspin);
 	return 0;
 }
 
@@ -1557,10 +1960,75 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 }
 
 /**
+ * claim_reserved_blks - Claim previously reserved blocks
+ * @ip: the inode that's claiming the reservation
+ * @dinode: 1 if this block is a dinode block, otherwise data block
+ * @nblocks: desired extent length
+ *
+ * Lay claim to previously allocated block reservation blocks.
+ * Returns: Starting block number of the blocks claimed.
+ * Sets *nblocks to the actual extent length allocated.
+ */
+static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
+			       unsigned int *nblocks)
+{
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rgd;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_bitmap *bi;
+	u32 biblk;
+	u64 start_block = 0, fsblock;
+	const unsigned int elen = *nblocks;
+
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+	gfs2_assert_withdraw(sdp, rgd);
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+	bi = rs->rs_bi;
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+
+	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
+		biblk = rs->rs_biblk;
+
+		fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+		/* Check if the next free block is not contiguous with
+		   the others in the extent. */
+		if (start_block && fsblock != start_block + (*nblocks))
+			break;
+
+		rs->rs_biblk++;
+		rs->rs_free--;
+		/* Make sure the bitmap hasn't changed */
+		gfs2_setbit(rgd, bi->bi_clone, bi, biblk,
+			    dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
+
+		BUG_ON(!rgd->rd_reserved);
+		rgd->rd_reserved--;
+		if (!start_block) {
+			start_block = fsblock;
+			dinode = false;
+		}
+		trace_gfs2_rs(ip, rs, fsblock, TRACE_RS_CLAIM);
+	}
+
+	if (!rs->rs_free) {
+		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
+
+		gfs2_rs_deltree(rs);
+		/* -nblocks because we haven't returned to do the math yet.
+		   I'm doing the math backwards to prevent negative numbers,
+		   but think of it as:
+		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
+		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
+			rg_mblk_search(rgd, ip);
+	}
+	return start_block;
+}
+
+/**
  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
  * @ip: the inode to allocate the block for
  * @bn: Used to return the starting block number
- * @ndata: requested number of blocks/extent length (value/result)
+ * @nblocks: requested number of blocks/extent length (value/result)
  * @dinode: 1 if we're allocating a dinode block, else 0
  * @generation: the generation number of the inode
  *
@@ -1585,20 +2053,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	if (ip->i_res->rs_requested == 0)
 		return -ECANCELED;
 
-	rgd = ip->i_rgd;
-
-	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
-		goal = ip->i_goal - rgd->rd_data0;
-	else
-		goal = rgd->rd_last_alloc;
-
-	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+	/* Check if we have a multi-block reservation, and if so, claim the
+	   next free block from it. */
+	if (gfs2_rs_active(ip->i_res)) {
+		BUG_ON(!ip->i_res->rs_free);
+		rgd = ip->i_res->rs_rgd;
+		block = claim_reserved_blks(ip, dinode, nblocks);
+	} else {
+		rgd = ip->i_rgd;
 
-	/* Since all blocks are reserved in advance, this shouldn't happen */
-	if (blk == BFITNOENT)
-		goto rgrp_error;
+		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
+			goal = ip->i_goal - rgd->rd_data0;
+		else
+			goal = rgd->rd_last_alloc;
+
+		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+
+		/* Since all blocks are reserved in advance, this shouldn't
+		   happen */
+		if (blk == BFITNOENT) {
+			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
+			       *nblocks);
+			printk(KERN_WARNING "FULL=%d\n",
+			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
+			goto rgrp_error;
+		}
 
-	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+	}
 	ndata = *nblocks;
 	if (dinode)
 		ndata--;
@@ -1615,8 +2097,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 			brelse(dibh);
 		}
 	}
-	if (rgd->rd_free < *nblocks)
+	if (rgd->rd_free < *nblocks) {
+		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
 		goto rgrp_error;
+	}
 
 	rgd->rd_free -= *nblocks;
 	if (dinode) {
@@ -1672,6 +2156,9 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
 		return;
 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
 	rgd->rd_free += blen;
+	if (bstart < rgd->rd_next_rsrv)
+		rgd->rd_next_rsrv = bstart;
+
 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5d8314d..95b00d5 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -13,6 +13,14 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 
+/* Since each block in the file system is represented by two bits in the
+ * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
+ * By reserving 32 blocks at a time, we can optimize / shortcut how we search
+ * through the bitmaps by looking a word at a time.
+ */
+#define RGRP_RSRV_MINBYTES 8
+#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
@@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
 extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
 extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+
 extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
@@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 			     bool dinode, u64 *generation);
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
 extern void gfs2_rs_delete(struct gfs2_inode *ip);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
 extern int gfs2_fitrim(struct file *filp, void __user *argp);
 
-/* This is how to tell if a reservation is "inplace" reserved: */
+/* This is how to tell if a multi-block reservation is "inplace" reserved: */
 static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
 {
 	if (ip->i_res && ip->i_res->rs_requested)
@@ -70,4 +81,12 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
 	return 0;
 }
 
+/* This is how to tell if a multi-block reservation is in the rgrp tree: */
+static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
+{
+	if (rs && rs->rs_start)
+		return 1;
+	return 0;
+}
+
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7880687..b1502c4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 		return -EIO;
 	}
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		return error;
@@ -1550,6 +1554,9 @@ out_truncate:
 
 out_unlock:
 	/* Error path for case 1 */
+	if (gfs2_rs_active(ip->i_res))
+		gfs2_rs_deltree(ip->i_res);
+
 	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
 		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 1b8b815..98fd480 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -31,6 +31,17 @@
 			    { GFS2_BLKST_DINODE, "dinode" },	\
 			    { GFS2_BLKST_UNLINKED, "unlinked" })
 
+#define TRACE_RS_DELETE  0
+#define TRACE_RS_TREEDEL 1
+#define TRACE_RS_INSERT  2
+#define TRACE_RS_CLAIM   3
+
+#define rs_func_name(x) __print_symbolic(x,	\
+					 { 0, "del " },	\
+					 { 1, "tdel" },	\
+					 { 2, "ins " },	\
+					 { 3, "clm " })
+
 #define show_glock_flags(flags) __print_flags(flags, "",	\
 	{(1UL << GLF_LOCK),			"l" },		\
 	{(1UL << GLF_DEMOTE),			"D" },		\
@@ -470,6 +481,7 @@ TRACE_EVENT(gfs2_block_alloc,
 		__field(	u8,	block_state		)
 		__field(        u64,	rd_addr			)
 		__field(        u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
 	),
 
 	TP_fast_assign(
@@ -480,16 +492,64 @@ TRACE_EVENT(gfs2_block_alloc,
 		__entry->block_state	= block_state;
 		__entry->rd_addr	= rgd->rd_addr;
 		__entry->rd_free_clone	= rgd->rd_free_clone;
+		__entry->rd_reserved	= rgd->rd_reserved;
 	),
 
-	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
+	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long long)__entry->inum,
 		  (unsigned long long)__entry->start,
 		  (unsigned long)__entry->len,
 		  block_state_name(__entry->block_state),
 		  (unsigned long long)__entry->rd_addr,
-		  __entry->rd_free_clone)
+		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
+);
+
+/* Keep track of multi-block reservations as they are allocated/freed */
+TRACE_EVENT(gfs2_rs,
+
+	TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
+		 u64 claimed, u8 func),
+
+	TP_ARGS(ip, rs, claimed, func),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	rd_addr			)
+		__field(	u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
+		__field(	u64,	inum			)
+		__field(	u64,	start			)
+		__field(	u32,	len			)
+		__field(	u64,	claimed			)
+		__field(	u32,	free			)
+		__field(	u8,	func			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
+		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
+		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
+		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
+		__entry->inum		= ip ? ip->i_no_addr : 0;
+		__entry->start		= rs->rs_start;
+		__entry->len		= rs->rs_len;
+		__entry->claimed	= claimed;
+		__entry->free		= rs->rs_free;
+		__entry->func		= func;
+	),
+
+	TP_printk("%u,%u bmap %llu resrv %llu/%lu rg:%llu rf:%lu rr:%lu %s "
+		  "c:%llu f:%lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->start,
+		  (unsigned long)__entry->len,
+		  (unsigned long long)__entry->rd_addr,
+		  (unsigned long)__entry->rd_free_clone,
+		  (unsigned long)__entry->rd_reserved,
+		  rs_func_name(__entry->func),
+		  __entry->claimed, (unsigned long)__entry->free)
 );
 
 #endif /* _TRACE_GFS2_H */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 523c0de..27a0b4a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 {
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto out_alloc;
@@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	struct buffer_head *dibh;
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_lock_check(ip);
 	if (error)
 		return error;
@@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 {
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		return error;



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 4] GFS2: Reduce file fragmentation
  2012-07-12 19:28         ` [Cluster-devel] [GFS2 Patch] [Try 4] " Bob Peterson
@ 2012-07-13  9:36           ` Steven Whitehouse
  2012-07-13 12:55             ` Bob Peterson
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Whitehouse @ 2012-07-13  9:36 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Thu, 2012-07-12 at 15:28 -0400, Bob Peterson wrote:
> ----- Original Message -----
> | I've found a workload that seems to do something a bit odd...
> | 
> | 
> | [root at chywoon /]# for i in `seq 0 9`; do dd if=/dev/zero of=file.${i}
> | bs=4096 count=1; done
> | 
> | (i.e. create 10 single block files, sequentially)
> | 
> | Then I looked at the allocation pattern using stat & filefrag. We get
> | 10
> | inodes, at positions: 33150, 33151, 33152, 33153, 33154, 33155,
> | 33156,
> | 33157, 33158, 33159 (so sequential with each other)
> | 
> | The lets look at where the data lands up: 33302, 33462, 33622, 33782,
> | 33942, 34102, 34262, 34422, 34582, 34742 (respectively for each
> | inode)
> | 
> | What I'd hope to see is that the data blocks are being allocated
> | contiguously with the inodes, at least in most cases. The data blocks
> | appear to be spaced out with gaps of 160 blocks between them. This
> | was
> | on newly created filesystem, btw, so there should be nothing to get
> | in
> | the way of the allocations.
> | 
> | Given that directories don't have reservations, and the reservations
> | relating to the individual inodes that are being created should be
> | cleared on close (i.e. before the subsequent creation) this should
> | not
> | have changed in behaviour from the current kernels,
> | 
> | Steve.
> 
> Hi,
> 
> This was not unexpected behavior, but I see the merits in making the
> allocations for the dinode follow the dinode when it's created.
> 
I have to say that worries me when you say that a patch which is
designed to reduce fragmentation is not unexpected to produce results
like that. We've got to be very careful here not to make things worse in
such cases, otherwise we are not making progress.

> This fourth version of the patch addresses the problem by allowing
> directories to have allocations, like files, but when a file is
> created, the allocation is moved to the new gfs2_inode. That solves
> the problem of preventing "holes" in the directory's reservation, plus
> allows future block allocations to follow in line. This continuity
> may allow it to be somewhat faster than the previous version. Thanks,
> Steve!
> 
Yes, that would be interesting to know. I'll have a look in more detail
a bit later, but some comments follow....

[snip]

> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 2b035e0..d4e6882 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
>  	int error;
>  
>  	munge_mode_uid_gid(dip, &mode, &uid, &gid);
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
>  
>  	error = gfs2_quota_lock(dip, uid, gid);
>  	if (error)
> @@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto fail;
> @@ -596,7 +603,8 @@ fail_end_trans:
>  	gfs2_trans_end(sdp);
>  
>  fail_ipreserv:
> -	gfs2_inplace_release(dip);
> +	if (alloc_required)
> +		gfs2_inplace_release(dip);
>  
>  fail_quota_locks:
>  	gfs2_quota_unlock(dip);
> @@ -647,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>  	const struct qstr *name = &dentry->d_name;
>  	struct gfs2_holder ghs[2];
>  	struct inode *inode = NULL;
> -	struct gfs2_inode *dip = GFS2_I(dir);
> +	struct gfs2_inode *dip = GFS2_I(dir), *ip;
>  	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
>  	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
>  	int error;
> @@ -694,24 +702,35 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>  	if (IS_ERR(inode))
>  		goto fail_gunlock2;
>  
> -	error = gfs2_inode_refresh(GFS2_I(inode));
> +	ip = GFS2_I(inode);
> +	error = gfs2_inode_refresh(ip);
>  	if (error)
>  		goto fail_gunlock2;
>  
> -	/* the new inode needs a reservation so it can allocate xattrs. */
> -	error = gfs2_rs_alloc(GFS2_I(inode));
> -	if (error)
> -		goto fail_gunlock2;
> +	/* Tricky: The newly created inode needs a reservation so it can
> +	   allocate xattrs. At the same time, we don't want the directory
> +	   to retain its reservation, and here's why: With directories, items
> +	   are often created and deleted in the directory in the same breath,
> +	   which can create "holes" in the reservation. By holes I mean that
> +	   your next "claim" may not be the next free block in the reservation.
> +	   In other words, we could get into situations where two or more
> +	   blocks are reserved, then used, then one or more of the earlier
> +	   blocks is freed. When we delete the reservation, the rs_free
> +	   will be off due to the hole, so the rgrp's rg_free count can get
> +	   off. The solution is that we transfer ownership of the reservation
> +	   from the directory to the new inode. */
> +	ip->i_res = dip->i_res;
> +	dip->i_res = NULL;

This comment still doesn't make sense to me. What are these operations
that are freeing up blocks in the directory? There should be no blocks
freed in a directory unless we deallocate the entire directory at the
moment.

This is really just a bug. We need to create the new inodes earlier so
that we can use their reservations when allocating them "on disk". Then
the directory and new inode reservations can be entirely separate.

The current problem is that we are using the directory's allocation
state to allocate new inodes, which is wrong and causes other problems
too. It has been on my list to fix, but it is complicated - we will have
to address it at some stage in order to make the Orlov allocator work
though,

Steve.




^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 4] GFS2: Reduce file fragmentation
  2012-07-13  9:36           ` Steven Whitehouse
@ 2012-07-13 12:55             ` Bob Peterson
  2012-07-13 13:49               ` Steven Whitehouse
  0 siblings, 1 reply; 13+ messages in thread
From: Bob Peterson @ 2012-07-13 12:55 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| > allows future block allocations to follow in line. This continuity
| > may allow it to be somewhat faster than the previous version.
| > Thanks,
| > Steve!
| > 
| Yes, that would be interesting to know. I'll have a look in more
| detail
| a bit later, but some comments follow....

Preliminary results are in: In the hour-long test I've been using,
this improvement shaves about 4 minutes off (just under 1 percent).

| > +	/* Tricky: The newly created inode needs a reservation so it can
| > +	   allocate xattrs. At the same time, we don't want the directory
| > +	   to retain its reservation, and here's why: With directories,
| > items
| > +	   are often created and deleted in the directory in the same
| > breath,
| > +	   which can create "holes" in the reservation. By holes I mean
| > that
| > +	   your next "claim" may not be the next free block in the
| > reservation.
| > +	   In other words, we could get into situations where two or more
| > +	   blocks are reserved, then used, then one or more of the
| > earlier
| > +	   blocks is freed. When we delete the reservation, the rs_free
| > +	   will be off due to the hole, so the rgrp's rg_free count can
| > get
| > +	   off. The solution is that we transfer ownership of the
| > reservation
| > +	   from the directory to the new inode. */
| 
| This comment still doesn't make sense to me. What are these
| operations
| that are freeing up blocks in the directory? There should be no
| blocks
| freed in a directory unless we deallocate the entire directory at the
| moment.

Again, the "holes" I'm talking about are in the reservation, not in the
directory. Suppose you have an empty directory and "touch" files
a,b,c,d and e. Suppose the directory gets a multi-block reservation of
8 blocks for those allocations. After the 5 files are created, the
directory's reservation has rs_start=S, rs_len=8, and rs_free=3.
The bitmap representing those dinodes, which corresponds to the
reservation, looks something like this: 11 11 11 11 11 00 00 00.

Now suppose you delete file "b". The directory's blocks won't change,
nor will its hash table. However, the dinode for "b" will be deleted
and the corresponding bitmap for the dinodes will then look something like:
11 00 11 11 11 00 00 00. The corresponding reservation will have:
rs_start=S, rs_len=8 and rs_free=4.

The problem is if you now create file f in that directory,
it essentially "claims" the blocks at rs_start + rs_len - rs_free,
but S + 8 - 4 = S + 4, and that block is already claimed by file "e".

The alternative, as I stated in an earlier email, is to make the
starting block, S, a moving target, adjusting it with each allocation.
In that case, block S + 1, which was freed when file b was deleted,
will be "left behind" and add to the fragmentation.

We can't just keep marching rs_free forward because then we get into
rgrp accounting problems if/when the reservation is freed and has
unclaimed blocks that we need to return to the pool of blocks in the rgrp.
 
| This is really just a bug. We need to create the new inodes earlier
| so
| that we can use their reservations when allocating them "on disk".
| Then
| the directory and new inode reservations can be entirely separate.
| 
| The current problem is that we are using the directory's allocation
| state to allocate new inodes, which is wrong and causes other
| problems
| too. It has been on my list to fix, but it is complicated - we will
| have
| to address it at some stage in order to make the Orlov allocator work
| though,

I don't understand what you're saying about this being a bug,
nor what needs to be fixed. Can you elaborate?

Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 4] GFS2: Reduce file fragmentation
  2012-07-13 12:55             ` Bob Peterson
@ 2012-07-13 13:49               ` Steven Whitehouse
  2012-07-18 16:26                 ` [Cluster-devel] [GFS2 Patch] [Try 5] " Bob Peterson
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Whitehouse @ 2012-07-13 13:49 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Fri, 2012-07-13 at 08:55 -0400, Bob Peterson wrote:
> ----- Original Message -----
> | > allows future block allocations to follow in line. This continuity
> | > may allow it to be somewhat faster than the previous version.
> | > Thanks,
> | > Steve!
> | > 
> | Yes, that would be interesting to know. I'll have a look in more
> | detail
> | a bit later, but some comments follow....
> 
> Preliminary results are in: In the hour-long test I've been using,
> this improvement shaves about 4 minutes off (just under 1 percent).
> 
Sounds good

> | > +	/* Tricky: The newly created inode needs a reservation so it can
> | > +	   allocate xattrs. At the same time, we don't want the directory
> | > +	   to retain its reservation, and here's why: With directories,
> | > items
> | > +	   are often created and deleted in the directory in the same
> | > breath,
> | > +	   which can create "holes" in the reservation. By holes I mean
> | > that
> | > +	   your next "claim" may not be the next free block in the
> | > reservation.
> | > +	   In other words, we could get into situations where two or more
> | > +	   blocks are reserved, then used, then one or more of the
> | > earlier
> | > +	   blocks is freed. When we delete the reservation, the rs_free
> | > +	   will be off due to the hole, so the rgrp's rg_free count can
> | > get
> | > +	   off. The solution is that we transfer ownership of the
> | > reservation
> | > +	   from the directory to the new inode. */
> | 
> | This comment still doesn't make sense to me. What are these
> | operations
> | that are freeing up blocks in the directory? There should be no
> | blocks
> | freed in a directory unless we deallocate the entire directory at the
> | moment.
> 
> Again, the "holes" I'm talking about are in the reservation, not in the
> directory. Suppose you have an empty directory and "touch" files
> a,b,c,d and e. Suppose the directory gets a multi-block reservation of
> 8 blocks for those allocations. After the 5 files are created, the
> directory's reservation has rs_start=S, rs_len=8, and rs_free=3.
> The bitmap representing those dinodes, which corresponds to the
> reservation, looks something like this: 11 11 11 11 11 00 00 00.
> 
> Now suppose you delete file "b". The directory's blocks won't change,
> nor will its hash table. However, the dinode for "b" will be deleted
> and the corresponding bitmap for the dinodes will then look something like:
> 11 00 11 11 11 00 00 00. The corresponding reservation will have:
> rs_start=S, rs_len=8 and rs_free=4.
> 
This is where the problem lies.... If the reservation for the directory
is of length 8, then the dinode "b" has no business being anywhere
within those 8 blocks, since only directory blocks should be there. It
is a different inode and should have its own reservation and its own
allocation state. The directory's reservation should only be dealing
with blocks belonging to the directory.

The fact that the child inode is currently allocated using the parent
directory's allocation state is the bug I referred to in the last email.

Another problem is that we don't have a good way to predict how many
blocks a directory might grow to ahead of time, so it is a very tricky
task to create a sensibly sized reservation at directory creation time.
We don't want big gaps between the directory blocks and its contents if
the contents are just a few small files. If on the other hand, we have a
large directory, then we want to keep the directory blocks together in
large chunks as much as possible, and the child inodes together with
their indirect and data blocks following (or not too far, at least) from
the directory.

As the directory grows, we get more information about the likelihood of
further entries being added, so we can make larger reservations more
confidently. In fact making a reservation based upon the current size of
the directory might not be a bad plan, since we have little else to go
on.

Also, I don't mind leaving directory reservations until later - just so
long as we don't regress in the mean time. Whatever is easiest really I
guess.

For the Orlov allocator, we need to be able to select a random rgrp when
creating a new directory if the parent has the T flag set. So we must
have a separate allocation state for the child directory in order to be
able to do that.

> The problem is if you now create file f in that directory,
> it essentially "claims" the blocks at rs_start + rs_len - rs_free,
> but S + 8 - 4 = S + 4, and that block is already claimed by file "e".
> 
> The alternative, as I stated in an earlier email, is to make the
> starting block, S, a moving target, adjusting it with each allocation.
> In that case, block S + 1, which was freed when file b was deleted,
> will be "left behind" and add to the fragmentation.
> 
> We can't just keep marching rs_free forward because then we get into
> rgrp accounting problems if/when the reservation is freed and has
> unclaimed blocks that we need to return to the pool of blocks in the rgrp.

The reason that this happened is because the search started from the
wrong place. When looking for blocks to make a reservation, the starting
point always needs to be based on the existing location of the inode's
blocks (if we are extending the file, then the last block in the file).

Using an algorithm which marches forward in the rgrp will fail as soon
as we have a filesystem which has been in use for some period of time,
and thus this particular location could be pointing almost anywhere.
When the next request for allocation is made, we ought to be trying to
do it as close as possible to the existing inode's blocks, and not using
this point in the rgrp.

Your algorithm will work very well while you have a filesystem that is
being filled up from new, but it is not optimal on a filesystem that has
lots of obstacles already in place, due to previous allocations, and
where the rgrps current position pointer may be fairly randomly placed
within the rgrp.

That was why I mentioned the need to use i_goal as the starting point
rather than using a pointer in the rgrp in my previous comments, as it
tends to keep blocks for each inode together where possible. I know that
i_goal is not ideal (since it will not reflect whether the write is
extending the file or is at some random point inside it) but it is at
least in the right ball park. It would be better still to calculate the
goal block depending on the requested file offset, but that kind of
thing can come later.

I suspect that you are worried about the situation of filling up a
directory with files and then having to search through the reservations
and/or allocations of previous files for each new file to find some
empty space. That can be a problem - there are various ways to resolve
it. We could take a leaf from OCFS2's book and keep an extent cache
(your rbtree is already half way there) which would make it much quicker
to skip over the unuseable space. Another alternative is to keep the
rgrp pointer which moves forward, but to be prepared to reset it to the
goal block of the current inode if there is some event which might have
created a hole in the rgrp since its last use (i.e. glock has been
dropped on the rgrp or some deallocation has taken place). There may be
other solutions too.

Does that sound ok? I'd just like to try and ensure that we maintain
good performance even after the initial fill of the filesystem as much
as we possibly can,

Steve.




^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 5] GFS2: Reduce file fragmentation
  2012-07-13 13:49               ` Steven Whitehouse
@ 2012-07-18 16:26                 ` Bob Peterson
  2012-07-19 14:36                   ` Steven Whitehouse
  0 siblings, 1 reply; 13+ messages in thread
From: Bob Peterson @ 2012-07-18 16:26 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

This is my fifth attempt at posting my file defragmentation patch
for GFS2. This patch incorporates many of the things suggested by
Steve Whitehouse in past emails. Notes:

1. The reservations now keep track of starting point and free space,
   and the starting point advances as blocks are claimed. This eliminates
   a couple variables from the reservations structure and makes it
   shorter, and the code smaller.
2. I'm using a spin_lock now to protect the reservation tree rather
   than the mutex.
3. I'm not trying to implement an Orlov allocator for the directories
   for this iteration. I'm now keeping the inode blocks closer together
   by having the inode create function grab an allocation on behalf of
   the new inode, using the parent directory.

Description:

This patch reduces GFS2 file fragmentation by pre-reserving blocks.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpeterso@redhat.com> 
---
 fs/gfs2/bmap.c       |    3 +
 fs/gfs2/file.c       |   24 +--
 fs/gfs2/incore.h     |   49 ++++-
 fs/gfs2/inode.c      |   37 +++-
 fs/gfs2/rgrp.c       |  576 +++++++++++++++++++++++++++++++++++++++++++++-----
 fs/gfs2/rgrp.h       |   31 +++-
 fs/gfs2/super.c      |    7 +
 fs/gfs2/trace_gfs2.h |   59 +++++-
 fs/gfs2/xattr.c      |   12 +
 9 files changed, 707 insertions(+), 91 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d957a8..49cd7dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (error)
 		goto out_rlist;
 
+	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
+		gfs2_rs_deltree(ip->i_res);
+
 	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
 				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
 				 revokes);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fbf3cb..9f94832 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		return ret;
 
+	atomic_set(&ip->i_res->rs_sizehint,
+		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
+
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
@@ -571,22 +574,15 @@ fail:
 
 static int gfs2_release(struct inode *inode, struct file *file)
 {
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_file *fp;
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	fp = file->private_data;
+	kfree(file->private_data);
 	file->private_data = NULL;
 
-	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
+	if ((file->f_mode & FMODE_WRITE) &&
 	    (atomic_read(&inode->i_writecount) == 1))
 		gfs2_rs_delete(ip);
 
-	if (gfs2_assert_warn(sdp, fp))
-		return -EIO;
-
-	kfree(fp);
-
 	return 0;
 }
 
@@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				   unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
+	size_t writesize = iov_length(iov, nr_segs);
 	struct dentry *dentry = file->f_dentry;
 	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_sbd *sdp;
 	int ret;
 
+	sdp = GFS2_SB(file->f_mapping->host);
 	ret = gfs2_rs_alloc(ip);
 	if (ret)
 		return ret;
 
+	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
 	if (file->f_flags & O_APPEND) {
 		struct gfs2_holder gh;
 
@@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	if (unlikely(error))
 		goto out_uninit;
 
+	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
+
 	while (len > 0) {
 		if (len < bytes)
 			bytes = len;
@@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 			offset += bytes;
 			continue;
 		}
-		error = gfs2_rindex_update(sdp);
-		if (error)
-			goto out_unlock;
-
 		error = gfs2_quota_lock_check(ip);
 		if (error)
 			goto out_unlock;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dc73070..aaecc80 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -84,6 +84,7 @@ struct gfs2_rgrpd {
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
 	u32 rd_free;
+	u32 rd_reserved;                /* number of blocks reserved */
 	u32 rd_free_clone;
 	u32 rd_dinodes;
 	u64 rd_igeneration;
@@ -96,6 +97,9 @@ struct gfs2_rgrpd {
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
 #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
+	spinlock_t rd_rsspin;           /* protects reservation related vars */
+	struct rb_root rd_rstree;       /* multi-block reservation tree */
+	u32 rd_rs_cnt;                  /* count of current reservations */
 };
 
 enum gfs2_state_bits {
@@ -233,6 +237,38 @@ struct gfs2_holder {
 	unsigned long gh_ip;
 };
 
+/* Resource group multi-block reservation, in order of appearance:
+
+   Step 1. Function prepares to write, allocates a mb, sets the size hint.
+   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
+   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
+   Step 4. Bits are assigned from the rgrp based on either the reservation
+           or wherever it can.
+*/
+
+struct gfs2_blkreserv {
+	/* components used during write (step 1): */
+	atomic_t rs_sizehint;         /* hint of the write size */
+
+	/* components used during inplace_reserve (step 2): */
+	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+
+	/* components used during get_local_rgrp (step 3): */
+	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
+	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
+	struct rb_node rs_node;       /* link to other block reservations */
+
+	/* components used during block searches and assignments (step 4): */
+	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
+	u32 rs_biblk;                 /* start block relative to the bi */
+	u32 rs_free;                  /* how many blocks are still free */
+
+	/* ancillary quota stuff */
+	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
+	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+	unsigned int rs_qa_qd_num;
+};
+
 enum {
 	GLF_LOCK			= 1,
 	GLF_DEMOTE			= 3,
@@ -290,16 +326,6 @@ struct gfs2_glock {
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
 
-struct gfs2_blkreserv {
-	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
-	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
-
-	/* ancillary quota stuff */
-	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
-	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
-	unsigned int rs_qa_qd_num;
-};
-
 enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
@@ -307,7 +333,6 @@ enum {
 	GIF_SW_PAGED		= 3,
 };
 
-
 struct gfs2_inode {
 	struct inode i_inode;
 	u64 i_no_addr;
@@ -318,7 +343,7 @@ struct gfs2_inode {
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
-	struct gfs2_blkreserv *i_res; /* resource group block reservation */
+	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
 	struct gfs2_rgrpd *i_rgd;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b035e0..c53c67e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	int error;
 
 	munge_mode_uid_gid(dip, &mode, &uid, &gid);
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
 
 	error = gfs2_quota_lock(dip, uid, gid);
 	if (error)
@@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	struct buffer_head *dibh;
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto fail;
@@ -596,7 +603,8 @@ fail_end_trans:
 	gfs2_trans_end(sdp);
 
 fail_ipreserv:
-	gfs2_inplace_release(dip);
+	if (alloc_required)
+		gfs2_inplace_release(dip);
 
 fail_quota_locks:
 	gfs2_quota_unlock(dip);
@@ -647,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_holder ghs[2];
 	struct inode *inode = NULL;
-	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_inode *dip = GFS2_I(dir), *ip;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
 	int error;
@@ -657,6 +665,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (!name->len || name->len > GFS2_FNAMESIZE)
 		return -ENAMETOOLONG;
 
+	/* We need a reservation to allocate the new dinode block. The
+	   directory ip temporarily points to the reservation, but this is
+	   being done to get a set of contiguous blocks for the new dinode.
+	   Since this is a create, we don't have a sizehint yet, so it will
+	   have to use the minimum reservation size. */
 	error = gfs2_rs_alloc(dip);
 	if (error)
 		return error;
@@ -694,24 +707,29 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
-	error = gfs2_inode_refresh(GFS2_I(inode));
+	ip = GFS2_I(inode);
+	error = gfs2_inode_refresh(ip);
 	if (error)
 		goto fail_gunlock2;
 
-	/* the new inode needs a reservation so it can allocate xattrs. */
-	error = gfs2_rs_alloc(GFS2_I(inode));
-	if (error)
-		goto fail_gunlock2;
+	/* The newly created inode needs a reservation so it can allocate
+	   xattrs. At the same time, we want new blocks allocated to the new
+	   dinode to be as contiguous as possible. Since we allocated the
+	   dinode block under the directory's reservation, we transfer
+	   ownership of that reservation to the new inode. The directory
+	   doesn't need a reservation unless it needs a new allocation. */
+	ip->i_res = dip->i_res;
+	dip->i_res = NULL;
 
 	error = gfs2_acl_create(dip, inode);
 	if (error)
 		goto fail_gunlock2;
 
-	error = gfs2_security_init(dip, GFS2_I(inode), name);
+	error = gfs2_security_init(dip, ip, name);
 	if (error)
 		goto fail_gunlock2;
 
-	error = link_dinode(dip, name, GFS2_I(inode));
+	error = link_dinode(dip, name, ip);
 	if (error)
 		goto fail_gunlock2;
 
@@ -738,6 +756,7 @@ fail_gunlock:
 		iput(inode);
 	}
 fail:
+	gfs2_rs_delete(dip);
 	if (bh)
 		brelse(bh);
 	return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e53d0a1..ad37fe2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -35,6 +35,9 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
+#define RSRV_CONTENTION_FACTOR 4
+#define RGRP_RSRV_MAX_CONTENDERS 2
+
 #if BITS_PER_LONG == 32
 #define LBITMASK   (0x55555555UL)
 #define LBITSKIP55 (0x55555555UL)
@@ -178,6 +181,57 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
 }
 
 /**
+ * rs_cmp - multi-block reservation range compare
+ * @blk: absolute file system block number of the new reservation
+ * @len: number of blocks in the new reservation
+ * @rs: existing reservation to compare against
+ *
+ * returns: 1 if the block range is beyond the reach of the reservation
+ *         -1 if the block range is before the start of the reservation
+ *          0 if the block range overlaps with the reservation
+ */
+static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
+{
+	u64 startblk = gfs2_rs_startblk(rs);
+
+	if (blk >= startblk + rs->rs_free)
+		return 1;
+	if (blk + len - 1 < startblk)
+		return -1;
+	return 0;
+}
+
+/**
+ * rs_find - Find a rgrp multi-block reservation that contains a given block
+ * @rgd: The rgrp
+ * @rgblk: The block we're looking for, relative to the rgrp
+ */
+static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
+{
+	struct rb_node **newn;
+	int rc;
+	u64 fsblk = rgblk + rgd->rd_data0;
+
+	spin_lock(&rgd->rd_rsspin);
+	newn = &rgd->rd_rstree.rb_node;
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+		rc = rs_cmp(fsblk, 1, cur);
+		if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else {
+			spin_unlock(&rgd->rd_rsspin);
+			return cur;
+		}
+	}
+	spin_unlock(&rgd->rd_rsspin);
+	return NULL;
+}
+
+/**
  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
  *       a block in a given allocation state.
  * @buf: the buffer that holds the bitmaps
@@ -424,19 +478,93 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
 int gfs2_rs_alloc(struct gfs2_inode *ip)
 {
 	int error = 0;
+	struct gfs2_blkreserv *res;
+
+	if (ip->i_res)
+		return 0;
+
+	res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
+	if (!res)
+		error = -ENOMEM;
 
 	down_write(&ip->i_rw_mutex);
-	if (!ip->i_res) {
-		ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
-		if (!ip->i_res)
-			error = -ENOMEM;
-	}
+	if (ip->i_res)
+		kmem_cache_free(gfs2_rsrv_cachep, res);
+	else
+		ip->i_res = res;
 	up_write(&ip->i_rw_mutex);
 	return error;
 }
 
+static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
+{
+	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u f:%u\n",
+		       rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk,
+		       rs->rs_free);
+}
+
 /**
- * gfs2_rs_delete - delete a reservation
+ * __rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+static void __rs_deltree(struct gfs2_blkreserv *rs)
+{
+	struct gfs2_rgrpd *rgd;
+
+	if (!gfs2_rs_active(rs))
+		return;
+
+	rgd = rs->rs_rgd;
+	/* We can't do this: The reason is that when the rgrp is invalidated,
+	   it's in the "middle" of acquiring the glock, but the HOLDER bit
+	   isn't set yet:
+	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
+	trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL);
+
+	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
+		rb_erase(&rs->rs_node, &rgd->rd_rstree);
+	BUG_ON(!rgd->rd_rs_cnt);
+	rgd->rd_rs_cnt--;
+
+	if (rs->rs_free) {
+		/* return reserved blocks to the rgrp and the ip */
+		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
+		rs->rs_rgd->rd_reserved -= rs->rs_free;
+		rs->rs_free = 0;
+		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
+		smp_mb__after_clear_bit();
+	}
+	/* We can't change any of the step 1 or step 2 components of the rs.
+	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
+	   dequeued through this pointer.
+	   Can't: atomic_set(&rs->rs_sizehint, 0);
+	   Can't: rs->rs_requested = 0;
+	   Can't: rs->rs_rgd = NULL;*/
+	rs->rs_bi = NULL;
+	rs->rs_biblk = 0;
+}
+
+/**
+ * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+{
+	struct gfs2_rgrpd *rgd;
+
+	if (!gfs2_rs_active(rs))
+		return;
+
+	rgd = rs->rs_rgd;
+	spin_lock(&rgd->rd_rsspin);
+	__rs_deltree(rs);
+	spin_unlock(&rgd->rd_rsspin);
+}
+
+/**
+ * gfs2_rs_delete - delete a multi-block reservation
  * @ip: The inode for this reservation
  *
  */
@@ -444,12 +572,36 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
 {
 	down_write(&ip->i_rw_mutex);
 	if (ip->i_res) {
+		gfs2_rs_deltree(ip->i_res);
+		trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE);
+		BUG_ON(ip->i_res->rs_free);
 		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
 		ip->i_res = NULL;
 	}
 	up_write(&ip->i_rw_mutex);
 }
 
+/**
+ * return_all_reservations - return all reserved blocks back to the rgrp.
+ * @rgd: the rgrp that needs its space back
+ *
+ * We previously reserved a bunch of blocks for allocation. Now we need to
+ * give them back. This leave the reservation structures in tact, but removes
+ * all of their corresponding "no-fly zones".
+ */
+static void return_all_reservations(struct gfs2_rgrpd *rgd)
+{
+	struct rb_node *n;
+	struct gfs2_blkreserv *rs;
+
+	spin_lock(&rgd->rd_rsspin);
+	while ((n = rb_first(&rgd->rd_rstree))) {
+		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		__rs_deltree(rs);
+	}
+	spin_unlock(&rgd->rd_rsspin);
+}
+
 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 {
 	struct rb_node *n;
@@ -472,6 +624,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 
 		gfs2_free_clones(rgd);
 		kfree(rgd->rd_bits);
+		return_all_reservations(rgd);
 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
 	}
 }
@@ -649,6 +802,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
 	rgd->rd_data = be32_to_cpu(buf.ri_data);
 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
+	spin_lock_init(&rgd->rd_rsspin);
 
 	error = compute_bitstructs(rgd);
 	if (error)
@@ -1115,29 +1269,212 @@ out:
 }
 
 /**
+ * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
+ * @bi: the bitmap with the blocks
+ * @ip: the inode structure
+ * @biblk: the 32-bit block number relative to the start of the bitmap
+ * @amount: the number of blocks to reserve
+ *
+ * Returns: NULL - reservation was already taken, so not inserted
+ *          pointer to the inserted reservation
+ */
+static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
+				       struct gfs2_inode *ip, u32 biblk,
+				       int amount)
+{
+	struct rb_node **newn, *parent = NULL;
+	int rc;
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rgd;
+	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+
+	spin_lock(&rgd->rd_rsspin);
+	newn = &rgd->rd_rstree.rb_node;
+	BUG_ON(!ip->i_res);
+	BUG_ON(gfs2_rs_active(rs));
+	/* Figure out where to put new node */
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+	while (*newn) {
+		struct gfs2_blkreserv *cur =
+			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+
+		parent = *newn;
+		rc = rs_cmp(fsblock, amount, cur);
+		if (rc > 0)
+			newn = &((*newn)->rb_right);
+		else if (rc < 0)
+			newn = &((*newn)->rb_left);
+		else {
+			spin_unlock(&rgd->rd_rsspin);
+			return NULL; /* reservation already in use */
+		}
+	}
+
+	/* Do our reservation work */
+	rs = ip->i_res;
+	rs->rs_free = amount;
+	rs->rs_biblk = biblk;
+	rs->rs_bi = bi;
+	rb_link_node(&rs->rs_node, parent, newn);
+	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
+
+	/* Do our inode accounting for the reservation */
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+
+	/* Do our rgrp accounting for the reservation */
+	rgd->rd_reserved += amount; /* blocks reserved */
+	rgd->rd_rs_cnt++; /* number of in-tree reservations */
+	spin_unlock(&rgd->rd_rsspin);
+	trace_gfs2_rs(ip, rs, TRACE_RS_INSERT);
+	return rs;
+}
+
+/**
+ * unclaimed_blocks - return number of blocks that aren't spoken for
+ */
+static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
+{
+	return rgd->rd_free_clone - rgd->rd_reserved;
+}
+
+/**
+ * rg_mblk_search - find a group of multiple free blocks
+ * @rgd: the resource group descriptor
+ * @rs: the block reservation
+ * @ip: pointer to the inode for which we're reserving blocks
+ *
+ * This is very similar to rgblk_search, except we're looking for whole
+ * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
+ * on aligned dwords for speed's sake.
+ *
+ * Returns: 0 if successful or BFITNOENT if there isn't enough free space
+ */
+
+static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+	struct gfs2_bitmap *bi = rgd->rd_bits;
+	const u32 length = rgd->rd_length;
+	u32 blk;
+	unsigned int buf, x, search_bytes;
+	u8 *buffer = NULL;
+	u8 *ptr, *end, *nonzero;
+	u32 goal, rsv_bytes;
+	struct gfs2_blkreserv *rs;
+	u32 best_rs_bytes, unclaimed;
+	int best_rs_blocks;
+
+	/* Find bitmap block that contains bits for goal block */
+	if (rgrp_contains_block(rgd, ip->i_goal))
+		goal = ip->i_goal - rgd->rd_data0;
+	else
+		goal = rgd->rd_last_alloc;
+	for (buf = 0; buf < length; buf++) {
+		bi = rgd->rd_bits + buf;
+		/* Convert scope of "goal" from rgrp-wide to within
+		   found bit block */
+		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+			goal -= bi->bi_start * GFS2_NBBY;
+			goto do_search;
+		}
+	}
+	buf = 0;
+	goal = 0;
+
+do_search:
+	best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
+			       (RGRP_RSRV_MINBLKS * rgd->rd_length));
+	best_rs_bytes = (best_rs_blocks *
+			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
+		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
+	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
+	unclaimed = unclaimed_blocks(rgd);
+	if (best_rs_bytes * GFS2_NBBY > unclaimed)
+		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
+
+	for (x = 0; x <= length; x++) {
+		bi = rgd->rd_bits + buf;
+
+		if (test_bit(GBF_FULL, &bi->bi_flags))
+			goto skip;
+
+		WARN_ON(!buffer_uptodate(bi->bi_bh));
+		if (bi->bi_clone)
+			buffer = bi->bi_clone + bi->bi_offset;
+		else
+			buffer = bi->bi_bh->b_data + bi->bi_offset;
+
+		/* We have to keep the reservations aligned on u64 boundaries
+		   otherwise we could get situations where a byte can't be
+		   used because it's after a reservation, but a free bit still
+		   is within the reservation's area. */
+		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
+		end = (buffer + bi->bi_len);
+		while (ptr < end) {
+			rsv_bytes = 0;
+			if ((ptr + best_rs_bytes) <= end)
+				search_bytes = best_rs_bytes;
+			else
+				search_bytes = end - ptr;
+			BUG_ON(!search_bytes);
+			nonzero = memchr_inv(ptr, 0, search_bytes);
+			/* If the lot is all zeroes, reserve the whole size. If
+			   there's enough zeroes to satisfy the request, use
+			   what we can. If there's not enough, keep looking. */
+			if (nonzero == NULL)
+				rsv_bytes = search_bytes;
+			else if ((nonzero - ptr) * GFS2_NBBY >=
+				 ip->i_res->rs_requested)
+				rsv_bytes = (nonzero - ptr);
+
+			if (rsv_bytes) {
+				blk = ((ptr - buffer) * GFS2_NBBY);
+				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
+				rs = rs_insert(bi, ip, blk,
+					       rsv_bytes * GFS2_NBBY);
+				if (IS_ERR(rs))
+					return PTR_ERR(rs);
+				if (rs)
+					return 0;
+			}
+			ptr += ALIGN(search_bytes, sizeof(u64));
+		}
+skip:
+		/* Try next bitmap block (wrap back to rgrp header
+		   if at end) */
+		buf++;
+		buf %= length;
+		goal = 0;
+	}
+
+	return BFITNOENT;
+}
+
+/**
  * try_rgrp_fit - See if a given reservation will fit in a given RG
  * @rgd: the RG data
  * @ip: the inode
  *
  * If there's room for the requested blocks to be allocated from the RG:
+ * This will try to get a multi-block reservation first, and if that doesn't
+ * fit, it will take what it can.
  *
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
  */
 
-static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
-	const struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_blkreserv *rs = ip->i_res;
 
 	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
-	if (rgd->rd_free_clone >= rs->rs_requested)
+	/* Look for a multi-block reservation. */
+	if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
+	    rg_mblk_search(rgd, ip) != BFITNOENT)
+		return 1;
+	if (unclaimed_blocks(rgd) >= rs->rs_requested)
 		return 1;
-	return 0;
-}
 
-static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
-{
-	return (bi->bi_start * GFS2_NBBY) + blk;
+	return 0;
 }
 
 /**
@@ -1217,7 +1554,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrpd *rgd, *begin = NULL;
+	struct gfs2_rgrpd *begin = NULL;
 	struct gfs2_blkreserv *rs = ip->i_res;
 	int error = 0, rg_locked, flags = LM_FLAG_TRY;
 	u64 last_unlinked = NO_BLOCK;
@@ -1225,32 +1562,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 
 	if (sdp->sd_args.ar_rgrplvb)
 		flags |= GL_SKIP;
-	rs = ip->i_res;
 	rs->rs_requested = requested;
 	if (gfs2_assert_warn(sdp, requested)) {
 		error = -EINVAL;
 		goto out;
 	}
-
-	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
-		rgd = begin = ip->i_rgd;
-	else
-		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
-
-	if (rgd == NULL)
+	if (gfs2_rs_active(rs)) {
+		begin = rs->rs_rgd;
+		flags = 0; /* Yoda: Do or do not. There is no try */
+	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
+		rs->rs_rgd = begin = ip->i_rgd;
+	} else {
+		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
+	}
+	if (rs->rs_rgd == NULL)
 		return -EBADSLT;
 
 	while (loops < 3) {
 		rg_locked = 0;
 
-		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
 			rg_locked = 1;
 			error = 0;
+		} else if (!loops && !gfs2_rs_active(rs) &&
+			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
+			/* If the rgrp already is maxed out for contenders,
+			   we can eliminate it as a "first pass" without even
+			   requesting the rgrp glock. */
+			error = GLR_TRYFAILED;
 		} else {
-			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
-						   flags, &rs->rs_rgd_gh);
+			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
+						   LM_ST_EXCLUSIVE, flags,
+						   &rs->rs_rgd_gh);
 			if (!error && sdp->sd_args.ar_rgrplvb) {
-				error = update_rgrp_lvb(rgd);
+				error = update_rgrp_lvb(rs->rs_rgd);
 				if (error) {
 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 					return error;
@@ -1259,24 +1604,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
 		}
 		switch (error) {
 		case 0:
-			if (try_rgrp_fit(rgd, ip)) {
+			if (gfs2_rs_active(rs)) {
+				if (unclaimed_blocks(rs->rs_rgd) +
+				    rs->rs_free >= rs->rs_requested) {
+					ip->i_rgd = rs->rs_rgd;
+					return 0;
+				}
+				/* We have a multi-block reservation, but the
+				   rgrp doesn't have enough free blocks to
+				   satisfy the request. Free the reservation
+				   and look for a suitable rgrp. */
+				gfs2_rs_deltree(rs);
+			}
+			if (try_rgrp_fit(rs->rs_rgd, ip)) {
 				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rgd);
-				ip->i_rgd = rgd;
+					gfs2_rgrp_bh_get(rs->rs_rgd);
+				ip->i_rgd = rs->rs_rgd;
 				return 0;
 			}
-			if (rgd->rd_flags & GFS2_RDF_CHECK) {
+			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
 				if (sdp->sd_args.ar_rgrplvb)
-					gfs2_rgrp_bh_get(rgd);
-				try_rgrp_unlink(rgd, &last_unlinked,
+					gfs2_rgrp_bh_get(rs->rs_rgd);
+				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
 						ip->i_no_addr);
 			}
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 			/* fall through */
 		case GLR_TRYFAILED:
-			rgd = gfs2_rgrpd_get_next(rgd);
-			if (rgd != begin) /* If we didn't wrap */
+			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
+			if (rs->rs_rgd != begin) /* If we didn't wrap */
 				break;
 
 			flags &= ~LM_FLAG_TRY;
@@ -1314,6 +1671,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 {
 	struct gfs2_blkreserv *rs = ip->i_res;
 
+	if (!rs)
+		return;
+
+	if (!rs->rs_free)
+		gfs2_rs_deltree(rs);
+
 	if (rs->rs_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 	rs->rs_requested = 0;
@@ -1412,7 +1775,27 @@ do_search:
 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
 			buffer = bi->bi_clone + bi->bi_offset;
 
-		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+		while (1) {
+			struct gfs2_blkreserv *rs;
+			u32 rgblk;
+
+			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+			if (biblk == BFITNOENT)
+				break;
+			/* Check if this block is reserved() */
+			rgblk = gfs2_bi2rgd_blk(bi, biblk);
+			rs = rs_find(rgd, rgblk);
+			if (rs == NULL)
+				break;
+
+			BUG_ON(rs->rs_bi != bi);
+			biblk = BFITNOENT;
+			/* This should jump to the first block after the
+			   reservation. */
+			goal = rs->rs_biblk + rs->rs_free;
+			if (goal >= bi->bi_len * GFS2_NBBY)
+				break;
+		}
 		if (biblk != BFITNOENT)
 			break;
 
@@ -1448,8 +1831,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 			     u32 blk, bool dinode, unsigned int *n)
 {
 	const unsigned int elen = *n;
-	u32 goal;
+	u32 goal, rgblk;
 	const u8 *buffer = NULL;
+	struct gfs2_blkreserv *rs;
 
 	*n = 0;
 	buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1462,6 +1846,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
 		goal++;
 		if (goal >= (bi->bi_len * GFS2_NBBY))
 			break;
+		rgblk = gfs2_bi2rgd_blk(bi, goal);
+		rs = rs_find(rgd, rgblk);
+		if (rs) /* Oops, we bumped into someone's reservation */
+			break;
 		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
 		    GFS2_BLKST_FREE)
 			break;
@@ -1537,12 +1925,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 
 int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
-	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	struct gfs2_rgrpd *rgd = gl->gl_object;
+	struct gfs2_blkreserv *trs;
+	const struct rb_node *n;
+
 	if (rgd == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
+		       rgd->rd_reserved);
+	spin_lock(&rgd->rd_rsspin);
+	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
+		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+		dump_rs(seq, trs);
+	}
+	spin_unlock(&rgd->rd_rsspin);
 	return 0;
 }
 
@@ -1557,10 +1955,63 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 }
 
 /**
+ * claim_reserved_blks - Claim previously reserved blocks
+ * @ip: the inode that's claiming the reservation
+ * @dinode: 1 if this block is a dinode block, otherwise data block
+ * @nblocks: desired extent length
+ *
+ * Lay claim to previously allocated block reservation blocks.
+ * Returns: Starting block number of the blocks claimed.
+ * Sets *nblocks to the actual extent length allocated.
+ */
+static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
+			       unsigned int *nblocks)
+{
+	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_rgrpd *rgd = rs->rs_rgd;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_bitmap *bi;
+	u64 start_block = gfs2_rs_startblk(rs);
+	const unsigned int elen = *nblocks;
+
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+	gfs2_assert_withdraw(sdp, rgd);
+	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+	bi = rs->rs_bi;
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+
+	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
+		/* Make sure the bitmap hasn't changed */
+		gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk,
+			    dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
+		rs->rs_biblk++;
+		rs->rs_free--;
+
+		BUG_ON(!rgd->rd_reserved);
+		rgd->rd_reserved--;
+		dinode = false;
+		trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM);
+	}
+
+	if (!rs->rs_free) {
+		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
+
+		gfs2_rs_deltree(rs);
+		/* -nblocks because we haven't returned to do the math yet.
+		   I'm doing the math backwards to prevent negative numbers,
+		   but think of it as:
+		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
+		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
+			rg_mblk_search(rgd, ip);
+	}
+	return start_block;
+}
+
+/**
  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
  * @ip: the inode to allocate the block for
  * @bn: Used to return the starting block number
- * @ndata: requested number of blocks/extent length (value/result)
+ * @nblocks: requested number of blocks/extent length (value/result)
  * @dinode: 1 if we're allocating a dinode block, else 0
  * @generation: the generation number of the inode
  *
@@ -1585,20 +2036,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	if (ip->i_res->rs_requested == 0)
 		return -ECANCELED;
 
-	rgd = ip->i_rgd;
-
-	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
-		goal = ip->i_goal - rgd->rd_data0;
-	else
-		goal = rgd->rd_last_alloc;
-
-	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+	/* Check if we have a multi-block reservation, and if so, claim the
+	   next free block from it. */
+	if (gfs2_rs_active(ip->i_res)) {
+		BUG_ON(!ip->i_res->rs_free);
+		rgd = ip->i_res->rs_rgd;
+		block = claim_reserved_blks(ip, dinode, nblocks);
+	} else {
+		rgd = ip->i_rgd;
 
-	/* Since all blocks are reserved in advance, this shouldn't happen */
-	if (blk == BFITNOENT)
-		goto rgrp_error;
+		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
+			goal = ip->i_goal - rgd->rd_data0;
+		else
+			goal = rgd->rd_last_alloc;
+
+		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+
+		/* Since all blocks are reserved in advance, this shouldn't
+		   happen */
+		if (blk == BFITNOENT) {
+			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
+			       *nblocks);
+			printk(KERN_WARNING "FULL=%d\n",
+			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
+			goto rgrp_error;
+		}
 
-	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
+	}
 	ndata = *nblocks;
 	if (dinode)
 		ndata--;
@@ -1615,8 +2080,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 			brelse(dibh);
 		}
 	}
-	if (rgd->rd_free < *nblocks)
+	if (rgd->rd_free < *nblocks) {
+		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
 		goto rgrp_error;
+	}
 
 	rgd->rd_free -= *nblocks;
 	if (dinode) {
@@ -1876,6 +2343,7 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
 		for (x = 0; x < rlist->rl_rgrps; x++)
 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
 		kfree(rlist->rl_ghs);
+		rlist->rl_ghs = NULL;
 	}
 }
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5d8314d..ca6e267 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -13,6 +13,14 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 
+/* Since each block in the file system is represented by two bits in the
+ * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
+ * By reserving 32 blocks at a time, we can optimize / shortcut how we search
+ * through the bitmaps by looking a word at a time.
+ */
+#define RGRP_RSRV_MINBYTES 8
+#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
@@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
 extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
 extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+
 extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
@@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 			     bool dinode, u64 *generation);
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
 extern void gfs2_rs_delete(struct gfs2_inode *ip);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
 extern int gfs2_fitrim(struct file *filp, void __user *argp);
 
-/* This is how to tell if a reservation is "inplace" reserved: */
+/* This is how to tell if a multi-block reservation is "inplace" reserved: */
 static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
 {
 	if (ip->i_res && ip->i_res->rs_requested)
@@ -70,4 +81,22 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
 	return 0;
 }
 
+/* This is how to tell if a multi-block reservation is in the rgrp tree: */
+static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
+{
+	if (rs && rs->rs_bi)
+		return 1;
+	return 0;
+}
+
+static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk)
+{
+	return (bi->bi_start * GFS2_NBBY) + blk;
+}
+
+static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs)
+{
+	return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0;
+}
+
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7880687..b1502c4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 		return -EIO;
 	}
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		return error;
@@ -1550,6 +1554,9 @@ out_truncate:
 
 out_unlock:
 	/* Error path for case 1 */
+	if (gfs2_rs_active(ip->i_res))
+		gfs2_rs_deltree(ip->i_res);
+
 	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
 		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 1b8b815..a25c252 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -14,6 +14,7 @@
 #include <linux/ktime.h>
 #include "incore.h"
 #include "glock.h"
+#include "rgrp.h"
 
 #define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
 #define glock_trace_name(x) __print_symbolic(x,		\
@@ -31,6 +32,17 @@
 			    { GFS2_BLKST_DINODE, "dinode" },	\
 			    { GFS2_BLKST_UNLINKED, "unlinked" })
 
+#define TRACE_RS_DELETE  0
+#define TRACE_RS_TREEDEL 1
+#define TRACE_RS_INSERT  2
+#define TRACE_RS_CLAIM   3
+
+#define rs_func_name(x) __print_symbolic(x,	\
+					 { 0, "del " },	\
+					 { 1, "tdel" },	\
+					 { 2, "ins " },	\
+					 { 3, "clm " })
+
 #define show_glock_flags(flags) __print_flags(flags, "",	\
 	{(1UL << GLF_LOCK),			"l" },		\
 	{(1UL << GLF_DEMOTE),			"D" },		\
@@ -470,6 +482,7 @@ TRACE_EVENT(gfs2_block_alloc,
 		__field(	u8,	block_state		)
 		__field(        u64,	rd_addr			)
 		__field(        u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
 	),
 
 	TP_fast_assign(
@@ -480,16 +493,58 @@ TRACE_EVENT(gfs2_block_alloc,
 		__entry->block_state	= block_state;
 		__entry->rd_addr	= rgd->rd_addr;
 		__entry->rd_free_clone	= rgd->rd_free_clone;
+		__entry->rd_reserved	= rgd->rd_reserved;
 	),
 
-	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
+	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long long)__entry->inum,
 		  (unsigned long long)__entry->start,
 		  (unsigned long)__entry->len,
 		  block_state_name(__entry->block_state),
 		  (unsigned long long)__entry->rd_addr,
-		  __entry->rd_free_clone)
+		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
+);
+
+/* Keep track of multi-block reservations as they are allocated/freed */
+TRACE_EVENT(gfs2_rs,
+
+	TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
+		 u8 func),
+
+	TP_ARGS(ip, rs, func),
+
+	TP_STRUCT__entry(
+		__field(        dev_t,  dev                     )
+		__field(	u64,	rd_addr			)
+		__field(	u32,	rd_free_clone		)
+		__field(	u32,	rd_reserved		)
+		__field(	u64,	inum			)
+		__field(	u64,	start			)
+		__field(	u32,	free			)
+		__field(	u8,	func			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
+		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
+		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
+		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
+		__entry->inum		= ip ? ip->i_no_addr : 0;
+		__entry->start		= gfs2_rs_startblk(rs);
+		__entry->free		= rs->rs_free;
+		__entry->func		= func;
+	),
+
+	TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s "
+		  "f:%lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->inum,
+		  (unsigned long long)__entry->start,
+		  (unsigned long long)__entry->rd_addr,
+		  (unsigned long)__entry->rd_free_clone,
+		  (unsigned long)__entry->rd_reserved,
+		  rs_func_name(__entry->func), (unsigned long)__entry->free)
 );
 
 #endif /* _TRACE_GFS2_H */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 523c0de..27a0b4a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 {
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		goto out_alloc;
@@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	struct buffer_head *dibh;
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_lock_check(ip);
 	if (error)
 		return error;
@@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 {
 	int error;
 
+	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
+	if (error)
+		return error;
+
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
 		return error;



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Cluster-devel] [GFS2 Patch] [Try 5] GFS2: Reduce file fragmentation
  2012-07-18 16:26                 ` [Cluster-devel] [GFS2 Patch] [Try 5] " Bob Peterson
@ 2012-07-19 14:36                   ` Steven Whitehouse
  0 siblings, 0 replies; 13+ messages in thread
From: Steven Whitehouse @ 2012-07-19 14:36 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

With a slight tweek to take account of Abhi's patch, this is now in the
-nmw tree. A really good milestone to have reached. We should now take
the time to give this a really good test, and make sure that there are
no other corner cases lurking anywhere,

Steve.

On Wed, 2012-07-18 at 12:26 -0400, Bob Peterson wrote:
> Hi,
> 
> This is my fifth attempt at posting my file defragmentation patch
> for GFS2. This patch incorporates many of the things suggested by
> Steve Whitehouse in past emails. Notes:
> 
> 1. The reservations now keep track of starting point and free space,
>    and the starting point advances as blocks are claimed. This eliminates
>    a couple variables from the reservations structure and makes it
>    shorter, and the code smaller.
> 2. I'm using a spin_lock now to protect the reservation tree rather
>    than the mutex.
> 3. I'm not trying to implement an Orlov allocator for the directories
>    for this iteration. I'm now keeping the inode blocks closer together
>    by having the inode create function grab an allocation on behalf of
>    the new inode, using the parent directory.
> 
> Description:
> 
> This patch reduces GFS2 file fragmentation by pre-reserving blocks.
> 
> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com> 
> ---
>  fs/gfs2/bmap.c       |    3 +
>  fs/gfs2/file.c       |   24 +--
>  fs/gfs2/incore.h     |   49 ++++-
>  fs/gfs2/inode.c      |   37 +++-
>  fs/gfs2/rgrp.c       |  576 +++++++++++++++++++++++++++++++++++++++++++++-----
>  fs/gfs2/rgrp.h       |   31 +++-
>  fs/gfs2/super.c      |    7 +
>  fs/gfs2/trace_gfs2.h |   59 +++++-
>  fs/gfs2/xattr.c      |   12 +
>  9 files changed, 707 insertions(+), 91 deletions(-)
> 
> diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
> index 6d957a8..49cd7dd 100644
> --- a/fs/gfs2/bmap.c
> +++ b/fs/gfs2/bmap.c
> @@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
>  	if (error)
>  		goto out_rlist;
>  
> +	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
> +		gfs2_rs_deltree(ip->i_res);
> +
>  	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
>  				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
>  				 revokes);
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 6fbf3cb..9f94832 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint,
> +		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
> +
>  	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
>  	ret = gfs2_glock_nq(&gh);
>  	if (ret)
> @@ -571,22 +574,15 @@ fail:
>  
>  static int gfs2_release(struct inode *inode, struct file *file)
>  {
> -	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
> -	struct gfs2_file *fp;
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  
> -	fp = file->private_data;
> +	kfree(file->private_data);
>  	file->private_data = NULL;
>  
> -	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
> +	if ((file->f_mode & FMODE_WRITE) &&
>  	    (atomic_read(&inode->i_writecount) == 1))
>  		gfs2_rs_delete(ip);
>  
> -	if (gfs2_assert_warn(sdp, fp))
> -		return -EIO;
> -
> -	kfree(fp);
> -
>  	return 0;
>  }
>  
> @@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  				   unsigned long nr_segs, loff_t pos)
>  {
>  	struct file *file = iocb->ki_filp;
> +	size_t writesize = iov_length(iov, nr_segs);
>  	struct dentry *dentry = file->f_dentry;
>  	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
> +	struct gfs2_sbd *sdp;
>  	int ret;
>  
> +	sdp = GFS2_SB(file->f_mapping->host);
>  	ret = gfs2_rs_alloc(ip);
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
>  	if (file->f_flags & O_APPEND) {
>  		struct gfs2_holder gh;
>  
> @@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  	if (unlikely(error))
>  		goto out_uninit;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
> +
>  	while (len > 0) {
>  		if (len < bytes)
>  			bytes = len;
> @@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  			offset += bytes;
>  			continue;
>  		}
> -		error = gfs2_rindex_update(sdp);
> -		if (error)
> -			goto out_unlock;
> -
>  		error = gfs2_quota_lock_check(ip);
>  		if (error)
>  			goto out_unlock;
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index dc73070..aaecc80 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -84,6 +84,7 @@ struct gfs2_rgrpd {
>  	u32 rd_data;			/* num of data blocks in rgrp */
>  	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
>  	u32 rd_free;
> +	u32 rd_reserved;                /* number of blocks reserved */
>  	u32 rd_free_clone;
>  	u32 rd_dinodes;
>  	u64 rd_igeneration;
> @@ -96,6 +97,9 @@ struct gfs2_rgrpd {
>  #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
>  #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
>  #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
> +	spinlock_t rd_rsspin;           /* protects reservation related vars */
> +	struct rb_root rd_rstree;       /* multi-block reservation tree */
> +	u32 rd_rs_cnt;                  /* count of current reservations */
>  };
>  
>  enum gfs2_state_bits {
> @@ -233,6 +237,38 @@ struct gfs2_holder {
>  	unsigned long gh_ip;
>  };
>  
> +/* Resource group multi-block reservation, in order of appearance:
> +
> +   Step 1. Function prepares to write, allocates a mb, sets the size hint.
> +   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
> +   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
> +   Step 4. Bits are assigned from the rgrp based on either the reservation
> +           or wherever it can.
> +*/
> +
> +struct gfs2_blkreserv {
> +	/* components used during write (step 1): */
> +	atomic_t rs_sizehint;         /* hint of the write size */
> +
> +	/* components used during inplace_reserve (step 2): */
> +	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> +
> +	/* components used during get_local_rgrp (step 3): */
> +	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
> +	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
> +	struct rb_node rs_node;       /* link to other block reservations */
> +
> +	/* components used during block searches and assignments (step 4): */
> +	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
> +	u32 rs_biblk;                 /* start block relative to the bi */
> +	u32 rs_free;                  /* how many blocks are still free */
> +
> +	/* ancillary quota stuff */
> +	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> +	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> +	unsigned int rs_qa_qd_num;
> +};
> +
>  enum {
>  	GLF_LOCK			= 1,
>  	GLF_DEMOTE			= 3,
> @@ -290,16 +326,6 @@ struct gfs2_glock {
>  
>  #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
>  
> -struct gfs2_blkreserv {
> -	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> -	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
> -
> -	/* ancillary quota stuff */
> -	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> -	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> -	unsigned int rs_qa_qd_num;
> -};
> -
>  enum {
>  	GIF_INVALID		= 0,
>  	GIF_QD_LOCKED		= 1,
> @@ -307,7 +333,6 @@ enum {
>  	GIF_SW_PAGED		= 3,
>  };
>  
> -
>  struct gfs2_inode {
>  	struct inode i_inode;
>  	u64 i_no_addr;
> @@ -318,7 +343,7 @@ struct gfs2_inode {
>  	struct gfs2_glock *i_gl; /* Move into i_gh? */
>  	struct gfs2_holder i_iopen_gh;
>  	struct gfs2_holder i_gh; /* for prepare/commit_write only */
> -	struct gfs2_blkreserv *i_res; /* resource group block reservation */
> +	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
>  	struct gfs2_rgrpd *i_rgd;
>  	u64 i_goal;	/* goal block for allocations */
>  	struct rw_semaphore i_rw_mutex;
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 2b035e0..c53c67e 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
>  	int error;
>  
>  	munge_mode_uid_gid(dip, &mode, &uid, &gid);
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
>  
>  	error = gfs2_quota_lock(dip, uid, gid);
>  	if (error)
> @@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto fail;
> @@ -596,7 +603,8 @@ fail_end_trans:
>  	gfs2_trans_end(sdp);
>  
>  fail_ipreserv:
> -	gfs2_inplace_release(dip);
> +	if (alloc_required)
> +		gfs2_inplace_release(dip);
>  
>  fail_quota_locks:
>  	gfs2_quota_unlock(dip);
> @@ -647,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>  	const struct qstr *name = &dentry->d_name;
>  	struct gfs2_holder ghs[2];
>  	struct inode *inode = NULL;
> -	struct gfs2_inode *dip = GFS2_I(dir);
> +	struct gfs2_inode *dip = GFS2_I(dir), *ip;
>  	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
>  	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
>  	int error;
> @@ -657,6 +665,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>  	if (!name->len || name->len > GFS2_FNAMESIZE)
>  		return -ENAMETOOLONG;
>  
> +	/* We need a reservation to allocate the new dinode block. The
> +	   directory ip temporarily points to the reservation, but this is
> +	   being done to get a set of contiguous blocks for the new dinode.
> +	   Since this is a create, we don't have a sizehint yet, so it will
> +	   have to use the minimum reservation size. */
>  	error = gfs2_rs_alloc(dip);
>  	if (error)
>  		return error;
> @@ -694,24 +707,29 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>  	if (IS_ERR(inode))
>  		goto fail_gunlock2;
>  
> -	error = gfs2_inode_refresh(GFS2_I(inode));
> +	ip = GFS2_I(inode);
> +	error = gfs2_inode_refresh(ip);
>  	if (error)
>  		goto fail_gunlock2;
>  
> -	/* the new inode needs a reservation so it can allocate xattrs. */
> -	error = gfs2_rs_alloc(GFS2_I(inode));
> -	if (error)
> -		goto fail_gunlock2;
> +	/* The newly created inode needs a reservation so it can allocate
> +	   xattrs. At the same time, we want new blocks allocated to the new
> +	   dinode to be as contiguous as possible. Since we allocated the
> +	   dinode block under the directory's reservation, we transfer
> +	   ownership of that reservation to the new inode. The directory
> +	   doesn't need a reservation unless it needs a new allocation. */
> +	ip->i_res = dip->i_res;
> +	dip->i_res = NULL;
>  
>  	error = gfs2_acl_create(dip, inode);
>  	if (error)
>  		goto fail_gunlock2;
>  
> -	error = gfs2_security_init(dip, GFS2_I(inode), name);
> +	error = gfs2_security_init(dip, ip, name);
>  	if (error)
>  		goto fail_gunlock2;
>  
> -	error = link_dinode(dip, name, GFS2_I(inode));
> +	error = link_dinode(dip, name, ip);
>  	if (error)
>  		goto fail_gunlock2;
>  
> @@ -738,6 +756,7 @@ fail_gunlock:
>  		iput(inode);
>  	}
>  fail:
> +	gfs2_rs_delete(dip);
>  	if (bh)
>  		brelse(bh);
>  	return error;
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index e53d0a1..ad37fe2 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -35,6 +35,9 @@
>  #define BFITNOENT ((u32)~0)
>  #define NO_BLOCK ((u64)~0)
>  
> +#define RSRV_CONTENTION_FACTOR 4
> +#define RGRP_RSRV_MAX_CONTENDERS 2
> +
>  #if BITS_PER_LONG == 32
>  #define LBITMASK   (0x55555555UL)
>  #define LBITSKIP55 (0x55555555UL)
> @@ -178,6 +181,57 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
>  }
>  
>  /**
> + * rs_cmp - multi-block reservation range compare
> + * @blk: absolute file system block number of the new reservation
> + * @len: number of blocks in the new reservation
> + * @rs: existing reservation to compare against
> + *
> + * returns: 1 if the block range is beyond the reach of the reservation
> + *         -1 if the block range is before the start of the reservation
> + *          0 if the block range overlaps with the reservation
> + */
> +static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
> +{
> +	u64 startblk = gfs2_rs_startblk(rs);
> +
> +	if (blk >= startblk + rs->rs_free)
> +		return 1;
> +	if (blk + len - 1 < startblk)
> +		return -1;
> +	return 0;
> +}
> +
> +/**
> + * rs_find - Find a rgrp multi-block reservation that contains a given block
> + * @rgd: The rgrp
> + * @rgblk: The block we're looking for, relative to the rgrp
> + */
> +static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
> +{
> +	struct rb_node **newn;
> +	int rc;
> +	u64 fsblk = rgblk + rgd->rd_data0;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	newn = &rgd->rd_rstree.rb_node;
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +		rc = rs_cmp(fsblk, 1, cur);
> +		if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else {
> +			spin_unlock(&rgd->rd_rsspin);
> +			return cur;
> +		}
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
> +	return NULL;
> +}
> +
> +/**
>   * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
>   *       a block in a given allocation state.
>   * @buf: the buffer that holds the bitmaps
> @@ -424,19 +478,93 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
>  int gfs2_rs_alloc(struct gfs2_inode *ip)
>  {
>  	int error = 0;
> +	struct gfs2_blkreserv *res;
> +
> +	if (ip->i_res)
> +		return 0;
> +
> +	res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
> +	if (!res)
> +		error = -ENOMEM;
>  
>  	down_write(&ip->i_rw_mutex);
> -	if (!ip->i_res) {
> -		ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
> -		if (!ip->i_res)
> -			error = -ENOMEM;
> -	}
> +	if (ip->i_res)
> +		kmem_cache_free(gfs2_rsrv_cachep, res);
> +	else
> +		ip->i_res = res;
>  	up_write(&ip->i_rw_mutex);
>  	return error;
>  }
>  
> +static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
> +{
> +	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u f:%u\n",
> +		       rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk,
> +		       rs->rs_free);
> +}
> +
>  /**
> - * gfs2_rs_delete - delete a reservation
> + * __rs_deltree - remove a multi-block reservation from the rgd tree
> + * @rs: The reservation to remove
> + *
> + */
> +static void __rs_deltree(struct gfs2_blkreserv *rs)
> +{
> +	struct gfs2_rgrpd *rgd;
> +
> +	if (!gfs2_rs_active(rs))
> +		return;
> +
> +	rgd = rs->rs_rgd;
> +	/* We can't do this: The reason is that when the rgrp is invalidated,
> +	   it's in the "middle" of acquiring the glock, but the HOLDER bit
> +	   isn't set yet:
> +	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
> +	trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL);
> +
> +	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
> +		rb_erase(&rs->rs_node, &rgd->rd_rstree);
> +	BUG_ON(!rgd->rd_rs_cnt);
> +	rgd->rd_rs_cnt--;
> +
> +	if (rs->rs_free) {
> +		/* return reserved blocks to the rgrp and the ip */
> +		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
> +		rs->rs_rgd->rd_reserved -= rs->rs_free;
> +		rs->rs_free = 0;
> +		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
> +		smp_mb__after_clear_bit();
> +	}
> +	/* We can't change any of the step 1 or step 2 components of the rs.
> +	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
> +	   dequeued through this pointer.
> +	   Can't: atomic_set(&rs->rs_sizehint, 0);
> +	   Can't: rs->rs_requested = 0;
> +	   Can't: rs->rs_rgd = NULL;*/
> +	rs->rs_bi = NULL;
> +	rs->rs_biblk = 0;
> +}
> +
> +/**
> + * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
> + * @rs: The reservation to remove
> + *
> + */
> +void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
> +{
> +	struct gfs2_rgrpd *rgd;
> +
> +	if (!gfs2_rs_active(rs))
> +		return;
> +
> +	rgd = rs->rs_rgd;
> +	spin_lock(&rgd->rd_rsspin);
> +	__rs_deltree(rs);
> +	spin_unlock(&rgd->rd_rsspin);
> +}
> +
> +/**
> + * gfs2_rs_delete - delete a multi-block reservation
>   * @ip: The inode for this reservation
>   *
>   */
> @@ -444,12 +572,36 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
>  {
>  	down_write(&ip->i_rw_mutex);
>  	if (ip->i_res) {
> +		gfs2_rs_deltree(ip->i_res);
> +		trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE);
> +		BUG_ON(ip->i_res->rs_free);
>  		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
>  		ip->i_res = NULL;
>  	}
>  	up_write(&ip->i_rw_mutex);
>  }
>  
> +/**
> + * return_all_reservations - return all reserved blocks back to the rgrp.
> + * @rgd: the rgrp that needs its space back
> + *
> + * We previously reserved a bunch of blocks for allocation. Now we need to
> + * give them back. This leave the reservation structures in tact, but removes
> + * all of their corresponding "no-fly zones".
> + */
> +static void return_all_reservations(struct gfs2_rgrpd *rgd)
> +{
> +	struct rb_node *n;
> +	struct gfs2_blkreserv *rs;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	while ((n = rb_first(&rgd->rd_rstree))) {
> +		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		__rs_deltree(rs);
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
> +}
> +
>  void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  {
>  	struct rb_node *n;
> @@ -472,6 +624,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  
>  		gfs2_free_clones(rgd);
>  		kfree(rgd->rd_bits);
> +		return_all_reservations(rgd);
>  		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
>  	}
>  }
> @@ -649,6 +802,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
>  	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
>  	rgd->rd_data = be32_to_cpu(buf.ri_data);
>  	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
> +	spin_lock_init(&rgd->rd_rsspin);
>  
>  	error = compute_bitstructs(rgd);
>  	if (error)
> @@ -1115,29 +1269,212 @@ out:
>  }
>  
>  /**
> + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
> + * @bi: the bitmap with the blocks
> + * @ip: the inode structure
> + * @biblk: the 32-bit block number relative to the start of the bitmap
> + * @amount: the number of blocks to reserve
> + *
> + * Returns: NULL - reservation was already taken, so not inserted
> + *          pointer to the inserted reservation
> + */
> +static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
> +				       struct gfs2_inode *ip, u32 biblk,
> +				       int amount)
> +{
> +	struct rb_node **newn, *parent = NULL;
> +	int rc;
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	newn = &rgd->rd_rstree.rb_node;
> +	BUG_ON(!ip->i_res);
> +	BUG_ON(gfs2_rs_active(rs));
> +	/* Figure out where to put new node */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +
> +		parent = *newn;
> +		rc = rs_cmp(fsblock, amount, cur);
> +		if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else {
> +			spin_unlock(&rgd->rd_rsspin);
> +			return NULL; /* reservation already in use */
> +		}
> +	}
> +
> +	/* Do our reservation work */
> +	rs = ip->i_res;
> +	rs->rs_free = amount;
> +	rs->rs_biblk = biblk;
> +	rs->rs_bi = bi;
> +	rb_link_node(&rs->rs_node, parent, newn);
> +	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
> +
> +	/* Do our inode accounting for the reservation */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +
> +	/* Do our rgrp accounting for the reservation */
> +	rgd->rd_reserved += amount; /* blocks reserved */
> +	rgd->rd_rs_cnt++; /* number of in-tree reservations */
> +	spin_unlock(&rgd->rd_rsspin);
> +	trace_gfs2_rs(ip, rs, TRACE_RS_INSERT);
> +	return rs;
> +}
> +
> +/**
> + * unclaimed_blocks - return number of blocks that aren't spoken for
> + */
> +static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
> +{
> +	return rgd->rd_free_clone - rgd->rd_reserved;
> +}
> +
> +/**
> + * rg_mblk_search - find a group of multiple free blocks
> + * @rgd: the resource group descriptor
> + * @rs: the block reservation
> + * @ip: pointer to the inode for which we're reserving blocks
> + *
> + * This is very similar to rgblk_search, except we're looking for whole
> + * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
> + * on aligned dwords for speed's sake.
> + *
> + * Returns: 0 if successful or BFITNOENT if there isn't enough free space
> + */
> +
> +static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
> +{
> +	struct gfs2_bitmap *bi = rgd->rd_bits;
> +	const u32 length = rgd->rd_length;
> +	u32 blk;
> +	unsigned int buf, x, search_bytes;
> +	u8 *buffer = NULL;
> +	u8 *ptr, *end, *nonzero;
> +	u32 goal, rsv_bytes;
> +	struct gfs2_blkreserv *rs;
> +	u32 best_rs_bytes, unclaimed;
> +	int best_rs_blocks;
> +
> +	/* Find bitmap block that contains bits for goal block */
> +	if (rgrp_contains_block(rgd, ip->i_goal))
> +		goal = ip->i_goal - rgd->rd_data0;
> +	else
> +		goal = rgd->rd_last_alloc;
> +	for (buf = 0; buf < length; buf++) {
> +		bi = rgd->rd_bits + buf;
> +		/* Convert scope of "goal" from rgrp-wide to within
> +		   found bit block */
> +		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
> +			goal -= bi->bi_start * GFS2_NBBY;
> +			goto do_search;
> +		}
> +	}
> +	buf = 0;
> +	goal = 0;
> +
> +do_search:
> +	best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
> +			       (RGRP_RSRV_MINBLKS * rgd->rd_length));
> +	best_rs_bytes = (best_rs_blocks *
> +			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
> +		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
> +	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
> +	unclaimed = unclaimed_blocks(rgd);
> +	if (best_rs_bytes * GFS2_NBBY > unclaimed)
> +		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
> +
> +	for (x = 0; x <= length; x++) {
> +		bi = rgd->rd_bits + buf;
> +
> +		if (test_bit(GBF_FULL, &bi->bi_flags))
> +			goto skip;
> +
> +		WARN_ON(!buffer_uptodate(bi->bi_bh));
> +		if (bi->bi_clone)
> +			buffer = bi->bi_clone + bi->bi_offset;
> +		else
> +			buffer = bi->bi_bh->b_data + bi->bi_offset;
> +
> +		/* We have to keep the reservations aligned on u64 boundaries
> +		   otherwise we could get situations where a byte can't be
> +		   used because it's after a reservation, but a free bit still
> +		   is within the reservation's area. */
> +		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
> +		end = (buffer + bi->bi_len);
> +		while (ptr < end) {
> +			rsv_bytes = 0;
> +			if ((ptr + best_rs_bytes) <= end)
> +				search_bytes = best_rs_bytes;
> +			else
> +				search_bytes = end - ptr;
> +			BUG_ON(!search_bytes);
> +			nonzero = memchr_inv(ptr, 0, search_bytes);
> +			/* If the lot is all zeroes, reserve the whole size. If
> +			   there's enough zeroes to satisfy the request, use
> +			   what we can. If there's not enough, keep looking. */
> +			if (nonzero == NULL)
> +				rsv_bytes = search_bytes;
> +			else if ((nonzero - ptr) * GFS2_NBBY >=
> +				 ip->i_res->rs_requested)
> +				rsv_bytes = (nonzero - ptr);
> +
> +			if (rsv_bytes) {
> +				blk = ((ptr - buffer) * GFS2_NBBY);
> +				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
> +				rs = rs_insert(bi, ip, blk,
> +					       rsv_bytes * GFS2_NBBY);
> +				if (IS_ERR(rs))
> +					return PTR_ERR(rs);
> +				if (rs)
> +					return 0;
> +			}
> +			ptr += ALIGN(search_bytes, sizeof(u64));
> +		}
> +skip:
> +		/* Try next bitmap block (wrap back to rgrp header
> +		   if at end) */
> +		buf++;
> +		buf %= length;
> +		goal = 0;
> +	}
> +
> +	return BFITNOENT;
> +}
> +
> +/**
>   * try_rgrp_fit - See if a given reservation will fit in a given RG
>   * @rgd: the RG data
>   * @ip: the inode
>   *
>   * If there's room for the requested blocks to be allocated from the RG:
> + * This will try to get a multi-block reservation first, and if that doesn't
> + * fit, it will take what it can.
>   *
>   * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
>   */
>  
> -static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
> +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
>  {
> -	const struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_blkreserv *rs = ip->i_res;
>  
>  	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
>  		return 0;
> -	if (rgd->rd_free_clone >= rs->rs_requested)
> +	/* Look for a multi-block reservation. */
> +	if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
> +	    rg_mblk_search(rgd, ip) != BFITNOENT)
> +		return 1;
> +	if (unclaimed_blocks(rgd) >= rs->rs_requested)
>  		return 1;
> -	return 0;
> -}
>  
> -static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
> -{
> -	return (bi->bi_start * GFS2_NBBY) + blk;
> +	return 0;
>  }
>  
>  /**
> @@ -1217,7 +1554,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>  int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  {
>  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> -	struct gfs2_rgrpd *rgd, *begin = NULL;
> +	struct gfs2_rgrpd *begin = NULL;
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  	int error = 0, rg_locked, flags = LM_FLAG_TRY;
>  	u64 last_unlinked = NO_BLOCK;
> @@ -1225,32 +1562,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  
>  	if (sdp->sd_args.ar_rgrplvb)
>  		flags |= GL_SKIP;
> -	rs = ip->i_res;
>  	rs->rs_requested = requested;
>  	if (gfs2_assert_warn(sdp, requested)) {
>  		error = -EINVAL;
>  		goto out;
>  	}
> -
> -	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
> -		rgd = begin = ip->i_rgd;
> -	else
> -		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> -
> -	if (rgd == NULL)
> +	if (gfs2_rs_active(rs)) {
> +		begin = rs->rs_rgd;
> +		flags = 0; /* Yoda: Do or do not. There is no try */
> +	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
> +		rs->rs_rgd = begin = ip->i_rgd;
> +	} else {
> +		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> +	}
> +	if (rs->rs_rgd == NULL)
>  		return -EBADSLT;
>  
>  	while (loops < 3) {
>  		rg_locked = 0;
>  
> -		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
> +		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
>  			rg_locked = 1;
>  			error = 0;
> +		} else if (!loops && !gfs2_rs_active(rs) &&
> +			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
> +			/* If the rgrp already is maxed out for contenders,
> +			   we can eliminate it as a "first pass" without even
> +			   requesting the rgrp glock. */
> +			error = GLR_TRYFAILED;
>  		} else {
> -			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
> -						   flags, &rs->rs_rgd_gh);
> +			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
> +						   LM_ST_EXCLUSIVE, flags,
> +						   &rs->rs_rgd_gh);
>  			if (!error && sdp->sd_args.ar_rgrplvb) {
> -				error = update_rgrp_lvb(rgd);
> +				error = update_rgrp_lvb(rs->rs_rgd);
>  				if (error) {
>  					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  					return error;
> @@ -1259,24 +1604,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  		}
>  		switch (error) {
>  		case 0:
> -			if (try_rgrp_fit(rgd, ip)) {
> +			if (gfs2_rs_active(rs)) {
> +				if (unclaimed_blocks(rs->rs_rgd) +
> +				    rs->rs_free >= rs->rs_requested) {
> +					ip->i_rgd = rs->rs_rgd;
> +					return 0;
> +				}
> +				/* We have a multi-block reservation, but the
> +				   rgrp doesn't have enough free blocks to
> +				   satisfy the request. Free the reservation
> +				   and look for a suitable rgrp. */
> +				gfs2_rs_deltree(rs);
> +			}
> +			if (try_rgrp_fit(rs->rs_rgd, ip)) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				ip->i_rgd = rgd;
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				ip->i_rgd = rs->rs_rgd;
>  				return 0;
>  			}
> -			if (rgd->rd_flags & GFS2_RDF_CHECK) {
> +			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				try_rgrp_unlink(rgd, &last_unlinked,
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
>  						ip->i_no_addr);
>  			}
>  			if (!rg_locked)
>  				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  			/* fall through */
>  		case GLR_TRYFAILED:
> -			rgd = gfs2_rgrpd_get_next(rgd);
> -			if (rgd != begin) /* If we didn't wrap */
> +			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
> +			if (rs->rs_rgd != begin) /* If we didn't wrap */
>  				break;
>  
>  			flags &= ~LM_FLAG_TRY;
> @@ -1314,6 +1671,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
>  {
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  
> +	if (!rs)
> +		return;
> +
> +	if (!rs->rs_free)
> +		gfs2_rs_deltree(rs);
> +
>  	if (rs->rs_rgd_gh.gh_gl)
>  		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  	rs->rs_requested = 0;
> @@ -1412,7 +1775,27 @@ do_search:
>  		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
>  			buffer = bi->bi_clone + bi->bi_offset;
>  
> -		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +		while (1) {
> +			struct gfs2_blkreserv *rs;
> +			u32 rgblk;
> +
> +			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +			if (biblk == BFITNOENT)
> +				break;
> +			/* Check if this block is reserved() */
> +			rgblk = gfs2_bi2rgd_blk(bi, biblk);
> +			rs = rs_find(rgd, rgblk);
> +			if (rs == NULL)
> +				break;
> +
> +			BUG_ON(rs->rs_bi != bi);
> +			biblk = BFITNOENT;
> +			/* This should jump to the first block after the
> +			   reservation. */
> +			goal = rs->rs_biblk + rs->rs_free;
> +			if (goal >= bi->bi_len * GFS2_NBBY)
> +				break;
> +		}
>  		if (biblk != BFITNOENT)
>  			break;
>  
> @@ -1448,8 +1831,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  			     u32 blk, bool dinode, unsigned int *n)
>  {
>  	const unsigned int elen = *n;
> -	u32 goal;
> +	u32 goal, rgblk;
>  	const u8 *buffer = NULL;
> +	struct gfs2_blkreserv *rs;
>  
>  	*n = 0;
>  	buffer = bi->bi_bh->b_data + bi->bi_offset;
> @@ -1462,6 +1846,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  		goal++;
>  		if (goal >= (bi->bi_len * GFS2_NBBY))
>  			break;
> +		rgblk = gfs2_bi2rgd_blk(bi, goal);
> +		rs = rs_find(rgd, rgblk);
> +		if (rs) /* Oops, we bumped into someone's reservation */
> +			break;
>  		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
>  		    GFS2_BLKST_FREE)
>  			break;
> @@ -1537,12 +1925,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
>  
>  int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
>  {
> -	const struct gfs2_rgrpd *rgd = gl->gl_object;
> +	struct gfs2_rgrpd *rgd = gl->gl_object;
> +	struct gfs2_blkreserv *trs;
> +	const struct rb_node *n;
> +
>  	if (rgd == NULL)
>  		return 0;
> -	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
> +	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
>  		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
> -		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
> +		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
> +		       rgd->rd_reserved);
> +	spin_lock(&rgd->rd_rsspin);
> +	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
> +		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		dump_rs(seq, trs);
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
>  	return 0;
>  }
>  
> @@ -1557,10 +1955,63 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
>  }
>  
>  /**
> + * claim_reserved_blks - Claim previously reserved blocks
> + * @ip: the inode that's claiming the reservation
> + * @dinode: 1 if this block is a dinode block, otherwise data block
> + * @nblocks: desired extent length
> + *
> + * Lay claim to previously allocated block reservation blocks.
> + * Returns: Starting block number of the blocks claimed.
> + * Sets *nblocks to the actual extent length allocated.
> + */
> +static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
> +			       unsigned int *nblocks)
> +{
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> +	struct gfs2_bitmap *bi;
> +	u64 start_block = gfs2_rs_startblk(rs);
> +	const unsigned int elen = *nblocks;
> +
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +	gfs2_assert_withdraw(sdp, rgd);
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	bi = rs->rs_bi;
> +	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
> +
> +	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
> +		/* Make sure the bitmap hasn't changed */
> +		gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk,
> +			    dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
> +		rs->rs_biblk++;
> +		rs->rs_free--;
> +
> +		BUG_ON(!rgd->rd_reserved);
> +		rgd->rd_reserved--;
> +		dinode = false;
> +		trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM);
> +	}
> +
> +	if (!rs->rs_free) {
> +		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
> +
> +		gfs2_rs_deltree(rs);
> +		/* -nblocks because we haven't returned to do the math yet.
> +		   I'm doing the math backwards to prevent negative numbers,
> +		   but think of it as:
> +		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
> +		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
> +			rg_mblk_search(rgd, ip);
> +	}
> +	return start_block;
> +}
> +
> +/**
>   * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
>   * @ip: the inode to allocate the block for
>   * @bn: Used to return the starting block number
> - * @ndata: requested number of blocks/extent length (value/result)
> + * @nblocks: requested number of blocks/extent length (value/result)
>   * @dinode: 1 if we're allocating a dinode block, else 0
>   * @generation: the generation number of the inode
>   *
> @@ -1585,20 +2036,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  	if (ip->i_res->rs_requested == 0)
>  		return -ECANCELED;
>  
> -	rgd = ip->i_rgd;
> -
> -	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> -		goal = ip->i_goal - rgd->rd_data0;
> -	else
> -		goal = rgd->rd_last_alloc;
> -
> -	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +	/* Check if we have a multi-block reservation, and if so, claim the
> +	   next free block from it. */
> +	if (gfs2_rs_active(ip->i_res)) {
> +		BUG_ON(!ip->i_res->rs_free);
> +		rgd = ip->i_res->rs_rgd;
> +		block = claim_reserved_blks(ip, dinode, nblocks);
> +	} else {
> +		rgd = ip->i_rgd;
>  
> -	/* Since all blocks are reserved in advance, this shouldn't happen */
> -	if (blk == BFITNOENT)
> -		goto rgrp_error;
> +		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> +			goal = ip->i_goal - rgd->rd_data0;
> +		else
> +			goal = rgd->rd_last_alloc;
> +
> +		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +
> +		/* Since all blocks are reserved in advance, this shouldn't
> +		   happen */
> +		if (blk == BFITNOENT) {
> +			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
> +			       *nblocks);
> +			printk(KERN_WARNING "FULL=%d\n",
> +			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
> +			goto rgrp_error;
> +		}
>  
> -	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +	}
>  	ndata = *nblocks;
>  	if (dinode)
>  		ndata--;
> @@ -1615,8 +2080,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  			brelse(dibh);
>  		}
>  	}
> -	if (rgd->rd_free < *nblocks)
> +	if (rgd->rd_free < *nblocks) {
> +		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
>  		goto rgrp_error;
> +	}
>  
>  	rgd->rd_free -= *nblocks;
>  	if (dinode) {
> @@ -1876,6 +2343,7 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
>  		for (x = 0; x < rlist->rl_rgrps; x++)
>  			gfs2_holder_uninit(&rlist->rl_ghs[x]);
>  		kfree(rlist->rl_ghs);
> +		rlist->rl_ghs = NULL;
>  	}
>  }
>  
> diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
> index 5d8314d..ca6e267 100644
> --- a/fs/gfs2/rgrp.h
> +++ b/fs/gfs2/rgrp.h
> @@ -13,6 +13,14 @@
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  
> +/* Since each block in the file system is represented by two bits in the
> + * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
> + * By reserving 32 blocks at a time, we can optimize / shortcut how we search
> + * through the bitmaps by looking a word at a time.
> + */
> +#define RGRP_RSRV_MINBYTES 8
> +#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
> +
>  struct gfs2_rgrpd;
>  struct gfs2_sbd;
>  struct gfs2_holder;
> @@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
>  extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
>  extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
>  
> +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
> +
>  extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
>  extern void gfs2_inplace_release(struct gfs2_inode *ip);
>  
> @@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
>  			     bool dinode, u64 *generation);
>  
>  extern int gfs2_rs_alloc(struct gfs2_inode *ip);
> +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
>  extern void gfs2_rs_delete(struct gfs2_inode *ip);
>  extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
>  extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
> @@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
>  				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
>  extern int gfs2_fitrim(struct file *filp, void __user *argp);
>  
> -/* This is how to tell if a reservation is "inplace" reserved: */
> +/* This is how to tell if a multi-block reservation is "inplace" reserved: */
>  static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  {
>  	if (ip->i_res && ip->i_res->rs_requested)
> @@ -70,4 +81,22 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  	return 0;
>  }
>  
> +/* This is how to tell if a multi-block reservation is in the rgrp tree: */
> +static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
> +{
> +	if (rs && rs->rs_bi)
> +		return 1;
> +	return 0;
> +}
> +
> +static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk)
> +{
> +	return (bi->bi_start * GFS2_NBBY) + blk;
> +}
> +
> +static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs)
> +{
> +	return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0;
> +}
> +
>  #endif /* __RGRP_DOT_H__ */
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index 7880687..b1502c4 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
>  		return -EIO;
>  	}
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;
> @@ -1550,6 +1554,9 @@ out_truncate:
>  
>  out_unlock:
>  	/* Error path for case 1 */
> +	if (gfs2_rs_active(ip->i_res))
> +		gfs2_rs_deltree(ip->i_res);
> +
>  	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
>  		gfs2_glock_dq(&ip->i_iopen_gh);
>  	gfs2_holder_uninit(&ip->i_iopen_gh);
> diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
> index 1b8b815..a25c252 100644
> --- a/fs/gfs2/trace_gfs2.h
> +++ b/fs/gfs2/trace_gfs2.h
> @@ -14,6 +14,7 @@
>  #include <linux/ktime.h>
>  #include "incore.h"
>  #include "glock.h"
> +#include "rgrp.h"
>  
>  #define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
>  #define glock_trace_name(x) __print_symbolic(x,		\
> @@ -31,6 +32,17 @@
>  			    { GFS2_BLKST_DINODE, "dinode" },	\
>  			    { GFS2_BLKST_UNLINKED, "unlinked" })
>  
> +#define TRACE_RS_DELETE  0
> +#define TRACE_RS_TREEDEL 1
> +#define TRACE_RS_INSERT  2
> +#define TRACE_RS_CLAIM   3
> +
> +#define rs_func_name(x) __print_symbolic(x,	\
> +					 { 0, "del " },	\
> +					 { 1, "tdel" },	\
> +					 { 2, "ins " },	\
> +					 { 3, "clm " })
> +
>  #define show_glock_flags(flags) __print_flags(flags, "",	\
>  	{(1UL << GLF_LOCK),			"l" },		\
>  	{(1UL << GLF_DEMOTE),			"D" },		\
> @@ -470,6 +482,7 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__field(	u8,	block_state		)
>  		__field(        u64,	rd_addr			)
>  		__field(        u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
>  	),
>  
>  	TP_fast_assign(
> @@ -480,16 +493,58 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__entry->block_state	= block_state;
>  		__entry->rd_addr	= rgd->rd_addr;
>  		__entry->rd_free_clone	= rgd->rd_free_clone;
> +		__entry->rd_reserved	= rgd->rd_reserved;
>  	),
>  
> -	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
> +	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  (unsigned long long)__entry->inum,
>  		  (unsigned long long)__entry->start,
>  		  (unsigned long)__entry->len,
>  		  block_state_name(__entry->block_state),
>  		  (unsigned long long)__entry->rd_addr,
> -		  __entry->rd_free_clone)
> +		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
> +);
> +
> +/* Keep track of multi-block reservations as they are allocated/freed */
> +TRACE_EVENT(gfs2_rs,
> +
> +	TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
> +		 u8 func),
> +
> +	TP_ARGS(ip, rs, func),
> +
> +	TP_STRUCT__entry(
> +		__field(        dev_t,  dev                     )
> +		__field(	u64,	rd_addr			)
> +		__field(	u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
> +		__field(	u64,	inum			)
> +		__field(	u64,	start			)
> +		__field(	u32,	free			)
> +		__field(	u8,	func			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
> +		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
> +		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
> +		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
> +		__entry->inum		= ip ? ip->i_no_addr : 0;
> +		__entry->start		= gfs2_rs_startblk(rs);
> +		__entry->free		= rs->rs_free;
> +		__entry->func		= func;
> +	),
> +
> +	TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s "
> +		  "f:%lu",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long long)__entry->inum,
> +		  (unsigned long long)__entry->start,
> +		  (unsigned long long)__entry->rd_addr,
> +		  (unsigned long)__entry->rd_free_clone,
> +		  (unsigned long)__entry->rd_reserved,
> +		  rs_func_name(__entry->func), (unsigned long)__entry->free)
>  );
>  
>  #endif /* _TRACE_GFS2_H */
> diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
> index 523c0de..27a0b4a 100644
> --- a/fs/gfs2/xattr.c
> +++ b/fs/gfs2/xattr.c
> @@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto out_alloc;
> @@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock_check(ip);
>  	if (error)
>  		return error;
> @@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;




^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-07-19 14:36 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <785c19c3-631a-4976-84fd-048876691678@zmail12.collab.prod.int.phx2.redhat.com>
2012-07-05 21:58 ` [Cluster-devel] [GFS2 Patch] [Try 2] GFS2: Reduce file fragmentation Bob Peterson
2012-07-10 14:23   ` Steven Whitehouse
2012-07-11 18:07     ` Bob Peterson
2012-07-11 18:14       ` Steven Whitehouse
2012-07-11 18:51         ` Bob Peterson
2012-07-11 18:10     ` [Cluster-devel] [GFS2 Patch] [Try 3] " Bob Peterson
2012-07-12 10:01       ` Steven Whitehouse
2012-07-12 19:28         ` [Cluster-devel] [GFS2 Patch] [Try 4] " Bob Peterson
2012-07-13  9:36           ` Steven Whitehouse
2012-07-13 12:55             ` Bob Peterson
2012-07-13 13:49               ` Steven Whitehouse
2012-07-18 16:26                 ` [Cluster-devel] [GFS2 Patch] [Try 5] " Bob Peterson
2012-07-19 14:36                   ` Steven Whitehouse

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.