[Cluster-devel] [GFS2 PATCH] [take 2] GFS2: Non-recursive delete

* [Cluster-devel] [GFS2 PATCH] [take 2] GFS2: Non-recursive delete
       [not found] <34819588.22701467.1487021225129.JavaMail.zimbra@redhat.com>
@ 2017-02-13 21:44 ` Bob Peterson
  2017-02-15 15:53   ` Andreas Gruenbacher
  0 siblings, 1 reply; 3+ messages in thread
From: Bob Peterson @ 2017-02-13 21:44 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On 30 January 2017, I posted a non-recursive algorithm for truncate
and delete. Steve Whitehouse suggested I should be able to rewrite
the algorithm without the need for buffer lists that eat memory.
He suggested the algorithm could be implemented as a kind of the
inverse of the block map allocator, gfs2_bmap_alloc.

This new patch is my first attempt to implement his suggestion.

Half the path is wiping out the old recursive method. The other
half is implementing the new non-recursive one.

I could probably stand to tidy the code up a bit, but for this
initial pass I wanted to code for clarity (both for my sanity and
the sanity of those reviewing). Thus, there are places in the
state machine which could easily "fall through" to the next state,
but I've chosen not to for readability, at least for now.

Also note that I've deliberately added cond_resched to allow
other users to run better while deletes are ongoing. Also, note
that the deletes are implemented one resource group at a time,
which was also done for performance. Many blocks may be freed
from the same resource group in a single transaction because the
resource group is only changed when absolutely necessary.
This can occur when files become fragmented across rgrps.

This version isn't as fast as the version I posted on 30 January,
but it's not as bad as I had feared. On my hardware, a 500GB
file took 27s to truncate with the previous (30 Jan) algorithm.
This version takes 33s, which seem acceptable.

Again, I don't expect this is final form, but it seems to work.
I've tested it with large (half-terabyte) dense files and huge
(1PB) sparse files, both of which seem to now work properly.
I've also tested both truncates and deletes.

The patch could probably stand a lot more testing, which I plan
to do.
---
Patch description:

Implement truncate/delete as a non-recursive algorithm. The older
algorithm was implemented with recursion to strip off each layer
at a time (going by height, starting with the maximum height).
This version tries to do the same thing, but without recursion,
and without needing to allocate new structures or lists in memory.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ad1971d..3af3861 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -298,6 +298,7 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
  * lookup_metapath - Walk the metadata tree to a specific point
  * @ip: The inode
  * @mp: The metapath
+ * @h: The height to which it should be mapped
  *
  * Assumes that the inode's buffer has already been looked up and
  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
@@ -311,20 +312,21 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
  * Returns: error or height of metadata tree
  */
 
-static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
+static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 {
-	unsigned int end_of_metadata = ip->i_height - 1;
 	unsigned int x;
 	__be64 *ptr;
 	u64 dblock;
 	int ret;
 
-	for (x = 0; x < end_of_metadata; x++) {
+	for (x = 0; x < h; x++) {
 		ptr = metapointer(x, mp);
 		dblock = be64_to_cpu(*ptr);
 		if (!dblock)
 			return x + 1;
 
+		if (mp->mp_bh[x + 1])
+			continue;
 		ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
 		if (ret)
 			return ret;
@@ -619,7 +621,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 
 	BUG_ON(maxlen == 0);
 
-	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+	memset(&mp, 0, sizeof(mp));
 	bmap_lock(ip, create);
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
@@ -642,7 +644,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	ret = 1;
 	if (height > ip->i_height || gfs2_is_stuffed(ip))
 		goto do_alloc;
-	ret = lookup_metapath(ip, &mp);
+	ret = lookup_metapath(ip, &mp, ip->i_height - 1);
 	if (ret < 0)
 		goto out;
 	if (ret != ip->i_height)
@@ -701,252 +703,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 }
 
 /**
- * do_strip - Look for a layer a particular layer of the file and strip it off
- * @ip: the inode
- * @dibh: the dinode buffer
- * @bh: A buffer of pointers
- * @top: The first pointer in the buffer
- * @bottom: One more than the last pointer
- * @height: the height this buffer is at
- * @data: a pointer to a struct strip_mine
- *
- * Returns: errno
- */
-
-static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
-		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
-		    unsigned int height, struct strip_mine *sm)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrp_list rlist;
-	struct gfs2_trans *tr;
-	u64 bn, bstart;
-	u32 blen, btotal;
-	__be64 *p;
-	unsigned int rg_blocks = 0;
-	int metadata;
-	unsigned int revokes = 0;
-	int x;
-	int error;
-	int jblocks_rqsted;
-
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
-
-	if (!*top)
-		sm->sm_first = 0;
-
-	if (height != sm->sm_height)
-		return 0;
-
-	if (sm->sm_first) {
-		top++;
-		sm->sm_first = 0;
-	}
-
-	metadata = (height != ip->i_height - 1);
-	if (metadata)
-		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-	else if (ip->i_depth)
-		revokes = sdp->sd_inptrs;
-
-	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-	bstart = 0;
-	blen = 0;
-
-	for (p = top; p < bottom; p++) {
-		if (!*p)
-			continue;
-
-		bn = be64_to_cpu(*p);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_rlist_add(ip, &rlist, bstart);
-
-			bstart = bn;
-			blen = 1;
-		}
-	}
-
-	if (bstart)
-		gfs2_rlist_add(ip, &rlist, bstart);
-	else
-		goto out; /* Nothing to do */
-
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
-	for (x = 0; x < rlist.rl_rgrps; x++) {
-		struct gfs2_rgrpd *rgd;
-		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
-		rg_blocks += rgd->rd_length;
-	}
-
-	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
-	if (error)
-		goto out_rlist;
-
-	if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
-		gfs2_rs_deltree(&ip->i_res);
-
-restart:
-	jblocks_rqsted = rg_blocks + RES_DINODE +
-		RES_INDIRECT + RES_STATFS + RES_QUOTA +
-		gfs2_struct2blk(sdp, revokes, sizeof(u64));
-	if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
-		jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
-	error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
-	if (error)
-		goto out_rg_gunlock;
-
-	tr = current->journal_info;
-	down_write(&ip->i_rw_mutex);
-
-	gfs2_trans_add_meta(ip->i_gl, dibh);
-	gfs2_trans_add_meta(ip->i_gl, bh);
-
-	bstart = 0;
-	blen = 0;
-	btotal = 0;
-
-	for (p = top; p < bottom; p++) {
-		if (!*p)
-			continue;
-
-		/* check for max reasonable journal transaction blocks */
-		if (tr->tr_num_buf_new + RES_STATFS +
-		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
-			if (rg_blocks >= tr->tr_num_buf_new)
-				rg_blocks -= tr->tr_num_buf_new;
-			else
-				rg_blocks = 0;
-			break;
-		}
-
-		bn = be64_to_cpu(*p);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart) {
-				__gfs2_free_blocks(ip, bstart, blen, metadata);
-				btotal += blen;
-			}
-
-			bstart = bn;
-			blen = 1;
-		}
-
-		*p = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
-	}
-	if (p == bottom)
-		rg_blocks = 0;
-
-	if (bstart) {
-		__gfs2_free_blocks(ip, bstart, blen, metadata);
-		btotal += blen;
-	}
-
-	gfs2_statfs_change(sdp, 0, +btotal, 0);
-	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
-			  ip->i_inode.i_gid);
-
-	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-
-	gfs2_dinode_out(ip, dibh->b_data);
-
-	up_write(&ip->i_rw_mutex);
-
-	gfs2_trans_end(sdp);
-
-	if (rg_blocks)
-		goto restart;
-
-out_rg_gunlock:
-	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist:
-	gfs2_rlist_free(&rlist);
-out:
-	return error;
-}
-
-/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @sm: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
-			  struct metapath *mp, unsigned int height,
-			  u64 block, int first, struct strip_mine *sm)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head *bh = NULL;
-	__be64 *top, *bottom;
-	u64 bn;
-	int error;
-	int mh_size = sizeof(struct gfs2_meta_header);
-
-	if (!height) {
-		error = gfs2_meta_inode_buffer(ip, &bh);
-		if (error)
-			return error;
-		dibh = bh;
-
-		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
-		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
-	} else {
-		error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
-		if (error)
-			return error;
-
-		top = (__be64 *)(bh->b_data + mh_size) +
-				  (first ? mp->mp_list[height] : 0);
-
-		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
-	}
-
-	error = do_strip(ip, dibh, bh, top, bottom, height, sm);
-	if (error)
-		goto out;
-
-	if (height < ip->i_height - 1) {
-
-		gfs2_metapath_ra(ip->i_gl, bh, top);
-
-		for (; top < bottom; top++, first = 0) {
-			if (!*top)
-				continue;
-
-			bn = be64_to_cpu(*top);
-
-			error = recursive_scan(ip, dibh, mp, height + 1, bn,
-					       first, sm);
-			if (error)
-				break;
-		}
-	}
-out:
-	brelse(bh);
-	return error;
-}
-
-
-/**
  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
  *
  * This is partly borrowed from ext3.
@@ -1105,41 +861,387 @@ out:
 	return error;
 }
 
-static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+/*
+ * sweep_bh_for_rgrps - find an rgrp in a buffer and free blocks therein
+ * ip: inode
+ * bh: buffer_head of current metadata buffer to sweep
+ * btotal: place to keep count of total blocks freed
+ * hgt: height we're processing
+ * off: Starting metapath offset, for the first block of the truncate
+ * rg_gh: holder of resource group glock
+ *
+ * We sweep a metadata buffer for blocks we need to free, and free them all.
+ * However, we do it one rgrp@a time. If this block has references to
+ * multiple rgrps, we break it into individual transactions. This allows
+ * other processes to use the rgrps while we're focused on a single one.
+ * At every transaction boundary, we rewrite the inode into the journal.
+ * That way the bitmaps are kept consistent with the inode and we can recover
+ * if we're somehow interrupted by power-outages.
+ *
+ * Returns: ret = 0, or return code if an error occurred.
+ *          *btotal has the total number of blocks freed
+ */
+static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct buffer_head *bh,
+			      struct buffer_head *dibh, u32 *btotal, int hgt,
+			      int off, struct gfs2_holder *rd_gh)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int height = ip->i_height;
-	u64 lblock;
-	struct metapath mp;
-	int error;
+	struct gfs2_rgrpd *rgd = NULL;
+	struct gfs2_trans *tr;
+	__be64 *top, *bottom, *p;
+	int blks_outside_rgrp;
+	u64 bn, bstart, isize_blks;
+	s64 blen;
+	int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
+	int ret = 0;
+	static u64 last_bh_blk = 0;
+
+	gfs2_assert_withdraw(sdp, last_bh_blk != bh->b_blocknr);
+	last_bh_blk = bh->b_blocknr;
+more_rgrps:
+	blks_outside_rgrp = 0;
+	bstart = 0;
+	blen = 0;
+	if (gfs2_holder_initialized(rd_gh))
+		rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
+	if (hgt == 0) {
+		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode));
+		bottom = (__be64 *)top + sdp->sd_diptrs;
+	} else {
+		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+		bottom = top + sdp->sd_inptrs;
+	}
+	top += off;
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+		bn = be64_to_cpu(*p);
+		if (rgd == NULL) {
+			rgd = gfs2_blk2rgrpd(sdp, bn, false);
+			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+						 0, rd_gh);
+			if (ret)
+				goto out;
+		}
+
+		if (!rgrp_contains_block(rgd, bn)) {
+			blks_outside_rgrp++;
+			continue;
+		}
+
+		/* The size of our transactions will be unknown until we
+		   actually process all the metadata blocks that relate to
+		   the rgrp. So we estimate. We know it can't be more than
+		   the dinode's i_blocks and we don't want to exceed the
+		   journal flush threshold, sd_log_thresh2. */
+		if (current->journal_info == NULL) {
+			unsigned int jblocks_rqsted, revokes;
+
+			jblocks_rqsted = rgd->rd_length + RES_DINODE +
+				RES_INDIRECT;
+			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
+			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
+				jblocks_rqsted +=
+					atomic_read(&sdp->sd_log_thresh2);
+			else
+				jblocks_rqsted += isize_blks;
+			revokes = jblocks_rqsted;
+			if (meta)
+				revokes += hgt ? sdp->sd_inptrs :
+					sdp->sd_diptrs;
+			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
+			if (ret)
+				goto out_unlock;
+		}
+		/* check if we will exceed the transaction blocks requested */
+		tr = current->journal_info;
+		if (tr->tr_num_buf_new + RES_STATFS +
+		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2))
+			goto out_unlock;
 
-	if (!size)
+		gfs2_trans_add_meta(ip->i_gl, bh);
+		*p = 0;
+		if (bstart + blen == bn) {
+			blen++;
+			continue;
+		}
+		if (bstart) {
+			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+			(*btotal) += blen;
+			gfs2_add_inode_blocks(&ip->i_inode, -blen);
+		}
+		bstart = bn;
+		blen = 1;
+	}
+	if (bstart) {
+		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+		(*btotal) += blen;
+		gfs2_add_inode_blocks(&ip->i_inode, -blen);
+	}
+out_unlock:
+	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
+					    outside the rgrp we just processed,
+					    do it all over again. */
+		gfs2_glock_dq_uninit(rd_gh);
+		if (current->journal_info) {
+			/* Every transaction boundary, we rewrite the dinode
+			   to keep its di_blocks current in case of failure. */
+			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
+				CURRENT_TIME;
+			gfs2_trans_add_meta(ip->i_gl, dibh);
+			gfs2_dinode_out(ip, dibh->b_data);
+			gfs2_trans_end(sdp);
+		}
+		rgd = NULL; /* force a new rgrp */
+		goto more_rgrps;
+	}
+out:
+	cond_resched(); /* Give other processes a chance to run */
+	return ret;
+}
+
+/* assumes the metapath is valid (with buffers) out to height h */
+static bool find_nonnull_ptr(struct metapath *mp_eof, struct metapath *mp,
+			     unsigned int h)
+{
+	__be64 *ptr;
+
+	while (true) {
+		ptr = metapointer(h, mp_eof);
+		if (*ptr) /* if we have a non-null pointer */
+			return true;
+
+		if (mp_eof->mp_list[h] > mp->mp_list[h])
+			mp_eof->mp_list[h]--;
+		else
+			return false; /* no more pointers this buffer */
+	}
+}
+
+enum dealloc_states {
+	DEALLOC_LOOKUP_MP = 0,
+	DEALLOC_MP_FULL = 1,
+	DEALLOC_MP_FINDPREV = 2,
+	DEALLOC_MP_LOWER = 3,
+	DEALLOC_MP_INCOMPLETE = 4,
+	DEALLOC_DONE = 5,
+};
+
+/* trunc_dealloc - truncate a file down to a desired size
+ * @ip: inode to truncate
+ * @oldsize: The previous size of the file (can't get this from ip anymore)
+ * @newsize: The desired size of the file
+ *
+ * This function truncates a file from oldsize to newsize. It works from the
+ * bottom up, and from the right to the left. In other words, it strips off
+ * the highest layer (data) before stripping any of the metadata. Doing it
+ * this way is best in case the operation is interrupted by power failure, etc.
+ * The dinode is rewritten in every transaction to guarantee integrity.
+ */
+static int trunc_dealloc(struct gfs2_inode *ip, u64 oldsize, u64 newsize)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 lblock, newblks;
+	struct metapath mp, mp_eof;
+	struct inode *inode = &ip->i_inode;
+	struct buffer_head *dibh, *bh;
+	struct gfs2_holder rd_gh;
+	__u16 eof[GFS2_MAX_META_HEIGHT];
+	unsigned int strip_h = ip->i_height - 1;
+	u32 btotal = 0, off;
+	int ret, state;
+	int mp_h; /* metapath buffers are read in to this height */
+	sector_t last_ra = 0;
+
+	if (!newsize) {
 		lblock = 0;
-	else
-		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+		newblks = 0;
+	} else {
+		lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
+		newblks = (newsize + sdp->sd_sb.sb_bsize - 1) >>
+			sdp->sd_sb.sb_bsize_shift;
+	}
 
+	memset(&mp, 0, sizeof(mp));
 	find_metapath(sdp, lblock, &mp, ip->i_height);
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
 
-	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
-	if (error)
-		return error;
+	/* find our ending index for each height */
+	memset(&mp_eof, 0, sizeof(mp_eof));
+	ret = gfs2_meta_inode_buffer(ip, &mp_eof.mp_bh[0]);
+	if (ret)
+		return ret;
 
-	while (height--) {
-		struct strip_mine sm;
-		sm.sm_first = !!size;
-		sm.sm_height = height;
+	dibh = mp_eof.mp_bh[0];
+	find_metapath(sdp, (oldsize >> sdp->sd_sb.sb_bsize_shift) - 1,
+		      &mp_eof, ip->i_height);
+	memcpy(&eof, &mp_eof.mp_list, sizeof(eof));
 
-		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
-		if (error)
+	ret = gfs2_rindex_update(sdp);
+	if (ret)
+		goto out;
+
+	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (ret)
+		goto out;
+	gfs2_holder_mark_uninitialized(&rd_gh);
+
+	mp_h = strip_h;
+	state = DEALLOC_LOOKUP_MP;
+
+	while (state != DEALLOC_DONE) {
+		switch (state) {
+		/* Look up the metapath to the given height. */
+		case DEALLOC_LOOKUP_MP:
+			/* Fill the buffers out to the current height. */
+			ret = lookup_metapath(ip, &mp_eof, mp_h);
+			if (ret < 0)
+				goto out_quota;
+
+			/* if we found a partial height */
+			if (ret < ip->i_height) {
+				mp_h = ret - 1;
+				state = DEALLOC_MP_INCOMPLETE;
+				break;
+			}
+			/* If buffers found for the entire requested height. */
+			if (mp_h == strip_h)
+				state = DEALLOC_MP_FULL;
+			else
+				state = DEALLOC_MP_INCOMPLETE;
+			break;
+
+		/* Truncate a now full metapath at the given strip height */
+		case DEALLOC_MP_FULL:
+			off = 0;
+			/* If this is a truncate to a non-zero size and we're
+			   at the top of the buffer, we need to add an offset
+			   so we don't destroy our metadata tree. */
+			if (mp_eof.mp_list[strip_h] == mp.mp_list[strip_h] &&
+			    (strip_h == 0 || mp_eof.mp_list[strip_h - 1] ==
+			     mp.mp_list[strip_h - 1])) {
+				off = mp.mp_list[strip_h];
+				if (newsize)
+					off++;
+			}
+
+			if (mp_h > 0) { /* issue read-ahead on metadata */
+				__be64 *top;
+
+				bh = mp_eof.mp_bh[mp_h - 1];
+				if (bh->b_blocknr != last_ra) {
+					last_ra = bh->b_blocknr;
+					top = (__be64 *)bh->b_data;
+					if (mp_h - 1 == 0)
+						top += sizeof(struct gfs2_dinode);
+					else
+						top += sizeof(struct gfs2_meta_header);
+					gfs2_metapath_ra(ip->i_gl, bh, top);
+				}
+			}
+			bh = mp_eof.mp_bh[strip_h];
+			gfs2_assert_withdraw(sdp, bh);
+
+			ret = sweep_bh_for_rgrps(ip, bh, dibh, &btotal,
+						 strip_h, off, &rd_gh);
+			/* If we hit an error or just swept dinode buffer,
+			   just exit. */
+			if (ret || gfs2_get_inode_blocks(inode) <= newblks ||
+			    !strip_h) {
+				state = DEALLOC_DONE;
+				break;
+			}
+			state = DEALLOC_MP_LOWER;
 			break;
+
+		/* lower the metapath strip height */
+		case DEALLOC_MP_LOWER:
+			/* We're done with the current buffer, so release it,
+			   unless it's the dinode buffer. Then back up to the
+			   previous pointer. */
+			bh = mp_eof.mp_bh[mp_h];
+			if (mp_eof.mp_bh[mp_h] != dibh) {
+				brelse(mp_eof.mp_bh[mp_h]);
+				mp_eof.mp_bh[mp_h] = NULL;
+			}
+			if (mp_eof.mp_list[mp_h - 1] == mp.mp_list[mp_h - 1])
+				mp_eof.mp_list[mp_h] = mp.mp_list[mp_h];
+			else
+				mp_eof.mp_list[mp_h] = sdp->sd_inptrs - 1;
+			/* If we can't get any lower in height, we've stripped
+			   off all we can. Next step is to back up and start
+			   stripping the previous level of metadata. */
+			if (mp_h == 0) {
+				strip_h--;
+				memcpy(&mp_eof.mp_list, &eof, sizeof(eof));
+				mp_h = strip_h;
+				state = DEALLOC_LOOKUP_MP;
+				break;
+			}
+			mp_h--; /* search one metadata height down */
+			state = DEALLOC_MP_FINDPREV;
+			break;
+
+		/* Find previous metadata pointer */
+		case DEALLOC_MP_FINDPREV:
+			if (mp_eof.mp_list[mp_h] <= mp.mp_list[mp_h]) {
+				state = DEALLOC_MP_LOWER;
+				break;
+			}
+			mp_eof.mp_list[mp_h]--;
+			/* Here we've found a part of the metapath that is not
+			 * allocated. We need to search (backward) at that
+			 * height for a previous non-null pointer. */
+			if (find_nonnull_ptr(&mp_eof, &mp, mp_h)) {
+				state = DEALLOC_LOOKUP_MP;
+				mp_h++;
+				break;
+			}
+			/* No more non-null pointers at this height. Back up
+			   to the previous height and try again. */
+			state = DEALLOC_MP_LOWER;
+			break;
+
+		/* Our metatree is incomplete. Backward search the meta buffer
+		  @the highest known height for more metadata pointers */
+		case DEALLOC_MP_INCOMPLETE:
+			/* If we find a non-null block pointer, crawl a bit
+			   higher up in the metapath. */
+			if (find_nonnull_ptr(&mp_eof, &mp, mp_h)) {
+				state = DEALLOC_LOOKUP_MP;
+				mp_h++;
+			} else {
+				state = DEALLOC_MP_LOWER;
+			}
+			break;
+		}
 	}
 
-	gfs2_quota_unhold(ip);
+	if (btotal) {
+		if (current->journal_info == NULL) {
+			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
+					       RES_QUOTA, 0);
+			if (ret)
+				goto out_quota;
+		}
+		gfs2_statfs_change(sdp, 0, +btotal, 0);
+		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+				  ip->i_inode.i_gid);
+		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		gfs2_trans_end(sdp);
+	}
 
-	return error;
+out_quota:
+	gfs2_quota_unhold(ip);
+out:
+	if (gfs2_holder_initialized(&rd_gh))
+		gfs2_glock_dq_uninit(&rd_gh);
+	if (current->journal_info)
+		gfs2_trans_end(sdp);
+	release_metapath(&mp_eof);
+	return ret;
 }
 
 static int trunc_end(struct gfs2_inode *ip)
@@ -1200,7 +1302,7 @@ static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
 	if (gfs2_is_stuffed(ip))
 		return 0;
 
-	error = trunc_dealloc(ip, newsize);
+	error = trunc_dealloc(ip, oldsize, newsize);
 	if (error == 0)
 		error = trunc_end(ip);
 
@@ -1335,7 +1437,8 @@ out:
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
 	int error;
-	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
+	error = trunc_dealloc(ip, i_size_read(&ip->i_inode),
+			      i_size_read(&ip->i_inode));
 	if (!error)
 		error = trunc_end(ip);
 	return error;
@@ -1343,7 +1446,7 @@ int gfs2_truncatei_resume(struct gfs2_inode *ip)
 
 int gfs2_file_dealloc(struct gfs2_inode *ip)
 {
-	return trunc_dealloc(ip, 0);
+	return trunc_dealloc(ip, i_size_read(&ip->i_inode), 0);
 }
 
 /**
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 1e6a443..cdac201 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -457,13 +457,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 	}
 }
 
-static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
-{
-	u64 first = rgd->rd_data0;
-	u64 last = first + rgd->rd_data;
-	return first <= block && block < last;
-}
-
 /**
  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
  * @sdp: The GFS2 superblock
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 817b58f..5b363f8 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
 	return rs && !RB_EMPTY_NODE(&rs->rs_node);
 }
 
+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+	u64 first = rgd->rd_data0;
+	u64 last = first + rgd->rd_data;
+	return first <= block && block < last;
+}
+
 extern void check_and_update_goal(struct gfs2_inode *ip);
 #endif /* __RGRP_DOT_H__ */



^ permalink raw reply related	[flat|nested] 3+ messages in thread