[Cluster-devel] [GFS2 PATCH] [take 3] GFS2: Non-recursive delete

* [Cluster-devel] [GFS2 PATCH] [take 3] GFS2: Non-recursive delete
       [not found] <1583927707.2546800.1489582882203.JavaMail.zimbra@redhat.com>
@ 2017-03-15 13:14 ` Bob Peterson
  0 siblings, 0 replies; only message in thread
From: Bob Peterson @ 2017-03-15 13:14 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On 13 February, I sent out a "take 2" patch for non-recursive delete.

Since that time I've been testing, finding problems, and fixing them.
This is my "take 3" patch, and it seems to be working just fine,
even under stress, and in a clustered environment.
It still needs more testing, and it should still be considered
highly experimental.

NOTE: As before, this is a RHEL7 port.

Rationale: We need this for several reasons:

First, for performance. The old algorithm locked multiple resource
groups at once for large (or fragmented) files, which meant other
processes were blocked on all block allocations from them.

Second, if the file is big enough, the old algorithm would request
a huge chunk of kernel memory to juggle all that locking. This resulted
in errors like "WARNING: at mm/page_alloc.c:1338 get_page_from_freelist"
and potentially worse outcomes (if the memory request could not be
satisfied). It could also cause the system to start memory swapping,
and so forth.

Third, if the file was big enough, the delete could take an enormous
amount of CPU time, which could cause kernel lockup warnings, or in
worst case, could cause small virts to be fenced because they seemed
to be CPU-spinning out of control.

Regards,

Bob Peterson
Red Hat File Systems
---
GFS2: Non-recursive delete

Implement truncate/delete as a non-recursive algorithm. The older
algorithm was implemented with recursion to strip off each layer
at a time (going by height, starting with the maximum height).
This version tries to do the same thing, but without recursion,
and without needing to allocate new structures or lists in memory.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/bmap.c | 708 ++++++++++++++++++++++++++++++++++-----------------------
 fs/gfs2/rgrp.c |   7 -
 fs/gfs2/rgrp.h |   7 +
 3 files changed, 435 insertions(+), 287 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ad1971d..7565b63 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -38,11 +38,6 @@ struct metapath {
 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
 };
 
-struct strip_mine {
-	int sm_first;
-	unsigned int sm_height;
-};
-
 /**
  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  * @ip: the inode
@@ -298,6 +293,7 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
  * lookup_metapath - Walk the metadata tree to a specific point
  * @ip: The inode
  * @mp: The metapath
+ * @h: The height to which it should be mapped
  *
  * Assumes that the inode's buffer has already been looked up and
  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
@@ -311,20 +307,21 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
  * Returns: error or height of metadata tree
  */
 
-static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
+static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 {
-	unsigned int end_of_metadata = ip->i_height - 1;
 	unsigned int x;
 	__be64 *ptr;
 	u64 dblock;
 	int ret;
 
-	for (x = 0; x < end_of_metadata; x++) {
+	for (x = 0; x < h; x++) {
 		ptr = metapointer(x, mp);
 		dblock = be64_to_cpu(*ptr);
 		if (!dblock)
 			return x + 1;
 
+		if (mp->mp_bh[x + 1])
+			continue;
 		ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
 		if (ret)
 			return ret;
@@ -619,7 +616,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 
 	BUG_ON(maxlen == 0);
 
-	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+	memset(&mp, 0, sizeof(mp));
 	bmap_lock(ip, create);
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
@@ -642,7 +639,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	ret = 1;
 	if (height > ip->i_height || gfs2_is_stuffed(ip))
 		goto do_alloc;
-	ret = lookup_metapath(ip, &mp);
+	ret = lookup_metapath(ip, &mp, ip->i_height - 1);
 	if (ret < 0)
 		goto out;
 	if (ret != ip->i_height)
@@ -701,252 +698,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 }
 
 /**
- * do_strip - Look for a layer a particular layer of the file and strip it off
- * @ip: the inode
- * @dibh: the dinode buffer
- * @bh: A buffer of pointers
- * @top: The first pointer in the buffer
- * @bottom: One more than the last pointer
- * @height: the height this buffer is at
- * @data: a pointer to a struct strip_mine
- *
- * Returns: errno
- */
-
-static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
-		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
-		    unsigned int height, struct strip_mine *sm)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrp_list rlist;
-	struct gfs2_trans *tr;
-	u64 bn, bstart;
-	u32 blen, btotal;
-	__be64 *p;
-	unsigned int rg_blocks = 0;
-	int metadata;
-	unsigned int revokes = 0;
-	int x;
-	int error;
-	int jblocks_rqsted;
-
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
-
-	if (!*top)
-		sm->sm_first = 0;
-
-	if (height != sm->sm_height)
-		return 0;
-
-	if (sm->sm_first) {
-		top++;
-		sm->sm_first = 0;
-	}
-
-	metadata = (height != ip->i_height - 1);
-	if (metadata)
-		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-	else if (ip->i_depth)
-		revokes = sdp->sd_inptrs;
-
-	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-	bstart = 0;
-	blen = 0;
-
-	for (p = top; p < bottom; p++) {
-		if (!*p)
-			continue;
-
-		bn = be64_to_cpu(*p);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_rlist_add(ip, &rlist, bstart);
-
-			bstart = bn;
-			blen = 1;
-		}
-	}
-
-	if (bstart)
-		gfs2_rlist_add(ip, &rlist, bstart);
-	else
-		goto out; /* Nothing to do */
-
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
-	for (x = 0; x < rlist.rl_rgrps; x++) {
-		struct gfs2_rgrpd *rgd;
-		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
-		rg_blocks += rgd->rd_length;
-	}
-
-	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
-	if (error)
-		goto out_rlist;
-
-	if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
-		gfs2_rs_deltree(&ip->i_res);
-
-restart:
-	jblocks_rqsted = rg_blocks + RES_DINODE +
-		RES_INDIRECT + RES_STATFS + RES_QUOTA +
-		gfs2_struct2blk(sdp, revokes, sizeof(u64));
-	if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
-		jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
-	error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
-	if (error)
-		goto out_rg_gunlock;
-
-	tr = current->journal_info;
-	down_write(&ip->i_rw_mutex);
-
-	gfs2_trans_add_meta(ip->i_gl, dibh);
-	gfs2_trans_add_meta(ip->i_gl, bh);
-
-	bstart = 0;
-	blen = 0;
-	btotal = 0;
-
-	for (p = top; p < bottom; p++) {
-		if (!*p)
-			continue;
-
-		/* check for max reasonable journal transaction blocks */
-		if (tr->tr_num_buf_new + RES_STATFS +
-		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
-			if (rg_blocks >= tr->tr_num_buf_new)
-				rg_blocks -= tr->tr_num_buf_new;
-			else
-				rg_blocks = 0;
-			break;
-		}
-
-		bn = be64_to_cpu(*p);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart) {
-				__gfs2_free_blocks(ip, bstart, blen, metadata);
-				btotal += blen;
-			}
-
-			bstart = bn;
-			blen = 1;
-		}
-
-		*p = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
-	}
-	if (p == bottom)
-		rg_blocks = 0;
-
-	if (bstart) {
-		__gfs2_free_blocks(ip, bstart, blen, metadata);
-		btotal += blen;
-	}
-
-	gfs2_statfs_change(sdp, 0, +btotal, 0);
-	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
-			  ip->i_inode.i_gid);
-
-	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-
-	gfs2_dinode_out(ip, dibh->b_data);
-
-	up_write(&ip->i_rw_mutex);
-
-	gfs2_trans_end(sdp);
-
-	if (rg_blocks)
-		goto restart;
-
-out_rg_gunlock:
-	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist:
-	gfs2_rlist_free(&rlist);
-out:
-	return error;
-}
-
-/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @sm: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
-			  struct metapath *mp, unsigned int height,
-			  u64 block, int first, struct strip_mine *sm)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head *bh = NULL;
-	__be64 *top, *bottom;
-	u64 bn;
-	int error;
-	int mh_size = sizeof(struct gfs2_meta_header);
-
-	if (!height) {
-		error = gfs2_meta_inode_buffer(ip, &bh);
-		if (error)
-			return error;
-		dibh = bh;
-
-		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
-		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
-	} else {
-		error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
-		if (error)
-			return error;
-
-		top = (__be64 *)(bh->b_data + mh_size) +
-				  (first ? mp->mp_list[height] : 0);
-
-		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
-	}
-
-	error = do_strip(ip, dibh, bh, top, bottom, height, sm);
-	if (error)
-		goto out;
-
-	if (height < ip->i_height - 1) {
-
-		gfs2_metapath_ra(ip->i_gl, bh, top);
-
-		for (; top < bottom; top++, first = 0) {
-			if (!*top)
-				continue;
-
-			bn = be64_to_cpu(*top);
-
-			error = recursive_scan(ip, dibh, mp, height + 1, bn,
-					       first, sm);
-			if (error)
-				break;
-		}
-	}
-out:
-	brelse(bh);
-	return error;
-}
-
-
-/**
  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
  *
  * This is partly borrowed from ext3.
@@ -1105,41 +856,437 @@ out:
 	return error;
 }
 
-static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+/**
+ * sweep_bh_for_rgrps - find an rgrp in a buffer and free blocks therein
+ * ip: inode
+ * bh: buffer_head of current metadata buffer to sweep
+ * dibh: inode's buffer_head
+ * btotal: place to keep count of total blocks freed
+ * hgt: height we're processing
+ * off: Starting metapath offset, for the first block of the truncate
+ * rg_gh: holder of resource group glock
+ * newsize: new size of the file
+ *
+ * We sweep a metadata buffer for blocks we need to free, and free them all.
+ * However, we do it one rgrp@a time. If this block has references to
+ * multiple rgrps, we break it into individual transactions. This allows
+ * other processes to use the rgrps while we're focused on a single one.
+ * At every transaction boundary, we rewrite the inode into the journal.
+ * That way the bitmaps are kept consistent with the inode and we can recover
+ * if we're interrupted by power-outages.
+ *
+ * Returns: ret = 0, or return code if an error occurred.
+ *          *btotal has the total number of blocks freed
+ */
+static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct buffer_head *bh,
+			      struct buffer_head *dibh, u32 *btotal, int hgt,
+			      int off, struct gfs2_holder *rd_gh,
+			      u64 newsize)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int height = ip->i_height;
-	u64 lblock;
-	struct metapath mp;
-	int error;
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_trans *tr;
+	__be64 *top, *bottom, *p;
+	int blks_outside_rgrp;
+	u64 bn, bstart, isize_blks;
+	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
+	int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
+	int ret = 0;
+
+more_rgrps:
+	blks_outside_rgrp = 0;
+	bstart = 0;
+	blen = 0;
+	if (hgt == 0) {
+		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode));
+		bottom = (__be64 *)top + sdp->sd_diptrs;
+		if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_DI))
+			return -EIO;
+	} else {
+		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+		bottom = top + sdp->sd_inptrs;
+		if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_IN))
+			return -EIO;
+	}
+	top += off;
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+		bn = be64_to_cpu(*p);
+		if (gfs2_holder_initialized(rd_gh)) {
+			rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
+			gfs2_assert_withdraw(sdp,
+				     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
+		} else {
+			rgd = gfs2_blk2rgrpd(sdp, bn, false);
+			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+						 0, rd_gh);
+			if (ret)
+				goto out;
+
+			/* Must be done with the rgrp glock held: */
+			if (gfs2_rs_active(&ip->i_res) &&
+			    rgd == ip->i_res.rs_rbm.rgd)
+				gfs2_rs_deltree(&ip->i_res);
+		}
+
+		if (!rgrp_contains_block(rgd, bn)) {
+			blks_outside_rgrp++;
+			continue;
+		}
+
+		/* The size of our transactions will be unknown until we
+		   actually process all the metadata blocks that relate to
+		   the rgrp. So we estimate. We know it can't be more than
+		   the dinode's i_blocks and we don't want to exceed the
+		   journal flush threshold, sd_log_thresh2. */
+		if (current->journal_info == NULL) {
+			unsigned int jblocks_rqsted, revokes;
+
+			jblocks_rqsted = rgd->rd_length + RES_DINODE +
+				RES_INDIRECT;
+			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
+			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
+				jblocks_rqsted +=
+					atomic_read(&sdp->sd_log_thresh2);
+			else
+				jblocks_rqsted += isize_blks;
+			revokes = jblocks_rqsted;
+			if (meta)
+				revokes += hgt ? sdp->sd_inptrs :
+					sdp->sd_diptrs;
+			else if (ip->i_depth)
+				revokes += sdp->sd_inptrs;
+			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
+			if (ret)
+				goto out_unlock;
+			down_write(&ip->i_rw_mutex);
+		}
+		/* check if we will exceed the transaction blocks requested */
+		tr = current->journal_info;
+		if (tr->tr_num_buf_new + RES_STATFS +
+		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
+			/* We set blks_outside_rgrp to ensure the loop will
+			   be repeated for the same rgrp, but with a new
+			   transaction. */
+			blks_outside_rgrp++;
+			/* This next part is tricky. If the buffer is not
+			   pinned, we haven't added it to the transaction, and
+			   doing so would exceed our transaction, so we need to
+			   end the transaction and start a new one (so goto).
+
+			   If the buffer is pinned, it means we've already
+			   added it to the transaction, in which case we've
+			   already set some block pointers to 0, so we
+			   better follow through and free those, or we will
+			   introduce corruption (so break). This may be
+			   impossible, or at least rare, but I decided to cover
+			   the case regardless. */
+			if (!buffer_pinned(bh))
+				goto out_unlock;
+			break;
+		}
+
+		gfs2_trans_add_meta(ip->i_gl, bh);
+		*p = 0;
+		if (bstart + blen == bn) {
+			blen++;
+			continue;
+		}
+		if (bstart) {
+			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+			(*btotal) += blen;
+			gfs2_add_inode_blocks(&ip->i_inode, -blen);
+		}
+		bstart = bn;
+		blen = 1;
+	}
+	if (bstart) {
+		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+		(*btotal) += blen;
+		gfs2_add_inode_blocks(&ip->i_inode, -blen);
+	}
+out_unlock:
+	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
+					    outside the rgrp we just processed,
+					    do it all over again. */
+		if (current->journal_info) {
+			/* Every transaction boundary, we rewrite the dinode
+			   to keep its di_blocks current in case of failure. */
+			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
+				CURRENT_TIME;
+			gfs2_trans_add_meta(ip->i_gl, dibh);
+			gfs2_dinode_out(ip, dibh->b_data);
+			up_write(&ip->i_rw_mutex);
+			gfs2_trans_end(sdp);
+		}
+		gfs2_glock_dq_uninit(rd_gh);
+		goto more_rgrps;
+	}
+out:
+	cond_resched(); /* Give other processes a chance to run */
+	return ret;
+}
+
+/* assumes the metapath is valid (with buffers) out to height h */
+static bool find_nonnull_ptr(struct metapath *mp_eof, struct metapath *mp,
+			     unsigned int h)
+{
+	__be64 *ptr;
+
+	while (true) {
+		ptr = metapointer(h, mp_eof);
+		if (*ptr) /* if we have a non-null pointer */
+			return true;
+
+		if (mp_eof->mp_list[h] > mp->mp_list[h])
+			mp_eof->mp_list[h]--;
+		else
+			return false; /* no more pointers in this buffer */
+	}
+}
+
+enum dealloc_states {
+	DEALLOC_LOOKUP_MP = 0,  /* Look up the metapath to the given height. */
+	DEALLOC_MP_FULL = 1,    /* Strip a metapath with all buffers read in */
+	DEALLOC_MP_FINDPREV = 2, /* Find previous metadata pointer */
+	DEALLOC_MP_LOWER = 3,   /* lower the metapath strip height */
+	DEALLOC_MP_INCOMPLETE = 4,/* deal with an incomplete metapath */
+	DEALLOC_DONE = 5,       /* process complete */
+};
 
-	if (!size)
+/**
+ * trunc_dealloc - truncate a file down to a desired size
+ * @ip: inode to truncate
+ * @oldsize: The previous size of the file (can't get this from ip anymore)
+ * @newsize: The desired size of the file
+ *
+ * This function truncates a file from oldsize to newsize. It works from the
+ * bottom up, and from the right to the left. In other words, it strips off
+ * the highest layer (data) before stripping any of the metadata. Doing it
+ * this way is best in case the operation is interrupted by power failure, etc.
+ * The dinode is rewritten in every transaction to guarantee integrity.
+ */
+static int trunc_dealloc(struct gfs2_inode *ip, u64 oldsize, u64 newsize)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 lblock;
+	struct metapath mp, mp_eof;
+	struct buffer_head *dibh, *bh;
+	struct gfs2_holder rd_gh;
+	__u16 eof[GFS2_MAX_META_HEIGHT];
+	unsigned int strip_h = ip->i_height - 1;
+	u32 btotal = 0, off;
+	int ret, state;
+	int mp_h; /* metapath buffers are read in to this height */
+	sector_t last_ra = 0;
+	u64 prev_bnr = 0;
+
+	if (!newsize)
 		lblock = 0;
 	else
-		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+		lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
 
+	memset(&mp, 0, sizeof(mp));
 	find_metapath(sdp, lblock, &mp, ip->i_height);
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
 
-	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
-	if (error)
-		return error;
+	/* find our ending index for each height */
+	memset(&mp_eof, 0, sizeof(mp_eof));
+	ret = gfs2_meta_inode_buffer(ip, &mp_eof.mp_bh[0]);
+	if (ret)
+		return ret;
 
-	while (height--) {
-		struct strip_mine sm;
-		sm.sm_first = !!size;
-		sm.sm_height = height;
+	dibh = mp_eof.mp_bh[0];
+	find_metapath(sdp, (oldsize - 1) >> sdp->sd_sb.sb_bsize_shift,
+		      &mp_eof, ip->i_height);
+	memcpy(&eof, &mp_eof.mp_list, sizeof(eof));
 
-		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
-		if (error)
+	ret = gfs2_rindex_update(sdp);
+	if (ret)
+		goto out_metapath;
+
+	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (ret)
+		goto out_metapath;
+	gfs2_holder_mark_uninitialized(&rd_gh);
+
+	mp_h = strip_h;
+	state = DEALLOC_LOOKUP_MP;
+
+	while (state != DEALLOC_DONE) {
+		switch (state) {
+		/* Look up the metapath to the given height. */
+		case DEALLOC_LOOKUP_MP:
+			/* Fill the buffers out to the current height. */
+			ret = lookup_metapath(ip, &mp_eof, mp_h);
+			if (ret < 0)
+				goto out;
+
+			/* if we found a partial height */
+			if (ret < ip->i_height) {
+				mp_h = ret - 1;
+				state = DEALLOC_MP_INCOMPLETE;
+				break;
+			}
+			/* If buffers found for the entire requested height. */
+			if (mp_h == strip_h)
+				state = DEALLOC_MP_FULL;
+			else
+				state = DEALLOC_MP_INCOMPLETE;
+			break;
+
+		/* Truncate a now full metapath at the given strip height.
+		 * Note that strip_h == mp_h in order to be in this state. */
+		case DEALLOC_MP_FULL:
+			off = 0;
+			/* If this is a truncate to a non-zero size and we're
+			   at the top of the buffer, we need to add an offset
+			   so we don't destroy our metadata tree. */
+			if (mp_h == 0 || (mp_eof.mp_list[mp_h - 1] ==
+					  mp.mp_list[mp_h - 1])) {
+				off = mp.mp_list[mp_h];
+				if (newsize)
+					off++;
+			}
+
+			if (mp_h > 0) { /* issue read-ahead on metadata */
+				__be64 *top;
+
+				bh = mp_eof.mp_bh[mp_h - 1];
+				if (bh->b_blocknr != last_ra) {
+					last_ra = bh->b_blocknr;
+					top = (__be64 *)bh->b_data;
+					if (mp_h - 1 == 0)
+						top += sizeof(struct gfs2_dinode);
+					else
+						top += sizeof(struct gfs2_meta_header);
+					gfs2_metapath_ra(ip->i_gl, bh, top);
+				}
+			}
+			bh = mp_eof.mp_bh[mp_h];
+			gfs2_assert_withdraw(sdp, bh);
+			if (gfs2_metatype_check(sdp, bh, mp_h ?
+						GFS2_METATYPE_IN :
+						GFS2_METATYPE_DI))
+				return -EIO;
+
+			if (gfs2_assert_withdraw(sdp,
+						 prev_bnr != bh->b_blocknr)) {
+				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
+				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
+				       sdp->sd_fsname,
+				       (unsigned long long)ip->i_no_addr,
+				       prev_bnr, ip->i_height, strip_h, mp_h);
+			}
+			prev_bnr = bh->b_blocknr;
+			ret = sweep_bh_for_rgrps(ip, bh, dibh, &btotal, mp_h,
+						 off, &rd_gh, newsize);
+			/* If we hit an error or just swept dinode buffer,
+			   just exit. */
+			if (ret || !mp_h) {
+				state = DEALLOC_DONE;
+				break;
+			}
+			state = DEALLOC_MP_LOWER;
+			break;
+
+		/* lower the metapath strip height */
+		case DEALLOC_MP_LOWER:
+			/* We're done with the current buffer, so release it,
+			   unless it's the dinode buffer. Then back up to the
+			   previous pointer. */
+			bh = mp_eof.mp_bh[mp_h];
+			if (bh != dibh) {
+				brelse(bh);
+				mp_eof.mp_bh[mp_h] = NULL;
+			}
+			/* If we can't get any lower in height, we've stripped
+			   off all we can. Next step is to back up and start
+			   stripping the previous level of metadata. */
+			if (mp_h == 0) {
+				strip_h--;
+				memcpy(&mp_eof.mp_list, &eof, sizeof(eof));
+				for (mp_h = 1; mp_h < GFS2_MAX_META_HEIGHT;
+				     mp_h++)
+					gfs2_assert_withdraw(sdp,
+							     mp_eof.mp_bh[mp_h] == NULL);
+				mp_h = strip_h;
+				state = DEALLOC_LOOKUP_MP;
+				break;
+			}
+			if (mp_eof.mp_list[mp_h - 1] == mp.mp_list[mp_h - 1])
+				mp_eof.mp_list[mp_h] = mp.mp_list[mp_h];
+			else
+				mp_eof.mp_list[mp_h] = sdp->sd_inptrs - 1;
+			mp_h--; /* search one metadata height down */
+			state = DEALLOC_MP_FINDPREV;
+			break;
+
+		/* Find previous metadata pointer */
+		case DEALLOC_MP_FINDPREV:
+			if (mp_eof.mp_list[mp_h] <= mp.mp_list[mp_h]) {
+				state = DEALLOC_MP_LOWER;
+				break;
+			}
+			mp_eof.mp_list[mp_h]--;
+			/* Here we've found a part of the metapath that is not
+			 * allocated. We need to search (backward) at that
+			 * height for a previous non-null pointer. */
+			if (find_nonnull_ptr(&mp_eof, &mp, mp_h)) {
+				state = DEALLOC_LOOKUP_MP;
+				mp_h++;
+				break;
+			}
+			/* No more non-null pointers at this height. Back up
+			   to the previous height and try again. */
+			state = DEALLOC_MP_LOWER;
+			break;
+
+		/* Our metatree is incomplete. Backward search the meta buffer
+		  @the highest known height for more metadata pointers */
+		case DEALLOC_MP_INCOMPLETE:
+			/* If we find a non-null block pointer, crawl a bit
+			   higher up in the metapath. */
+			if (find_nonnull_ptr(&mp_eof, &mp, mp_h)) {
+				state = DEALLOC_LOOKUP_MP;
+				mp_h++;
+			} else {
+				state = DEALLOC_MP_LOWER;
+			}
 			break;
+		}
 	}
 
-	gfs2_quota_unhold(ip);
+	if (btotal) {
+		if (current->journal_info == NULL) {
+			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
+					       RES_QUOTA, 0);
+			if (ret)
+				goto out;
+			down_write(&ip->i_rw_mutex);
+		}
+		gfs2_statfs_change(sdp, 0, +btotal, 0);
+		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+				  ip->i_inode.i_gid);
+		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		up_write(&ip->i_rw_mutex);
+		gfs2_trans_end(sdp);
+	}
 
-	return error;
+out:
+	if (gfs2_holder_initialized(&rd_gh))
+		gfs2_glock_dq_uninit(&rd_gh);
+	if (current->journal_info) {
+		up_write(&ip->i_rw_mutex);
+		gfs2_trans_end(sdp);
+	}
+	gfs2_quota_unhold(ip);
+out_metapath:
+	release_metapath(&mp_eof);
+	return ret;
 }
 
 static int trunc_end(struct gfs2_inode *ip)
@@ -1200,7 +1347,7 @@ static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
 	if (gfs2_is_stuffed(ip))
 		return 0;
 
-	error = trunc_dealloc(ip, newsize);
+	error = trunc_dealloc(ip, oldsize, newsize);
 	if (error == 0)
 		error = trunc_end(ip);
 
@@ -1335,7 +1482,8 @@ out:
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
 	int error;
-	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
+	error = trunc_dealloc(ip, i_size_read(&ip->i_inode),
+			      i_size_read(&ip->i_inode));
 	if (!error)
 		error = trunc_end(ip);
 	return error;
@@ -1343,7 +1491,7 @@ int gfs2_truncatei_resume(struct gfs2_inode *ip)
 
 int gfs2_file_dealloc(struct gfs2_inode *ip)
 {
-	return trunc_dealloc(ip, 0);
+	return trunc_dealloc(ip, i_size_read(&ip->i_inode), 0);
 }
 
 /**
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 1e6a443..cdac201 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -457,13 +457,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 	}
 }
 
-static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
-{
-	u64 first = rgd->rd_data0;
-	u64 last = first + rgd->rd_data;
-	return first <= block && block < last;
-}
-
 /**
  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
  * @sdp: The GFS2 superblock
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 817b58f..5b363f8 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
 	return rs && !RB_EMPTY_NODE(&rs->rs_node);
 }
 
+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+	u64 first = rgd->rd_data0;
+	u64 last = first + rgd->rd_data;
+	return first <= block && block < last;
+}
+
 extern void check_and_update_goal(struct gfs2_inode *ip);
 #endif /* __RGRP_DOT_H__ */



^ permalink raw reply related	[flat|nested] only message in thread