All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/14] repair memory usage reductions
@ 2009-09-02 17:55 Christoph Hellwig
  2009-09-02 17:55 ` [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt Christoph Hellwig
                   ` (15 more replies)
  0 siblings, 16 replies; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs

This is a respin of the patches Barry Naujok wrote at SGI for reducing
the memory usage in repair.  I've split it up, fixed a few small bugs
and added two preparatory cleanups - but all the real work is Barry's.
There has been lots of heavy testing on large filesystems by Barry
on the original patches, and quite a lot of testing on slightly smaller
filesystems by me.  These were all ad-hoc tests as XFSQA coverage is
rather low on repair.  My plan is to add various additional testcase
for XFSQA both for intentional corruptions as well as reproducing past
reported bugs before we'll release these patches in xfsprogs.  But I think
it would be good if we could get them into the development git tree to
get wider coverage already.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-12 16:53   ` Eric Sandeen
  2009-09-02 17:55 ` [PATCH 02/14] repair: reduce byte swap operations in scanfunc_allocbt Christoph Hellwig
                   ` (14 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs

[-- Attachment #1: repair-unify-scanfunc-bno-cnt --]
[-- Type: text/plain, Size: 7034 bytes --]

Those two functions are almost identical.  The big difference is that we only
move blocks from XR_E_FREE1 to XR_E_FREE state when processing the cnt btree.

Besides that we print bno vs cnt in the messages and obviously validate a
slightly different magic number in the header.


Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/scan.c
===================================================================
--- xfsprogs-dev.orig/repair/scan.c	2009-08-21 18:24:26.000000000 +0000
+++ xfsprogs-dev/repair/scan.c	2009-08-21 18:40:59.000000000 +0000
@@ -439,15 +439,16 @@ _("out-of-order bmap key (file offset) i
 }
 
 void
-scanfunc_bno(
+scanfunc_allocbt(
 	struct xfs_btree_block	*block,
 	int			level,
 	xfs_agblock_t		bno,
 	xfs_agnumber_t		agno,
 	int			suspect,
-	int			isroot
-	)
+	int			isroot,
+	__uint32_t		magic)
 {
+	const char 		*name;
 	xfs_agblock_t		b, e;
 	int			i;
 	xfs_alloc_ptr_t		*pp;
@@ -456,16 +457,18 @@ scanfunc_bno(
 	int			numrecs;
 	int			state;
 
-	if (be32_to_cpu(block->bb_magic) != XFS_ABTB_MAGIC) {
-		do_warn(_("bad magic # %#x in btbno block %d/%d\n"),
-			be32_to_cpu(block->bb_magic), agno, bno);
+	name = (magic == XFS_ABTB_MAGIC) ? "bno" : "cnt";
+
+	if (be32_to_cpu(block->bb_magic) != magic) {
+		do_warn(_("bad magic # %#x in bt%s block %d/%d\n"),
+			be32_to_cpu(block->bb_magic), name, agno, bno);
 		hdr_errors++;
 		if (suspect)
 			return;
 	}
 	if (be16_to_cpu(block->bb_level) != level) {
-		do_warn(_("expected level %d got %d in btbno block %d/%d\n"),
-			level, be16_to_cpu(block->bb_level), agno, bno);
+		do_warn(_("expected level %d got %d in bt%s block %d/%d\n"),
+			level, be16_to_cpu(block->bb_level), name, agno, bno);
 		hdr_errors++;
 		if (suspect)
 			return;
@@ -483,8 +486,8 @@ scanfunc_bno(
 	default:
 		set_agbno_state(mp, agno, bno, XR_E_MULT);
 		do_warn(
-_("bno freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
-				state, agno, bno, suspect);
+_("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
+				name, state, agno, bno, suspect);
 		return;
 	}
 
@@ -520,15 +523,27 @@ _("bno freespace btree block claimed (st
 				continue;
 			for (b = be32_to_cpu(rp[i].ar_startblock);
 			     b < e; b++)  {
-				if (get_agbno_state(mp, agno, b)
-							== XR_E_UNKNOWN)
+				state = get_agbno_state(mp, agno, b);
+				switch (state) {
+				case XR_E_UNKNOWN:
 					set_agbno_state(mp, agno, b,
 							XR_E_FREE1);
-				else  {
+					break;
+				case XR_E_FREE1:
+					/*
+					 * no warning messages -- we'll catch
+					 * FREE1 blocks later
+					 */
+					if (magic != XFS_ABTB_MAGIC) {
+						set_agbno_state(mp, agno, b,
+								XR_E_FREE);
+						break;
+					}
+				default:
 					do_warn(
-	_("block (%d,%d) multiply claimed by bno space tree, state - %d\n"),
-						agno, b,
-						get_agbno_state(mp, agno, b));
+	_("block (%d,%d) multiply claimed by %s space tree, state - %d\n"),
+						agno, b, name, state);
+					break;
 				}
 			}
 		}
@@ -575,12 +590,26 @@ _("bno freespace btree block claimed (st
 		 */
 		if (be32_to_cpu(pp[i]) != 0 && verify_agbno(mp, agno,
 							be32_to_cpu(pp[i])))
-			scan_sbtree(be32_to_cpu(pp[i]), level, agno,
-					suspect, scanfunc_bno, 0);
+			scan_sbtree(be32_to_cpu(pp[i]), level, agno, suspect,
+				    (magic == XFS_ABTB_MAGIC) ?
+				    	scanfunc_bno : scanfunc_cnt, 0);
 	}
 }
 
 void
+scanfunc_bno(
+	struct xfs_btree_block	*block,
+	int			level,
+	xfs_agblock_t		bno,
+	xfs_agnumber_t		agno,
+	int			suspect,
+	int			isroot)
+{
+	return scanfunc_allocbt(block, level, bno, agno,
+				suspect, isroot, XFS_ABTB_MAGIC);
+}
+
+void
 scanfunc_cnt(
 	struct xfs_btree_block	*block,
 	int			level,
@@ -590,136 +619,8 @@ scanfunc_cnt(
 	int			isroot
 	)
 {
-	xfs_alloc_ptr_t		*pp;
-	xfs_alloc_rec_t		*rp;
-	xfs_agblock_t		b, e;
-	int			i;
-	int			hdr_errors;
-	int			numrecs;
-	int			state;
-
-	hdr_errors = 0;
-
-	if (be32_to_cpu(block->bb_magic) != XFS_ABTC_MAGIC) {
-		do_warn(_("bad magic # %#x in btcnt block %d/%d\n"),
-			be32_to_cpu(block->bb_magic), agno, bno);
-		hdr_errors++;
-		if (suspect)
-			return;
-	}
-	if (be16_to_cpu(block->bb_level) != level) {
-		do_warn(_("expected level %d got %d in btcnt block %d/%d\n"),
-			level, be16_to_cpu(block->bb_level), agno, bno);
-		hdr_errors++;
-		if (suspect)
-			return;
-	}
-
-	/*
-	 * check for btree blocks multiply claimed
-	 */
-	state = get_agbno_state(mp, agno, bno);
-
-	switch (state)  {
-	case XR_E_UNKNOWN:
-		set_agbno_state(mp, agno, bno, XR_E_FS_MAP);
-		break;
-	default:
-		set_agbno_state(mp, agno, bno, XR_E_MULT);
-		do_warn(
-_("bcnt freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
-			state, agno, bno, suspect);
-		return;
-	}
-
-	numrecs = be16_to_cpu(block->bb_numrecs);
-
-	if (level == 0) {
-		if (numrecs > mp->m_alloc_mxr[0])  {
-			numrecs = mp->m_alloc_mxr[0];
-			hdr_errors++;
-		}
-		if (isroot == 0 && numrecs < mp->m_alloc_mnr[0])  {
-			numrecs = mp->m_alloc_mnr[0];
-			hdr_errors++;
-		}
-
-		if (hdr_errors)
-			suspect++;
-
-		rp = XFS_ALLOC_REC_ADDR(mp, block, 1);
-		for (i = 0; i < numrecs; i++) {
-			if (be32_to_cpu(rp[i].ar_blockcount) == 0 ||
-					be32_to_cpu(rp[i].ar_startblock) == 0 ||
-					!verify_agbno(mp, agno, be32_to_cpu(
-							rp[i].ar_startblock)) ||
-			    		be32_to_cpu(rp[i].ar_blockcount) >
-							MAXEXTLEN)
-				continue;
-
-			e = be32_to_cpu(rp[i].ar_startblock) +
-				be32_to_cpu(rp[i].ar_blockcount);
-			if (!verify_agbno(mp, agno, e - 1))
-				continue;
-			for (b = be32_to_cpu(rp[i].ar_startblock); b < e; b++) {
-				state = get_agbno_state(mp, agno, b);
-				/*
-				 * no warning messages -- we'll catch
-				 * FREE1 blocks later
-				 */
-				switch (state)  {
-				case XR_E_FREE1:
-					set_agbno_state(mp, agno, b, XR_E_FREE);
-					break;
-				case XR_E_UNKNOWN:
-					set_agbno_state(mp, agno, b,
-							XR_E_FREE1);
-					break;
-				default:
-					do_warn(
-				_("block (%d,%d) already used, state %d\n"),
-						agno, b, state);
-					break;
-				}
-			}
-		}
-		return;
-	}
-
-	/*
-	 * interior record
-	 */
-	pp = XFS_ALLOC_PTR_ADDR(mp, block, 1, mp->m_alloc_mxr[1]);
-
-	if (numrecs > mp->m_alloc_mxr[1])  {
-		numrecs = mp->m_alloc_mxr[1];
-		hdr_errors++;
-	}
-	if (isroot == 0 && numrecs < mp->m_alloc_mnr[1])  {
-		numrecs = mp->m_alloc_mnr[1];
-		hdr_errors++;
-	}
-
-	/*
-	 * don't pass bogus tree flag down further if this block
-	 * looked ok.  bail out if two levels in a row look bad.
-	 */
-
-	if (suspect && !hdr_errors)
-		suspect = 0;
-
-	if (hdr_errors)  {
-		if (suspect)
-			return;
-		else suspect++;
-	}
-
-	for (i = 0; i < numrecs; i++) {
-		if (be32_to_cpu(pp[i]) != 0 && verify_agbno(mp, agno,
-							be32_to_cpu(pp[i])))
-			scan_sbtree(be32_to_cpu(pp[i]), level, agno,
-					suspect, scanfunc_cnt, 0);
-	}
+	return scanfunc_allocbt(block, level, bno, agno,
+				suspect, isroot, XFS_ABTC_MAGIC);
 }
 
 /*

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 02/14] repair: reduce byte swap operations in scanfunc_allocbt
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
  2009-09-02 17:55 ` [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-12 17:18   ` Eric Sandeen
  2009-09-02 17:55 ` [PATCH 03/14] repair: kill B_IS_META flag Christoph Hellwig
                   ` (13 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-scan.c-reduce-byteswaps --]
[-- Type: text/plain, Size: 2514 bytes --]

Store native endian version of the extent startblock and length in
local variables instead of converting them over and over again.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/scan.c
===================================================================
--- xfsprogs-dev.orig/repair/scan.c	2009-08-21 18:48:01.000000000 +0000
+++ xfsprogs-dev/repair/scan.c	2009-08-21 18:54:29.000000000 +0000
@@ -449,7 +449,6 @@ scanfunc_allocbt(
 	__uint32_t		magic)
 {
 	const char 		*name;
-	xfs_agblock_t		b, e;
 	int			i;
 	xfs_alloc_ptr_t		*pp;
 	xfs_alloc_rec_t		*rp;
@@ -509,20 +508,21 @@ _("%s freespace btree block claimed (sta
 
 		rp = XFS_ALLOC_REC_ADDR(mp, block, 1);
 		for (i = 0; i < numrecs; i++) {
-			if (be32_to_cpu(rp[i].ar_blockcount) == 0 ||
-			    be32_to_cpu(rp[i].ar_startblock) == 0 ||
-			    !verify_agbno(mp, agno,
-				be32_to_cpu(rp[i].ar_startblock)) ||
-			    be32_to_cpu(rp[i].ar_blockcount) >
-					MAXEXTLEN)
-				continue;
+			xfs_agblock_t		b, end;
+			xfs_extlen_t		len;
+
+			b = be32_to_cpu(rp[i].ar_startblock);
+			len = be32_to_cpu(rp[i].ar_blockcount);
+			end = b + len;
 
-			e = be32_to_cpu(rp[i].ar_startblock) +
-				be32_to_cpu(rp[i].ar_blockcount);
-			if (!verify_agbno(mp, agno, e - 1))
+			if (b == 0 || !verify_agbno(mp, agno, b))
+				continue;
+			if (len == 0 || len > MAXEXTLEN)
 				continue;
-			for (b = be32_to_cpu(rp[i].ar_startblock);
-			     b < e; b++)  {
+			if (!verify_agbno(mp, agno, end - 1))
+				continue;
+
+			for ( ; b < end; b++)  {
 				state = get_agbno_state(mp, agno, b);
 				switch (state) {
 				case XR_E_UNKNOWN:
@@ -579,6 +579,8 @@ _("%s freespace btree block claimed (sta
 	}
 
 	for (i = 0; i < numrecs; i++)  {
+		xfs_agblock_t		bno = be32_to_cpu(pp[i]);
+
 		/*
 		 * XXX - put sibling detection right here.
 		 * we know our sibling chain is good.  So as we go,
@@ -588,11 +590,11 @@ _("%s freespace btree block claimed (sta
 		 * pointer mismatch, try and extract as much data
 		 * as possible.
 		 */
-		if (be32_to_cpu(pp[i]) != 0 && verify_agbno(mp, agno,
-							be32_to_cpu(pp[i])))
-			scan_sbtree(be32_to_cpu(pp[i]), level, agno, suspect,
+		if (bno != 0 && verify_agbno(mp, agno, bno)) {
+			scan_sbtree(bno, level, agno, suspect,
 				    (magic == XFS_ABTB_MAGIC) ?
 				    	scanfunc_bno : scanfunc_cnt, 0);
+		}
 	}
 }
 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 03/14] repair: kill B_IS_META flag
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
  2009-09-02 17:55 ` [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt Christoph Hellwig
  2009-09-02 17:55 ` [PATCH 02/14] repair: reduce byte swap operations in scanfunc_allocbt Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-12 19:45   ` Eric Sandeen
  2009-09-02 17:55 ` [PATCH 04/14] repair: split up scanfunc_ino Christoph Hellwig
                   ` (12 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs

[-- Attachment #1: repair-kill-B_IS_META --]
[-- Type: text/plain, Size: 1286 bytes --]

B_IS_META is the inverse flag of B_IS_INODE which is not really obvious
from it's use.  So just use !B_IS_INODE to make it more clear.


Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/prefetch.c
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.c	2009-08-20 00:02:25.000000000 +0000
+++ xfsprogs-dev/repair/prefetch.c	2009-08-20 00:05:36.000000000 +0000
@@ -64,7 +64,6 @@
  * the buffer is for an inode or other metadata.
  */
 #define B_IS_INODE(f)	(((f) & 5) == 0)
-#define B_IS_META(f)	(((f) & 5) != 0)
 
 #define DEF_BATCH_BYTES	0x10000
 
@@ -131,7 +130,7 @@
 
 	if (fsbno > args->last_bno_read) {
 		radix_tree_insert(&args->primary_io_queue, fsbno, bp);
-		if (B_IS_META(flag))
+		if (!B_IS_INODE(flag))
 			radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
 		else {
 			args->inode_bufs_queued++;
@@ -153,7 +152,7 @@
 			(long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
 			args->last_bno_read);
 #endif
-		ASSERT(B_IS_META(flag));
+		ASSERT(!B_IS_INODE(flag));
 		XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
 		radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
 	}

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 04/14] repair: split up scanfunc_ino
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (2 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 03/14] repair: kill B_IS_META flag Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-12 20:06   ` Eric Sandeen
  2009-09-02 17:55 ` [PATCH 05/14] repair: reduce byte swapping in scan_freelist Christoph Hellwig
                   ` (11 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-split-scanfunc_ino --]
[-- Type: text/plain, Size: 10302 bytes --]

Split out a helper to scan a single inode chunk for suspect inodes from
scanfunc_ino to make it more readable.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/scan.c
===================================================================
--- xfsprogs-dev.orig/repair/scan.c	2009-08-21 19:00:15.000000000 +0000
+++ xfsprogs-dev/repair/scan.c	2009-08-21 19:03:26.000000000 +0000
@@ -625,6 +625,167 @@ scanfunc_cnt(
 				suspect, isroot, XFS_ABTC_MAGIC);
 }
 
+static int
+scan_single_ino_chunk(
+	xfs_agnumber_t		agno,
+	xfs_inobt_rec_t		*rp,
+	int			suspect)
+{
+	xfs_ino_t		lino;
+	xfs_agino_t		ino;
+	xfs_agblock_t		agbno;
+	int			j;
+	int			nfree;
+	int			off;
+	int			state;
+	ino_tree_node_t		*ino_rec, *first_rec, *last_rec;
+
+	ino = be32_to_cpu(rp->ir_startino);
+	off = XFS_AGINO_TO_OFFSET(mp, ino);
+	agbno = XFS_AGINO_TO_AGBNO(mp, ino);
+	lino = XFS_AGINO_TO_INO(mp, agno, ino);
+
+	/*
+	 * on multi-block block chunks, all chunks start
+	 * at the beginning of the block.  with multi-chunk
+	 * blocks, all chunks must start on 64-inode boundaries
+	 * since each block can hold N complete chunks. if
+	 * fs has aligned inodes, all chunks must start
+	 * at a fs_ino_alignment*N'th agbno.  skip recs
+	 * with badly aligned starting inodes.
+	 */
+	if (ino == 0 ||
+	    (inodes_per_block <= XFS_INODES_PER_CHUNK && off !=  0) ||
+	    (inodes_per_block > XFS_INODES_PER_CHUNK &&
+	     off % XFS_INODES_PER_CHUNK != 0) ||
+	    (fs_aligned_inodes && agbno % fs_ino_alignment != 0))  {
+		do_warn(
+	_("badly aligned inode rec (starting inode = %llu)\n"),
+			lino);
+		suspect++;
+	}
+
+	/*
+	 * verify numeric validity of inode chunk first
+	 * before inserting into a tree.  don't have to
+	 * worry about the overflow case because the
+	 * starting ino number of a chunk can only get
+	 * within 255 inodes of max (NULLAGINO).  if it
+	 * gets closer, the agino number will be illegal
+	 * as the agbno will be too large.
+	 */
+	if (verify_aginum(mp, agno, ino))  {
+		do_warn(
+_("bad starting inode # (%llu (0x%x 0x%x)) in ino rec, skipping rec\n"),
+			lino, agno, ino);
+		return ++suspect;
+	}
+
+	if (verify_aginum(mp, agno,
+			ino + XFS_INODES_PER_CHUNK - 1))  {
+		do_warn(
+_("bad ending inode # (%llu (0x%x 0x%x)) in ino rec, skipping rec\n"),
+			lino + XFS_INODES_PER_CHUNK - 1,
+			agno, ino + XFS_INODES_PER_CHUNK - 1);
+		return ++suspect;
+	}
+
+	/*
+	 * set state of each block containing inodes
+	 */
+	if (off == 0 && !suspect)  {
+		for (j = 0;
+		     j < XFS_INODES_PER_CHUNK;
+		     j += mp->m_sb.sb_inopblock)  {
+			agbno = XFS_AGINO_TO_AGBNO(mp, ino + j);
+			state = get_agbno_state(mp, agno, agbno);
+			if (state == XR_E_UNKNOWN)  {
+				set_agbno_state(mp, agno, agbno, XR_E_INO);
+			} else if (state == XR_E_INUSE_FS && agno == 0 &&
+				   ino + j >= first_prealloc_ino &&
+				   ino + j < last_prealloc_ino)  {
+				set_agbno_state(mp, agno, agbno, XR_E_INO);
+			} else  {
+				do_warn(
+_("inode chunk claims used block, inobt block - agno %d, bno %d, inopb %d\n"),
+					agno, agbno,
+				mp->m_sb.sb_inopblock);
+				/*
+				 * XXX - maybe should mark
+				 * block a duplicate
+				 */
+				return ++suspect;
+			}
+		}
+	}
+
+	/*
+	 * ensure only one avl entry per chunk
+	 */
+	find_inode_rec_range(agno, ino, ino + XFS_INODES_PER_CHUNK,
+			     &first_rec, &last_rec);
+	if (first_rec != NULL)  {
+		/*
+		 * this chunk overlaps with one (or more)
+		 * already in the tree
+		 */
+		do_warn(
+_("inode rec for ino %llu (%d/%d) overlaps existing rec (start %d/%d)\n"),
+			lino, agno, ino, agno, first_rec->ino_startnum);
+		suspect++;
+
+		/*
+		 * if the 2 chunks start at the same place,
+		 * then we don't have to put this one
+		 * in the uncertain list.  go to the next one.
+		 */
+		if (first_rec->ino_startnum == ino)
+			return suspect;
+	}
+
+	nfree = 0;
+
+	/*
+	 * now mark all the inodes as existing and free or used.
+	 * if the tree is suspect, put them into the uncertain
+	 * inode tree.
+	 */
+	if (!suspect)  {
+		if (XFS_INOBT_IS_FREE_DISK(rp, 0)) {
+			nfree++;
+			ino_rec = set_inode_free_alloc(agno, ino);
+		} else  {
+			ino_rec = set_inode_used_alloc(agno, ino);
+		}
+		for (j = 1; j < XFS_INODES_PER_CHUNK; j++) {
+			if (XFS_INOBT_IS_FREE_DISK(rp, j)) {
+				nfree++;
+				set_inode_free(ino_rec, j);
+			} else  {
+				set_inode_used(ino_rec, j);
+			}
+		}
+	} else  {
+		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+			if (XFS_INOBT_IS_FREE_DISK(rp, j)) {
+				nfree++;
+				add_aginode_uncertain(agno, ino + j, 1);
+			} else  {
+				add_aginode_uncertain(agno, ino + j, 0);
+			}
+		}
+	}
+
+	if (nfree != be32_to_cpu(rp->ir_freecount)) {
+		do_warn(_("ir_freecount/free mismatch, inode "
+			"chunk %d/%d, freecount %d nfree %d\n"),
+			agno, ino, be32_to_cpu(rp->ir_freecount), nfree);
+	}
+
+	return suspect;
+}
+
+
 /*
  * this one walks the inode btrees sucking the info there into
  * the incore avl tree.  We try and rescue corrupted btree records
@@ -651,18 +812,11 @@ scanfunc_ino(
 	int			isroot
 	)
 {
-	xfs_ino_t		lino;
 	int			i;
-	xfs_agino_t		ino;
-	xfs_agblock_t		agbno;
-	int			j;
-	int			nfree;
-	int			off;
 	int			numrecs;
 	int			state;
 	xfs_inobt_ptr_t		*pp;
 	xfs_inobt_rec_t		*rp;
-	ino_tree_node_t		*ino_rec, *first_rec, *last_rec;
 	int			hdr_errors;
 
 	hdr_errors = 0;
@@ -737,165 +891,8 @@ _("inode btree block claimed (state %d),
 		 * of INODES_PER_CHUNK (64) inodes.  off is the offset into
 		 * the block.  skip processing of bogus records.
 		 */
-		for (i = 0; i < numrecs; i++) {
-			ino = be32_to_cpu(rp[i].ir_startino);
-			off = XFS_AGINO_TO_OFFSET(mp, ino);
-			agbno = XFS_AGINO_TO_AGBNO(mp, ino);
-			lino = XFS_AGINO_TO_INO(mp, agno, ino);
-			/*
-			 * on multi-block block chunks, all chunks start
-			 * at the beginning of the block.  with multi-chunk
-			 * blocks, all chunks must start on 64-inode boundaries
-			 * since each block can hold N complete chunks. if
-			 * fs has aligned inodes, all chunks must start
-			 * at a fs_ino_alignment*N'th agbno.  skip recs
-			 * with badly aligned starting inodes.
-			 */
-			if (ino == 0 ||
-			    (inodes_per_block <= XFS_INODES_PER_CHUNK &&
-			     off !=  0) ||
-			    (inodes_per_block > XFS_INODES_PER_CHUNK &&
-			     off % XFS_INODES_PER_CHUNK != 0) ||
-			    (fs_aligned_inodes &&
-			     agbno % fs_ino_alignment != 0))  {
-				do_warn(
-			_("badly aligned inode rec (starting inode = %llu)\n"),
-					lino);
-				suspect++;
-			}
-
-			/*
-			 * verify numeric validity of inode chunk first
-			 * before inserting into a tree.  don't have to
-			 * worry about the overflow case because the
-			 * starting ino number of a chunk can only get
-			 * within 255 inodes of max (NULLAGINO).  if it
-			 * gets closer, the agino number will be illegal
-			 * as the agbno will be too large.
-			 */
-			if (verify_aginum(mp, agno, ino))  {
-				do_warn(
-_("bad starting inode # (%llu (0x%x 0x%x)) in ino rec, skipping rec\n"),
-					lino, agno, ino);
-				suspect++;
-				continue;
-			}
-
-			if (verify_aginum(mp, agno,
-					ino + XFS_INODES_PER_CHUNK - 1))  {
-				do_warn(
-_("bad ending inode # (%llu (0x%x 0x%x)) in ino rec, skipping rec\n"),
-					lino + XFS_INODES_PER_CHUNK - 1,
-					agno, ino + XFS_INODES_PER_CHUNK - 1);
-				suspect++;
-				continue;
-			}
-
-			/*
-			 * set state of each block containing inodes
-			 */
-			if (off == 0 && !suspect)  {
-				for (j = 0;
-				     j < XFS_INODES_PER_CHUNK;
-				     j += mp->m_sb.sb_inopblock)  {
-					agbno = XFS_AGINO_TO_AGBNO(mp, ino + j);
-					state = get_agbno_state(mp,
-							agno, agbno);
-
-					if (state == XR_E_UNKNOWN)  {
-						set_agbno_state(mp, agno,
-							agbno, XR_E_INO);
-					} else if (state == XR_E_INUSE_FS &&
-						agno == 0 &&
-						ino + j >= first_prealloc_ino &&
-						ino + j < last_prealloc_ino)  {
-						set_agbno_state(mp, agno,
-							agbno, XR_E_INO);
-					} else  {
-						do_warn(
-_("inode chunk claims used block, inobt block - agno %d, bno %d, inopb %d\n"),
-							agno, bno,
-							mp->m_sb.sb_inopblock);
-						suspect++;
-						/*
-						 * XXX - maybe should mark
-						 * block a duplicate
-						 */
-						continue;
-					}
-				}
-			}
-			/*
-			 * ensure only one avl entry per chunk
-			 */
-			find_inode_rec_range(agno, ino,
-					ino + XFS_INODES_PER_CHUNK,
-					&first_rec,
-					&last_rec);
-			if (first_rec != NULL)  {
-				/*
-				 * this chunk overlaps with one (or more)
-				 * already in the tree
-				 */
-				do_warn(
-_("inode rec for ino %llu (%d/%d) overlaps existing rec (start %d/%d)\n"),
-					lino, agno, ino,
-					agno, first_rec->ino_startnum);
-				suspect++;
-
-				/*
-				 * if the 2 chunks start at the same place,
-				 * then we don't have to put this one
-				 * in the uncertain list.  go to the next one.
-				 */
-				if (first_rec->ino_startnum == ino)
-					continue;
-			}
-
-			nfree = 0;
-
-			/*
-			 * now mark all the inodes as existing and free or used.
-			 * if the tree is suspect, put them into the uncertain
-			 * inode tree.
-			 */
-			if (!suspect)  {
-				if (XFS_INOBT_IS_FREE_DISK(&rp[i], 0)) {
-					nfree++;
-					ino_rec = set_inode_free_alloc(agno,
-									ino);
-				} else  {
-					ino_rec = set_inode_used_alloc(agno,
-									ino);
-				}
-				for (j = 1; j < XFS_INODES_PER_CHUNK; j++) {
-					if (XFS_INOBT_IS_FREE_DISK(&rp[i], j)) {
-						nfree++;
-						set_inode_free(ino_rec, j);
-					} else  {
-						set_inode_used(ino_rec, j);
-					}
-				}
-			} else  {
-				for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
-					if (XFS_INOBT_IS_FREE_DISK(&rp[i], j)) {
-						nfree++;
-						add_aginode_uncertain(agno,
-								ino + j, 1);
-					} else  {
-						add_aginode_uncertain(agno,
-								ino + j, 0);
-					}
-				}
-			}
-
-			if (nfree != be32_to_cpu(rp[i].ir_freecount)) {
-				do_warn(_("ir_freecount/free mismatch, inode "
-					"chunk %d/%d, freecount %d nfree %d\n"),
-					agno, ino,
-					be32_to_cpu(rp[i].ir_freecount), nfree);
-			}
-		}
+		for (i = 0; i < numrecs; i++)
+			suspect = scan_single_ino_chunk(agno, &rp[i], suspect);
 
 		if (suspect)
 			bad_ino_btree = 1;

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 05/14] repair: reduce byte swapping in scan_freelist
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (3 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 04/14] repair: split up scanfunc_ino Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-12 20:43   ` Eric Sandeen
  2009-09-02 17:55 ` [PATCH 06/14] repair: use a btree instead of a radix tree for the prefetch queue Christoph Hellwig
                   ` (10 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-cleanup-scan_freelist --]
[-- Type: text/plain, Size: 2471 bytes --]

Store the ag number in a local native endian variable to avoid byteswapping
it over and over again.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/scan.c
===================================================================
--- xfsprogs-dev.orig/repair/scan.c	2009-08-21 19:03:26.000000000 +0000
+++ xfsprogs-dev/repair/scan.c	2009-08-21 19:05:32.000000000 +0000
@@ -943,23 +943,26 @@ scan_freelist(
 {
 	xfs_agfl_t	*agfl;
 	xfs_buf_t	*agflbuf;
+	xfs_agnumber_t	agno;
 	xfs_agblock_t	bno;
 	int		count;
 	int		i;
 
+	agno = be32_to_cpu(agf->agf_seqno);
+
 	if (XFS_SB_BLOCK(mp) != XFS_AGFL_BLOCK(mp) &&
-			XFS_AGF_BLOCK(mp) != XFS_AGFL_BLOCK(mp) &&
-			XFS_AGI_BLOCK(mp) != XFS_AGFL_BLOCK(mp))
-		set_agbno_state(mp, be32_to_cpu(agf->agf_seqno),
-				XFS_AGFL_BLOCK(mp), XR_E_FS_MAP);
+	    XFS_AGF_BLOCK(mp) != XFS_AGFL_BLOCK(mp) &&
+	    XFS_AGI_BLOCK(mp) != XFS_AGFL_BLOCK(mp))
+		set_agbno_state(mp, agno, XFS_AGFL_BLOCK(mp), XR_E_FS_MAP);
+
 	if (be32_to_cpu(agf->agf_flcount) == 0)
 		return;
-	agflbuf = libxfs_readbuf(mp->m_dev, XFS_AG_DADDR(mp,
-				be32_to_cpu(agf->agf_seqno),
-				XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0);
+
+	agflbuf = libxfs_readbuf(mp->m_dev,
+				 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+				 XFS_FSS_TO_BB(mp, 1), 0);
 	if (!agflbuf)  {
-		do_abort(_("can't read agfl block for ag %d\n"),
-			be32_to_cpu(agf->agf_seqno));
+		do_abort(_("can't read agfl block for ag %d\n"), agno);
 		return;
 	}
 	agfl = XFS_BUF_TO_AGFL(agflbuf);
@@ -967,12 +970,11 @@ scan_freelist(
 	count = 0;
 	for (;;) {
 		bno = be32_to_cpu(agfl->agfl_bno[i]);
-		if (verify_agbno(mp, be32_to_cpu(agf->agf_seqno), bno))
-			set_agbno_state(mp, be32_to_cpu(agf->agf_seqno),
-					bno, XR_E_FREE);
+		if (verify_agbno(mp, agno, bno))
+			set_agbno_state(mp, agno, bno, XR_E_FREE);
 		else
 			do_warn(_("bad agbno %u in agfl, agno %d\n"),
-				bno, be32_to_cpu(agf->agf_seqno));
+				bno, agno);
 		count++;
 		if (i == be32_to_cpu(agf->agf_fllast))
 			break;
@@ -981,8 +983,7 @@ scan_freelist(
 	}
 	if (count != be32_to_cpu(agf->agf_flcount)) {
 		do_warn(_("freeblk count %d != flcount %d in ag %d\n"), count,
-			be32_to_cpu(agf->agf_flcount),
-			be32_to_cpu(agf->agf_seqno));
+			be32_to_cpu(agf->agf_flcount), agno);
 	}
 	libxfs_putbuf(agflbuf);
 }

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 06/14] repair: use a btree instead of a radix tree for the prefetch queue
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (4 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 05/14] repair: reduce byte swapping in scan_freelist Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-21 17:12   ` [PATCH 06/14] repair: use a btree instead of a radix tree for theprefetch queue Alex Elder
  2009-09-02 17:55 ` [PATCH 07/14] repair: use single prefetch queue Christoph Hellwig
                   ` (9 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-radix-to-btree --]
[-- Type: text/plain, Size: 61367 bytes --]

Currently the prefetch queue in xfs_repair uses a radix tree implementation
derived from the Linux kernel one to manage it's prefetch queue.

The radix tree implement is not very memory efficient for sparse indices,
so replace it with a btree implementation that is much more efficient.
This is not that important for the prefetch queue but will be very important
for the next memory optimization patches which need a tree to store things
like the block map which are very sparse, and we do not want to deal with
two tree implementations (or rather three given that we still have avl.c
around)

Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/Makefile
===================================================================
--- xfsprogs-dev.orig/repair/Makefile	2009-08-20 00:01:58.000000000 +0000
+++ xfsprogs-dev/repair/Makefile	2009-08-20 00:06:43.000000000 +0000
@@ -9,15 +9,15 @@
 
 LTCOMMAND = xfs_repair
 
-HFILES = agheader.h attr_repair.h avl.h avl64.h bmap.h dinode.h dir.h \
-	dir2.h err_protos.h globals.h incore.h protos.h rt.h \
-	progress.h scan.h versions.h prefetch.h radix-tree.h threads.h
+HFILES = agheader.h attr_repair.h avl.h avl64.h bmap.h btree.h \
+	dinode.h dir.h dir2.h err_protos.h globals.h incore.h protos.h rt.h \
+	progress.h scan.h versions.h prefetch.h threads.h
 
-CFILES = agheader.c attr_repair.c avl.c avl64.c bmap.c dino_chunks.c \
-	dinode.c dir.c dir2.c globals.c incore.c \
+CFILES = agheader.c attr_repair.c avl.c avl64.c bmap.c btree.c \
+	dino_chunks.c dinode.c dir.c dir2.c globals.c incore.c \
 	incore_bmc.c init.c incore_ext.c incore_ino.c phase1.c \
 	phase2.c phase3.c phase4.c phase5.c phase6.c phase7.c \
-	progress.c prefetch.c radix-tree.c rt.c sb.c scan.c threads.c \
+	progress.c prefetch.c rt.c sb.c scan.c threads.c \
 	versions.c xfs_repair.c
 
 LLDLIBS = $(LIBXFS) $(LIBXLOG) $(LIBUUID) $(LIBRT) $(LIBPTHREAD)
Index: xfsprogs-dev/repair/btree.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ xfsprogs-dev/repair/btree.c	2009-08-20 00:06:44.000000000 +0000
@@ -0,0 +1,1234 @@
+/*
+ * Copyright (c) 2007, Silicon Graphics, Inc. Barry Naujok <bnaujok@sgi.com>
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <libxfs.h>
+#include "btree.h"
+
+
+#define BTREE_KEY_MAX		7
+#define BTREE_KEY_MIN		(BTREE_KEY_MAX / 2)
+
+#define BTREE_PTR_MAX		(BTREE_KEY_MAX + 1)
+
+struct btree_node {
+	unsigned long		num_keys;
+	unsigned long		keys[BTREE_KEY_MAX];
+	struct btree_node *	ptrs[BTREE_PTR_MAX];
+};
+
+struct btree_cursor {
+	struct btree_node	*node;
+	int			index;
+};
+
+struct btree_root {
+	struct btree_node	*root_node;
+	struct btree_cursor	*cursor;	/* track path to end leaf */
+	int			height;
+	/* lookup cache */
+	int			keys_valid;	/* set if the cache is valid */
+	unsigned long		cur_key;
+	unsigned long		next_key;
+	void			*next_value;
+	unsigned long		prev_key;
+	void			*prev_value;
+#ifdef BTREE_STATS
+	struct btree_stats {
+		unsigned long	num_items;
+		unsigned long	max_items;
+		int		alloced;
+		int		cache_hits;
+		int		cache_misses;
+		int		lookup;
+		int		find;
+		int		key_update;
+		int		value_update;
+		int		insert;
+		int		delete;
+		int		inc_height;
+		int		dec_height;
+		int		shift_prev;
+		int		shift_next;
+		int		split;
+		int		merge_prev;
+		int		merge_next;
+		int		balance_prev;
+		int		balance_next;
+	} stats;
+#endif
+};
+
+
+static struct btree_node *
+btree_node_alloc(void)
+{
+	return calloc(1, sizeof(struct btree_node));
+}
+
+static void
+btree_node_free(
+	struct btree_node 	*node)
+{
+	free(node);
+}
+
+static void
+btree_free_nodes(
+	struct btree_node	*node,
+	int			level)
+{
+	int			i;
+
+	if (level)
+		for (i = 0; i <= node->num_keys; i++)
+			btree_free_nodes(node->ptrs[i], level - 1);
+	btree_node_free(node);
+}
+
+static void
+__btree_init(
+	struct btree_root	*root)
+{
+	memset(root, 0, sizeof(struct btree_root));
+	root->height = 1;
+	root->cursor = calloc(1, sizeof(struct btree_cursor));
+	root->root_node = btree_node_alloc();
+	ASSERT(root->root_node);
+#ifdef BTREE_STATS
+	root->stats.max_items = 1;
+	root->stats.alloced += 1;
+#endif
+}
+
+static void
+__btree_free(
+	struct btree_root	*root)
+{
+	btree_free_nodes(root->root_node, root->height - 1);
+	free(root->cursor);
+	root->height = 0;
+	root->cursor = NULL;
+	root->root_node = NULL;
+}
+
+void
+btree_init(
+	struct btree_root	**root)
+{
+	*root = calloc(1, sizeof(struct btree_root));
+	__btree_init(*root);
+}
+
+void
+btree_clear(
+	struct btree_root	*root)
+{
+	__btree_free(root);
+	__btree_init(root);
+}
+
+void
+btree_destroy(
+	struct btree_root	*root)
+{
+	__btree_free(root);
+	free(root);
+}
+
+int
+btree_is_empty(
+	struct btree_root	*root)
+{
+	return root->root_node->num_keys == 0;
+}
+
+static inline void
+btree_invalidate_cursor(
+	struct btree_root	*root)
+{
+	root->cursor[0].node = NULL;
+	root->keys_valid = 0;
+}
+
+static inline unsigned long
+btree_key_of_cursor(
+	struct btree_cursor	*cursor,
+	int			height)
+{
+	while (cursor->node->num_keys == cursor->index && --height > 0)
+		cursor++;
+	return cursor->node->keys[cursor->index];
+}
+
+static void *
+btree_get_prev(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	struct btree_cursor	*cur = root->cursor;
+	int			level = 0;
+	struct btree_node	*node;
+
+	if (cur->index > 0) {
+		if (key)
+			*key = cur->node->keys[cur->index - 1];
+		return cur->node->ptrs[cur->index - 1];
+	}
+
+	/* else need to go up and back down the tree to find the previous */
+
+	while (cur->index == 0) {
+		if (++level == root->height)
+			return NULL;
+		cur++;
+	}
+
+	/* the key is in the current level */
+	if (key)
+		*key = cur->node->keys[cur->index - 1];
+
+	/* descend back down the right side to get the pointer */
+	node = cur->node->ptrs[cur->index - 1];
+	while (level--)
+		node = node->ptrs[node->num_keys];
+	return node;
+}
+
+static void *
+btree_get_next(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	struct btree_cursor	*cur = root->cursor;
+	int			level = 0;
+	struct btree_node	*node;
+
+	while (cur->index == cur->node->num_keys) {
+		if (++level == root->height)
+			return NULL;
+		cur++;
+	}
+	if (level == 0) {
+		if (key) {
+			cur->index++;
+			*key = btree_key_of_cursor(cur, root->height);
+			cur->index--;
+		}
+		return cur->node->ptrs[cur->index + 1];
+	}
+
+	node = cur->node->ptrs[cur->index + 1];
+	while (--level > 0)
+		node = node->ptrs[0];
+	if (key)
+		*key = node->keys[0];
+	return node->ptrs[0];
+}
+
+/*
+ * Lookup/Search functions
+ */
+
+static int
+btree_do_search(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+	unsigned long		k = 0;
+	struct btree_cursor	*cur = root->cursor + root->height;
+	struct btree_node	*node = root->root_node;
+	int			height = root->height;
+	int			key_found = 0;
+	int			i;
+
+	while (--height >= 0) {
+		cur--;
+		for (i = 0; i < node->num_keys; i++)
+			if (node->keys[i] >= key) {
+				k = node->keys[i];
+				key_found = 1;
+				break;
+			}
+		cur->node = node;
+		cur->index = i;
+		node = node->ptrs[i];
+	}
+	root->keys_valid = key_found;
+	if (!key_found)
+		return 0;
+
+	root->cur_key = k;
+	root->next_value = NULL;	/* do on-demand next value lookup */
+	root->prev_value = btree_get_prev(root, &root->prev_key);
+	return 1;
+}
+
+static int
+btree_search(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+	if (root->keys_valid && key <= root->cur_key &&
+				(!root->prev_value || key > root->prev_key)) {
+#ifdef BTREE_STATS
+		root->stats.cache_hits++;
+#endif
+		return 1;
+	}
+#ifdef BTREE_STATS
+	root->stats.cache_misses++;
+#endif
+	return btree_do_search(root, key);
+}
+
+void *
+btree_find(
+	struct btree_root	*root,
+	unsigned long		key,
+	unsigned long		*actual_key)
+{
+#ifdef BTREE_STATS
+	root->stats.find += 1;
+#endif
+	if (!btree_search(root, key))
+		return NULL;
+
+	if (actual_key)
+		*actual_key = root->cur_key;
+	return root->cursor->node->ptrs[root->cursor->index];
+}
+
+void *
+btree_lookup(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+#ifdef BTREE_STATS
+	root->stats.lookup += 1;
+#endif
+	if (!btree_search(root, key) || root->cur_key != key)
+		return NULL;
+	return root->cursor->node->ptrs[root->cursor->index];
+}
+
+void *
+btree_peek_prev(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	if (!root->keys_valid)
+		return NULL;
+	if (key)
+		*key = root->prev_key;
+	return root->prev_value;
+}
+
+void *
+btree_peek_next(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	if (!root->keys_valid)
+		return NULL;
+	if (!root->next_value)
+		root->next_value = btree_get_next(root, &root->next_key);
+	if (key)
+		*key = root->next_key;
+	return root->next_value;
+}
+
+static void *
+btree_move_cursor_to_next(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	struct btree_cursor	*cur = root->cursor;
+	int			level = 0;
+
+	while (cur->index == cur->node->num_keys) {
+		if (++level == root->height)
+			return NULL;
+		cur++;
+	}
+	cur->index++;
+	if (level == 0) {
+		if (key)
+			*key = btree_key_of_cursor(cur, root->height);
+		return cur->node->ptrs[cur->index];
+	}
+
+	while (--level >= 0) {
+		root->cursor[level].node = cur->node->ptrs[cur->index];
+		root->cursor[level].index = 0;
+		cur--;
+	}
+	if (key)
+		*key = cur->node->keys[0];
+	return cur->node->ptrs[0];
+}
+
+void *
+btree_lookup_next(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	void			*value;
+
+	if (!root->keys_valid)
+		return NULL;
+
+	root->prev_key = root->cur_key;
+	root->prev_value = root->cursor->node->ptrs[root->cursor->index];
+
+	value = btree_move_cursor_to_next(root, &root->cur_key);
+	if (!value) {
+		btree_invalidate_cursor(root);
+		return NULL;
+ 	}
+ 	root->next_value = NULL;	/* on-demand next value fetch */
+	if (key)
+		*key = root->cur_key;
+	return value;
+}
+
+static void *
+btree_move_cursor_to_prev(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	struct btree_cursor	*cur = root->cursor;
+	int			level = 0;
+
+	while (cur->index == 0) {
+		if (++level == root->height)
+			return NULL;
+		cur++;
+	}
+	cur->index--;
+	if (key)	/* the key is in the current level */
+		*key = cur->node->keys[cur->index];
+	while (level > 0) {
+		level--;
+		root->cursor[level].node = cur->node->ptrs[cur->index];
+		root->cursor[level].index = root->cursor[level].node->num_keys;
+		cur--;
+	}
+	return cur->node->ptrs[cur->index];
+}
+
+void *
+btree_lookup_prev(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	void			*value;
+
+	if (!root->keys_valid)
+		return NULL;
+
+	value = btree_move_cursor_to_prev(root, &root->cur_key);
+	if (!value)
+		return NULL;
+	root->prev_value = btree_get_prev(root, &root->prev_key);
+ 	root->next_value = NULL;	/* on-demand next value fetch */
+	if (key)
+		*key = root->cur_key;
+	return value;
+}
+
+void *
+btree_uncached_lookup(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+	/* cursor-less (ie. uncached) lookup */
+	int			height = root->height - 1;
+	struct btree_node	*node = root->root_node;
+	int			i;
+	int			key_found = 0;
+
+	while (height >= 0) {
+		for (i = 0; i < node->num_keys; i++)
+			if (node->keys[i] >= key) {
+				key_found = node->keys[i] == key;
+				break;
+			}
+		node = node->ptrs[i];
+		height--;
+	}
+	return key_found ? node : NULL;
+}
+
+/* Update functions */
+
+static inline void
+btree_update_node_key(
+	struct btree_root	*root,
+	struct btree_cursor	*cursor,
+	int			level,
+	unsigned long		new_key)
+{
+	int			i;
+
+#ifdef BTREE_STATS
+	root->stats.key_update += 1;
+#endif
+
+	cursor += level;
+	for (i = level; i < root->height; i++) {
+		if (cursor->index < cursor->node->num_keys) {
+			cursor->node->keys[cursor->index] = new_key;
+			break;
+		}
+		cursor++;
+	}
+}
+
+int
+btree_update_key(
+	struct btree_root	*root,
+	unsigned long		old_key,
+	unsigned long		new_key)
+{
+	if (!btree_search(root, old_key) || root->cur_key != old_key)
+		return ENOENT;
+
+	if (root->next_value && new_key >= root->next_key)
+		return EINVAL;
+
+	if (root->prev_value && new_key <= root->prev_key)
+		return EINVAL;
+
+	btree_update_node_key(root, root->cursor, 0, new_key);
+
+	return 0;
+}
+
+int
+btree_update_value(
+	struct btree_root	*root,
+	unsigned long		key,
+	void			*new_value)
+{
+	if (!new_value)
+		return EINVAL;
+
+	if (!btree_search(root, key) || root->cur_key != key)
+		return ENOENT;
+
+#ifdef BTREE_STATS
+	root->stats.value_update += 1;
+#endif
+	root->cursor->node->ptrs[root->cursor->index] = new_value;
+
+	return 0;
+}
+
+/*
+ * Cursor modification functions - used for inserting and deleting
+ */
+
+static struct btree_cursor *
+btree_copy_cursor_prev(
+	struct btree_root	*root,
+	struct btree_cursor	*dest_cursor,
+	int			level)
+{
+	struct btree_cursor	*src_cur = root->cursor + level;
+	struct btree_cursor	*dst_cur;
+	int			l = level;
+	int			i;
+
+	if (level >= root->height)
+		return NULL;
+
+	while (src_cur->index == 0) {
+		if (++l >= root->height)
+			return NULL;
+		src_cur++;
+	}
+	for (i = l; i < root->height; i++)
+		dest_cursor[i] = *src_cur++;
+
+	dst_cur = dest_cursor + l;
+	dst_cur->index--;
+	while (l-- >= level) {
+		dest_cursor[l].node = dst_cur->node->ptrs[dst_cur->index];
+		dest_cursor[l].index = dest_cursor[l].node->num_keys;
+		dst_cur--;
+	}
+	return dest_cursor;
+}
+
+static struct btree_cursor *
+btree_copy_cursor_next(
+	struct btree_root	*root,
+	struct btree_cursor	*dest_cursor,
+	int			level)
+{
+	struct btree_cursor	*src_cur = root->cursor + level;
+	struct btree_cursor	*dst_cur;
+	int			l = level;
+	int			i;
+
+	if (level >= root->height)
+		return NULL;
+
+	while (src_cur->index == src_cur->node->num_keys) {
+		if (++l >= root->height)
+			return NULL;
+		src_cur++;
+	}
+	for (i = l; i < root->height; i++)
+		dest_cursor[i] = *src_cur++;
+
+	dst_cur = dest_cursor + l;
+	dst_cur->index++;
+	while (l-- >= level) {
+		dest_cursor[l].node = dst_cur->node->ptrs[dst_cur->index];
+		dest_cursor[l].index = 0;
+		dst_cur--;
+	}
+	return dest_cursor;
+}
+
+/*
+ * Shift functions
+ *
+ * Tries to move items in the current leaf to its sibling if it has space.
+ * Used in both insert and delete functions.
+ * Returns the number of items shifted.
+ */
+
+static int
+btree_shift_to_prev(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*prev_cursor,
+	int			num_children)
+{
+	struct btree_node	*node;
+	struct btree_node	*prev_node;
+	int			num_remain;	/* # of keys left in "node" */
+	unsigned long		key;
+	int			i;
+
+	if (!prev_cursor || !num_children)
+		return 0;
+
+	prev_node = prev_cursor[level].node;
+	node = root->cursor[level].node;
+
+	ASSERT(num_children > 0 && num_children <= node->num_keys + 1);
+
+	if ((prev_node->num_keys + num_children) > BTREE_KEY_MAX)
+		return 0;
+
+#ifdef BTREE_STATS
+	root->stats.shift_prev += 1;
+#endif
+
+	num_remain = node->num_keys - num_children;
+	ASSERT(num_remain == -1 || num_remain >= BTREE_KEY_MIN);
+
+	/* shift parent keys around */
+	level++;
+	if (num_remain > 0)
+		key = node->keys[num_children - 1];
+	else
+		key = btree_key_of_cursor(root->cursor + level,
+						root->height - level);
+	while (prev_cursor[level].index == prev_cursor[level].node->num_keys) {
+		level++;
+		ASSERT(level < root->height);
+	}
+	prev_node->keys[prev_node->num_keys] =
+			prev_cursor[level].node->keys[prev_cursor[level].index];
+	prev_cursor[level].node->keys[prev_cursor[level].index] = key;
+
+	/* copy pointers and keys to the end of the prev node */
+	for (i = 0; i < num_children - 1; i++) {
+		prev_node->keys[prev_node->num_keys + 1 + i] = node->keys[i];
+		prev_node->ptrs[prev_node->num_keys + 1 + i] = node->ptrs[i];
+	}
+	prev_node->ptrs[prev_node->num_keys + 1 + i] = node->ptrs[i];
+	prev_node->num_keys += num_children;
+
+	/* move remaining pointers/keys to start of node */
+	if (num_remain >= 0) {
+		for (i = 0; i < num_remain; i++) {
+			node->keys[i] = node->keys[num_children + i];
+			node->ptrs[i] = node->ptrs[num_children + i];
+		}
+		node->ptrs[i] = node->ptrs[num_children + i];
+		node->num_keys = num_remain;
+	} else
+		node->num_keys = 0;
+
+	return num_children;
+}
+
+static int
+btree_shift_to_next(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*next_cursor,
+	int			num_children)
+{
+	struct btree_node	*node;
+	struct btree_node	*next_node;
+	int			num_remain;	/* # of children left in node */
+	int			i;
+
+	if (!next_cursor || !num_children)
+		return 0;
+
+	node = root->cursor[level].node;
+	next_node = next_cursor[level].node;
+
+	ASSERT(num_children > 0 && num_children <= node->num_keys + 1);
+
+	if ((next_node->num_keys + num_children) > BTREE_KEY_MAX)
+		return 0;
+
+	num_remain = node->num_keys + 1 - num_children;
+	ASSERT(num_remain == 0 || num_remain > BTREE_KEY_MIN);
+
+#ifdef BTREE_STATS
+	root->stats.shift_next += 1;
+#endif
+
+	/* make space for "num_children" items at beginning of next-leaf */
+	i = next_node->num_keys;
+	next_node->ptrs[num_children + i] = next_node->ptrs[i];
+	while (--i >= 0) {
+		next_node->keys[num_children + i] = next_node->keys[i];
+		next_node->ptrs[num_children + i] = next_node->ptrs[i];
+	}
+
+	/* update keys in parent and next node from parent */
+	do {
+		level++;
+		ASSERT(level < root->height);
+	} while (root->cursor[level].index == root->cursor[level].node->num_keys);
+
+	next_node->keys[num_children - 1] =
+		root->cursor[level].node->keys[root->cursor[level].index];
+	root->cursor[level].node->keys[root->cursor[level].index] =
+		node->keys[node->num_keys - num_children];
+
+	/* copy last "num_children" items from node into start of next-node */
+	for (i = 0; i < num_children - 1; i++) {
+		next_node->keys[i] = node->keys[num_remain + i];
+		next_node->ptrs[i] = node->ptrs[num_remain + i];
+	}
+	next_node->ptrs[i] = node->ptrs[num_remain + i];
+	next_node->num_keys += num_children;
+
+	if (num_remain > 0)
+		node->num_keys -= num_children;
+	else
+		node->num_keys = 0;
+
+	return num_children;
+}
+
+/*
+ * Insertion functions
+ */
+
+static struct btree_node *
+btree_increase_height(
+	struct btree_root	*root)
+{
+	struct btree_node	*new_root;
+	struct btree_cursor	*new_cursor;
+
+	new_cursor = realloc(root->cursor, (root->height + 1) *
+				sizeof(struct btree_cursor));
+	if (!new_cursor)
+		return NULL;
+	root->cursor = new_cursor;
+
+	new_root = btree_node_alloc();
+	if (!new_root)
+		return NULL;
+
+#ifdef BTREE_STATS
+	root->stats.alloced += 1;
+	root->stats.inc_height += 1;
+	root->stats.max_items *= BTREE_PTR_MAX;
+#endif
+
+	new_root->ptrs[0] = root->root_node;
+	root->root_node = new_root;
+
+	root->cursor[root->height].node = new_root;
+	root->cursor[root->height].index = 0;
+
+	root->height++;
+
+	return new_root;
+}
+
+static int
+btree_insert_item(
+	struct btree_root	*root,
+	int			level,
+	unsigned long		key,
+	void			*value);
+
+
+static struct btree_node *
+btree_split(
+	struct btree_root	*root,
+	int			level,
+	unsigned long		key,
+	int			*index)
+{
+	struct btree_node	*node = root->cursor[level].node;
+	struct btree_node	*new_node;
+	int			i;
+
+	new_node = btree_node_alloc();
+	if (!new_node)
+		return NULL;
+
+	if (btree_insert_item(root, level + 1, node->keys[BTREE_KEY_MIN],
+							new_node) != 0) {
+		btree_node_free(new_node);
+		return NULL;
+	}
+
+#ifdef BTREE_STATS
+	root->stats.alloced += 1;
+	root->stats.split += 1;
+#endif
+
+	for (i = 0; i < BTREE_KEY_MAX - BTREE_KEY_MIN - 1; i++) {
+		new_node->keys[i] = node->keys[BTREE_KEY_MIN + 1 + i];
+		new_node->ptrs[i] = node->ptrs[BTREE_KEY_MIN + 1 + i];
+	}
+	new_node->ptrs[i] = node->ptrs[BTREE_KEY_MIN + 1 + i];
+	new_node->num_keys = BTREE_KEY_MAX - BTREE_KEY_MIN - 1;
+
+	node->num_keys = BTREE_KEY_MIN;
+	if (key < node->keys[BTREE_KEY_MIN])
+		return node;	/* index doesn't change */
+
+	/* insertion point is in new node... */
+	*index -= BTREE_KEY_MIN + 1;
+	return new_node;
+}
+
+static int
+btree_insert_shift_to_prev(
+	struct btree_root	*root,
+	int			level,
+	int			*index)
+{
+	struct btree_cursor	tmp_cursor[root->height];
+	int			n;
+
+	if (*index <= 0)
+		return -1;
+
+	if (!btree_copy_cursor_prev(root, tmp_cursor, level + 1))
+		return -1;
+
+	n = MIN(*index, (BTREE_PTR_MAX - tmp_cursor[level].node->num_keys) / 2);
+	if (!n || !btree_shift_to_prev(root, level, tmp_cursor, n))
+		return -1;
+
+	*index -= n;
+	return 0;
+}
+
+static int
+btree_insert_shift_to_next(
+	struct btree_root	*root,
+	int			level,
+	int			*index)
+{
+	struct btree_cursor	tmp_cursor[root->height];
+	int			n;
+
+	if (*index >= BTREE_KEY_MAX)
+		return -1;
+
+	if (!btree_copy_cursor_next(root, tmp_cursor, level + 1))
+		return -1;
+
+	n = MIN(BTREE_KEY_MAX - *index,
+		(BTREE_PTR_MAX - tmp_cursor[level].node->num_keys) / 2);
+	if (!n || !btree_shift_to_next(root, level, tmp_cursor, n))
+		return -1;
+	return 0;
+}
+
+static int
+btree_insert_item(
+	struct btree_root	*root,
+	int			level,
+	unsigned long		key,
+	void			*value)
+{
+	struct btree_node	*node = root->cursor[level].node;
+	int			index = root->cursor[level].index;
+	int			i;
+
+	if (node->num_keys == BTREE_KEY_MAX) {
+		if (btree_insert_shift_to_prev(root, level, &index) == 0)
+			goto insert;
+		if (btree_insert_shift_to_next(root, level, &index) == 0)
+			goto insert;
+		if (level == root->height - 1) {
+			if (!btree_increase_height(root))
+				return ENOMEM;
+		}
+		node = btree_split(root, level, key, &index);
+		if (!node)
+			return ENOMEM;
+	}
+insert:
+	ASSERT(index <= node->num_keys);
+
+	i = node->num_keys;
+	node->ptrs[i + 1] = node->ptrs[i];
+	while (--i >= index) {
+		node->keys[i + 1] = node->keys[i];
+		node->ptrs[i + 1] = node->ptrs[i];
+	}
+
+	node->num_keys++;
+	node->keys[index] = key;
+
+	if (level == 0)
+		node->ptrs[index] = value;
+	else
+		node->ptrs[index + 1] = value;
+
+	return 0;
+}
+
+
+
+int
+btree_insert(
+	struct btree_root	*root,
+	unsigned long		key,
+	void			*value)
+{
+	int			result;
+
+	if (!value)
+		return EINVAL;
+
+	if (btree_search(root, key) && root->cur_key == key)
+		return EEXIST;
+
+#ifdef BTREE_STATS
+	root->stats.insert += 1;
+	root->stats.num_items += 1;
+#endif
+
+	result = btree_insert_item(root, 0, key, value);
+
+	btree_invalidate_cursor(root);
+
+	return result;
+}
+
+
+/*
+ * Deletion functions
+ *
+ * Rather more complicated as deletions has 4 ways to go once a node
+ * ends up with less than the minimum number of keys:
+ *   - move remainder to previous node
+ *   - move remainder to next node
+ *       (both will involve a parent deletion which may recurse)
+ *   - balance by moving some items from previous node
+ *   - balance by moving some items from next node
+ */
+
+static void
+btree_decrease_height(
+	struct btree_root	*root)
+{
+	struct btree_node	*old_root = root->root_node;
+
+	ASSERT(old_root->num_keys == 0);
+
+#ifdef BTREE_STATS
+	root->stats.alloced -= 1;
+	root->stats.dec_height += 1;
+	root->stats.max_items /= BTREE_PTR_MAX;
+#endif
+	root->root_node = old_root->ptrs[0];
+	btree_node_free(old_root);
+	root->height--;
+}
+
+static int
+btree_merge_with_prev(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*prev_cursor)
+{
+	if (!prev_cursor)
+		return 0;
+
+	if (!btree_shift_to_prev(root, level, prev_cursor,
+					root->cursor[level].node->num_keys + 1))
+		return 0;
+
+#ifdef BTREE_STATS
+	root->stats.merge_prev += 1;
+#endif
+	return 1;
+}
+
+static int
+btree_merge_with_next(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*next_cursor)
+{
+	if (!next_cursor)
+		return 0;
+
+	if (!btree_shift_to_next(root, level, next_cursor,
+					root->cursor[level].node->num_keys + 1))
+		return 0;
+
+#ifdef BTREE_STATS
+	root->stats.merge_next += 1;
+#endif
+	return 1;
+}
+
+static int
+btree_balance_with_prev(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*prev_cursor)
+{
+	struct btree_cursor	*root_cursor = root->cursor;
+
+	if (!prev_cursor)
+		return 0;
+	ASSERT(prev_cursor[level].node->num_keys > BTREE_KEY_MIN);
+
+#ifdef BTREE_STATS
+	root->stats.balance_prev += 1;
+#endif
+	/*
+	 * Move some nodes from the prev node into the current node.
+	 * As the shift operation is a right shift and is relative to
+	 * the root cursor, make the root cursor the prev cursor and
+	 * pass in the root cursor as the next cursor.
+	 */
+
+	root->cursor = prev_cursor;
+	if (!btree_shift_to_next(root, level, root_cursor,
+		(prev_cursor[level].node->num_keys + 1 - BTREE_KEY_MIN) / 2))
+			abort();
+	root->cursor = root_cursor;
+
+	return 1;
+}
+
+static int
+btree_balance_with_next(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*next_cursor)
+{
+	struct btree_cursor	*root_cursor = root->cursor;
+
+	if (!next_cursor)
+		return 0;
+	assert(next_cursor[level].node->num_keys > BTREE_KEY_MIN);
+
+#ifdef btree_stats
+	root->stats.balance_next += 1;
+#endif
+	/*
+	 * move some nodes from the next node into the current node.
+	 * as the shift operation is a left shift and is relative to
+	 * the root cursor, make the root cursor the next cursor and
+	 * pass in the root cursor as the prev cursor.
+	 */
+
+	root->cursor = next_cursor;
+	if (!btree_shift_to_prev(root, level, root_cursor,
+		(next_cursor[level].node->num_keys + 1 - BTREE_KEY_MIN) / 2))
+			abort();
+	root->cursor = root_cursor;
+
+	return 1;
+
+}
+
+static void
+btree_delete_key(
+	struct btree_root	*root,
+	int			level);
+
+/*
+ * btree_delete_node:
+ *
+ * Return 0 if it's done or 1 if the next level needs to be collapsed
+ */
+static void
+btree_delete_node(
+	struct btree_root	*root,
+	int			level)
+{
+	struct btree_cursor	prev_cursor[root->height];
+	struct btree_cursor	next_cursor[root->height];
+	struct btree_cursor	*pc;
+	struct btree_cursor	*nc;
+
+	/*
+	 * the node has underflowed, grab or merge keys/items from a
+	 * neighbouring node.
+	 */
+
+	if (level == root->height - 1) {
+		if (level > 0 && root->root_node->num_keys == 0)
+			btree_decrease_height(root);
+		return;
+	}
+
+	pc = btree_copy_cursor_prev(root, prev_cursor, level + 1);
+	if (!btree_merge_with_prev(root, level, pc)) {
+		nc = btree_copy_cursor_next(root, next_cursor, level + 1);
+		if (!btree_merge_with_next(root, level, nc)) {
+			/* merging failed, try redistrubution */
+			if (!btree_balance_with_prev(root, level, pc) &&
+			    !btree_balance_with_next(root, level, nc))
+				abort();
+			return;	/* when balancing, then the node isn't freed */
+		}
+	}
+
+#ifdef BTREE_STATS
+	root->stats.alloced -= 1;
+#endif
+	btree_node_free(root->cursor[level].node);
+
+	btree_delete_key(root, level + 1);
+}
+
+static void
+btree_delete_key(
+	struct btree_root	*root,
+	int			level)
+{
+	struct btree_node	*node = root->cursor[level].node;
+	int			index = root->cursor[level].index;
+
+	node->num_keys--;
+	if (index <= node->num_keys) {
+		/*
+		 * if not deleting the last item, shift higher items down
+		 * to cover the item being deleted
+		 */
+		while (index < node->num_keys) {
+			node->keys[index] = node->keys[index + 1];
+			node->ptrs[index] = node->ptrs[index + 1];
+			index++;
+		}
+		node->ptrs[index] = node->ptrs[index + 1];
+	} else {
+		/*
+		 * else update the associated parent key as the last key
+		 * in the leaf has changed
+		 */
+		btree_update_node_key(root, root->cursor, level + 1,
+						node->keys[node->num_keys]);
+	}
+	/*
+	 * if node underflows, either merge with sibling or rebalance
+	 * with sibling.
+	 */
+	if (node->num_keys < BTREE_KEY_MIN)
+		btree_delete_node(root, level);
+}
+
+void *
+btree_delete(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+	void			*value;
+
+	value = btree_lookup(root, key);
+	if (!value)
+		return NULL;
+
+#ifdef BTREE_STATS
+	root->stats.delete += 1;
+	root->stats.num_items -= 1;
+#endif
+
+	btree_delete_key(root, 0);
+
+	btree_invalidate_cursor(root);
+
+	return value;
+}
+
+#ifdef BTREE_STATS
+void
+btree_print_stats(
+	struct btree_root	*root,
+	FILE			*f)
+{
+	unsigned long		max_items = root->stats.max_items *
+						(root->root_node->num_keys + 1);
+
+	fprintf(f, "\tnum_items = %lu, max_items = %lu (%lu%%)\n",
+			root->stats.num_items, max_items,
+			root->stats.num_items * 100 / max_items);
+	fprintf(f, "\talloced = %d nodes, %lu bytes, %lu bytes per item\n",
+			root->stats.alloced,
+			root->stats.alloced * sizeof(struct btree_node),
+			root->stats.alloced * sizeof(struct btree_node) /
+							root->stats.num_items);
+	fprintf(f, "\tlookup = %d\n", root->stats.lookup);
+	fprintf(f, "\tfind = %d\n", root->stats.find);
+	fprintf(f, "\tcache_hits = %d\n", root->stats.cache_hits);
+	fprintf(f, "\tcache_misses = %d\n", root->stats.cache_misses);
+	fprintf(f, "\tkey_update = %d\n", root->stats.key_update);
+	fprintf(f, "\tvalue_update = %d\n", root->stats.value_update);
+	fprintf(f, "\tinsert = %d\n", root->stats.insert);
+	fprintf(f, "\tshift_prev = %d\n", root->stats.shift_prev);
+	fprintf(f, "\tshift_next = %d\n", root->stats.shift_next);
+	fprintf(f, "\tsplit = %d\n", root->stats.split);
+	fprintf(f, "\tinc_height = %d\n", root->stats.inc_height);
+	fprintf(f, "\tdelete = %d\n", root->stats.delete);
+	fprintf(f, "\tmerge_prev = %d\n", root->stats.merge_prev);
+	fprintf(f, "\tmerge_next = %d\n", root->stats.merge_next);
+	fprintf(f, "\tbalance_prev = %d\n", root->stats.balance_prev);
+	fprintf(f, "\tbalance_next = %d\n", root->stats.balance_next);
+	fprintf(f, "\tdec_height = %d\n", root->stats.dec_height);
+}
+#endif
Index: xfsprogs-dev/repair/btree.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ xfsprogs-dev/repair/btree.h	2009-08-20 00:06:44.000000000 +0000
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef _BTREE_H
+#define _BTREE_H
+
+
+struct btree_root;
+
+void
+btree_init(
+	struct btree_root	**root);
+
+void
+btree_destroy(
+	struct btree_root	*root);
+
+int
+btree_is_empty(
+	struct btree_root	*root);
+
+void *
+btree_lookup(
+	struct btree_root	*root,
+	unsigned long		key);
+
+void *
+btree_find(
+	struct btree_root	*root,
+	unsigned long		key,
+	unsigned long		*actual_key);
+
+void *
+btree_peek_prev(
+	struct btree_root	*root,
+	unsigned long		*key);
+
+void *
+btree_peek_next(
+	struct btree_root	*root,
+	unsigned long		*key);
+
+void *
+btree_lookup_next(
+	struct btree_root	*root,
+	unsigned long		*key);
+
+void *
+btree_lookup_prev(
+	struct btree_root	*root,
+	unsigned long		*key);
+
+int
+btree_insert(
+	struct btree_root	*root,
+	unsigned long		key,
+	void			*value);
+
+void *
+btree_delete(
+	struct btree_root	*root,
+	unsigned long		key);
+
+int
+btree_update_key(
+	struct btree_root	*root,
+	unsigned long		old_key,
+	unsigned long		new_key);
+
+int
+btree_update_value(
+	struct btree_root	*root,
+	unsigned long		key,
+	void 			*new_value);
+
+void
+btree_clear(
+	struct btree_root	*root);
+
+#ifdef BTREE_STATS
+void
+btree_print_stats(
+	struct btree_root	*root,
+	FILE			*f);
+#endif
+
+#endif /* _BTREE_H */
Index: xfsprogs-dev/repair/init.c
===================================================================
--- xfsprogs-dev.orig/repair/init.c	2009-08-20 00:01:58.000000000 +0000
+++ xfsprogs-dev/repair/init.c	2009-08-20 00:06:44.000000000 +0000
@@ -26,7 +26,6 @@
 #include "dir.h"
 #include "incore.h"
 #include "prefetch.h"
-#include "radix-tree.h"
 #include <sys/resource.h>
 
 static pthread_key_t dirbuf_key;
@@ -151,5 +150,4 @@
 	ts_create();
 	ts_init();
 	increase_rlimit();
-	radix_tree_init();
 }
Index: xfsprogs-dev/repair/prefetch.c
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.c	2009-08-20 00:05:36.000000000 +0000
+++ xfsprogs-dev/repair/prefetch.c	2009-08-20 00:14:08.000000000 +0000
@@ -1,6 +1,7 @@
 #include <libxfs.h>
 #include <pthread.h>
 #include "avl.h"
+#include "btree.h"
 #include "globals.h"
 #include "agheader.h"
 #include "incore.h"
@@ -14,7 +15,6 @@
 #include "threads.h"
 #include "prefetch.h"
 #include "progress.h"
-#include "radix-tree.h"
 
 int do_prefetch = 1;
 
@@ -129,10 +129,8 @@
 	pthread_mutex_lock(&args->lock);
 
 	if (fsbno > args->last_bno_read) {
-		radix_tree_insert(&args->primary_io_queue, fsbno, bp);
-		if (!B_IS_INODE(flag))
-			radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
-		else {
+		btree_insert(args->primary_io_queue, fsbno, bp);
+		if (B_IS_INODE(flag)) {
 			args->inode_bufs_queued++;
 			if (args->inode_bufs_queued == IO_THRESHOLD)
 				pf_start_io_workers(args);
@@ -154,7 +152,7 @@
 #endif
 		ASSERT(!B_IS_INODE(flag));
 		XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
-		radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
+		btree_insert(args->secondary_io_queue, fsbno, bp);
 	}
 
 	pf_start_processing(args);
@@ -407,7 +405,7 @@
 	pf_which_t		which,
 	void			*buf)
 {
-	struct radix_tree_root	*queue;
+	struct btree_root	*queue;
 	xfs_buf_t		*bplist[MAX_BUFS];
 	unsigned int		num;
 	off64_t			first_off, last_off, next_off;
@@ -415,27 +413,25 @@
 	int			i;
 	int			inode_bufs;
 	unsigned long		fsbno;
+	unsigned long		max_fsbno;
 	char			*pbuf;
 
-	queue = (which != PF_SECONDARY) ? &args->primary_io_queue
-				: &args->secondary_io_queue;
+	queue = (which != PF_SECONDARY) ? args->primary_io_queue
+				: args->secondary_io_queue;
 
-	while (radix_tree_lookup_first(queue, &fsbno) != NULL) {
-
-		if (which != PF_META_ONLY) {
-			num = radix_tree_gang_lookup_ex(queue,
-					(void**)&bplist[0], fsbno,
-					fsbno + pf_max_fsbs, MAX_BUFS);
-			ASSERT(num > 0);
-			ASSERT(XFS_FSB_TO_DADDR(mp, fsbno) ==
-				XFS_BUF_ADDR(bplist[0]));
-		} else {
-			num = radix_tree_gang_lookup_tag(queue,
-					(void**)&bplist[0], fsbno,
-					MAX_BUFS / 4, 0);
-			if (num == 0)
-				return;
+	while (btree_find(queue, 0, &fsbno) != NULL) {
+		max_fsbno = fsbno + pf_max_fsbs;
+		num = 0;
+
+		bplist[0] = btree_lookup(queue, fsbno);
+		while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
+			if (which != PF_META_ONLY ||
+			    !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
+				num++;
+			bplist[num] = btree_lookup_next(queue, &fsbno);
 		}
+		if (!num)
+			return;
 
 		/*
 		 * do a big read if 25% of the potential buffer is useful,
@@ -467,7 +463,7 @@
 		}
 
 		for (i = 0; i < num; i++) {
-			if (radix_tree_delete(queue, XFS_DADDR_TO_FSB(mp,
+			if (btree_delete(queue, XFS_DADDR_TO_FSB(mp,
 					XFS_BUF_ADDR(bplist[i]))) == NULL)
 				do_error(_("prefetch corruption\n"));
 		}
@@ -570,7 +566,7 @@
 		return NULL;
 
 	pthread_mutex_lock(&args->lock);
-	while (!args->queuing_done || args->primary_io_queue.height) {
+	while (!args->queuing_done || btree_find(args->primary_io_queue, 0, NULL)) {
 
 #ifdef XR_PF_TRACE
 		pftrace("waiting to start prefetch I/O for AG %d", args->agno);
@@ -696,8 +692,8 @@
 #endif
 	pthread_mutex_lock(&args->lock);
 
-	ASSERT(args->primary_io_queue.height == 0);
-	ASSERT(args->secondary_io_queue.height == 0);
+	ASSERT(btree_find(args->primary_io_queue, 0, NULL) == NULL);
+	ASSERT(btree_find(args->secondary_io_queue, 0, NULL) == NULL);
 
 	args->prefetch_done = 1;
 	if (args->next_args)
@@ -755,8 +751,8 @@
 
 	args = calloc(1, sizeof(prefetch_args_t));
 
-	INIT_RADIX_TREE(&args->primary_io_queue, 0);
-	INIT_RADIX_TREE(&args->secondary_io_queue, 0);
+	btree_init(&args->primary_io_queue);
+	btree_init(&args->secondary_io_queue);
 	if (pthread_mutex_init(&args->lock, NULL) != 0)
 		do_error(_("failed to initialize prefetch mutex\n"));
 	if (pthread_cond_init(&args->start_reading, NULL) != 0)
@@ -835,6 +831,8 @@
 	pthread_cond_destroy(&args->start_reading);
 	pthread_cond_destroy(&args->start_processing);
 	sem_destroy(&args->ra_count);
+	btree_destroy(args->primary_io_queue);
+	btree_destroy(args->secondary_io_queue);
 
 	free(args);
 }
Index: xfsprogs-dev/repair/prefetch.h
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.h	2009-08-20 00:01:58.000000000 +0000
+++ xfsprogs-dev/repair/prefetch.h	2009-08-20 00:06:44.000000000 +0000
@@ -3,7 +3,6 @@
 
 #include <semaphore.h>
 #include "incore.h"
-#include "radix-tree.h"
 
 
 extern int 	do_prefetch;
@@ -14,8 +13,8 @@
 	pthread_mutex_t		lock;
 	pthread_t		queuing_thread;
 	pthread_t		io_threads[PF_THREAD_COUNT];
-	struct radix_tree_root	primary_io_queue;
-	struct radix_tree_root	secondary_io_queue;
+	struct btree_root	*primary_io_queue;
+	struct btree_root	*secondary_io_queue;
 	pthread_cond_t		start_reading;
 	pthread_cond_t		start_processing;
 	int			agno;
Index: xfsprogs-dev/repair/radix-tree.c
===================================================================
--- xfsprogs-dev.orig/repair/radix-tree.c	2009-08-20 00:01:58.000000000 +0000
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,805 +0,0 @@
-/*
- * Copyright (C) 2001 Momchil Velikov
- * Portions Copyright (C) 2001 Christoph Hellwig
- * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <libxfs.h>
-#include "radix-tree.h"
-
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
-#define RADIX_TREE_MAP_SHIFT	6
-#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
-
-#ifdef RADIX_TREE_TAGS
-#define RADIX_TREE_TAG_LONGS	\
-	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
-#endif
-
-struct radix_tree_node {
-	unsigned int	count;
-	void		*slots[RADIX_TREE_MAP_SIZE];
-#ifdef RADIX_TREE_TAGS
-	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
-#endif
-};
-
-struct radix_tree_path {
-	struct radix_tree_node *node;
-	int offset;
-};
-
-#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
-
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH];
-
-/*
- * Radix tree node cache.
- */
-
-#define radix_tree_node_alloc(r) 	((struct radix_tree_node *) \
-		calloc(1, sizeof(struct radix_tree_node)))
-#define radix_tree_node_free(n) 	free(n)
-
-#ifdef RADIX_TREE_TAGS
-
-static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	*((__uint32_t *)node->tags[tag] + (offset >> 5)) |= (1 << (offset & 31));
-}
-
-static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	__uint32_t 	*p = (__uint32_t*)node->tags[tag] + (offset >> 5);
-	__uint32_t 	m = 1 << (offset & 31);
-	*p &= ~m;
-}
-
-static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	return 1 & (((const __uint32_t *)node->tags[tag])[offset >> 5] >> (offset & 31));
-}
-
-/*
- * Returns 1 if any slot in the node has this tag set.
- * Otherwise returns 0.
- */
-static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
-{
-	int idx;
-	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-		if (node->tags[tag][idx])
-			return 1;
-	}
-	return 0;
-}
-
-#endif
-
-/*
- *	Return the maximum key which can be store into a
- *	radix tree with height HEIGHT.
- */
-static inline unsigned long radix_tree_maxindex(unsigned int height)
-{
-	return height_to_maxindex[height];
-}
-
-/*
- *	Extend a radix tree so it can store key @index.
- */
-static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
-{
-	struct radix_tree_node *node;
-	unsigned int height;
-#ifdef RADIX_TREE_TAGS
-	char tags[RADIX_TREE_MAX_TAGS];
-	int tag;
-#endif
-
-	/* Figure out what the height should be.  */
-	height = root->height + 1;
-	while (index > radix_tree_maxindex(height))
-		height++;
-
-	if (root->rnode == NULL) {
-		root->height = height;
-		goto out;
-	}
-
-#ifdef RADIX_TREE_TAGS
-	/*
-	 * Prepare the tag status of the top-level node for propagation
-	 * into the newly-pushed top-level node(s)
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 0;
-		if (any_tag_set(root->rnode, tag))
-			tags[tag] = 1;
-	}
-#endif
-	do {
-		if (!(node = radix_tree_node_alloc(root)))
-			return -ENOMEM;
-
-		/* Increase the height.  */
-		node->slots[0] = root->rnode;
-
-#ifdef RADIX_TREE_TAGS
-		/* Propagate the aggregated tag info into the new root */
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
-				tag_set(node, tag, 0);
-		}
-#endif
-		node->count = 1;
-		root->rnode = node;
-		root->height++;
-	} while (height > root->height);
-out:
-	return 0;
-}
-
-/**
- *	radix_tree_insert    -    insert into a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *	@item:		item to insert
- *
- *	Insert an item into the radix tree at position @index.
- */
-int radix_tree_insert(struct radix_tree_root *root,
-			unsigned long index, void *item)
-{
-	struct radix_tree_node *node = NULL, *slot;
-	unsigned int height, shift;
-	int offset;
-	int error;
-
-	/* Make sure the tree is high enough.  */
-	if ((!index && !root->rnode) ||
-			index > radix_tree_maxindex(root->height)) {
-		error = radix_tree_extend(root, index);
-		if (error)
-			return error;
-	}
-
-	slot = root->rnode;
-	height = root->height;
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
-	offset = 0;			/* uninitialised var warning */
-	do {
-		if (slot == NULL) {
-			/* Have to add a child node.  */
-			if (!(slot = radix_tree_node_alloc(root)))
-				return -ENOMEM;
-			if (node) {
-				node->slots[offset] = slot;
-				node->count++;
-			} else
-				root->rnode = slot;
-		}
-
-		/* Go a level down */
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		node = slot;
-		slot = node->slots[offset];
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	} while (height > 0);
-
-	if (slot != NULL)
-		return -EEXIST;
-
-	ASSERT(node);
-	node->count++;
-	node->slots[offset] = item;
-#ifdef RADIX_TREE_TAGS
-	ASSERT(!tag_get(node, 0, offset));
-	ASSERT(!tag_get(node, 1, offset));
-#endif
-	return 0;
-}
-
-static inline void **__lookup_slot(struct radix_tree_root *root,
-				   unsigned long index)
-{
-	unsigned int height, shift;
-	struct radix_tree_node **slot;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		return NULL;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = &root->rnode;
-
-	while (height > 0) {
-		if (*slot == NULL)
-			return NULL;
-
-		slot = (struct radix_tree_node **)
-			((*slot)->slots +
-				((index >> shift) & RADIX_TREE_MAP_MASK));
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	return (void **)slot;
-}
-
-/**
- *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the slot corresponding to the position @index in the radix tree
- *	@root. This is useful for update-if-exists operations.
- */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
-{
-	return __lookup_slot(root, index);
-}
-
-/**
- *	radix_tree_lookup    -    perform lookup operation on a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the item at the position @index in the radix tree @root.
- */
-void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
-{
-	void **slot;
-
-	slot = __lookup_slot(root, index);
-	return slot != NULL ? *slot : NULL;
-}
-
-/**
- *	raid_tree_first_key - find the first index key in the radix tree
- *	@root:		radix tree root
- *	@index:		where the first index will be placed
- *
- *	Returns the first entry and index key in the radix tree @root.
- */
-void *radix_tree_lookup_first(struct radix_tree_root *root, unsigned long *index)
-{
-	unsigned int height, shift;
-	struct radix_tree_node *slot;
-	unsigned long i;
-
-	height = root->height;
-	*index = 0;
-	if (height == 0)
-		return NULL;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	for (; height > 1; height--) {
-		for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (slot->slots[i] != NULL)
-				break;
-		}
-		ASSERT(i < RADIX_TREE_MAP_SIZE);
-
-		*index |= (i << shift);
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	}
-	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-		if (slot->slots[i] != NULL) {
-			*index |= i;
-			return slot->slots[i];
-		}
-	}
-	return NULL;
-}
-
-#ifdef RADIX_TREE_TAGS
-
-/**
- *	radix_tree_tag_set - set a tag on a radix tree node
- *	@root:		radix tree root
- *	@index:		index key
- *	@tag: 		tag index
- *
- *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
- *	corresponding to @index in the radix tree.  From
- *	the root all the way down to the leaf node.
- *
- *	Returns the address of the tagged item.   Setting a tag on a not-present
- *	item is a bug.
- */
-void *radix_tree_tag_set(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag)
-{
-	unsigned int height, shift;
-	struct radix_tree_node *slot;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		return NULL;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	while (height > 0) {
-		int offset;
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!tag_get(slot, tag, offset))
-			tag_set(slot, tag, offset);
-		slot = slot->slots[offset];
-		ASSERT(slot != NULL);
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	return slot;
-}
-
-/**
- *	radix_tree_tag_clear - clear a tag on a radix tree node
- *	@root:		radix tree root
- *	@index:		index key
- *	@tag: 		tag index
- *
- *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
- *	corresponding to @index in the radix tree.  If
- *	this causes the leaf node to have no tags set then clear the tag in the
- *	next-to-leaf node, etc.
- *
- *	Returns the address of the tagged item on success, else NULL.  ie:
- *	has the same return value and semantics as radix_tree_lookup().
- */
-void *radix_tree_tag_clear(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag)
-{
-	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_node *slot;
-	unsigned int height, shift;
-	void *ret = NULL;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		goto out;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
-	slot = root->rnode;
-
-	while (height > 0) {
-		int offset;
-
-		if (slot == NULL)
-			goto out;
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp[1].offset = offset;
-		pathp[1].node = slot;
-		slot = slot->slots[offset];
-		pathp++;
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	ret = slot;
-	if (ret == NULL)
-		goto out;
-
-	do {
-		if (!tag_get(pathp->node, tag, pathp->offset))
-			goto out;
-		tag_clear(pathp->node, tag, pathp->offset);
-		if (any_tag_set(pathp->node, tag))
-			goto out;
-		pathp--;
-	} while (pathp->node);
-out:
-	return ret;
-}
-
-#endif
-
-static unsigned int
-__lookup(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index)
-{
-	unsigned int nr_found = 0;
-	unsigned int shift, height;
-	struct radix_tree_node *slot;
-	unsigned long i;
-
-	height = root->height;
-	if (height == 0)
-		goto out;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	for ( ; height > 1; height--) {
-
-		for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
-				i < RADIX_TREE_MAP_SIZE; i++) {
-			if (slot->slots[i] != NULL)
-				break;
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
-
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	}
-
-	/* Bottom level: grab some items */
-	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
-		index++;
-		if (slot->slots[i]) {
-			results[nr_found++] = slot->slots[i];
-			if (nr_found == max_items)
-				goto out;
-		}
-	}
-out:
-	*next_index = index;
-	return nr_found;
-}
-
-/**
- *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
- *	@root:		radix tree root
- *	@results:	where the results of the lookup are placed
- *	@first_index:	start the lookup from this key
- *	@max_items:	place up to this many items at *results
- *
- *	Performs an index-ascending scan of the tree for present items.  Places
- *	them at *@results and returns the number of items which were placed at
- *	*@results.
- *
- *	The implementation is naive.
- */
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items)
-{
-	const unsigned long max_index = radix_tree_maxindex(root->height);
-	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
-
-	while (ret < max_items) {
-		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
-
-		if (cur_index > max_index)
-			break;
-		nr_found = __lookup(root, results + ret, cur_index,
-					max_items - ret, &next_index);
-		ret += nr_found;
-		if (next_index == 0)
-			break;
-		cur_index = next_index;
-	}
-	return ret;
-}
-
-/**
- *	radix_tree_gang_lookup_ex - perform multiple lookup on a radix tree
- *	@root:		radix tree root
- *	@results:	where the results of the lookup are placed
- *	@first_index:	start the lookup from this key
- *	@last_index:	don't lookup past this key
- *	@max_items:	place up to this many items at *results
- *
- *	Performs an index-ascending scan of the tree for present items starting
- *	@first_index until @last_index up to as many as @max_items.  Places
- *	them at *@results and returns the number of items which were placed
- *	at *@results.
- *
- *	The implementation is naive.
- */
-unsigned int
-radix_tree_gang_lookup_ex(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned long last_index,
-			unsigned int max_items)
-{
-	const unsigned long max_index = radix_tree_maxindex(root->height);
-	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
-
-	while (ret < max_items && cur_index < last_index) {
-		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
-
-		if (cur_index > max_index)
-			break;
-		nr_found = __lookup(root, results + ret, cur_index,
-					max_items - ret, &next_index);
-		ret += nr_found;
-		if (next_index == 0)
-			break;
-		cur_index = next_index;
-	}
-	return ret;
-}
-
-#ifdef RADIX_TREE_TAGS
-
-static unsigned int
-__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index, unsigned int tag)
-{
-	unsigned int nr_found = 0;
-	unsigned int shift;
-	unsigned int height = root->height;
-	struct radix_tree_node *slot;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	while (height > 0) {
-		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
-
-		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (tag_get(slot, tag, i)) {
-				ASSERT(slot->slots[i] != NULL);
-				break;
-			}
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
-		height--;
-		if (height == 0) {	/* Bottom level: grab some items */
-			unsigned long j = index & RADIX_TREE_MAP_MASK;
-
-			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
-				index++;
-				if (tag_get(slot, tag, j)) {
-					ASSERT(slot->slots[j] != NULL);
-					results[nr_found++] = slot->slots[j];
-					if (nr_found == max_items)
-						goto out;
-				}
-			}
-		}
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	}
-out:
-	*next_index = index;
-	return nr_found;
-}
-
-/**
- *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
- *	                             based on a tag
- *	@root:		radix tree root
- *	@results:	where the results of the lookup are placed
- *	@first_index:	start the lookup from this key
- *	@max_items:	place up to this many items at *results
- *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
- *
- *	Performs an index-ascending scan of the tree for present items which
- *	have the tag indexed by @tag set.  Places the items at *@results and
- *	returns the number of items which were placed at *@results.
- */
-unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-		unsigned long first_index, unsigned int max_items,
-		unsigned int tag)
-{
-	const unsigned long max_index = radix_tree_maxindex(root->height);
-	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
-
-	while (ret < max_items) {
-		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
-
-		if (cur_index > max_index)
-			break;
-		nr_found = __lookup_tag(root, results + ret, cur_index,
-					max_items - ret, &next_index, tag);
-		ret += nr_found;
-		if (next_index == 0)
-			break;
-		cur_index = next_index;
-	}
-	return ret;
-}
-
-#endif
-
-/**
- *	radix_tree_shrink    -    shrink height of a radix tree to minimal
- *	@root		radix tree root
- */
-static inline void radix_tree_shrink(struct radix_tree_root *root)
-{
-	/* try to shrink tree height */
-	while (root->height > 1 &&
-			root->rnode->count == 1 &&
-			root->rnode->slots[0]) {
-		struct radix_tree_node *to_free = root->rnode;
-
-		root->rnode = to_free->slots[0];
-		root->height--;
-		/* must only free zeroed nodes into the slab */
-#ifdef RADIX_TREE_TAGS
-		tag_clear(to_free, 0, 0);
-		tag_clear(to_free, 1, 0);
-#endif
-		to_free->slots[0] = NULL;
-		to_free->count = 0;
-		radix_tree_node_free(to_free);
-	}
-}
-
-/**
- *	radix_tree_delete    -    delete an item from a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Remove the item at @index from the radix tree rooted at @root.
- *
- *	Returns the address of the deleted item, or NULL if it was not present.
- */
-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
-{
-	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_path *orig_pathp;
-	struct radix_tree_node *slot;
-	unsigned int height, shift;
-	void *ret = NULL;
-#ifdef RADIX_TREE_TAGS
-	char tags[RADIX_TREE_MAX_TAGS];
-	int nr_cleared_tags;
-	int tag;
-#endif
-	int offset;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		goto out;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
-	slot = root->rnode;
-
-	for ( ; height > 0; height--) {
-		if (slot == NULL)
-			goto out;
-
-		pathp++;
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp->offset = offset;
-		pathp->node = slot;
-		slot = slot->slots[offset];
-		shift -= RADIX_TREE_MAP_SHIFT;
-	}
-
-	ret = slot;
-	if (ret == NULL)
-		goto out;
-
-	orig_pathp = pathp;
-
-#ifdef RADIX_TREE_TAGS
-	/*
-	 * Clear all tags associated with the just-deleted item
-	 */
-	nr_cleared_tags = 0;
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 1;
-		if (tag_get(pathp->node, tag, pathp->offset)) {
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (!any_tag_set(pathp->node, tag)) {
-				tags[tag] = 0;
-				nr_cleared_tags++;
-			}
-		}
-	}
-
-	for (pathp--; nr_cleared_tags && pathp->node; pathp--) {
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
-				continue;
-
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (any_tag_set(pathp->node, tag)) {
-				tags[tag] = 1;
-				nr_cleared_tags--;
-			}
-		}
-	}
-#endif
-	/* Now free the nodes we do not need anymore */
-	for (pathp = orig_pathp; pathp->node; pathp--) {
-		pathp->node->slots[pathp->offset] = NULL;
-		pathp->node->count--;
-
-		if (pathp->node->count) {
-			if (pathp->node == root->rnode)
-				radix_tree_shrink(root);
-			goto out;
-		}
-
-		/* Node with zero slots in use so free it */
-		radix_tree_node_free(pathp->node);
-	}
-	root->rnode = NULL;
-	root->height = 0;
-out:
-	return ret;
-}
-
-#ifdef RADIX_TREE_TAGS
-/**
- *	radix_tree_tagged - test whether any items in the tree are tagged
- *	@root:		radix tree root
- *	@tag:		tag to test
- */
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
-{
-  	struct radix_tree_node *rnode;
-  	rnode = root->rnode;
-  	if (!rnode)
-  		return 0;
-	return any_tag_set(rnode, tag);
-}
-#endif
-
-static unsigned long __maxindex(unsigned int height)
-{
-	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
-	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
-
-	if (tmp >= RADIX_TREE_INDEX_BITS)
-		index = ~0UL;
-	return index;
-}
-
-static void radix_tree_init_maxindex(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
-		height_to_maxindex[i] = __maxindex(i);
-}
-
-void radix_tree_init(void)
-{
-	radix_tree_init_maxindex();
-}
Index: xfsprogs-dev/repair/radix-tree.h
===================================================================
--- xfsprogs-dev.orig/repair/radix-tree.h	2009-08-20 00:01:58.000000000 +0000
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2001 Momchil Velikov
- * Portions Copyright (C) 2001 Christoph Hellwig
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#ifndef __XFS_SUPPORT_RADIX_TREE_H__
-#define __XFS_SUPPORT_RADIX_TREE_H__
-
-#define RADIX_TREE_TAGS
-
-struct radix_tree_root {
-	unsigned int		height;
-	struct radix_tree_node	*rnode;
-};
-
-#define RADIX_TREE_INIT(mask)	{					\
-	.height = 0,							\
-	.rnode = NULL,							\
-}
-
-#define RADIX_TREE(name, mask) \
-	struct radix_tree_root name = RADIX_TREE_INIT(mask)
-
-#define INIT_RADIX_TREE(root, mask)					\
-do {									\
-	(root)->height = 0;						\
-	(root)->rnode = NULL;						\
-} while (0)
-
-#ifdef RADIX_TREE_TAGS
-#define RADIX_TREE_MAX_TAGS 2
-#endif
-
-int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
-void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
-void *radix_tree_lookup_first(struct radix_tree_root *, unsigned long *);
-void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items);
-unsigned int
-radix_tree_gang_lookup_ex(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned long last_index,
-			unsigned int max_items);
-
-void radix_tree_init(void);
-
-#ifdef RADIX_TREE_TAGS
-void *radix_tree_tag_set(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-void *radix_tree_tag_clear(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-int radix_tree_tag_get(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items,
-			unsigned int tag);
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
-#endif
-
-#endif /* __XFS_SUPPORT_RADIX_TREE_H__ */

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 07/14] repair: use single prefetch queue
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (5 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 06/14] repair: use a btree instead of a radix tree for the prefetch queue Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-21 17:48   ` Alex Elder
  2009-09-02 17:55 ` [PATCH 08/14] repair: clean up prefetch tracing Christoph Hellwig
                   ` (8 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-use-single-prefetch-queue --]
[-- Type: text/plain, Size: 5338 bytes --]

We don't need two prefetch queues as we guarantee execution in order anyway.

XXX: description could use some more details.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/prefetch.c
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.c	2009-08-20 00:14:08.000000000 +0000
+++ xfsprogs-dev/repair/prefetch.c	2009-08-20 00:16:01.000000000 +0000
@@ -128,8 +128,9 @@
 
 	pthread_mutex_lock(&args->lock);
 
+	btree_insert(args->io_queue, fsbno, bp);
+
 	if (fsbno > args->last_bno_read) {
-		btree_insert(args->primary_io_queue, fsbno, bp);
 		if (B_IS_INODE(flag)) {
 			args->inode_bufs_queued++;
 			if (args->inode_bufs_queued == IO_THRESHOLD)
@@ -152,7 +153,6 @@
 #endif
 		ASSERT(!B_IS_INODE(flag));
 		XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
-		btree_insert(args->secondary_io_queue, fsbno, bp);
 	}
 
 	pf_start_processing(args);
@@ -405,7 +405,6 @@
 	pf_which_t		which,
 	void			*buf)
 {
-	struct btree_root	*queue;
 	xfs_buf_t		*bplist[MAX_BUFS];
 	unsigned int		num;
 	off64_t			first_off, last_off, next_off;
@@ -416,19 +415,22 @@
 	unsigned long		max_fsbno;
 	char			*pbuf;
 
-	queue = (which != PF_SECONDARY) ? args->primary_io_queue
-				: args->secondary_io_queue;
-
-	while (btree_find(queue, 0, &fsbno) != NULL) {
-		max_fsbno = fsbno + pf_max_fsbs;
+	for (;;) {
 		num = 0;
-
-		bplist[0] = btree_lookup(queue, fsbno);
+		if (which == PF_SECONDARY) {
+			bplist[0] = btree_find(args->io_queue, 0, &fsbno);
+			max_fsbno = MIN(fsbno + pf_max_fsbs,
+							args->last_bno_read);
+		} else {
+			bplist[0] = btree_find(args->io_queue,
+						args->last_bno_read, &fsbno);
+			max_fsbno = fsbno + pf_max_fsbs;
+		}
 		while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
 			if (which != PF_META_ONLY ||
 			    !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
 				num++;
-			bplist[num] = btree_lookup_next(queue, &fsbno);
+			bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
 		}
 		if (!num)
 			return;
@@ -440,21 +442,22 @@
 		 */
 		first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
 		last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
-			XFS_BUF_SIZE(bplist[num-1]);
+						XFS_BUF_SIZE(bplist[num-1]);
 		while (last_off - first_off > pf_max_bytes) {
 			num--;
-			last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
-				XFS_BUF_SIZE(bplist[num-1]);
+			last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(
+				bplist[num-1])) + XFS_BUF_SIZE(bplist[num-1]);
 		}
-		if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
+		if (num < ((last_off - first_off) >>
+						(mp->m_sb.sb_blocklog + 3))) {
 			/*
 			 * not enough blocks for one big read, so determine
 			 * the number of blocks that are close enough.
 			 */
 			last_off = first_off + XFS_BUF_SIZE(bplist[0]);
 			for (i = 1; i < num; i++) {
-				next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
-						XFS_BUF_SIZE(bplist[i]);
+				next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(
+					bplist[i])) + XFS_BUF_SIZE(bplist[i]);
 				if (next_off - last_off > pf_batch_bytes)
 					break;
 				last_off = next_off;
@@ -463,7 +466,7 @@
 		}
 
 		for (i = 0; i < num; i++) {
-			if (btree_delete(queue, XFS_DADDR_TO_FSB(mp,
+			if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
 					XFS_BUF_ADDR(bplist[i]))) == NULL)
 				do_error(_("prefetch corruption\n"));
 		}
@@ -566,7 +569,7 @@
 		return NULL;
 
 	pthread_mutex_lock(&args->lock);
-	while (!args->queuing_done || btree_find(args->primary_io_queue, 0, NULL)) {
+	while (!args->queuing_done || btree_find(args->io_queue, 0, NULL)) {
 
 #ifdef XR_PF_TRACE
 		pftrace("waiting to start prefetch I/O for AG %d", args->agno);
@@ -692,8 +695,7 @@
 #endif
 	pthread_mutex_lock(&args->lock);
 
-	ASSERT(btree_find(args->primary_io_queue, 0, NULL) == NULL);
-	ASSERT(btree_find(args->secondary_io_queue, 0, NULL) == NULL);
+	ASSERT(btree_find(args->io_queue, 0, NULL) == NULL);
 
 	args->prefetch_done = 1;
 	if (args->next_args)
@@ -751,8 +753,7 @@
 
 	args = calloc(1, sizeof(prefetch_args_t));
 
-	btree_init(&args->primary_io_queue);
-	btree_init(&args->secondary_io_queue);
+	btree_init(&args->io_queue);
 	if (pthread_mutex_init(&args->lock, NULL) != 0)
 		do_error(_("failed to initialize prefetch mutex\n"));
 	if (pthread_cond_init(&args->start_reading, NULL) != 0)
@@ -831,8 +832,7 @@
 	pthread_cond_destroy(&args->start_reading);
 	pthread_cond_destroy(&args->start_processing);
 	sem_destroy(&args->ra_count);
-	btree_destroy(args->primary_io_queue);
-	btree_destroy(args->secondary_io_queue);
+	btree_destroy(args->io_queue);
 
 	free(args);
 }
Index: xfsprogs-dev/repair/prefetch.h
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.h	2009-08-20 00:06:44.000000000 +0000
+++ xfsprogs-dev/repair/prefetch.h	2009-08-20 00:16:01.000000000 +0000
@@ -13,8 +13,7 @@
 	pthread_mutex_t		lock;
 	pthread_t		queuing_thread;
 	pthread_t		io_threads[PF_THREAD_COUNT];
-	struct btree_root	*primary_io_queue;
-	struct btree_root	*secondary_io_queue;
+	struct btree_root	*io_queue;
 	pthread_cond_t		start_reading;
 	pthread_cond_t		start_processing;
 	int			agno;

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 08/14] repair: clean up prefetch tracing
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (6 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 07/14] repair: use single prefetch queue Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-21 17:53   ` Alex Elder
  2009-09-02 17:55 ` [PATCH 09/14] repair: track logical to physical block mapping more effeciently Christoph Hellwig
                   ` (7 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-cleanup-prefetch-tracing --]
[-- Type: text/plain, Size: 12367 bytes --]

Define a dummy pftrace macro for the non-tracing case to reduce the ifdef hell,
clean up a few trace calls and add proper init/exit handlers for the tracing
setup and teardown.

Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/dino_chunks.c
===================================================================
--- xfsprogs-dev.orig/repair/dino_chunks.c	2009-08-19 23:42:32.000000000 +0000
+++ xfsprogs-dev/repair/dino_chunks.c	2009-08-20 00:16:53.000000000 +0000
@@ -629,10 +629,9 @@
 			cluster_count * sizeof(xfs_buf_t*));
 
 	for (bp_index = 0; bp_index < cluster_count; bp_index++) {
-#ifdef XR_PF_TRACE
 		pftrace("about to read off %llu in AG %d",
 			(long long)XFS_AGB_TO_DADDR(mp, agno, agbno), agno);
-#endif
+
 		bplist[bp_index] = libxfs_readbuf(mp->m_dev,
 					XFS_AGB_TO_DADDR(mp, agno, agbno),
 					XFS_FSB_TO_BB(mp, blks_per_cluster), 0);
@@ -650,11 +649,9 @@
 		}
 		agbno += blks_per_cluster;
 
-#ifdef XR_PF_TRACE
 		pftrace("readbuf %p (%llu, %d) in AG %d", bplist[bp_index],
 			(long long)XFS_BUF_ADDR(bplist[bp_index]),
 			XFS_BUF_COUNT(bplist[bp_index]), agno);
-#endif
 	}
 	agbno = XFS_AGINO_TO_AGBNO(mp, first_irec->ino_startnum);
 
@@ -906,10 +903,10 @@
 			 * done! - finished up irec and block simultaneously
 			 */
 			for (bp_index = 0; bp_index < cluster_count; bp_index++) {
-#ifdef XR_PF_TRACE
-				pftrace("put/writebuf %p (%llu) in AG %d", bplist[bp_index],
-					(long long)XFS_BUF_ADDR(bplist[bp_index]), agno);
-#endif
+				pftrace("put/writebuf %p (%llu) in AG %d",
+					bplist[bp_index], (long long)
+					XFS_BUF_ADDR(bplist[bp_index]), agno);
+
 				if (dirty && !no_modify)
 					libxfs_writebuf(bplist[bp_index], 0);
 				else
Index: xfsprogs-dev/repair/dir2.c
===================================================================
--- xfsprogs-dev.orig/repair/dir2.c	2009-08-19 23:42:32.000000000 +0000
+++ xfsprogs-dev/repair/dir2.c	2009-08-20 00:16:53.000000000 +0000
@@ -103,21 +103,19 @@
 		bplist = bparray;
 	}
 	for (i = 0; i < nex; i++) {
-#ifdef XR_PF_TRACE
 		pftrace("about to read off %llu (len = %d)",
 			(long long)XFS_FSB_TO_DADDR(mp, bmp[i].startblock),
 			XFS_FSB_TO_BB(mp, bmp[i].blockcount));
-#endif
+
 		bplist[i] = libxfs_readbuf(mp->m_dev,
 				XFS_FSB_TO_DADDR(mp, bmp[i].startblock),
 				XFS_FSB_TO_BB(mp, bmp[i].blockcount), 0);
 		if (!bplist[i])
 			goto failed;
-#ifdef XR_PF_TRACE
+
 		pftrace("readbuf %p (%llu, %d)", bplist[i],
 			(long long)XFS_BUF_ADDR(bplist[i]),
 			XFS_BUF_COUNT(bplist[i]));
-#endif
 	}
 	dabuf = malloc(XFS_DA_BUF_SIZE(nex));
 	if (dabuf == NULL) {
@@ -248,10 +246,8 @@
 	}
 	da_buf_done(dabuf);
 	for (i = 0; i < nbuf; i++) {
-#ifdef XR_PF_TRACE
 		pftrace("putbuf %p (%llu)", bplist[i],
 					(long long)XFS_BUF_ADDR(bplist[i]));
-#endif
 		libxfs_putbuf(bplist[i]);
 	}
 	if (bplist != &bp)
@@ -538,7 +534,7 @@
 	/*
 	 * bail out if this is the root block (top of tree)
 	 */
-	if (this_level >= cursor->active)  
+	if (this_level >= cursor->active)
 		return(0);
 	/*
 	 * set hashvalue to correctl reflect the now-validated
@@ -1425,7 +1421,7 @@
 		 * numbers.  Do NOT touch the name until after we've computed
 		 * the hashvalue and done a namecheck() on the name.
 		 *
-		 * Conditions must either set clearino to zero or set 
+		 * Conditions must either set clearino to zero or set
 		 * clearreason why it's being cleared.
 		 */
 		if (!ino_discovery && ent_ino == BADFSINO) {
@@ -1456,7 +1452,7 @@
 				if (ino_discovery) {
 					add_inode_uncertain(mp, ent_ino, 0);
 					clearino = 0;
-				} else 
+				} else
 					clearreason = _("non-existent");
 			} else {
 				/*
Index: xfsprogs-dev/repair/globals.h
===================================================================
--- xfsprogs-dev.orig/repair/globals.h	2009-08-19 23:42:32.000000000 +0000
+++ xfsprogs-dev/repair/globals.h	2009-08-20 00:16:53.000000000 +0000
@@ -199,10 +199,6 @@
 EXTERN int 		report_interval;
 EXTERN __uint64_t 	*prog_rpt_done;
 
-#ifdef XR_PF_TRACE
-EXTERN FILE		*pf_trace_file;
-#endif
-
 EXTERN int		ag_stride;
 EXTERN int		thread_count;
 
Index: xfsprogs-dev/repair/init.c
===================================================================
--- xfsprogs-dev.orig/repair/init.c	2009-08-20 00:06:44.000000000 +0000
+++ xfsprogs-dev/repair/init.c	2009-08-20 00:16:53.000000000 +0000
@@ -150,4 +150,5 @@
 	ts_create();
 	ts_init();
 	increase_rlimit();
+	pftrace_init();
 }
Index: xfsprogs-dev/repair/prefetch.c
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.c	2009-08-20 00:16:01.000000000 +0000
+++ xfsprogs-dev/repair/prefetch.c	2009-08-20 00:16:53.000000000 +0000
@@ -83,9 +83,8 @@
 	prefetch_args_t		*args)
 {
 	if (!args->can_start_processing) {
-#ifdef XR_PF_TRACE
 		pftrace("signalling processing for AG %d", args->agno);
-#endif
+
 		args->can_start_processing = 1;
 		pthread_cond_signal(&args->start_processing);
 	}
@@ -96,9 +95,8 @@
 	prefetch_args_t		*args)
 {
 	if (!args->can_start_reading) {
-#ifdef XR_PF_TRACE
 		pftrace("signalling reading for AG %d", args->agno);
-#endif
+
 		args->can_start_reading = 1;
 		pthread_cond_broadcast(&args->start_reading);
 	}
@@ -136,25 +134,16 @@
 			if (args->inode_bufs_queued == IO_THRESHOLD)
 				pf_start_io_workers(args);
 		}
-#ifdef XR_PF_TRACE
-		pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
-			"primary queue (inode_bufs_queued = %d, last_bno = %lu)",
-			B_IS_INODE(flag) ? 'I' : 'M', bp,
-			(long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
-			args->inode_bufs_queued, args->last_bno_read);
-#endif
 	} else {
-#ifdef XR_PF_TRACE
-		pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
-			"secondary queue (last_bno = %lu)",
-			B_IS_INODE(flag) ? 'I' : 'M', bp,
-			(long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
-			args->last_bno_read);
-#endif
 		ASSERT(!B_IS_INODE(flag));
 		XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
 	}
 
+	pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
+		"(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
+		'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
+		args->inode_bufs_queued, args->last_bno_read);
+
 	pf_start_processing(args);
 
 	pthread_mutex_unlock(&args->lock);
@@ -192,9 +181,9 @@
 
 		while (irec.br_blockcount) {
 			unsigned int	len;
-#ifdef XR_PF_TRACE
+
 			pftrace("queuing dir extent in AG %d", args->agno);
-#endif
+
 			len = (irec.br_blockcount > mp->m_dirblkfsbs) ?
 					mp->m_dirblkfsbs : irec.br_blockcount;
 			pf_queue_io(args, irec.br_startblock, len, B_DIR_META);
@@ -520,20 +509,16 @@
 			}
 		}
 		for (i = 0; i < num; i++) {
-#ifdef XR_PF_TRACE
 			pftrace("putbuf %c %p (%llu) in AG %d",
 				B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
 				bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
 				args->agno);
-#endif
 			libxfs_putbuf(bplist[i]);
 		}
 		pthread_mutex_lock(&args->lock);
 		if (which != PF_SECONDARY) {
-#ifdef XR_PF_TRACE
 			pftrace("inode_bufs_queued for AG %d = %d", args->agno,
 				args->inode_bufs_queued);
-#endif
 			/*
 			 * if primary inode queue running low, process metadata
 			 * in boths queues to avoid I/O starvation as the
@@ -542,15 +527,14 @@
 			 */
 			if (which == PF_PRIMARY && !args->queuing_done &&
 					args->inode_bufs_queued < IO_THRESHOLD) {
-#ifdef XR_PF_TRACE
 				pftrace("reading metadata bufs from primary queue for AG %d",
 					args->agno);
-#endif
+
 				pf_batch_read(args, PF_META_ONLY, buf);
-#ifdef XR_PF_TRACE
+
 				pftrace("reading bufs from secondary queue for AG %d",
 					args->agno);
-#endif
+
 				pf_batch_read(args, PF_SECONDARY, buf);
 			}
 		}
@@ -571,20 +555,18 @@
 	pthread_mutex_lock(&args->lock);
 	while (!args->queuing_done || btree_find(args->io_queue, 0, NULL)) {
 
-#ifdef XR_PF_TRACE
 		pftrace("waiting to start prefetch I/O for AG %d", args->agno);
-#endif
+
 		while (!args->can_start_reading && !args->queuing_done)
 			pthread_cond_wait(&args->start_reading, &args->lock);
-#ifdef XR_PF_TRACE
+
 		pftrace("starting prefetch I/O for AG %d", args->agno);
-#endif
+
 		pf_batch_read(args, PF_PRIMARY, buf);
 		pf_batch_read(args, PF_SECONDARY, buf);
 
-#ifdef XR_PF_TRACE
 		pftrace("ran out of bufs to prefetch for AG %d", args->agno);
-#endif
+
 		if (!args->queuing_done)
 			args->can_start_reading = 0;
 	}
@@ -592,9 +574,8 @@
 
 	free(buf);
 
-#ifdef XR_PF_TRACE
 	pftrace("finished prefetch I/O for AG %d", args->agno);
-#endif
+
 	return NULL;
 }
 
@@ -636,10 +617,7 @@
 			break;
 		}
 	}
-
-#ifdef XR_PF_TRACE
 	pftrace("starting prefetch for AG %d", args->agno);
-#endif
 
 	for (irec = findfirst_inode_rec(args->agno); irec != NULL;
 			irec = next_ino_rec(irec)) {
@@ -676,10 +654,9 @@
 
 	pthread_mutex_lock(&args->lock);
 
-#ifdef XR_PF_TRACE
 	pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
 		args->agno, args->inode_bufs_queued);
-#endif
+
 	args->queuing_done = 1;
 	pf_start_io_workers(args);
 	pf_start_processing(args);
@@ -690,9 +667,8 @@
 		if (args->io_threads[i])
 			pthread_join(args->io_threads[i], NULL);
 
-#ifdef XR_PF_TRACE
 	pftrace("prefetch for AG %d finished", args->agno);
-#endif
+
 	pthread_mutex_lock(&args->lock);
 
 	ASSERT(btree_find(args->io_queue, 0, NULL) == NULL);
@@ -712,9 +688,8 @@
 {
 	int			err;
 
-#ifdef XR_PF_TRACE
 	pftrace("creating queue thread for AG %d", args->agno);
-#endif
+
 	err = pthread_create(&args->queuing_thread, NULL,
 			pf_queuing_worker, args);
 	if (err != 0) {
@@ -801,14 +776,12 @@
 	pthread_mutex_lock(&args->lock);
 
 	while (!args->can_start_processing) {
-#ifdef XR_PF_TRACE
 		pftrace("waiting to start processing AG %d", args->agno);
-#endif
+
 		pthread_cond_wait(&args->start_processing, &args->lock);
 	}
-#ifdef XR_PF_TRACE
 	pftrace("can start processing AG %d", args->agno);
-#endif
+
 	pthread_mutex_unlock(&args->lock);
 }
 
@@ -819,15 +792,13 @@
 	if (args == NULL)
 		return;
 
-#ifdef XR_PF_TRACE
 	pftrace("waiting AG %d prefetch to finish", args->agno);
-#endif
+
 	if (args->queuing_thread)
 		pthread_join(args->queuing_thread, NULL);
 
-#ifdef XR_PF_TRACE
 	pftrace("AG %d prefetch done", args->agno);
-#endif
+
 	pthread_mutex_destroy(&args->lock);
 	pthread_cond_destroy(&args->start_reading);
 	pthread_cond_destroy(&args->start_processing);
@@ -839,6 +810,21 @@
 
 #ifdef XR_PF_TRACE
 
+static FILE	*pf_trace_file;
+
+void
+pftrace_init(void)
+{
+	pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
+	setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
+}
+
+void
+pftrace_done(void)
+{
+	fclose(pf_trace_file);
+}
+
 void
 _pftrace(const char *func, const char *msg, ...)
 {
@@ -853,7 +839,8 @@
 	buf[sizeof(buf)-1] = '\0';
 	va_end(args);
 
-	fprintf(pf_trace_file, "%lu.%06lu  %s: %s\n", tv.tv_sec, tv.tv_usec, func, buf);
+	fprintf(pf_trace_file, "%lu.%06lu  %s: %s\n", tv.tv_sec, tv.tv_usec,
+		func, buf);
 }
 
 #endif
Index: xfsprogs-dev/repair/prefetch.h
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.h	2009-08-20 00:16:01.000000000 +0000
+++ xfsprogs-dev/repair/prefetch.h	2009-08-20 00:16:53.000000000 +0000
@@ -50,8 +50,15 @@
 
 
 #ifdef XR_PF_TRACE
+void	pftrace_init(void);
+void	pftrace_done(void);
+
 #define pftrace(msg...)	_pftrace(__FUNCTION__, ## msg)
 void	_pftrace(const char *, const char *, ...);
+#else
+static inline void pftrace_init(void) { };
+static inline void pftrace_done(void) { };
+static inline void pftrace(const char *msg, ...) { };
 #endif
 
 #endif /* _XFS_REPAIR_PREFETCH_H */
Index: xfsprogs-dev/repair/xfs_repair.c
===================================================================
--- xfsprogs-dev.orig/repair/xfs_repair.c	2009-08-19 23:42:32.000000000 +0000
+++ xfsprogs-dev/repair/xfs_repair.c	2009-08-20 00:16:53.000000000 +0000
@@ -542,11 +542,6 @@
 	bindtextdomain(PACKAGE, LOCALEDIR);
 	textdomain(PACKAGE);
 
-#ifdef XR_PF_TRACE
-	pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
-	setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
-#endif
-
 	temp_mp = &xfs_m;
 	setbuf(stdout, NULL);
 
@@ -850,8 +845,7 @@
 	if (verbose)
 		summary_report();
 	do_log(_("done\n"));
-#ifdef XR_PF_TRACE
-	fclose(pf_trace_file);
-#endif
+	pftrace_done();
+
 	return (0);
 }

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 09/14] repair: track logical to physical block mapping more effeciently
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (7 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 08/14] repair: clean up prefetch tracing Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-21 19:06   ` [PATCH 09/14] repair: track logical to physical block mapping moreeffeciently Alex Elder
  2009-09-02 17:55 ` [PATCH 10/14] repair: cleanup helpers for tracking block usage Christoph Hellwig
                   ` (6 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-blkmap-opt --]
[-- Type: text/plain, Size: 17623 bytes --]

Currently we track the logical to physical block mapping by a structure which
contains an array of physicial blocks.  This is extremly efficient and is
replaced with the normal starblock storage we use in the kernel and on disk
in this patch.

In addition also use thread-local storage for the block map, this is possible
because repair only processes one inode at a given time per thread, and the
block map does not have to outlive the processing of a single inode.

The combination of those factors means we can use pthread thread-local
storage to store the block map, and we can re-use the allocation over
and over again.

This should be ported over to xfs_db eventually, or even better we could try
to share the code.

[hch: added a small fix in blkmap_set_ext to not call memmove unless needed]

Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/bmap.c
===================================================================
--- xfsprogs-dev.orig/repair/bmap.c	2009-08-20 02:32:34.000000000 +0000
+++ xfsprogs-dev/repair/bmap.c	2009-08-20 02:32:45.000000000 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2001,2005,2008 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -21,106 +21,46 @@
 #include "bmap.h"
 
 /*
- * Block mapping code taken from xfs_db.
- */
-
-/*
- * Append an extent to the block entry.
- */
-void
-blkent_append(
-	blkent_t	**entp,
-	xfs_dfsbno_t	b,
-	xfs_dfilblks_t	c)
-{
-	blkent_t	*ent;
-	size_t		size;
-	int		i;
-
-	ent = *entp;
-	size = BLKENT_SIZE(c + ent->nblks);
-	if ((*entp = ent = realloc(ent, size)) == NULL) {
-		do_warn(_("realloc failed in blkent_append (%u bytes)\n"),
-			size);
-		return;
-	}
-	for (i = 0; i < c; i++)
-		ent->blks[ent->nblks + i] = b + i;
-	ent->nblks += c;
-}
-
-/*
- * Make a new block entry.
- */
-blkent_t *
-blkent_new(
-	xfs_dfiloff_t	o,
-	xfs_dfsbno_t	b,
-	xfs_dfilblks_t	c)
-{
-	blkent_t	*ent;
-	int		i;
-
-	if ((ent = malloc(BLKENT_SIZE(c))) == NULL) {
-		do_warn(_("malloc failed in blkent_new (%u bytes)\n"),
-			BLKENT_SIZE(c));
-		return ent;
-	}
-	ent->nblks = c;
-	ent->startoff = o;
-	for (i = 0; i < c; i++)
-		ent->blks[i] = b + i;
-	return ent;
-}
-
-/*
- * Prepend an extent to the block entry.
+ * Track the logical to physical block mapping for inodes.
+ *
+ * Repair only processes one inode at a given time per thread, and the
+ * block map does not have to outlive the processing of a single inode.
+ *
+ * The combination of those factors means we can use pthreads thread-local
+ * storage to store the block map, and we can re-use the allocation over
+ * and over again.
  */
-void
-blkent_prepend(
-	blkent_t	**entp,
-	xfs_dfsbno_t	b,
-	xfs_dfilblks_t	c)
-{
-	int		i;
-	blkent_t	*newent;
-	blkent_t	*oldent;
 
-	oldent = *entp;
-	if ((newent = malloc(BLKENT_SIZE(oldent->nblks + c))) == NULL) {
-		do_warn(_("malloc failed in blkent_prepend (%u bytes)\n"),
-			BLKENT_SIZE(oldent->nblks + c));
-		*entp = newent;
-		return;
-	}
-	newent->nblks = oldent->nblks + c;
-	newent->startoff = oldent->startoff - c;
-	for (i = 0; i < c; i++)
-		newent->blks[i] = b + c;
-	for (; i < oldent->nblks + c; i++)
-		newent->blks[i] = oldent->blks[i - c];
-	free(oldent);
-	*entp = newent;
-}
+pthread_key_t	dblkmap_key;
+pthread_key_t	ablkmap_key;
 
-/*
- * Allocate a block map.
- */
 blkmap_t *
 blkmap_alloc(
-	xfs_extnum_t	nex)
+	xfs_extnum_t	nex,
+	int		whichfork)
 {
+	pthread_key_t	key;
 	blkmap_t	*blkmap;
 
+	ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
+
 	if (nex < 1)
 		nex = 1;
-	if ((blkmap = malloc(BLKMAP_SIZE(nex))) == NULL) {
-		do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"),
-			BLKMAP_SIZE(nex));
-		return blkmap;
+
+	key = whichfork ? ablkmap_key : dblkmap_key;
+	blkmap = pthread_getspecific(key);
+	if (!blkmap || blkmap->naexts < nex) {
+		blkmap = realloc(blkmap, BLKMAP_SIZE(nex));
+		if (!blkmap) {
+			do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"),
+				BLKMAP_SIZE(nex));
+			return NULL;
+		}
+		pthread_setspecific(key, blkmap);
+		blkmap->naexts = nex;
 	}
-	blkmap->naents = nex;
-	blkmap->nents = 0;
+
+	blkmap->nexts = 0;
 	return blkmap;
 }
 
@@ -131,14 +71,7 @@ void
 blkmap_free(
 	blkmap_t	*blkmap)
 {
-	blkent_t	**entp;
-	xfs_extnum_t	i;
-
-	if (blkmap == NULL)
-		return;
-	for (i = 0, entp = blkmap->ents; i < blkmap->nents; i++, entp++)
-		free(*entp);
-	free(blkmap);
+	/* nothing to do! - keep the memory around for the next inode */
 }
 
 /*
@@ -149,20 +82,18 @@ blkmap_get(
 	blkmap_t	*blkmap,
 	xfs_dfiloff_t	o)
 {
-	blkent_t	*ent;
-	blkent_t	**entp;
+	bmap_ext_t	*ext = blkmap->exts;
 	int		i;
 
-	for (i = 0, entp = blkmap->ents; i < blkmap->nents; i++, entp++) {
-		ent = *entp;
-		if (o >= ent->startoff && o < ent->startoff + ent->nblks)
-			return ent->blks[o - ent->startoff];
+	for (i = 0; i < blkmap->nexts; i++, ext++) {
+		if (o >= ext->startoff && o < ext->startoff + ext->blockcount)
+			return ext->startblock + (o - ext->startoff);
 	}
 	return NULLDFSBNO;
 }
 
 /*
- * Get a chunk of entries from a block map.
+ * Get a chunk of entries from a block map - only used for reading dirv2 blocks
  */
 int
 blkmap_getn(
@@ -172,93 +103,62 @@ blkmap_getn(
 	bmap_ext_t	**bmpp,
 	bmap_ext_t	*bmpp_single)
 {
-	bmap_ext_t	*bmp;
-	blkent_t	*ent;
-	xfs_dfiloff_t	ento;
-	blkent_t	**entp;
+	bmap_ext_t	*bmp = NULL;
+	bmap_ext_t	*ext;
 	int		i;
 	int		nex;
 
 	if (nb == 1) {
-		/* 
+		/*
 		 * in the common case, when mp->m_dirblkfsbs == 1,
 		 * avoid additional malloc/free overhead
 		 */
 		bmpp_single->startblock = blkmap_get(blkmap, o);
-		bmpp_single->blockcount = 1;
-		bmpp_single->startoff = 0;
-		bmpp_single->flag = 0;
-		*bmpp = bmpp_single;
-		return (bmpp_single->startblock != NULLDFSBNO) ? 1 : 0;
+		goto single_ext;
 	}
-	for (i = nex = 0, bmp = NULL, entp = blkmap->ents;
-	     i < blkmap->nents;
-	     i++, entp++) {
-		ent = *entp;
-		if (ent->startoff >= o + nb)
+	ext = blkmap->exts;
+	nex = 0;
+	for (i = 0; i < blkmap->nexts; i++, ext++) {
+
+		if (ext->startoff >= o + nb)
 			break;
-		if (ent->startoff + ent->nblks <= o)
+		if (ext->startoff + ext->blockcount <= o)
 			continue;
-		for (ento = ent->startoff;
-		     ento < ent->startoff + ent->nblks && ento < o + nb;
-		     ento++) {
-			if (ento < o)
-				continue;
-			if (bmp &&
-			    bmp[nex - 1].startoff + bmp[nex - 1].blockcount ==
-				    ento &&
-			    bmp[nex - 1].startblock + bmp[nex - 1].blockcount ==
-				    ent->blks[ento - ent->startoff])
-				bmp[nex - 1].blockcount++;
-			else {
-				bmp = realloc(bmp, ++nex * sizeof(*bmp));
-				if (bmp == NULL) {
-					do_warn(_("blkmap_getn realloc failed"
-						" (%u bytes)\n"),
-						nex * sizeof(*bmp));
-					continue;
-				}
-				bmp[nex - 1].startoff = ento;
-				bmp[nex - 1].startblock =
-					ent->blks[ento - ent->startoff];
-				bmp[nex - 1].blockcount = 1;
-				bmp[nex - 1].flag = 0;
-			}
+
+		/*
+		 * if all the requested blocks are in one extent (also common),
+		 * use the bmpp_single option as well
+		 */
+		if (!bmp && o >= ext->startoff &&
+		    o + nb <= ext->startoff + ext->blockcount) {
+			bmpp_single->startblock =
+				 ext->startblock + (o - ext->startoff);
+			goto single_ext;
 		}
+
+		/*
+		 * rare case - multiple extents for a single dir block
+		 */
+		bmp = malloc(nb * sizeof(bmap_ext_t));
+		if (!bmp)
+			do_error(_("blkmap_getn malloc failed (%u bytes)\n"),
+						nb * sizeof(bmap_ext_t));
+
+		bmp[nex].startblock = ext->startblock + (o - ext->startoff);
+		bmp[nex].blockcount = MIN(nb, ext->blockcount -
+				(bmp[nex].startblock - ext->startblock));
+		o += bmp[nex].blockcount;
+		nb -= bmp[nex].blockcount;
+		nex++;
 	}
 	*bmpp = bmp;
 	return nex;
-}
-
-/*
- * Make a block map larger.
- */
-void
-blkmap_grow(
-	blkmap_t	**blkmapp,
-	blkent_t	**entp,
-	blkent_t	*newent)
-{
-	blkmap_t	*blkmap;
-	size_t		size;
-	int		i;
-	int		idx;
 
-	blkmap = *blkmapp;
-	idx = (int)(entp - blkmap->ents);
-	if (blkmap->naents == blkmap->nents) {
-		size = BLKMAP_SIZE(blkmap->nents + 1);
-		if ((*blkmapp = blkmap = realloc(blkmap, size)) == NULL) {
-			do_warn(_("realloc failed in blkmap_grow (%u bytes)\n"),
-				size);
-			return;
-		}
-		blkmap->naents++;
-	}
-	for (i = blkmap->nents; i > idx; i--)
-		blkmap->ents[i] = blkmap->ents[i - 1];
-	blkmap->ents[idx] = newent;
-	blkmap->nents++;
+single_ext:
+	bmpp_single->blockcount = nb;
+	bmpp_single->startoff = 0;	/* not even used by caller! */
+	*bmpp = bmpp_single;
+	return (bmpp_single->startblock != NULLDFSBNO) ? 1 : 0;
 }
 
 /*
@@ -268,12 +168,12 @@ xfs_dfiloff_t
 blkmap_last_off(
 	blkmap_t	*blkmap)
 {
-	blkent_t	*ent;
+	bmap_ext_t	*ext;
 
-	if (!blkmap->nents)
+	if (!blkmap->nexts)
 		return NULLDFILOFF;
-	ent = blkmap->ents[blkmap->nents - 1];
-	return ent->startoff + ent->nblks;
+	ext = blkmap->exts + blkmap->nexts - 1;
+	return ext->startoff + ext->blockcount;
 }
 
 /*
@@ -285,73 +185,45 @@ blkmap_next_off(
 	xfs_dfiloff_t	o,
 	int		*t)
 {
-	blkent_t	*ent;
-	blkent_t	**entp;
+	bmap_ext_t	*ext;
 
-	if (!blkmap->nents)
+	if (!blkmap->nexts)
 		return NULLDFILOFF;
 	if (o == NULLDFILOFF) {
 		*t = 0;
-		ent = blkmap->ents[0];
-		return ent->startoff;
+		return blkmap->exts[0].startoff;
 	}
-	entp = &blkmap->ents[*t];
-	ent = *entp;
-	if (o < ent->startoff + ent->nblks - 1)
+	ext = blkmap->exts + *t;
+	if (o < ext->startoff + ext->blockcount - 1)
 		return o + 1;
-	entp++;
-	if (entp >= &blkmap->ents[blkmap->nents])
+	if (*t >= blkmap->nexts - 1)
 		return NULLDFILOFF;
 	(*t)++;
-	ent = *entp;
-	return ent->startoff;
+	return ext[1].startoff;
 }
 
 /*
- * Set a block value in a block map.
+ * Make a block map larger.
  */
-void
-blkmap_set_blk(
-	blkmap_t	**blkmapp,
-	xfs_dfiloff_t	o,
-	xfs_dfsbno_t	b)
+static blkmap_t *
+blkmap_grow(
+	blkmap_t	**blkmapp)
 {
-	blkmap_t	*blkmap;
-	blkent_t	*ent;
-	blkent_t	**entp;
-	blkent_t	*nextent;
-
-	blkmap = *blkmapp;
-	for (entp = blkmap->ents; entp < &blkmap->ents[blkmap->nents]; entp++) {
-		ent = *entp;
-		if (o < ent->startoff - 1) {
-			ent = blkent_new(o, b, 1);
-			blkmap_grow(blkmapp, entp, ent);
-			return;
-		}
-		if (o == ent->startoff - 1) {
-			blkent_prepend(entp, b, 1);
-			return;
-		}
-		if (o >= ent->startoff && o < ent->startoff + ent->nblks) {
-			ent->blks[o - ent->startoff] = b;
-			return;
-		}
-		if (o > ent->startoff + ent->nblks)
-			continue;
-		blkent_append(entp, b, 1);
-		if (entp == &blkmap->ents[blkmap->nents - 1])
-			return;
-		ent = *entp;
-		nextent = entp[1];
-		if (ent->startoff + ent->nblks < nextent->startoff)
-			return;
-		blkent_append(entp, nextent->blks[0], nextent->nblks);
-		blkmap_shrink(blkmap, &entp[1]);
-		return;
+	pthread_key_t	key = dblkmap_key;
+	blkmap_t	*blkmap = *blkmapp;
+
+	if (pthread_getspecific(key) != blkmap) {
+		key = ablkmap_key;
+		ASSERT(pthread_getspecific(key) == blkmap);
 	}
-	ent = blkent_new(o, b, 1);
-	blkmap_grow(blkmapp, entp, ent);
+
+	blkmap->naexts += 4;
+	blkmap = realloc(blkmap, BLKMAP_SIZE(blkmap->naexts));
+	if (blkmap == NULL)
+		do_error(_("realloc failed in blkmap_grow\n"));
+	*blkmapp = blkmap;
+	pthread_setspecific(key, blkmap);
+	return blkmap;
 }
 
 /*
@@ -364,46 +236,23 @@ blkmap_set_ext(
 	xfs_dfsbno_t	b,
 	xfs_dfilblks_t	c)
 {
-	blkmap_t	*blkmap;
-	blkent_t	*ent;
-	blkent_t	**entp;
+	blkmap_t	*blkmap = *blkmapp;
 	xfs_extnum_t	i;
 
-	blkmap = *blkmapp;
-	if (!blkmap->nents) {
-		blkmap->ents[0] = blkent_new(o, b, c);
-		blkmap->nents = 1;
-		return;
-	}
-	entp = &blkmap->ents[blkmap->nents - 1];
-	ent = *entp;
-	if (ent->startoff + ent->nblks == o) {
-		blkent_append(entp, b, c);
-		return;
-	}
-	if (ent->startoff + ent->nblks < o) {
-		ent = blkent_new(o, b, c);
-		blkmap_grow(blkmapp, &blkmap->ents[blkmap->nents], ent);
-		return;
-	}
-	for (i = 0; i < c; i++)
-		blkmap_set_blk(blkmapp, o + i, b + i);
-}
+	if (blkmap->nexts == blkmap->naexts)
+		blkmap = blkmap_grow(blkmapp);
 
-/*
- * Make a block map smaller.
- */
-void
-blkmap_shrink(
-	blkmap_t	*blkmap,
-	blkent_t	**entp)
-{
-	int		i;
-	int		idx;
+	for (i = 0; i < blkmap->nexts; i++) {
+		if (blkmap->exts[i].startoff > o) {
+			memmove(blkmap->exts + i + 1,
+				blkmap->exts + i,
+				sizeof(bmap_ext_t) * (blkmap->nexts - i));
+			break;
+		}
+	}
 
-	free(*entp);
-	idx = (int)(entp - blkmap->ents);
-	for (i = idx + 1; i < blkmap->nents; i++)
-		blkmap->ents[i] = blkmap->ents[i - 1];
-	blkmap->nents--;
+	blkmap->exts[i].startoff = o;
+	blkmap->exts[i].startblock = b;
+	blkmap->exts[i].blockcount = c;
+	blkmap->nexts++;
 }
Index: xfsprogs-dev/repair/bmap.h
===================================================================
--- xfsprogs-dev.orig/repair/bmap.h	2009-08-20 02:32:34.000000000 +0000
+++ xfsprogs-dev/repair/bmap.h	2009-08-20 02:32:45.000000000 +0000
@@ -16,59 +16,41 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- * Block mapping code taken from xfs_db.
- */
+#ifndef _XFS_REPAIR_BMAP_H
+#define _XFS_REPAIR_BMAP_H
 
 /*
- * Block map entry.
+ * Extent descriptor.
  */
-typedef struct blkent {
+typedef struct bmap_ext {
 	xfs_dfiloff_t	startoff;
-	xfs_dfilblks_t	nblks;
-	xfs_dfsbno_t	blks[1];
-} blkent_t;
-#define	BLKENT_SIZE(n)	\
-	(offsetof(blkent_t, blks) + (sizeof(xfs_dfsbno_t) * (n)))
+	xfs_dfsbno_t	startblock;
+	xfs_dfilblks_t	blockcount;
+} bmap_ext_t;
 
 /*
  * Block map.
  */
 typedef	struct blkmap {
-	int		naents;
-	int		nents;
-	blkent_t	*ents[1];
+	int		naexts;
+	int		nexts;
+	bmap_ext_t	exts[1];
 } blkmap_t;
-#define	BLKMAP_SIZE(n)	\
-	(offsetof(blkmap_t, ents) + (sizeof(blkent_t *) * (n)))
 
-/*
- * Extent descriptor.
- */
-typedef struct bmap_ext {
-	xfs_dfiloff_t	startoff;
-	xfs_dfsbno_t	startblock;
-	xfs_dfilblks_t	blockcount;
-	int		flag;
-} bmap_ext_t;
+#define	BLKMAP_SIZE(n)	\
+	(offsetof(blkmap_t, exts) + (sizeof(bmap_ext_t) * (n)))
 
-void		blkent_append(blkent_t **entp, xfs_dfsbno_t b,
-			      xfs_dfilblks_t c);
-blkent_t	*blkent_new(xfs_dfiloff_t o, xfs_dfsbno_t b, xfs_dfilblks_t c);
-void		blkent_prepend(blkent_t **entp, xfs_dfsbno_t b,
-			       xfs_dfilblks_t c);
-blkmap_t	*blkmap_alloc(xfs_extnum_t);
+blkmap_t	*blkmap_alloc(xfs_extnum_t nex, int whichfork);
 void		blkmap_free(blkmap_t *blkmap);
+
+void		blkmap_set_ext(blkmap_t **blkmapp, xfs_dfiloff_t o,
+			       xfs_dfsbno_t b, xfs_dfilblks_t c);
+
 xfs_dfsbno_t	blkmap_get(blkmap_t *blkmap, xfs_dfiloff_t o);
 int		blkmap_getn(blkmap_t *blkmap, xfs_dfiloff_t o,
-			    xfs_dfilblks_t nb, bmap_ext_t **bmpp, 
+			    xfs_dfilblks_t nb, bmap_ext_t **bmpp,
 			    bmap_ext_t *bmpp_single);
-void		blkmap_grow(blkmap_t **blkmapp, blkent_t **entp,
-			    blkent_t *newent);
 xfs_dfiloff_t	blkmap_last_off(blkmap_t *blkmap);
 xfs_dfiloff_t	blkmap_next_off(blkmap_t *blkmap, xfs_dfiloff_t o, int *t);
-void		blkmap_set_blk(blkmap_t **blkmapp, xfs_dfiloff_t o,
-			       xfs_dfsbno_t b);
-void		blkmap_set_ext(blkmap_t **blkmapp, xfs_dfiloff_t o,
-			       xfs_dfsbno_t b, xfs_dfilblks_t c);
-void		blkmap_shrink(blkmap_t *blkmap, blkent_t **entp);
+
+#endif /* _XFS_REPAIR_BMAP_H */
Index: xfsprogs-dev/repair/dinode.c
===================================================================
--- xfsprogs-dev.orig/repair/dinode.c	2009-08-20 02:32:34.000000000 +0000
+++ xfsprogs-dev/repair/dinode.c	2009-08-21 01:23:34.000000000 +0000
@@ -2050,7 +2050,7 @@ process_inode_data_fork(
 		*nextents = 1;
 
 	if (dinoc->di_format != XFS_DINODE_FMT_LOCAL && type != XR_INO_RTDATA)
-		*dblkmap = blkmap_alloc(*nextents);
+		*dblkmap = blkmap_alloc(*nextents, XFS_DATA_FORK);
 	*nextents = 0;
 
 	switch (dinoc->di_format) {
@@ -2172,14 +2172,14 @@ process_inode_attr_fork(
 		err = process_lclinode(mp, agno, ino, dino, XFS_ATTR_FORK);
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
-		ablkmap = blkmap_alloc(*anextents);
+		ablkmap = blkmap_alloc(*anextents, XFS_ATTR_FORK);
 		*anextents = 0;
 		err = process_exinode(mp, agno, ino, dino, type, dirty,
 				atotblocks, anextents, &ablkmap,
 				XFS_ATTR_FORK, check_dups);
 		break;
 	case XFS_DINODE_FMT_BTREE:
-		ablkmap = blkmap_alloc(*anextents);
+		ablkmap = blkmap_alloc(*anextents, XFS_ATTR_FORK);
 		*anextents = 0;
 		err = process_btinode(mp, agno, ino, dino, type, dirty,
 				atotblocks, anextents, &ablkmap,
Index: xfsprogs-dev/repair/init.c
===================================================================
--- xfsprogs-dev.orig/repair/init.c	2009-08-20 02:32:34.000000000 +0000
+++ xfsprogs-dev/repair/init.c	2009-08-20 02:32:45.000000000 +0000
@@ -24,19 +24,24 @@
 #include "pthread.h"
 #include "avl.h"
 #include "dir.h"
+#include "bmap.h"
 #include "incore.h"
 #include "prefetch.h"
 #include <sys/resource.h>
 
+/* TODO: dirbuf/freemap key usage is completely b0rked - only used for dirv1 */
 static pthread_key_t dirbuf_key;
 static pthread_key_t dir_freemap_key;
 static pthread_key_t attr_freemap_key;
 
+extern pthread_key_t dblkmap_key;
+extern pthread_key_t ablkmap_key;
+
 static void
 ts_alloc(pthread_key_t key, unsigned n, size_t size)
 {
 	void *voidp;
-	voidp = malloc((n)*(size));
+	voidp = calloc(n, size);
 	if (voidp == NULL) {
 		do_error(_("ts_alloc: cannot allocate thread specific storage\n"));
 		/* NO RETURN */
@@ -52,6 +57,9 @@ ts_create(void)
 	pthread_key_create(&dirbuf_key, NULL);
 	pthread_key_create(&dir_freemap_key, NULL);
 	pthread_key_create(&attr_freemap_key, NULL);
+
+	pthread_key_create(&dblkmap_key, NULL);
+	pthread_key_create(&ablkmap_key, NULL);
 }
 
 void

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 10/14] repair: cleanup helpers for tracking block usage
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (8 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 09/14] repair: track logical to physical block mapping more effeciently Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-21 19:33   ` Alex Elder
  2009-09-02 17:55 ` [PATCH 11/14] repair: cleanup alloc/free/reset of the block usage tracking Christoph Hellwig
                   ` (5 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-cleanup-bmap-helpers --]
[-- Type: text/plain, Size: 22216 bytes --]

Rename get_agbno_state/set_agbno_state to get_bmap/set_bmap because
those names are more self-descriptive.  Remove the superblous mount
argument to the as the current filesystem is a global in repair.
Remove the fsbno taking variant as they just complicated the code.
Bring all uses of them into the canonical form.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/dinode.c
===================================================================
--- xfsprogs-dev.orig/repair/dinode.c	2009-08-21 19:05:41.000000000 +0000
+++ xfsprogs-dev/repair/dinode.c	2009-08-21 19:05:51.000000000 +0000
@@ -545,40 +545,33 @@ process_rt_rec(
 			continue;
 		}
 
-		state = get_rtbno_state(mp, ext);
-
+		state = get_rtbmap(ext);
 		switch (state)  {
-			case XR_E_FREE:
-			case XR_E_UNKNOWN:
-				set_rtbno_state(mp, ext, XR_E_INUSE);
+		case XR_E_FREE:
+		case XR_E_UNKNOWN:
+			set_rtbmap(ext, XR_E_INUSE);
+			break;
+		case XR_E_BAD_STATE:
+			do_error(_("bad state in rt block map %llu\n"), ext);
+		case XR_E_FS_MAP:
+		case XR_E_INO:
+		case XR_E_INUSE_FS:
+			do_error(_("data fork in rt inode %llu found "
+				"metadata block %llu in rt bmap\n"),
+				ino, ext);
+		case XR_E_INUSE:
+			if (pwe)
 				break;
-
-			case XR_E_BAD_STATE:
-				do_error(_("bad state in rt block map %llu\n"),
-						ext);
-
-			case XR_E_FS_MAP:
-			case XR_E_INO:
-			case XR_E_INUSE_FS:
-				do_error(_("data fork in rt inode %llu found "
-					"metadata block %llu in rt bmap\n"),
+		case XR_E_MULT:
+			set_rtbmap(ext, XR_E_MULT);
+			do_warn(_("data fork in rt inode %llu claims "
+					"used rt block %llu\n"),
 					ino, ext);
-
-			case XR_E_INUSE:
-				if (pwe)
-					break;
-
-			case XR_E_MULT:
-				set_rtbno_state(mp, ext, XR_E_MULT);
-				do_warn(_("data fork in rt inode %llu claims "
-						"used rt block %llu\n"),
-						ino, ext);
-				return 1;
-
-			case XR_E_FREE1:
-			default:
-				do_error(_("illegal state %d in rt block map "
-						"%llu\n"), state, b);
+			return 1;
+		case XR_E_FREE1:
+		default:
+			do_error(_("illegal state %d in rt block map "
+					"%llu\n"), state, b);
 		}
 	}
 
@@ -770,8 +763,7 @@ process_bmbt_reclist_int(
 
 			}
 
-			state = get_agbno_state(mp, agno, agbno);
-
+			state = get_bmap(agno, agbno);
 			switch (state)  {
 			case XR_E_FREE:
 			case XR_E_FREE1:
@@ -780,7 +772,7 @@ process_bmbt_reclist_int(
 					forkname, ino, (__uint64_t) b);
 				/* fall through ... */
 			case XR_E_UNKNOWN:
-				set_agbno_state(mp, agno, agbno, XR_E_INUSE);
+				set_bmap(agno, agbno, XR_E_INUSE);
 				break;
 
 			case XR_E_BAD_STATE:
@@ -796,7 +788,7 @@ process_bmbt_reclist_int(
 
 			case XR_E_INUSE:
 			case XR_E_MULT:
-				set_agbno_state(mp, agno, agbno, XR_E_MULT);
+				set_bmap(agno, agbno, XR_E_MULT);
 				do_warn(_("%s fork in %s inode %llu claims "
 					"used block %llu\n"),
 					forkname, ftype, ino, (__uint64_t) b);
Index: xfsprogs-dev/repair/dino_chunks.c
===================================================================
--- xfsprogs-dev.orig/repair/dino_chunks.c	2009-08-21 19:05:40.000000000 +0000
+++ xfsprogs-dev/repair/dino_chunks.c	2009-08-21 19:05:51.000000000 +0000
@@ -151,7 +151,8 @@ verify_inode_chunk(xfs_mount_t		*mp,
 
 		pthread_mutex_lock(&ag_locks[agno]);
 
-		switch (state = get_agbno_state(mp, agno, agbno))  {
+		state = get_bmap(agno, agbno);
+		switch (state) {
 		case XR_E_INO:
 			do_warn(
 		_("uncertain inode block %d/%d already known\n"),
@@ -160,7 +161,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
 		case XR_E_UNKNOWN:
 		case XR_E_FREE1:
 		case XR_E_FREE:
-			set_agbno_state(mp, agno, agbno, XR_E_INO);
+			set_bmap(agno, agbno, XR_E_INO);
 			break;
 		case XR_E_MULT:
 		case XR_E_INUSE:
@@ -172,14 +173,14 @@ verify_inode_chunk(xfs_mount_t		*mp,
 			do_warn(
 		_("inode block %d/%d multiply claimed, (state %d)\n"),
 				agno, agbno, state);
-			set_agbno_state(mp, agno, agbno, XR_E_MULT);
+			set_bmap(agno, agbno, XR_E_MULT);
 			pthread_mutex_unlock(&ag_locks[agno]);
 			return(0);
 		default:
 			do_warn(
 		_("inode block %d/%d bad state, (state %d)\n"),
 				agno, agbno, state);
-			set_agbno_state(mp, agno, agbno, XR_E_INO);
+			set_bmap(agno, agbno, XR_E_INO);
 			break;
 		}
 
@@ -434,7 +435,8 @@ verify_inode_chunk(xfs_mount_t		*mp,
 	pthread_mutex_lock(&ag_locks[agno]);
 	for (j = 0, cur_agbno = chunk_start_agbno;
 			cur_agbno < chunk_stop_agbno; cur_agbno++)  {
-		switch (state = get_agbno_state(mp, agno, cur_agbno))  {
+		state = get_bmap(agno, cur_agbno);
+		switch (state) {
 		case XR_E_MULT:
 		case XR_E_INUSE:
 		case XR_E_INUSE_FS:
@@ -442,7 +444,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
 			do_warn(
 		_("inode block %d/%d multiply claimed, (state %d)\n"),
 				agno, cur_agbno, state);
-			set_agbno_state(mp, agno, cur_agbno, XR_E_MULT);
+			set_bmap(agno, cur_agbno, XR_E_MULT);
 			j = 1;
 			break;
 		case XR_E_INO:
@@ -486,7 +488,8 @@ verify_inode_chunk(xfs_mount_t		*mp,
 
 	for (cur_agbno = chunk_start_agbno;
 			cur_agbno < chunk_stop_agbno; cur_agbno++)  {
-		switch (state = get_agbno_state(mp, agno, cur_agbno))  {
+		state = get_bmap(agno, cur_agbno);
+		switch (state) {
 		case XR_E_INO:
 			do_error(
 		_("uncertain inode block %llu already known\n"),
@@ -495,7 +498,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
 		case XR_E_UNKNOWN:
 		case XR_E_FREE1:
 		case XR_E_FREE:
-			set_agbno_state(mp, agno, cur_agbno, XR_E_INO);
+			set_bmap(agno, cur_agbno, XR_E_INO);
 			break;
 		case XR_E_MULT:
 		case XR_E_INUSE:
@@ -509,7 +512,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
 			do_warn(
 		_("inode block %d/%d bad state, (state %d)\n"),
 				agno, cur_agbno, state);
-			set_agbno_state(mp, agno, cur_agbno, XR_E_INO);
+			set_bmap(agno, cur_agbno, XR_E_INO);
 			break;
 		}
 	}
@@ -742,22 +745,23 @@ process_inode_chunk(
 	 * mark block as an inode block in the incore bitmap
 	 */
 	pthread_mutex_lock(&ag_locks[agno]);
-	switch (state = get_agbno_state(mp, agno, agbno))  {
-		case XR_E_INO:	/* already marked */
-			break;
-		case XR_E_UNKNOWN:
-		case XR_E_FREE:
-		case XR_E_FREE1:
-			set_agbno_state(mp, agno, agbno, XR_E_INO);
-			break;
-		case XR_E_BAD_STATE:
-			do_error(_("bad state in block map %d\n"), state);
-			break;
-		default:
-			set_agbno_state(mp, agno, agbno, XR_E_MULT);
-			do_warn(_("inode block %llu multiply claimed, state was %d\n"),
-				XFS_AGB_TO_FSB(mp, agno, agbno), state);
-			break;
+	state = get_bmap(agno, agbno);
+	switch (state) {
+	case XR_E_INO:	/* already marked */
+		break;
+	case XR_E_UNKNOWN:
+	case XR_E_FREE:
+	case XR_E_FREE1:
+		set_bmap(agno, agbno, XR_E_INO);
+		break;
+	case XR_E_BAD_STATE:
+		do_error(_("bad state in block map %d\n"), state);
+		break;
+	default:
+		set_bmap(agno, agbno, XR_E_MULT);
+		do_warn(_("inode block %llu multiply claimed, state was %d\n"),
+			XFS_AGB_TO_FSB(mp, agno, agbno), state);
+		break;
 	}
 	pthread_mutex_unlock(&ag_locks[agno]);
 
@@ -923,20 +927,21 @@ process_inode_chunk(
 			agbno++;
 
 			pthread_mutex_lock(&ag_locks[agno]);
-			switch (state = get_agbno_state(mp, agno, agbno))  {
+			state = get_bmap(agno, agbno);
+			switch (state) {
 			case XR_E_INO:	/* already marked */
 				break;
 			case XR_E_UNKNOWN:
 			case XR_E_FREE:
 			case XR_E_FREE1:
-				set_agbno_state(mp, agno, agbno, XR_E_INO);
+				set_bmap(agno, agbno, XR_E_INO);
 				break;
 			case XR_E_BAD_STATE:
 				do_error(_("bad state in block map %d\n"),
 					state);
 				break;
 			default:
-				set_agbno_state(mp, agno, agbno, XR_E_MULT);
+				set_bmap(agno, agbno, XR_E_MULT);
 				do_warn(_("inode block %llu multiply claimed, "
 					  "state was %d\n"),
 					XFS_AGB_TO_FSB(mp, agno, agbno), state);
Index: xfsprogs-dev/repair/phase4.c
===================================================================
--- xfsprogs-dev.orig/repair/phase4.c	2009-08-21 18:59:24.000000000 +0000
+++ xfsprogs-dev/repair/phase4.c	2009-08-21 19:05:51.000000000 +0000
@@ -247,8 +247,7 @@ phase4(xfs_mount_t *mp)
 				}
 			}
 
-			bstate = get_agbno_state(mp, i, j);
-
+			bstate = get_bmap(i, j);
 			switch (bstate)  {
 			case XR_E_BAD_STATE:
 			default:
@@ -305,9 +304,7 @@ phase4(xfs_mount_t *mp)
 	rt_len = 0;
 
 	for (bno = 0; bno < mp->m_sb.sb_rextents; bno++)  {
-
-		bstate = get_rtbno_state(mp, bno);
-
+		bstate = get_rtbmap(bno);
 		switch (bstate)  {
 		case XR_E_BAD_STATE:
 		default:
@@ -366,7 +363,7 @@ phase4(xfs_mount_t *mp)
 		    roundup((mp->m_sb.sb_agblocks+(NBBY/XR_BB)-1)/(NBBY/XR_BB),
 						sizeof(__uint64_t)));
 		for (j = 0; j < ag_hdr_block; j++)
-			set_agbno_state(mp, i, j, XR_E_INUSE_FS);
+			set_bmap(i, j, XR_E_INUSE_FS);
 	}
 	set_bmap_rt(mp->m_sb.sb_rextents);
 	set_bmap_log(mp);
Index: xfsprogs-dev/repair/phase5.c
===================================================================
--- xfsprogs-dev.orig/repair/phase5.c	2009-08-21 18:59:24.000000000 +0000
+++ xfsprogs-dev/repair/phase5.c	2009-08-21 19:05:51.000000000 +0000
@@ -123,7 +123,7 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_ag
 	for (agbno = 0; agbno < ag_end; agbno++)  {
 #if 0
 		old_state = state;
-		state = get_agbno_state(mp, agno, agbno);
+		state = get_bmap(agno, agbno);
 		if (state != old_state)  {
 			fprintf(stderr, "agbno %u - new state is %d\n",
 					agbno, state);
@@ -142,7 +142,7 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_ag
 			}
 
 		}
-		if (get_agbno_state(mp, agno, agbno) < XR_E_INUSE)  {
+		if (get_bmap(agno, agbno) < XR_E_INUSE)  {
 			free_blocks++;
 			if (in_extent == 0)  {
 				/*
Index: xfsprogs-dev/repair/scan.c
===================================================================
--- xfsprogs-dev.orig/repair/scan.c	2009-08-21 19:05:32.000000000 +0000
+++ xfsprogs-dev/repair/scan.c	2009-08-21 19:06:51.000000000 +0000
@@ -148,6 +148,9 @@ scanfunc_bmap(
 	xfs_dfiloff_t		last_key;
 	char			*forkname;
 	int			numrecs;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	int			state;
 
 	if (whichfork == XFS_DATA_FORK)
 		forkname = _("data");
@@ -229,11 +232,15 @@ _("bad back (left) sibling pointer (saw 
 		bm_cursor->level[level].right_fsbno =
 					be64_to_cpu(block->bb_u.l.bb_rightsib);
 
-		switch (get_fsbno_state(mp, bno))  {
+		agno = XFS_FSB_TO_AGNO(mp, bno);
+		agbno = XFS_FSB_TO_AGBNO(mp, bno);
+
+		state = get_bmap(agno, agbno);
+		switch (state) {
 		case XR_E_UNKNOWN:
 		case XR_E_FREE1:
 		case XR_E_FREE:
-			set_fsbno_state(mp, bno, XR_E_INUSE);
+			set_bmap(agno, agbno, XR_E_INUSE);
 			break;
 		case XR_E_FS_MAP:
 		case XR_E_INUSE:
@@ -245,19 +252,17 @@ _("bad back (left) sibling pointer (saw 
 			 * we made it here, the block probably
 			 * contains btree data.
 			 */
-			set_fsbno_state(mp, bno, XR_E_MULT);
+			set_bmap(agno, agbno, XR_E_MULT);
 			do_warn(
 		_("inode 0x%llx bmap block 0x%llx claimed, state is %d\n"),
-				ino, (__uint64_t) bno,
-				get_fsbno_state(mp, bno));
+				ino, (__uint64_t) bno, state);
 			break;
 		case XR_E_MULT:
 		case XR_E_INUSE_FS:
-			set_fsbno_state(mp, bno, XR_E_MULT);
+			set_bmap(agno, agbno, XR_E_MULT);
 			do_warn(
 		_("inode 0x%llx bmap block 0x%llx claimed, state is %d\n"),
-				ino, (__uint64_t) bno,
-				get_fsbno_state(mp, bno));
+				ino, (__uint64_t) bno, state);
 			/*
 			 * if we made it to here, this is probably a bmap block
 			 * that is being used by *another* file as a bmap block
@@ -272,8 +277,7 @@ _("bad back (left) sibling pointer (saw 
 		default:
 			do_warn(
 		_("bad state %d, inode 0x%llx bmap block 0x%llx\n"),
-				get_fsbno_state(mp, bno),
-				ino, (__uint64_t) bno);
+				state, ino, (__uint64_t) bno);
 			break;
 		}
 	} else  {
@@ -476,19 +480,15 @@ scanfunc_allocbt(
 	/*
 	 * check for btree blocks multiply claimed
 	 */
-	state = get_agbno_state(mp, agno, bno);
-
-	switch (state)  {
-	case XR_E_UNKNOWN:
-		set_agbno_state(mp, agno, bno, XR_E_FS_MAP);
-		break;
-	default:
-		set_agbno_state(mp, agno, bno, XR_E_MULT);
+	state = get_bmap(agno, bno);
+	switch (state != XR_E_UNKNOWN)  {
+		set_bmap(agno, bno, XR_E_MULT);
 		do_warn(
 _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
 				name, state, agno, bno, suspect);
 		return;
 	}
+	set_bmap(agno, bno, XR_E_FS_MAP);
 
 	numrecs = be16_to_cpu(block->bb_numrecs);
 
@@ -523,11 +523,10 @@ _("%s freespace btree block claimed (sta
 				continue;
 
 			for ( ; b < end; b++)  {
-				state = get_agbno_state(mp, agno, b);
+				state = get_bmap(agno, b);
 				switch (state) {
 				case XR_E_UNKNOWN:
-					set_agbno_state(mp, agno, b,
-							XR_E_FREE1);
+					set_bmap(agno, b, XR_E_FREE1);
 					break;
 				case XR_E_FREE1:
 					/*
@@ -535,8 +534,7 @@ _("%s freespace btree block claimed (sta
 					 * FREE1 blocks later
 					 */
 					if (magic != XFS_ABTB_MAGIC) {
-						set_agbno_state(mp, agno, b,
-								XR_E_FREE);
+						set_bmap(agno, b, XR_E_FREE);
 						break;
 					}
 				default:
@@ -698,13 +696,14 @@ _("bad ending inode # (%llu (0x%x 0x%x))
 		     j < XFS_INODES_PER_CHUNK;
 		     j += mp->m_sb.sb_inopblock)  {
 			agbno = XFS_AGINO_TO_AGBNO(mp, ino + j);
-			state = get_agbno_state(mp, agno, agbno);
+
+			state = get_bmap(agno, agbno);
 			if (state == XR_E_UNKNOWN)  {
-				set_agbno_state(mp, agno, agbno, XR_E_INO);
+				set_bmap(agno, agbno, XR_E_INO);
 			} else if (state == XR_E_INUSE_FS && agno == 0 &&
 				   ino + j >= first_prealloc_ino &&
 				   ino + j < last_prealloc_ino)  {
-				set_agbno_state(mp, agno, agbno, XR_E_INO);
+				set_bmap(agno, agbno, XR_E_INO);
 			} else  {
 				do_warn(
 _("inode chunk claims used block, inobt block - agno %d, bno %d, inopb %d\n"),
@@ -842,16 +841,15 @@ scanfunc_ino(
 	 * check for btree blocks multiply claimed, any unknown/free state
 	 * is ok in the bitmap block.
 	 */
-	state = get_agbno_state(mp, agno, bno);
-
+	state = get_bmap(agno, bno);
 	switch (state)  {
 	case XR_E_UNKNOWN:
 	case XR_E_FREE1:
 	case XR_E_FREE:
-		set_agbno_state(mp, agno, bno, XR_E_FS_MAP);
+		set_bmap(agno, bno, XR_E_FS_MAP);
 		break;
 	default:
-		set_agbno_state(mp, agno, bno, XR_E_MULT);
+		set_bmap(agno, bno, XR_E_MULT);
 		do_warn(
 _("inode btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
 			state, agno, bno, suspect);
@@ -953,7 +951,7 @@ scan_freelist(
 	if (XFS_SB_BLOCK(mp) != XFS_AGFL_BLOCK(mp) &&
 	    XFS_AGF_BLOCK(mp) != XFS_AGFL_BLOCK(mp) &&
 	    XFS_AGI_BLOCK(mp) != XFS_AGFL_BLOCK(mp))
-		set_agbno_state(mp, agno, XFS_AGFL_BLOCK(mp), XR_E_FS_MAP);
+		set_bmap(agno, XFS_AGFL_BLOCK(mp), XR_E_FS_MAP);
 
 	if (be32_to_cpu(agf->agf_flcount) == 0)
 		return;
@@ -971,7 +969,7 @@ scan_freelist(
 	for (;;) {
 		bno = be32_to_cpu(agfl->agfl_bno[i]);
 		if (verify_agbno(mp, agno, bno))
-			set_agbno_state(mp, agno, bno, XR_E_FREE);
+			set_bmap(agno, bno, XR_E_FREE);
 		else
 			do_warn(_("bad agbno %u in agfl, agno %d\n"),
 				bno, agno);
Index: xfsprogs-dev/repair/Makefile
===================================================================
--- xfsprogs-dev.orig/repair/Makefile	2009-08-21 19:05:38.000000000 +0000
+++ xfsprogs-dev/repair/Makefile	2009-08-21 19:05:51.000000000 +0000
@@ -32,9 +32,7 @@ include $(BUILDRULES)
 
 #
 # Tracing flags:
-# -DXR_BMAP_DBG		incore block bitmap debugging
 # -DXR_INODE_TRACE	inode processing
-# -DXR_BMAP_TRACE	bmap btree processing
 # -DXR_DIR_TRACE	directory processing
 # -DXR_DUP_TRACE	duplicate extent processing
 # -DXR_BCNT_TRACE	incore bcnt freespace btree building
Index: xfsprogs-dev/repair/incore.c
===================================================================
--- xfsprogs-dev.orig/repair/incore.c	2009-08-21 18:59:24.000000000 +0000
+++ xfsprogs-dev/repair/incore.c	2009-08-21 19:05:51.000000000 +0000
@@ -185,7 +185,8 @@ set_bmap_log(xfs_mount_t *mp)
 	logend = mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks;
 
 	for (i = mp->m_sb.sb_logstart; i < logend ; i++)  {
-		set_fsbno_state(mp, i, XR_E_INUSE_FS);
+		set_bmap(XFS_FSB_TO_AGNO(mp, i),
+			 XFS_FSB_TO_AGBNO(mp, i), XR_E_INUSE_FS);
 	}
 
 	return;
@@ -205,7 +206,7 @@ set_bmap_fs(xfs_mount_t *mp)
 
 	for (i = 0; i < mp->m_sb.sb_agcount; i++)
 		for (j = 0; j < end; j++)
-			set_agbno_state(mp, i, j, XR_E_INUSE_FS);
+			set_bmap(i, j, XR_E_INUSE_FS);
 
 	return;
 }
@@ -227,7 +228,7 @@ set_bmap_fs_bt(xfs_mount_t *mp)
 		 * account for btree roots
 		 */
 		for (j = begin; j < end; j++)
-			set_agbno_state(mp, i, j, XR_E_INUSE_FS);
+			set_bmap(i, j, XR_E_INUSE_FS);
 	}
 
 	return;
@@ -253,44 +254,3 @@ incore_init(xfs_mount_t *mp)
 
 	return;
 }
-
-#if defined(XR_BMAP_TRACE) || defined(XR_BMAP_DBG)
-int
-get_agbno_state(xfs_mount_t *mp, xfs_agnumber_t agno,
-		xfs_agblock_t ag_blockno)
-{
-	__uint64_t *addr;
-
-	addr = ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM;
-
-	return((*addr >> (((ag_blockno)%XR_BB_NUM)*XR_BB)) & XR_BB_MASK);
-}
-
-void set_agbno_state(xfs_mount_t *mp, xfs_agnumber_t agno,
-	xfs_agblock_t ag_blockno, int state)
-{
-	__uint64_t *addr;
-
-	addr = ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM;
-
-	*addr = (((*addr) &
-	  (~((__uint64_t) XR_BB_MASK << (((ag_blockno)%XR_BB_NUM)*XR_BB)))) |
-	 (((__uint64_t) (state)) << (((ag_blockno)%XR_BB_NUM)*XR_BB)));
-}
-
-int
-get_fsbno_state(xfs_mount_t *mp, xfs_dfsbno_t blockno)
-{
-	return(get_agbno_state(mp, XFS_FSB_TO_AGNO(mp, blockno),
-			XFS_FSB_TO_AGBNO(mp, blockno)));
-}
-
-void
-set_fsbno_state(xfs_mount_t *mp, xfs_dfsbno_t blockno, int state)
-{
-	set_agbno_state(mp, XFS_FSB_TO_AGNO(mp, blockno),
-		XFS_FSB_TO_AGBNO(mp, blockno), state);
-
-	return;
-}
-#endif
Index: xfsprogs-dev/repair/incore.h
===================================================================
--- xfsprogs-dev.orig/repair/incore.h	2009-08-21 18:59:24.000000000 +0000
+++ xfsprogs-dev/repair/incore.h	2009-08-21 19:05:51.000000000 +0000
@@ -72,51 +72,23 @@ void			teardown_bmap_finish(xfs_mount_t 
  * you want to use the regular block map.
  */
 
-#if defined(XR_BMAP_TRACE) || defined(XR_BMAP_DBG)
-/*
- * implemented as functions for debugging purposes
- */
-int get_agbno_state(xfs_mount_t *mp, xfs_agnumber_t agno,
-	xfs_agblock_t ag_blockno);
-void set_agbno_state(xfs_mount_t *mp, xfs_agnumber_t agno,
-	xfs_agblock_t ag_blockno, int state);
-
-int get_fsbno_state(xfs_mount_t *mp, xfs_dfsbno_t blockno);
-void set_fsbno_state(xfs_mount_t *mp, xfs_dfsbno_t blockno, int state);
-#else
-/*
- * implemented as macros for performance purposes
- */
-
-#define get_agbno_state(mp, agno, ag_blockno) \
+#define get_bmap(agno, ag_blockno) \
 			((int) (*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) \
 				 >> (((ag_blockno)%XR_BB_NUM)*XR_BB)) \
 				& XR_BB_MASK)
-#define set_agbno_state(mp, agno, ag_blockno, state) \
+#define set_bmap(agno, ag_blockno, state) \
 	*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) = \
 		((*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) & \
 	  (~((__uint64_t) XR_BB_MASK << (((ag_blockno)%XR_BB_NUM)*XR_BB)))) | \
 	 (((__uint64_t) (state)) << (((ag_blockno)%XR_BB_NUM)*XR_BB)))
 
-#define get_fsbno_state(mp, blockno) \
-		get_agbno_state(mp, XFS_FSB_TO_AGNO(mp, (blockno)), \
-				XFS_FSB_TO_AGBNO(mp, (blockno)))
-#define set_fsbno_state(mp, blockno, state) \
-		set_agbno_state(mp, XFS_FSB_TO_AGNO(mp, (blockno)), \
-			XFS_FSB_TO_AGBNO(mp, (blockno)), (state))
-
-
-#define get_agbno_rec(mp, agno, ag_blockno) \
-			(*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM))
-#endif /* XR_BMAP_TRACE */
-
 /*
  * these work in real-time extents (e.g. fsbno == rt extent number)
  */
-#define get_rtbno_state(mp, fsbno) \
+#define get_rtbmap(fsbno) \
 			((*(rt_ba_bmap + (fsbno)/XR_BB_NUM) >> \
 			(((fsbno)%XR_BB_NUM)*XR_BB)) & XR_BB_MASK)
-#define set_rtbno_state(mp, fsbno, state) \
+#define set_rtbmap(fsbno, state) \
 	*(rt_ba_bmap + (fsbno)/XR_BB_NUM) = \
 	 ((*(rt_ba_bmap + (fsbno)/XR_BB_NUM) & \
 	  (~((__uint64_t) XR_BB_MASK << (((fsbno)%XR_BB_NUM)*XR_BB)))) | \
Index: xfsprogs-dev/repair/phase2.c
===================================================================
--- xfsprogs-dev.orig/repair/phase2.c	2009-08-21 18:59:24.000000000 +0000
+++ xfsprogs-dev/repair/phase2.c	2009-08-21 19:05:51.000000000 +0000
@@ -176,7 +176,7 @@ phase2(xfs_mount_t *mp)
 		 * also mark blocks
 		 */
 		for (b = 0; b < mp->m_ialloc_blks; b++)  {
-			set_agbno_state(mp, 0,
+			set_bmap(0,
 				b + XFS_INO_TO_AGBNO(mp, mp->m_sb.sb_rootino),
 				XR_E_INO);
 		}
Index: xfsprogs-dev/repair/phase3.c
===================================================================
--- xfsprogs-dev.orig/repair/phase3.c	2009-08-21 18:59:24.000000000 +0000
+++ xfsprogs-dev/repair/phase3.c	2009-08-21 19:05:51.000000000 +0000
@@ -61,14 +61,8 @@ walk_unlinked_list(xfs_mount_t *mp, xfs_
 				agbno = XFS_AGINO_TO_AGBNO(mp, current_ino);
 
 				pthread_mutex_lock(&ag_locks[agno]);
-				switch (state = get_agbno_state(mp,
-							agno, agbno))  {
-				case XR_E_UNKNOWN:
-				case XR_E_FREE:
-				case XR_E_FREE1:
-					set_agbno_state(mp, agno, agbno,
-						XR_E_INO);
-					break;
+				state = get_bmap(agno, agbno);
+				switch (state) {
 				case XR_E_BAD_STATE:
 					do_error(_(
 						"bad state in block map %d\n"),
@@ -85,8 +79,7 @@ walk_unlinked_list(xfs_mount_t *mp, xfs_
 					 * anyway, hopefully without
 					 * losing too much other data
 					 */
-					set_agbno_state(mp, agno, agbno,
-						XR_E_INO);
+					set_bmap(agno, agbno, XR_E_INO);
 					break;
 				}
 				pthread_mutex_unlock(&ag_locks[agno]);
Index: xfsprogs-dev/repair/rt.c
===================================================================
--- xfsprogs-dev.orig/repair/rt.c	2009-08-21 18:59:24.000000000 +0000
+++ xfsprogs-dev/repair/rt.c	2009-08-21 19:05:51.000000000 +0000
@@ -91,7 +91,7 @@ generate_rtinfo(xfs_mount_t	*mp,
 		bits = 0;
 		for (i = 0; i < sizeof(xfs_rtword_t) * NBBY &&
 				extno < mp->m_sb.sb_rextents; i++, extno++)  {
-			if (get_rtbno_state(mp, extno) == XR_E_FREE)  {
+			if (get_rtbmap(extno) == XR_E_FREE)  {
 				sb_frextents++;
 				bits |= freebit;
 
@@ -218,7 +218,7 @@ process_rtbitmap(xfs_mount_t	*mp,
 		     bit < bitsperblock && extno < mp->m_sb.sb_rextents;
 		     bit++, extno++) {
 			if (xfs_isset(words, bit)) {
-				set_rtbno_state(mp, extno, XR_E_FREE);
+				set_rtbmap(extno, XR_E_FREE);
 				sb_frextents++;
 				if (prevbit == 0) {
 					start_bmbno = bmbno;

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 11/14] repair: cleanup alloc/free/reset of the block usage tracking
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (9 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 10/14] repair: cleanup helpers for tracking block usage Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-21 20:22   ` [PATCH 11/14] repair: cleanup alloc/free/reset of the block usagetracking Alex Elder
  2009-09-02 17:55 ` [PATCH 12/14] repair: switch block usage bitmap to a btree Christoph Hellwig
                   ` (4 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-cleanup-bmap-helpers-2 --]
[-- Type: text/plain, Size: 12788 bytes --]

Currently the code to allocate, free and reset the block usage bitmaps
is a complete mess.  This patch reorganizes it into logical helpers.

Details:

 - the current incore_init code is called just before phase2 is called,
   which then marks the log and the AG headers used.
 - we get rid of incore_init init, and replace it with direct calls to the
   unchanched incore_ino_init/incore_ext_init functions and our new init_bmaps
   which does all the allocations for the block usage tracking, aswell
   as a call to reset_bmaps to initialize it to the default values.
 - reset_bmaps is also called from early phase4 code to reset all state
   instead of opencoding it.
 - there is a new free_bmaps helper which we call to free our block usage
   bitmaps when we don't need them anymore after phase5.  The current
   code frees some of it a bit early in phase5, but needs to take of it
   in phase6 in case we didn't call phase5 due to nomodify mode, and leaks
   it if we don't call phase 6, which might happen in case of a bad inode
   allocation btree.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/phase4.c
===================================================================
--- xfsprogs-dev.orig/repair/phase4.c	2009-08-21 01:59:26.000000000 +0000
+++ xfsprogs-dev/repair/phase4.c	2009-08-21 02:41:44.000000000 +0000
@@ -355,19 +355,7 @@ phase4(xfs_mount_t *mp)
 	/*
 	 * initialize bitmaps for all AGs
 	 */
-	for (i = 0; i < mp->m_sb.sb_agcount; i++)  {
-		/*
-		 * now reset the bitmap for all ags
-		 */
-		memset(ba_bmap[i], 0,
-		    roundup((mp->m_sb.sb_agblocks+(NBBY/XR_BB)-1)/(NBBY/XR_BB),
-						sizeof(__uint64_t)));
-		for (j = 0; j < ag_hdr_block; j++)
-			set_bmap(i, j, XR_E_INUSE_FS);
-	}
-	set_bmap_rt(mp->m_sb.sb_rextents);
-	set_bmap_log(mp);
-	set_bmap_fs(mp);
+	reset_bmaps(mp);
 
 	do_log(_("        - check for inodes claiming duplicate blocks...\n"));
 	set_progress_msg(PROG_FMT_DUP_BLOCKS, (__uint64_t) mp->m_sb.sb_icount);
Index: xfsprogs-dev/repair/incore.c
===================================================================
--- xfsprogs-dev.orig/repair/incore.c	2009-08-21 01:59:26.000000000 +0000
+++ xfsprogs-dev/repair/incore.c	2009-08-21 03:02:28.000000000 +0000
@@ -52,205 +52,117 @@ free_allocations(ba_rec_t *list)
 	return;
 }
 
-/* ba bmap setupstuff.  setting/getting state is in incore.h  */
 
-void
-setup_bmap(xfs_agnumber_t agno, xfs_agblock_t numblocks, xfs_drtbno_t rtblocks)
-{
-	int i;
-	size_t size = 0;
+static size_t		rt_bmap_size;
 
-	ba_bmap = (__uint64_t**)malloc(agno*sizeof(__uint64_t *));
-	if (!ba_bmap)
-		do_error(_("couldn't allocate block map pointers\n"));
-	ag_locks = malloc(agno * sizeof(pthread_mutex_t));
-	if (!ag_locks)
-		do_error(_("couldn't allocate block map locks\n"));
-
-	for (i = 0; i < agno; i++)  {
-		size = roundup((numblocks+(NBBY/XR_BB)-1) / (NBBY/XR_BB),
-		       		sizeof(__uint64_t));
-
-		ba_bmap[i] = (__uint64_t*)memalign(sizeof(__uint64_t), size);
-		if (!ba_bmap[i]) {
-			do_error(_("couldn't allocate block map, size = %d\n"),
-				numblocks);
-			return;
-		}
-		memset(ba_bmap[i], 0, size);
-		pthread_mutex_init(&ag_locks[i], NULL);
-	}
+static void
+reset_rt_bmap(void)
+{
+	if (rt_ba_bmap)
+		memset(rt_ba_bmap, 0x22, rt_bmap_size);	/* XR_E_FREE */
+}
 
-	if (rtblocks == 0)  {
-		rt_ba_bmap = NULL;
+static void
+init_rt_bmap(
+	xfs_mount_t	*mp)
+{
+	if (mp->m_sb.sb_rextents == 0)
 		return;
-	}
 
-	size = roundup(rtblocks / (NBBY/XR_BB), sizeof(__uint64_t));
+	rt_bmap_size = roundup(mp->m_sb.sb_rextents / (NBBY / XR_BB),
+			       sizeof(__uint64_t));
 
-	rt_ba_bmap=(__uint64_t*)memalign(sizeof(__uint64_t), size);
+	rt_ba_bmap = memalign(sizeof(__uint64_t), rt_bmap_size);
 	if (!rt_ba_bmap) {
-			do_error(
+		do_error(
 		_("couldn't allocate realtime block map, size = %llu\n"),
-				rtblocks);
-			return;
+			mp->m_sb.sb_rextents);
+		return;
 	}
-
-	/*
-	 * start all real-time as free blocks
-	 */
-	set_bmap_rt(rtblocks);
-
-	return;
 }
 
-/* ARGSUSED */
-void
-teardown_rt_bmap(xfs_mount_t *mp)
+static void
+free_rt_bmap(xfs_mount_t *mp)
 {
-	if (rt_ba_bmap != NULL)  {
-		free(rt_ba_bmap);
-		rt_ba_bmap = NULL;
-	}
-
-	return;
+	free(rt_ba_bmap);
+	rt_ba_bmap = NULL;
 }
 
-/* ARGSUSED */
-void
-teardown_ag_bmap(xfs_mount_t *mp, xfs_agnumber_t agno)
-{
-	ASSERT(ba_bmap[agno] != NULL);
-
-	free(ba_bmap[agno]);
-	ba_bmap[agno] = NULL;
-
-	return;
-}
 
-/* ARGSUSED */
 void
-teardown_bmap_finish(xfs_mount_t *mp)
+reset_bmaps(xfs_mount_t *mp)
 {
-	free(ba_bmap);
-	ba_bmap = NULL;
-
-	return;
-}
+	xfs_agnumber_t	agno;
+	int		ag_hdr_block;
+	int		i;
 
-void
-teardown_bmap(xfs_mount_t *mp)
-{
-	xfs_agnumber_t i;
+	ag_hdr_block = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
 
-	for (i = 0; i < mp->m_sb.sb_agcount; i++)  {
-		teardown_ag_bmap(mp, i);
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)  {
+		memset(ba_bmap[agno], 0,
+		       roundup((mp->m_sb.sb_agblocks + (NBBY / XR_BB) - 1) /
+				(NBBY / XR_BB), sizeof(__uint64_t)));
+		for (i = 0; i < ag_hdr_block; i++)
+			set_bmap(agno, i, XR_E_INUSE_FS);
 	}
 
-	teardown_rt_bmap(mp);
-	teardown_bmap_finish(mp);
+	if (mp->m_sb.sb_logstart != 0) {
+		xfs_dfsbno_t	logend;
 
-	return;
-}
+		logend = mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks;
 
-/*
- * block map initialization routines -- realtime, log, fs
- */
-void
-set_bmap_rt(xfs_drtbno_t num)
-{
-	xfs_drtbno_t j;
-	xfs_drtbno_t size;
-
-	/*
-	 * for now, initialize all realtime blocks to be free
-	 * (state == XR_E_FREE)
-	 */
-	size = howmany(num / (NBBY/XR_BB), sizeof(__uint64_t));
-
-	for (j = 0; j < size; j++)
-		rt_ba_bmap[j] = 0x2222222222222222LL;
-
-	return;
-}
-
-void
-set_bmap_log(xfs_mount_t *mp)
-{
-	xfs_dfsbno_t	logend, i;
-
-	if (mp->m_sb.sb_logstart == 0)
-		return;
-
-	logend = mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks;
-
-	for (i = mp->m_sb.sb_logstart; i < logend ; i++)  {
-		set_bmap(XFS_FSB_TO_AGNO(mp, i),
-			 XFS_FSB_TO_AGBNO(mp, i), XR_E_INUSE_FS);
+		for (i = mp->m_sb.sb_logstart; i < logend ; i++)  {
+			set_bmap(XFS_FSB_TO_AGNO(mp, i),
+				 XFS_FSB_TO_AGBNO(mp, i), XR_E_INUSE_FS);
+		}
 	}
 
-	return;
+	reset_rt_bmap();
 }
 
 void
-set_bmap_fs(xfs_mount_t *mp)
+init_bmaps(xfs_mount_t *mp)
 {
-	xfs_agnumber_t	i;
-	xfs_agblock_t	j;
-	xfs_agblock_t	end;
-
-	/*
-	 * AG header is 4 sectors
-	 */
-	end = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
+	xfs_agblock_t numblocks = mp->m_sb.sb_agblocks;
+	int agcount = mp->m_sb.sb_agcount;
+	int i;
+	size_t size = 0;
 
-	for (i = 0; i < mp->m_sb.sb_agcount; i++)
-		for (j = 0; j < end; j++)
-			set_bmap(i, j, XR_E_INUSE_FS);
+	ba_bmap = calloc(agcount, sizeof(__uint64_t *));
+	if (!ba_bmap)
+		do_error(_("couldn't allocate block map pointers\n"));
 
-	return;
-}
+	ag_locks = calloc(agcount, sizeof(pthread_mutex_t));
+	if (!ag_locks)
+		do_error(_("couldn't allocate block map locks\n"));
 
-#if 0
-void
-set_bmap_fs_bt(xfs_mount_t *mp)
-{
-	xfs_agnumber_t	i;
-	xfs_agblock_t	j;
-	xfs_agblock_t	begin;
-	xfs_agblock_t	end;
-
-	begin = bnobt_root;
-	end = inobt_root + 1;
-
-	for (i = 0; i < mp->m_sb.sb_agcount; i++)  {
-		/*
-		 * account for btree roots
-		 */
-		for (j = begin; j < end; j++)
-			set_bmap(i, j, XR_E_INUSE_FS);
+	for (i = 0; i < agcount; i++)  {
+		size = roundup((numblocks+(NBBY/XR_BB)-1) / (NBBY/XR_BB),
+		       		sizeof(__uint64_t));
+
+		ba_bmap[i] = memalign(sizeof(__uint64_t), size);
+		if (!ba_bmap[i]) {
+			do_error(_("couldn't allocate block map, size = %d\n"),
+				numblocks);
+			return;
+		}
+		memset(ba_bmap[i], 0, size);
+		pthread_mutex_init(&ag_locks[i], NULL);
 	}
 
-	return;
+	init_rt_bmap(mp);
+	reset_bmaps(mp);
 }
-#endif
 
 void
-incore_init(xfs_mount_t *mp)
+free_bmaps(xfs_mount_t *mp)
 {
-	int agcount = mp->m_sb.sb_agcount;
-	extern void incore_ino_init(xfs_mount_t *);
-	extern void incore_ext_init(xfs_mount_t *);
-
-	/* init block alloc bmap */
-
-	setup_bmap(agcount, mp->m_sb.sb_agblocks, mp->m_sb.sb_rextents);
-	incore_ino_init(mp);
-	incore_ext_init(mp);
-
-	/* initialize random globals now that we know the fs geometry */
+	xfs_agnumber_t i;
 
-	inodes_per_block = mp->m_sb.sb_inopblock;
+	for (i = 0; i < mp->m_sb.sb_agcount; i++)
+		free(ba_bmap[i]);
+	free(ba_bmap);
+	ba_bmap = NULL;
 
-	return;
+	free_rt_bmap(mp);
 }
Index: xfsprogs-dev/repair/incore.h
===================================================================
--- xfsprogs-dev.orig/repair/incore.h	2009-08-21 01:59:26.000000000 +0000
+++ xfsprogs-dev/repair/incore.h	2009-08-21 03:00:13.000000000 +0000
@@ -43,14 +43,10 @@ void			free_allocations(ba_rec_t *list);
  */
 #define BA_BMAP_SIZE(x)		(howmany(x, 4))
 
-void			set_bmap_rt(xfs_drfsbno_t numblocks);
-void			set_bmap_log(xfs_mount_t *mp);
-void			set_bmap_fs(xfs_mount_t *mp);
-void			teardown_bmap(xfs_mount_t *mp);
-
-void			teardown_rt_bmap(xfs_mount_t *mp);
-void			teardown_ag_bmap(xfs_mount_t *mp, xfs_agnumber_t agno);
-void			teardown_bmap_finish(xfs_mount_t *mp);
+void			init_bmaps(xfs_mount_t *mp);
+void			reset_bmaps(xfs_mount_t *mp);
+void			free_bmaps(xfs_mount_t *mp);
+
 
 /* blocks are numbered from zero */
 
@@ -254,6 +250,7 @@ void		release_agbcnt_extent_tree(xfs_agn
  */
 void		free_rt_dup_extent_tree(xfs_mount_t *mp);
 
+void		incore_ext_init(xfs_mount_t *);
 /*
  * per-AG extent trees shutdown routine -- all (bno, bcnt and dup)
  * at once.  this one actually frees the memory instead of just recyling
@@ -261,6 +258,8 @@ void		free_rt_dup_extent_tree(xfs_mount_
  */
 void		incore_ext_teardown(xfs_mount_t *mp);
 
+void		incore_ino_init(xfs_mount_t *);
+
 /*
  * inode definitions
  */
Index: xfsprogs-dev/repair/phase2.c
===================================================================
--- xfsprogs-dev.orig/repair/phase2.c	2009-08-21 02:04:25.000000000 +0000
+++ xfsprogs-dev/repair/phase2.c	2009-08-21 02:41:43.000000000 +0000
@@ -134,12 +134,6 @@ phase2(xfs_mount_t *mp)
 
 	do_log(_("        - scan filesystem freespace and inode maps...\n"));
 
-	/*
-	 * account for space used by ag headers and log if internal
-	 */
-	set_bmap_log(mp);
-	set_bmap_fs(mp);
-
 	bad_ino_btree = 0;
 
 	set_progress_msg(PROG_FMT_SCAN_AG, (__uint64_t) glob_agcount);
Index: xfsprogs-dev/repair/xfs_repair.c
===================================================================
--- xfsprogs-dev.orig/repair/xfs_repair.c	2009-08-21 02:47:02.000000000 +0000
+++ xfsprogs-dev/repair/xfs_repair.c	2009-08-21 03:03:51.000000000 +0000
@@ -39,7 +39,6 @@ extern void	phase4(xfs_mount_t *);
 extern void	phase5(xfs_mount_t *);
 extern void	phase6(xfs_mount_t *);
 extern void	phase7(xfs_mount_t *);
-extern void	incore_init(xfs_mount_t *);
 
 #define		XR_MAX_SECT_SIZE	(64 * 1024)
 
@@ -694,9 +693,14 @@ main(int argc, char **argv)
 	calc_mkfs(mp);
 
 	/*
-	 * check sb filesystem stats and initialize in-core data structures
+	 * initialize block alloc map
 	 */
-	incore_init(mp);
+	init_bmaps(mp);
+	incore_ino_init(mp);
+	incore_ext_init(mp);
+
+	/* initialize random globals now that we know the fs geometry */
+	inodes_per_block = mp->m_sb.sb_inopblock;
 
 	if (parse_sb_version(&mp->m_sb))  {
 		do_warn(
@@ -724,6 +728,11 @@ main(int argc, char **argv)
 	}
 	timestamp(PHASE_END, 5, NULL);
 
+	/*
+	 * Done with the block usage maps, toss them...
+	 */
+	free_bmaps(mp);
+
 	if (!bad_ino_btree)  {
 		phase6(mp);
 		timestamp(PHASE_END, 6, NULL);
Index: xfsprogs-dev/repair/phase6.c
===================================================================
--- xfsprogs-dev.orig/repair/phase6.c	2009-08-21 02:44:58.000000000 +0000
+++ xfsprogs-dev/repair/phase6.c	2009-08-21 02:54:54.000000000 +0000
@@ -3661,11 +3661,6 @@ phase6(xfs_mount_t *mp)
 
 	do_log(_("Phase 6 - check inode connectivity...\n"));
 
-	if (!no_modify)
-		teardown_bmap_finish(mp);
-	else
-		teardown_bmap(mp);
-
 	incore_ext_teardown(mp);
 
 	add_ino_ex_data(mp);
Index: xfsprogs-dev/repair/phase5.c
===================================================================
--- xfsprogs-dev.orig/repair/phase5.c	2009-08-21 02:42:26.000000000 +0000
+++ xfsprogs-dev/repair/phase5.c	2009-08-21 03:00:07.000000000 +0000
@@ -1465,11 +1465,6 @@ phase5_func(
 		}
 
 		/*
-		 * done with the AG bitmap, toss it...
-		 */
-		teardown_ag_bmap(mp, agno);
-
-		/*
 		 * ok, now set up the btree cursors for the
 		 * on-disk btrees (includs pre-allocating all
 		 * required blocks for the trees themselves)
@@ -1655,7 +1650,6 @@ phase5(xfs_mount_t *mp)
 		_("        - generate realtime summary info and bitmap...\n"));
 		rtinit(mp);
 		generate_rtinfo(mp, btmcompute, sumcompute);
-		teardown_rt_bmap(mp);
 	}
 
 	do_log(_("        - reset superblock...\n"));

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 12/14] repair: switch block usage bitmap to a btree
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (10 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 11/14] repair: cleanup alloc/free/reset of the block usage tracking Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-22 16:22   ` Alex Elder
  2009-09-02 17:55 ` [PATCH 13/14] repair: optimize duplicate extent tracking Christoph Hellwig
                   ` (3 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-bmap_extents-btree --]
[-- Type: text/plain, Size: 24884 bytes --]

Using a btree representing the extents is much more space efficient than
using a bitmap tracking every single block.  In addition it also allows
for more optimal algorithms checking range overlaps instead of walking
every block in various places.

Also move the RT tracking bitmap into incore.c instead of leaving it
a as macros - this keeps the implementation contained.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/dino_chunks.c
===================================================================
--- xfsprogs-dev.orig/repair/dino_chunks.c	2009-09-02 14:51:09.449268859 -0300
+++ xfsprogs-dev/repair/dino_chunks.c	2009-09-02 14:51:18.593298964 -0300
@@ -118,6 +118,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
 	int		i;
 	int		j;
 	int		state;
+	xfs_extlen_t	blen;
 
 	agno = XFS_INO_TO_AGNO(mp, ino);
 	agino = XFS_INO_TO_AGINO(mp, ino);
@@ -433,9 +434,10 @@ verify_inode_chunk(xfs_mount_t		*mp,
 	 * entry or an iunlinked pointer
 	 */
 	pthread_mutex_lock(&ag_locks[agno]);
-	for (j = 0, cur_agbno = chunk_start_agbno;
-			cur_agbno < chunk_stop_agbno; cur_agbno++)  {
-		state = get_bmap(agno, cur_agbno);
+	for (cur_agbno = chunk_start_agbno;
+	     cur_agbno < chunk_stop_agbno;
+	     cur_agbno += blen)  {
+		state = get_bmap_ext(agno, cur_agbno, chunk_stop_agbno, &blen);
 		switch (state) {
 		case XR_E_MULT:
 		case XR_E_INUSE:
@@ -444,9 +446,9 @@ verify_inode_chunk(xfs_mount_t		*mp,
 			do_warn(
 		_("inode block %d/%d multiply claimed, (state %d)\n"),
 				agno, cur_agbno, state);
-			set_bmap(agno, cur_agbno, XR_E_MULT);
-			j = 1;
-			break;
+			set_bmap_ext(agno, cur_agbno, blen, XR_E_MULT);
+			pthread_mutex_unlock(&ag_locks[agno]);
+			return 0;
 		case XR_E_INO:
 			do_error(
 		_("uncertain inode block overlap, agbno = %d, ino = %llu\n"),
@@ -455,11 +457,6 @@ verify_inode_chunk(xfs_mount_t		*mp,
 		default:
 			break;
 		}
-
-		if (j) {
-			pthread_mutex_unlock(&ag_locks[agno]);
-			return(0);
-		}
 	}
 	pthread_mutex_unlock(&ag_locks[agno]);
 
@@ -487,8 +484,9 @@ verify_inode_chunk(xfs_mount_t		*mp,
 	pthread_mutex_lock(&ag_locks[agno]);
 
 	for (cur_agbno = chunk_start_agbno;
-			cur_agbno < chunk_stop_agbno; cur_agbno++)  {
-		state = get_bmap(agno, cur_agbno);
+	     cur_agbno < chunk_stop_agbno;
+	     cur_agbno += blen)  {
+		state = get_bmap_ext(agno, cur_agbno, chunk_stop_agbno, &blen);
 		switch (state) {
 		case XR_E_INO:
 			do_error(
@@ -498,7 +496,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
 		case XR_E_UNKNOWN:
 		case XR_E_FREE1:
 		case XR_E_FREE:
-			set_bmap(agno, cur_agbno, XR_E_INO);
+			set_bmap_ext(agno, cur_agbno, blen, XR_E_INO);
 			break;
 		case XR_E_MULT:
 		case XR_E_INUSE:
@@ -512,7 +510,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
 			do_warn(
 		_("inode block %d/%d bad state, (state %d)\n"),
 				agno, cur_agbno, state);
-			set_bmap(agno, cur_agbno, XR_E_INO);
+			set_bmap_ext(agno, cur_agbno, blen, XR_E_INO);
 			break;
 		}
 	}
Index: xfsprogs-dev/repair/dinode.c
===================================================================
--- xfsprogs-dev.orig/repair/dinode.c	2009-09-02 14:51:09.457268829 -0300
+++ xfsprogs-dev/repair/dinode.c	2009-09-02 14:51:18.593298964 -0300
@@ -524,6 +524,7 @@ process_rt_rec(
 
 	/*
 	 * set the appropriate number of extents
+	 * this iterates block by block, this can be optimised using extents
 	 */
 	for (b = irec->br_startblock; b < irec->br_startblock +
 			irec->br_blockcount; b += mp->m_sb.sb_rextsize)  {
@@ -614,9 +615,10 @@ process_bmbt_reclist_int(
 	char			*forkname;
 	int			i;
 	int			state;
-	xfs_dfsbno_t		e;
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
+	xfs_agblock_t		ebno;
+	xfs_extlen_t		blen;
 	xfs_agnumber_t		locked_agno = -1;
 	int			error = 1;
 
@@ -718,7 +720,7 @@ process_bmbt_reclist_int(
 		 */
 		agno = XFS_FSB_TO_AGNO(mp, irec.br_startblock);
 		agbno = XFS_FSB_TO_AGBNO(mp, irec.br_startblock);
-		e = irec.br_startblock + irec.br_blockcount;
+		ebno = agbno + irec.br_blockcount;
 		if (agno != locked_agno) {
 			if (locked_agno != -1)
 				pthread_mutex_unlock(&ag_locks[locked_agno]);
@@ -733,7 +735,9 @@ process_bmbt_reclist_int(
 			 * checking each entry without setting the
 			 * block bitmap
 			 */
-			for (b = irec.br_startblock; b < e; b++, agbno++)  {
+			for (b = irec.br_startblock;
+			     agbno < ebno;
+			     b++, agbno++)  {
 				if (search_dup_extent(mp, agno, agbno)) {
 					do_warn(_("%s fork in ino %llu claims "
 						"dup extent, off - %llu, "
@@ -748,22 +752,10 @@ process_bmbt_reclist_int(
 			continue;
 		}
 
-		for (b = irec.br_startblock; b < e; b++, agbno++)  {
-			/*
-			 * Process in chunks of 16 (XR_BB_UNIT/XR_BB)
-			 * for common XR_E_UNKNOWN to XR_E_INUSE transition
-			 */
-			if (((agbno & XR_BB_MASK) == 0) && ((irec.br_startblock + irec.br_blockcount - b) >= (XR_BB_UNIT/XR_BB))) {
-				if (ba_bmap[agno][agbno>>XR_BB] == XR_E_UNKNOWN_LL) {
-					ba_bmap[agno][agbno>>XR_BB] = XR_E_INUSE_LL;
-					agbno += (XR_BB_UNIT/XR_BB) - 1;
-					b += (XR_BB_UNIT/XR_BB) - 1;
-					continue;
-				}
-
-			}
-
-			state = get_bmap(agno, agbno);
+		for (b = irec.br_startblock;
+		     agbno < ebno;
+		     b += blen, agbno += blen) {
+			state = get_bmap_ext(agno, agbno, ebno, &blen);
 			switch (state)  {
 			case XR_E_FREE:
 			case XR_E_FREE1:
@@ -772,7 +764,7 @@ process_bmbt_reclist_int(
 					forkname, ino, (__uint64_t) b);
 				/* fall through ... */
 			case XR_E_UNKNOWN:
-				set_bmap(agno, agbno, XR_E_INUSE);
+				set_bmap_ext(agno, agbno, blen, XR_E_INUSE);
 				break;
 
 			case XR_E_BAD_STATE:
@@ -788,7 +780,7 @@ process_bmbt_reclist_int(
 
 			case XR_E_INUSE:
 			case XR_E_MULT:
-				set_bmap(agno, agbno, XR_E_MULT);
+				set_bmap_ext(agno, agbno, blen, XR_E_MULT);
 				do_warn(_("%s fork in %s inode %llu claims "
 					"used block %llu\n"),
 					forkname, ftype, ino, (__uint64_t) b);
Index: xfsprogs-dev/repair/globals.h
===================================================================
--- xfsprogs-dev.orig/repair/globals.h	2009-09-02 14:51:09.461268919 -0300
+++ xfsprogs-dev/repair/globals.h	2009-09-02 14:51:18.597292070 -0300
@@ -156,11 +156,6 @@ EXTERN int		chunks_pblock;	/* # of 64-in
 EXTERN int		max_symlink_blocks;
 EXTERN __int64_t	fs_max_file_offset;
 
-/* block allocation bitmaps */
-
-EXTERN __uint64_t	**ba_bmap;	/* see incore.h */
-EXTERN __uint64_t	*rt_ba_bmap;	/* see incore.h */
-
 /* realtime info */
 
 EXTERN xfs_rtword_t	*btmcompute;
Index: xfsprogs-dev/repair/phase2.c
===================================================================
--- xfsprogs-dev.orig/repair/phase2.c	2009-09-02 14:51:09.465298621 -0300
+++ xfsprogs-dev/repair/phase2.c	2009-09-02 14:51:18.605297206 -0300
@@ -109,7 +109,6 @@ void
 phase2(xfs_mount_t *mp)
 {
 	xfs_agnumber_t		i;
-	xfs_agblock_t		b;
 	int			j;
 	ino_tree_node_t		*ino_rec;
 
@@ -169,11 +168,8 @@ phase2(xfs_mount_t *mp)
 		/*
 		 * also mark blocks
 		 */
-		for (b = 0; b < mp->m_ialloc_blks; b++)  {
-			set_bmap(0,
-				b + XFS_INO_TO_AGBNO(mp, mp->m_sb.sb_rootino),
-				XR_E_INO);
-		}
+		set_bmap_ext(0, XFS_INO_TO_AGBNO(mp, mp->m_sb.sb_rootino),
+			     mp->m_ialloc_blks, XR_E_INO);
 	} else  {
 		do_log(_("        - found root inode chunk\n"));
 
Index: xfsprogs-dev/repair/phase4.c
===================================================================
--- xfsprogs-dev.orig/repair/phase4.c	2009-09-02 14:51:09.533268366 -0300
+++ xfsprogs-dev/repair/phase4.c	2009-09-02 14:51:18.609296598 -0300
@@ -192,8 +192,7 @@ phase4(xfs_mount_t *mp)
 	xfs_agnumber_t		i;
 	xfs_agblock_t		j;
 	xfs_agblock_t		ag_end;
-	xfs_agblock_t		extent_start;
-	xfs_extlen_t		extent_len;
+	xfs_extlen_t		blen;
 	int			ag_hdr_len = 4 * mp->m_sb.sb_sectsize;
 	int			ag_hdr_block;
 	int			bstate;
@@ -226,29 +225,13 @@ phase4(xfs_mount_t *mp)
 		ag_end = (i < mp->m_sb.sb_agcount - 1) ? mp->m_sb.sb_agblocks :
 			mp->m_sb.sb_dblocks -
 				(xfs_drfsbno_t) mp->m_sb.sb_agblocks * i;
-		extent_start = extent_len = 0;
+
 		/*
 		 * set up duplicate extent list for this ag
 		 */
-		for (j = ag_hdr_block; j < ag_end; j++)  {
-
-			/* Process in chunks of 16 (XR_BB_UNIT/XR_BB) */
-			if ((extent_start == 0) && ((j & XR_BB_MASK) == 0)) {
-				switch(ba_bmap[i][j>>XR_BB]) {
-				case XR_E_UNKNOWN_LL:
-				case XR_E_FREE1_LL:
-				case XR_E_FREE_LL:
-				case XR_E_INUSE_LL:
-				case XR_E_INUSE_FS_LL:
-				case XR_E_INO_LL:
-				case XR_E_FS_MAP_LL:
-					j += (XR_BB_UNIT/XR_BB) - 1;
-					continue;
-				}
-			}
-
-			bstate = get_bmap(i, j);
-			switch (bstate)  {
+		for (j = ag_hdr_block; j < ag_end; j += blen)  {
+			bstate = get_bmap_ext(i, j, ag_end, &blen);
+			switch (bstate) {
 			case XR_E_BAD_STATE:
 			default:
 				do_warn(
@@ -262,37 +245,13 @@ phase4(xfs_mount_t *mp)
 			case XR_E_INUSE_FS:
 			case XR_E_INO:
 			case XR_E_FS_MAP:
-				if (extent_start == 0)
-					continue;
-				else  {
-					/*
-					 * add extent and reset extent state
-					 */
-					add_dup_extent(i, extent_start,
-							extent_len);
-					extent_start = 0;
-					extent_len = 0;
-				}
 				break;
 			case XR_E_MULT:
-				if (extent_start == 0)  {
-					extent_start = j;
-					extent_len = 1;
-				} else if (extent_len == MAXEXTLEN)  {
-					add_dup_extent(i, extent_start,
-							extent_len);
-					extent_start = j;
-					extent_len = 1;
-				} else
-					extent_len++;
+				add_dup_extent(i, j, blen);
 				break;
 			}
 		}
-		/*
-		 * catch tail-case, extent hitting the end of the ag
-		 */
-		if (extent_start != 0)
-			add_dup_extent(i, extent_start, extent_len);
+
 		PROG_RPT_INC(prog_rpt_done[i], 1);
 	}
 	print_final_rpt();
Index: xfsprogs-dev/repair/phase5.c
===================================================================
--- xfsprogs-dev.orig/repair/phase5.c	2009-09-02 14:51:09.561269620 -0300
+++ xfsprogs-dev/repair/phase5.c	2009-09-02 14:51:18.613269588 -0300
@@ -88,10 +88,8 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_ag
 	xfs_agblock_t		agbno;
 	xfs_agblock_t		ag_end;
 	uint			free_blocks;
-#ifdef XR_BLD_FREE_TRACE
-	int			old_state;
-	int			state = XR_E_BAD_STATE;
-#endif
+	xfs_extlen_t		blen;
+	int			bstate;
 
 	/*
 	 * scan the bitmap for the ag looking for continuous
@@ -120,30 +118,10 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_ag
 	 * ok, now find the number of extents, keep track of the
 	 * largest extent.
 	 */
-	for (agbno = 0; agbno < ag_end; agbno++)  {
-#if 0
-		old_state = state;
-		state = get_bmap(agno, agbno);
-		if (state != old_state)  {
-			fprintf(stderr, "agbno %u - new state is %d\n",
-					agbno, state);
-		}
-#endif
-		/* Process in chunks of 16 (XR_BB_UNIT/XR_BB) */
-		if ((in_extent == 0) && ((agbno & XR_BB_MASK) == 0)) {
-			/* testing >= XR_E_INUSE */
-			switch (ba_bmap[agno][agbno>>XR_BB]) {
-			case XR_E_INUSE_LL:
-			case XR_E_INUSE_FS_LL:
-			case XR_E_INO_LL:
-			case XR_E_FS_MAP_LL:
-				agbno += (XR_BB_UNIT/XR_BB) - 1;
-				continue;
-			}
-
-		}
-		if (get_bmap(agno, agbno) < XR_E_INUSE)  {
-			free_blocks++;
+	for (agbno = 0; agbno < ag_end; agbno += blen) {
+		bstate = get_bmap_ext(agno, agbno, ag_end, &blen);
+		if (bstate < XR_E_INUSE)  {
+			free_blocks += blen;
 			if (in_extent == 0)  {
 				/*
 				 * found the start of a free extent
@@ -151,9 +129,9 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_ag
 				in_extent = 1;
 				num_extents++;
 				extent_start = agbno;
-				extent_len = 1;
+				extent_len = blen;
 			} else  {
-				extent_len++;
+				extent_len += blen;
 			}
 		} else   {
 			if (in_extent)  {
Index: xfsprogs-dev/repair/incore.c
===================================================================
--- xfsprogs-dev.orig/repair/incore.c	2009-09-02 14:51:09.565269570 -0300
+++ xfsprogs-dev/repair/incore.c	2009-09-02 14:51:29.072772399 -0300
@@ -18,6 +18,7 @@
 
 #include <libxfs.h>
 #include "avl.h"
+#include "btree.h"
 #include "globals.h"
 #include "incore.h"
 #include "agheader.h"
@@ -52,14 +53,192 @@ free_allocations(ba_rec_t *list)
 	return;
 }
 
+/*
+ * The following manages the in-core bitmap of the entire filesystem
+ * using extents in a btree.
+ *
+ * The btree items will point to one of the state values below,
+ * rather than storing the value itself in the pointer.
+ */
+static int states[16] =
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+static struct btree_root	**ag_bmap;
+
+static void
+update_bmap(
+	struct btree_root	*bmap,
+	unsigned long		offset,
+	xfs_extlen_t		blen,
+	void			*new_state)
+{
+	unsigned long		end = offset + blen;
+	int			*cur_state;
+	unsigned long		cur_key;
+	int			*next_state;
+	unsigned long		next_key;
+	int			*prev_state;
+
+	cur_state = btree_find(bmap, offset, &cur_key);
+	if (!cur_state)
+		return;
+
+	if (offset == cur_key) {
+		/* if the start is the same as the "item" extent */
+		if (cur_state == new_state)
+			return;
+
+		/*
+		 * Note: this may be NULL if we are updating the map for
+		 * the superblock.
+		 */
+		prev_state = btree_peek_prev(bmap, NULL);
+
+		next_state = btree_peek_next(bmap, &next_key);
+		if (next_key > end) {
+			/* different end */
+			if (new_state == prev_state) {
+				/* #1: prev has same state, move offset up */
+				btree_update_key(bmap, offset, end);
+				return;
+			}
+
+			/* #4: insert new extent after, update current value */
+			btree_update_value(bmap, offset, new_state);
+			btree_insert(bmap, end, cur_state);
+			return;
+		}
+
+		/* same end (and same start) */
+		if (new_state == next_state) {
+			/* next has same state */
+			if (new_state == prev_state) {
+				/* #3: merge prev & next */
+				btree_delete(bmap, offset);
+				btree_delete(bmap, end);
+				return;
+			}
+
+			/* #8: merge next */
+			btree_update_value(bmap, offset, new_state);
+			btree_delete(bmap, end);
+			return;
+		}
+
+		/* same start, same end, next has different state */
+		if (new_state == prev_state) {
+			/* #5: prev has same state */
+			btree_delete(bmap, offset);
+			return;
+		}
+
+		/* #6: update value only */
+		btree_update_value(bmap, offset, new_state);
+		return;
+	}
+
+	/* different start, offset is in the middle of "cur" */
+	prev_state = btree_peek_prev(bmap, NULL);
+	ASSERT(prev_state != NULL);
+	if (prev_state == new_state)
+		return;
+
+	if (end == cur_key) {
+		/* end is at the same point as the current extent */
+		if (new_state == cur_state) {
+			/* #7: move next extent down */
+			btree_update_key(bmap, end, offset);
+			return;
+		}
+
+		/* #9: different start, same end, add new extent */
+		btree_insert(bmap, offset, new_state);
+		return;
+	}
+
+	/* #2: insert an extent into the middle of another extent */
+	btree_insert(bmap, offset, new_state);
+	btree_insert(bmap, end, prev_state);
+}
+
+void
+set_bmap_ext(
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		blen,
+	int			state)
+{
+	update_bmap(ag_bmap[agno], agbno, blen, &states[state]);
+}
+
+int
+get_bmap_ext(
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		maxbno,
+	xfs_extlen_t		*blen)
+{
+	int			*statep;
+	unsigned long		key;
+
+	statep = btree_find(ag_bmap[agno], agbno, &key);
+	if (!statep)
+		return -1;
+
+	if (key == agbno) {
+		if (blen) {
+			if (!btree_peek_next(ag_bmap[agno], &key))
+				return -1;
+			*blen = MIN(maxbno, key) - agbno;
+		}
+		return *statep;
+	}
+
+	statep = btree_peek_prev(ag_bmap[agno], NULL);
+	if (!statep)
+		return -1;
+	if (blen)
+		*blen = MIN(maxbno, key) - agbno;
+
+	return *statep;
+}
 
+static uint64_t		*rt_bmap;
 static size_t		rt_bmap_size;
 
+/* block records fit into __uint64_t's units */
+#define XR_BB_UNIT	64			/* number of bits/unit */
+#define XR_BB		4			/* bits per block record */
+#define XR_BB_NUM	(XR_BB_UNIT/XR_BB)	/* number of records per unit */
+#define XR_BB_MASK	0xF			/* block record mask */
+
+/*
+ * these work in real-time extents (e.g. fsbno == rt extent number)
+ */
+int
+get_rtbmap(
+	xfs_drtbno_t	bno)
+{
+	return (*(rt_bmap + bno /  XR_BB_NUM) >>
+		((bno % XR_BB_NUM) * XR_BB)) & XR_BB_MASK;
+}
+
+void
+set_rtbmap(
+	xfs_drtbno_t	bno,
+	int		state)
+{
+	*(rt_bmap + bno / XR_BB_NUM) =
+	 ((*(rt_bmap + bno / XR_BB_NUM) &
+	  (~((__uint64_t) XR_BB_MASK << ((bno % XR_BB_NUM) * XR_BB)))) |
+	 (((__uint64_t) state) << ((bno % XR_BB_NUM) * XR_BB)));
+}
+
 static void
 reset_rt_bmap(void)
 {
-	if (rt_ba_bmap)
-		memset(rt_ba_bmap, 0x22, rt_bmap_size);	/* XR_E_FREE */
+	if (rt_bmap)
+		memset(rt_bmap, 0x22, rt_bmap_size);	/* XR_E_FREE */
 }
 
 static void
@@ -72,8 +251,8 @@ init_rt_bmap(
 	rt_bmap_size = roundup(mp->m_sb.sb_rextents / (NBBY / XR_BB),
 			       sizeof(__uint64_t));
 
-	rt_ba_bmap = memalign(sizeof(__uint64_t), rt_bmap_size);
-	if (!rt_ba_bmap) {
+	rt_bmap = memalign(sizeof(__uint64_t), rt_bmap_size);
+	if (!rt_bmap) {
 		do_error(
 		_("couldn't allocate realtime block map, size = %llu\n"),
 			mp->m_sb.sb_rextents);
@@ -84,8 +263,8 @@ init_rt_bmap(
 static void
 free_rt_bmap(xfs_mount_t *mp)
 {
-	free(rt_ba_bmap);
-	rt_ba_bmap = NULL;
+	free(rt_bmap);
+	rt_bmap = NULL;
 }
 
 
@@ -93,28 +272,41 @@ void
 reset_bmaps(xfs_mount_t *mp)
 {
 	xfs_agnumber_t	agno;
+	xfs_agblock_t	ag_size;
 	int		ag_hdr_block;
-	int		i;
 
 	ag_hdr_block = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
+	ag_size = mp->m_sb.sb_agblocks;
 
-	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)  {
-		memset(ba_bmap[agno], 0,
-		       roundup((mp->m_sb.sb_agblocks + (NBBY / XR_BB) - 1) /
-				(NBBY / XR_BB), sizeof(__uint64_t)));
-		for (i = 0; i < ag_hdr_block; i++)
-			set_bmap(agno, i, XR_E_INUSE_FS);
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		if (agno == mp->m_sb.sb_agcount - 1)
+			ag_size = (xfs_extlen_t)(mp->m_sb.sb_dblocks -
+				   (xfs_drfsbno_t)mp->m_sb.sb_agblocks * agno);
+#ifdef BTREE_STATS
+		if (btree_find(ag_bmap[agno], 0, NULL)) {
+			printf("ag_bmap[%d] btree stats:\n", i);
+			btree_print_stats(ag_bmap[agno], stdout);
+		}
+#endif
+		/*
+		 * We always insert an item for the first block having a
+		 * given state.  So the code below means:
+		 *
+		 *	block 0..ag_hdr_block-1:	XR_E_INUSE_FS
+		 *	ag_hdr_block..ag_size:		XR_E_UNKNOWN
+		 *	ag_size...			XR_E_BAD_STATE
+		 */
+		btree_clear(ag_bmap[agno]);
+		btree_insert(ag_bmap[agno], 0, &states[XR_E_INUSE_FS]);
+		btree_insert(ag_bmap[agno],
+				ag_hdr_block, &states[XR_E_UNKNOWN]);
+		btree_insert(ag_bmap[agno], ag_size, &states[XR_E_BAD_STATE]);
 	}
 
 	if (mp->m_sb.sb_logstart != 0) {
-		xfs_dfsbno_t	logend;
-
-		logend = mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks;
-
-		for (i = mp->m_sb.sb_logstart; i < logend ; i++)  {
-			set_bmap(XFS_FSB_TO_AGNO(mp, i),
-				 XFS_FSB_TO_AGBNO(mp, i), XR_E_INUSE_FS);
-		}
+		set_bmap_ext(XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart),
+			     XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart),
+			     mp->m_sb.sb_logblocks, XR_E_INUSE_FS);
 	}
 
 	reset_rt_bmap();
@@ -123,30 +315,18 @@ reset_bmaps(xfs_mount_t *mp)
 void
 init_bmaps(xfs_mount_t *mp)
 {
-	xfs_agblock_t numblocks = mp->m_sb.sb_agblocks;
-	int agcount = mp->m_sb.sb_agcount;
-	int i;
-	size_t size = 0;
-
-	ba_bmap = calloc(agcount, sizeof(__uint64_t *));
-	if (!ba_bmap)
-		do_error(_("couldn't allocate block map pointers\n"));
+	xfs_agnumber_t i;
 
-	ag_locks = calloc(agcount, sizeof(pthread_mutex_t));
+	ag_bmap = calloc(mp->m_sb.sb_agcount, sizeof(struct btree_root *));
+	if (!ag_bmap)
+		do_error(_("couldn't allocate block map btree roots\n"));
+
+	ag_locks = calloc(mp->m_sb.sb_agcount, sizeof(pthread_mutex_t));
 	if (!ag_locks)
 		do_error(_("couldn't allocate block map locks\n"));
 
-	for (i = 0; i < agcount; i++)  {
-		size = roundup((numblocks+(NBBY/XR_BB)-1) / (NBBY/XR_BB),
-		       		sizeof(__uint64_t));
-
-		ba_bmap[i] = memalign(sizeof(__uint64_t), size);
-		if (!ba_bmap[i]) {
-			do_error(_("couldn't allocate block map, size = %d\n"),
-				numblocks);
-			return;
-		}
-		memset(ba_bmap[i], 0, size);
+	for (i = 0; i < mp->m_sb.sb_agcount; i++)  {
+		btree_init(&ag_bmap[i]);
 		pthread_mutex_init(&ag_locks[i], NULL);
 	}
 
@@ -160,9 +340,9 @@ free_bmaps(xfs_mount_t *mp)
 	xfs_agnumber_t i;
 
 	for (i = 0; i < mp->m_sb.sb_agcount; i++)
-		free(ba_bmap[i]);
-	free(ba_bmap);
-	ba_bmap = NULL;
+		btree_destroy(ag_bmap[i]);
+	free(ag_bmap);
+	ag_bmap = NULL;
 
 	free_rt_bmap(mp);
 }
Index: xfsprogs-dev/repair/incore.h
===================================================================
--- xfsprogs-dev.orig/repair/incore.h	2009-09-02 14:51:09.573269190 -0300
+++ xfsprogs-dev/repair/incore.h	2009-09-02 14:51:18.621298890 -0300
@@ -37,59 +37,32 @@ void			record_allocation(ba_rec_t *addr,
 void			free_allocations(ba_rec_t *list);
 
 /*
- * block bit map defs -- track state of each filesystem block.
- * ba_bmap is an array of bitstrings declared in the globals.h file.
- * the bitstrings are broken up into 64-bit chunks.  one bitstring per AG.
- */
-#define BA_BMAP_SIZE(x)		(howmany(x, 4))
-
-void			init_bmaps(xfs_mount_t *mp);
-void			reset_bmaps(xfs_mount_t *mp);
-void			free_bmaps(xfs_mount_t *mp);
-
-
-/* blocks are numbered from zero */
-
-/* block records fit into __uint64_t's units */
-
-#define XR_BB_UNIT	64			/* number of bits/unit */
-#define XR_BB		4			/* bits per block record */
-#define XR_BB_NUM	(XR_BB_UNIT/XR_BB)	/* number of records per unit */
-#define XR_BB_MASK	0xF			/* block record mask */
-
-/*
- * bitstring ops -- set/get block states, either in filesystem
- * bno's or in agbno's.  turns out that fsbno addressing is
- * more convenient when dealing with bmap extracted addresses
- * and agbno addressing is more convenient when dealing with
- * meta-data extracted addresses.  So the fsbno versions use
- * mtype (which can be one of the block map types above) to
- * set the correct block map while the agbno versions assume
- * you want to use the regular block map.
- */
-
-#define get_bmap(agno, ag_blockno) \
-			((int) (*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) \
-				 >> (((ag_blockno)%XR_BB_NUM)*XR_BB)) \
-				& XR_BB_MASK)
-#define set_bmap(agno, ag_blockno, state) \
-	*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) = \
-		((*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) & \
-	  (~((__uint64_t) XR_BB_MASK << (((ag_blockno)%XR_BB_NUM)*XR_BB)))) | \
-	 (((__uint64_t) (state)) << (((ag_blockno)%XR_BB_NUM)*XR_BB)))
-
-/*
- * these work in real-time extents (e.g. fsbno == rt extent number)
- */
-#define get_rtbmap(fsbno) \
-			((*(rt_ba_bmap + (fsbno)/XR_BB_NUM) >> \
-			(((fsbno)%XR_BB_NUM)*XR_BB)) & XR_BB_MASK)
-#define set_rtbmap(fsbno, state) \
-	*(rt_ba_bmap + (fsbno)/XR_BB_NUM) = \
-	 ((*(rt_ba_bmap + (fsbno)/XR_BB_NUM) & \
-	  (~((__uint64_t) XR_BB_MASK << (((fsbno)%XR_BB_NUM)*XR_BB)))) | \
-	 (((__uint64_t) (state)) << (((fsbno)%XR_BB_NUM)*XR_BB)))
+ * block map -- track state of each filesystem block.
+ */
+
+void		init_bmaps(xfs_mount_t *mp);
+void		reset_bmaps(xfs_mount_t *mp);
+void		free_bmaps(xfs_mount_t *mp);
+
+void		set_bmap_ext(xfs_agnumber_t agno, xfs_agblock_t agbno,
+			     xfs_extlen_t blen, int state);
+int		get_bmap_ext(xfs_agnumber_t agno, xfs_agblock_t agbno,
+			     xfs_agblock_t maxbno, xfs_extlen_t *blen);
 
+void		set_rtbmap(xfs_drtbno_t bno, int state);
+int		get_rtbmap(xfs_drtbno_t bno);
+
+static inline void
+set_bmap(xfs_agnumber_t agno, xfs_agblock_t agbno, int state)
+{
+	set_bmap_ext(agno, agbno, 1, state);
+}
+
+static inline int
+get_bmap(xfs_agnumber_t agno, xfs_agblock_t agbno)
+{
+	return get_bmap_ext(agno, agbno, agbno + 1, NULL);
+}
 
 /*
  * extent tree definitions
Index: xfsprogs-dev/repair/scan.c
===================================================================
--- xfsprogs-dev.orig/repair/scan.c	2009-09-02 14:51:09.577269000 -0300
+++ xfsprogs-dev/repair/scan.c	2009-09-02 14:51:18.629269735 -0300
@@ -509,7 +509,7 @@ _("%s freespace btree block claimed (sta
 		rp = XFS_ALLOC_REC_ADDR(mp, block, 1);
 		for (i = 0; i < numrecs; i++) {
 			xfs_agblock_t		b, end;
-			xfs_extlen_t		len;
+			xfs_extlen_t		len, blen;
 
 			b = be32_to_cpu(rp[i].ar_startblock);
 			len = be32_to_cpu(rp[i].ar_blockcount);
@@ -522,8 +522,8 @@ _("%s freespace btree block claimed (sta
 			if (!verify_agbno(mp, agno, end - 1))
 				continue;
 
-			for ( ; b < end; b++)  {
-				state = get_bmap(agno, b);
+			for ( ; b < end; b += blen)  {
+				state = get_bmap_ext(agno, b, end, &blen);
 				switch (state) {
 				case XR_E_UNKNOWN:
 					set_bmap(agno, b, XR_E_FREE1);
@@ -534,13 +534,15 @@ _("%s freespace btree block claimed (sta
 					 * FREE1 blocks later
 					 */
 					if (magic != XFS_ABTB_MAGIC) {
-						set_bmap(agno, b, XR_E_FREE);
+						set_bmap_ext(agno, b, blen,
+							     XR_E_FREE);
 						break;
 					}
 				default:
 					do_warn(
-	_("block (%d,%d) multiply claimed by %s space tree, state - %d\n"),
-						agno, b, name, state);
+	_("block (%d,%d-%d) multiply claimed by %s space tree, state - %d\n"),
+						agno, b, b + blen - 1,
+						name, state);
 					break;
 				}
 			}

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 13/14] repair: optimize duplicate extent tracking
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (11 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 12/14] repair: switch block usage bitmap to a btree Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-22 16:41   ` Alex Elder
  2009-09-02 17:55 ` [PATCH 14/14] repair: add missing locking in scanfunc_bmap Christoph Hellwig
                   ` (2 subsequent siblings)
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-dup_extents-btree --]
[-- Type: text/plain, Size: 9885 bytes --]

Switch the duplicate extent tracking from an avl tree to our new btree
implementation.  Modify search_dup_extent to find overlapping extents
with differening start blocks instead of having the caller walk every
possible start block.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: xfsprogs-dev/repair/dinode.c
===================================================================
--- xfsprogs-dev.orig/repair/dinode.c	2009-08-21 15:11:16.000000000 +0000
+++ xfsprogs-dev/repair/dinode.c	2009-08-21 15:11:29.000000000 +0000
@@ -735,18 +735,14 @@ process_bmbt_reclist_int(
 			 * checking each entry without setting the
 			 * block bitmap
 			 */
-			for (b = irec.br_startblock;
-			     agbno < ebno;
-			     b++, agbno++)  {
-				if (search_dup_extent(mp, agno, agbno)) {
-					do_warn(_("%s fork in ino %llu claims "
-						"dup extent, off - %llu, "
-						"start - %llu, cnt %llu\n"),
-						forkname, ino, irec.br_startoff,
-						irec.br_startblock,
-						irec.br_blockcount);
-					goto done;
-				}
+			if (search_dup_extent(agno, agbno, ebno)) {
+				do_warn(_("%s fork in ino %llu claims "
+					"dup extent, off - %llu, "
+					"start - %llu, cnt %llu\n"),
+					forkname, ino, irec.br_startoff,
+					irec.br_startblock,
+					irec.br_blockcount);
+				goto done;
 			}
 			*tot += irec.br_blockcount;
 			continue;
Index: xfsprogs-dev/repair/incore.h
===================================================================
--- xfsprogs-dev.orig/repair/incore.h	2009-08-21 15:11:16.000000000 +0000
+++ xfsprogs-dev/repair/incore.h	2009-08-21 15:11:29.000000000 +0000
@@ -20,6 +20,8 @@
 #define XFS_REPAIR_INCORE_H
 
 #include "avl.h"
+
+
 /*
  * contains definition information.  implementation (code)
  * is spread out in separate files.
@@ -179,23 +181,11 @@ get_bcnt_extent(xfs_agnumber_t agno, xfs
 /*
  * duplicate extent tree functions
  */
-void		add_dup_extent(xfs_agnumber_t agno,
-				xfs_agblock_t startblock,
-				xfs_extlen_t blockcount);
-
-extern avltree_desc_t   **extent_tree_ptrs;
-/* ARGSUSED */
-static inline int
-search_dup_extent(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
-{
-	ASSERT(agno < glob_agcount);
-
-	if (avl_findrange(extent_tree_ptrs[agno], agbno) != NULL)
-		return(1);
-
-	return(0);
-}
 
+int		add_dup_extent(xfs_agnumber_t agno, xfs_agblock_t startblock,
+			xfs_extlen_t blockcount);
+int		search_dup_extent(xfs_agnumber_t agno,
+			xfs_agblock_t start_agbno, xfs_agblock_t end_agbno);
 void		add_rt_dup_extent(xfs_drtbno_t	startblock,
 				xfs_extlen_t	blockcount);
 
Index: xfsprogs-dev/repair/incore_ext.c
===================================================================
--- xfsprogs-dev.orig/repair/incore_ext.c	2009-08-21 15:11:16.000000000 +0000
+++ xfsprogs-dev/repair/incore_ext.c	2009-08-21 15:24:07.000000000 +0000
@@ -18,6 +18,7 @@
 
 #include <libxfs.h>
 #include "avl.h"
+#include "btree.h"
 #include "globals.h"
 #include "incore.h"
 #include "agheader.h"
@@ -72,8 +73,8 @@ static rt_ext_flist_t rt_ext_flist;
 
 static avl64tree_desc_t	*rt_ext_tree_ptr;	/* dup extent tree for rt */
 
-avltree_desc_t	**extent_tree_ptrs;		/* array of extent tree ptrs */
-						/* one per ag for dups */
+static struct btree_root **dup_extent_trees;	/* per ag dup extent trees */
+
 static avltree_desc_t	**extent_bno_ptrs;	/*
 						 * array of extent tree ptrs
 						 * one per ag for free extents
@@ -100,6 +101,48 @@ static pthread_mutex_t	rt_ext_tree_lock;
 static pthread_mutex_t	rt_ext_flist_lock;
 
 /*
+ * duplicate extent tree functions
+ */
+
+void
+release_dup_extent_tree(
+	xfs_agnumber_t		agno)
+{
+	btree_clear(dup_extent_trees[agno]);
+}
+
+int
+add_dup_extent(
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		startblock,
+	xfs_extlen_t		blockcount)
+{
+#ifdef XR_DUP_TRACE
+	fprintf(stderr, "Adding dup extent - %d/%d %d\n", agno, startblock,
+		blockcount);
+#endif
+	return btree_insert(dup_extent_trees[agno], startblock,
+				(void *)(uintptr_t)(startblock + blockcount));
+}
+
+int
+search_dup_extent(
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		start_agbno,
+	xfs_agblock_t		end_agbno)
+{
+	unsigned long	bno;
+
+	if (!btree_find(dup_extent_trees[agno], start_agbno, &bno))
+		return 0;	/* this really shouldn't happen */
+	if (bno < end_agbno)
+		return 1;
+	return (uintptr_t)btree_peek_prev(dup_extent_trees[agno], NULL) >
+								start_agbno;
+}
+
+
+/*
  * extent tree stuff is avl trees of duplicate extents,
  * sorted in order by block number.  there is one tree per ag.
  */
@@ -211,14 +254,6 @@ release_extent_tree(avltree_desc_t *tree
  * top-level (visible) routines
  */
 void
-release_dup_extent_tree(xfs_agnumber_t agno)
-{
-	release_extent_tree(extent_tree_ptrs[agno]);
-
-	return;
-}
-
-void
 release_agbno_extent_tree(xfs_agnumber_t agno)
 {
 	release_extent_tree(extent_bno_ptrs[agno]);
@@ -522,93 +557,6 @@ get_bcnt_extent(xfs_agnumber_t agno, xfs
 	return(ext);
 }
 
-/*
- * the next 2 routines manage the trees of duplicate extents -- 1 tree
- * per AG
- */
-void
-add_dup_extent(xfs_agnumber_t agno, xfs_agblock_t startblock,
-		xfs_extlen_t blockcount)
-{
-	extent_tree_node_t *first, *last, *ext, *next_ext;
-	xfs_agblock_t new_startblock;
-	xfs_extlen_t new_blockcount;
-
-	ASSERT(agno < glob_agcount);
-
-#ifdef XR_DUP_TRACE
-	fprintf(stderr, "Adding dup extent - %d/%d %d\n", agno, startblock, blockcount);
-#endif
-	avl_findranges(extent_tree_ptrs[agno], startblock - 1,
-		startblock + blockcount + 1,
-		(avlnode_t **) &first, (avlnode_t **) &last);
-	/*
-	 * find adjacent and overlapping extent blocks
-	 */
-	if (first == NULL && last == NULL)  {
-		/* nothing, just make and insert new extent */
-
-		ext = mk_extent_tree_nodes(startblock, blockcount, XR_E_MULT);
-
-		if (avl_insert(extent_tree_ptrs[agno],
-				(avlnode_t *) ext) == NULL)  {
-			do_error(_("duplicate extent range\n"));
-		}
-
-		return;
-	}
-
-	ASSERT(first != NULL && last != NULL);
-
-	/*
-	 * find the new composite range, delete old extent nodes
-	 * as we go
-	 */
-	new_startblock = startblock;
-	new_blockcount = blockcount;
-
-	for (ext = first;
-		ext != (extent_tree_node_t *) last->avl_node.avl_nextino;
-		ext = next_ext)  {
-		/*
-		 * preserve the next inorder node
-		 */
-		next_ext = (extent_tree_node_t *) ext->avl_node.avl_nextino;
-		/*
-		 * just bail if the new extent is contained within an old one
-		 */
-		if (ext->ex_startblock <= startblock &&
-				ext->ex_blockcount >= blockcount)
-			return;
-		/*
-		 * now check for overlaps and adjacent extents
-		 */
-		if (ext->ex_startblock + ext->ex_blockcount >= startblock
-			|| ext->ex_startblock <= startblock + blockcount)  {
-
-			if (ext->ex_startblock < new_startblock)
-				new_startblock = ext->ex_startblock;
-
-			if (ext->ex_startblock + ext->ex_blockcount >
-					new_startblock + new_blockcount)
-				new_blockcount = ext->ex_startblock +
-							ext->ex_blockcount -
-							new_startblock;
-
-			avl_delete(extent_tree_ptrs[agno], (avlnode_t *) ext);
-			continue;
-		}
-	}
-
-	ext = mk_extent_tree_nodes(new_startblock, new_blockcount, XR_E_MULT);
-
-	if (avl_insert(extent_tree_ptrs[agno], (avlnode_t *) ext) == NULL)  {
-		do_error(_("duplicate extent range\n"));
-	}
-
-	return;
-}
-
 static __psunsigned_t
 avl_ext_start(avlnode_t *node)
 {
@@ -901,10 +849,9 @@ incore_ext_init(xfs_mount_t *mp)
 	pthread_mutex_init(&rt_ext_tree_lock, NULL);
 	pthread_mutex_init(&rt_ext_flist_lock, NULL);
 
-	if ((extent_tree_ptrs = malloc(agcount *
-					sizeof(avltree_desc_t *))) == NULL)
-		do_error(
-	_("couldn't malloc dup extent tree descriptor table\n"));
+	dup_extent_trees = calloc(agcount, sizeof(struct btree_root *));
+	if (!dup_extent_trees)
+		do_error(_("couldn't malloc dup extent tree descriptor table\n"));
 
 	if ((extent_bno_ptrs = malloc(agcount *
 					sizeof(avltree_desc_t *))) == NULL)
@@ -917,10 +864,6 @@ incore_ext_init(xfs_mount_t *mp)
 	_("couldn't malloc free by-bcnt extent tree descriptor table\n"));
 
 	for (i = 0; i < agcount; i++)  {
-		if ((extent_tree_ptrs[i] =
-				malloc(sizeof(avltree_desc_t))) == NULL)
-			do_error(
-			_("couldn't malloc dup extent tree descriptor\n"));
 		if ((extent_bno_ptrs[i] =
 				malloc(sizeof(avltree_desc_t))) == NULL)
 			do_error(
@@ -932,7 +875,7 @@ incore_ext_init(xfs_mount_t *mp)
 	}
 
 	for (i = 0; i < agcount; i++)  {
-		avl_init_tree(extent_tree_ptrs[i], &avl_extent_tree_ops);
+		btree_init(&dup_extent_trees[i]);
 		avl_init_tree(extent_bno_ptrs[i], &avl_extent_tree_ops);
 		avl_init_tree(extent_bcnt_ptrs[i], &avl_extent_bcnt_tree_ops);
 	}
@@ -959,18 +902,18 @@ incore_ext_teardown(xfs_mount_t *mp)
 	free_allocations(ba_list);
 
 	for (i = 0; i < mp->m_sb.sb_agcount; i++)  {
-		free(extent_tree_ptrs[i]);
+		btree_destroy(dup_extent_trees[i]);
 		free(extent_bno_ptrs[i]);
 		free(extent_bcnt_ptrs[i]);
 	}
 
+	free(dup_extent_trees);
 	free(extent_bcnt_ptrs);
 	free(extent_bno_ptrs);
-	free(extent_tree_ptrs);
 
-	extent_bcnt_ptrs = extent_bno_ptrs = extent_tree_ptrs = NULL;
-
-	return;
+	dup_extent_trees = NULL;
+	extent_bcnt_ptrs = NULL;
+	extent_bno_ptrs = NULL;
 }
 
 int
Index: xfsprogs-dev/repair/scan.c
===================================================================
--- xfsprogs-dev.orig/repair/scan.c	2009-08-21 15:11:16.000000000 +0000
+++ xfsprogs-dev/repair/scan.c	2009-08-21 15:23:51.000000000 +0000
@@ -286,8 +286,9 @@ _("bad back (left) sibling pointer (saw 
 		 * filesystem
 		 */
 		if (type != XR_INO_RTDATA || whichfork != XFS_DATA_FORK)  {
-			if (search_dup_extent(mp, XFS_FSB_TO_AGNO(mp, bno),
-					XFS_FSB_TO_AGBNO(mp, bno)))
+			if (search_dup_extent(XFS_FSB_TO_AGNO(mp, bno),
+					XFS_FSB_TO_AGBNO(mp, bno),
+					XFS_FSB_TO_AGBNO(mp, bno) + 1))
 				return(1);
 		} else  {
 			if (search_rt_dup_extent(mp, bno))

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 14/14] repair: add missing locking in scanfunc_bmap
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (12 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 13/14] repair: optimize duplicate extent tracking Christoph Hellwig
@ 2009-09-02 17:55 ` Christoph Hellwig
  2009-10-22 16:42   ` Alex Elder
  2009-09-03 20:49 ` [PATCH 00/14] repair memory usage reductions Geoffrey Wehrman
  2009-11-12 15:58 ` Christoph Hellwig
  15 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-02 17:55 UTC (permalink / raw)
  To: xfs; +Cc: Barry Naujok

[-- Attachment #1: repair-scanfunc_bmap-locking --]
[-- Type: text/plain, Size: 941 bytes --]

Make sure to protect access to the block usage tracking btree with
the ag_lock.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>


Index: xfsprogs-dev/repair/scan.c
===================================================================
--- xfsprogs-dev.orig/repair/scan.c	2009-08-20 03:16:13.000000000 +0000
+++ xfsprogs-dev/repair/scan.c	2009-08-20 03:18:17.000000000 +0000
@@ -235,6 +235,7 @@
 		agno = XFS_FSB_TO_AGNO(mp, bno);
 		agbno = XFS_FSB_TO_AGBNO(mp, bno);
 
+		pthread_mutex_lock(&ag_locks[agno]);
 		state = get_bmap(agno, agbno);
 		switch (state) {
 		case XR_E_UNKNOWN:
@@ -280,6 +281,7 @@
 				state, ino, (__uint64_t) bno);
 			break;
 		}
+		pthread_mutex_unlock(&ag_locks[agno]);
 	} else  {
 		/*
 		 * attribute fork for realtime files is in the regular

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] repair memory usage reductions
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (13 preceding siblings ...)
  2009-09-02 17:55 ` [PATCH 14/14] repair: add missing locking in scanfunc_bmap Christoph Hellwig
@ 2009-09-03 20:49 ` Geoffrey Wehrman
  2009-09-04  2:57   ` Dave Chinner
  2009-11-12 15:58 ` Christoph Hellwig
  15 siblings, 1 reply; 50+ messages in thread
From: Geoffrey Wehrman @ 2009-09-03 20:49 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Wed, Sep 02, 2009 at 01:55:31PM -0400, Christoph Hellwig wrote:
| This is a respin of the patches Barry Naujok wrote at SGI for reducing
| the memory usage in repair.  I've split it up, fixed a few small bugs
| and added two preparatory cleanups - but all the real work is Barry's.
| There has been lots of heavy testing on large filesystems by Barry
| on the original patches, and quite a lot of testing on slightly smaller
| filesystems by me.  These were all ad-hoc tests as XFSQA coverage is
| rather low on repair.  My plan is to add various additional testcase
| for XFSQA both for intentional corruptions as well as reproducing past
| reported bugs before we'll release these patches in xfsprogs.  But I think
| it would be good if we could get them into the development git tree to
| get wider coverage already.

How do these changes affect xfs_repair I/O performance?  Barry changes
were previously withheld within SGI due to a regression in performance.


-- 
Geoffrey Wehrman  651-683-5496  gwehrman@sgi.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] repair memory usage reductions
  2009-09-03 20:49 ` [PATCH 00/14] repair memory usage reductions Geoffrey Wehrman
@ 2009-09-04  2:57   ` Dave Chinner
  2009-09-04 13:37     ` Geoffrey Wehrman
  0 siblings, 1 reply; 50+ messages in thread
From: Dave Chinner @ 2009-09-04  2:57 UTC (permalink / raw)
  To: Geoffrey Wehrman; +Cc: Christoph Hellwig, xfs

On Thu, Sep 03, 2009 at 03:49:40PM -0500, Geoffrey Wehrman wrote:
> On Wed, Sep 02, 2009 at 01:55:31PM -0400, Christoph Hellwig wrote:
> | This is a respin of the patches Barry Naujok wrote at SGI for reducing
> | the memory usage in repair.  I've split it up, fixed a few small bugs
> | and added two preparatory cleanups - but all the real work is Barry's.
> | There has been lots of heavy testing on large filesystems by Barry
> | on the original patches, and quite a lot of testing on slightly smaller
> | filesystems by me.  These were all ad-hoc tests as XFSQA coverage is
> | rather low on repair.  My plan is to add various additional testcase
> | for XFSQA both for intentional corruptions as well as reproducing past
> | reported bugs before we'll release these patches in xfsprogs.  But I think
> | it would be good if we could get them into the development git tree to
> | get wider coverage already.
> 
> How do these changes affect xfs_repair I/O performance?  Barry changes
> were previously withheld within SGI due to a regression in performance.

Christoph asked me to repeat what I said on #xfs w.r.t the regression.

The repair slowdowns were a result of increased CPU usage of the
btree structures used to track free space compared to manipulating
massive bitmaps. Hence if you have a disk subsystem fast enough that
prefetching could keep the CPUs 100% busy processing all the
incoming metadata the memory-optimised repair was about 30% slower
than the existing repair code.

However, given that getting to being CPU bound with the current
repair code requires having a *lot* of memory, so the more common
case is that you have to add gigabytes of swap space so that repair
can run. In these situations, the current repair will run much, much
slower than the memory optimised repair because the new version does
not have to swap.

Indeed, I recall one of the driving factors for this work was the SGI
customer that needed to connect their 300TB (or was it 600TB?) XFS
filesystem to an Altix with 2TB of RAM to be able to repair it
because the server head connected to the filesystem did not have 2TB
of storage available to assign as swap space. That is, XFS
scalability is limited by the amount of memory needed by repair....

Another mitigating factor is that the worst regressions were on
ia64, for which bitmap manipulation is far more friendly than branchy,
cache-miss causing btree traversals. Hence the regression will be
less (maybe even not present) on current x86-64 CPUs which handle
branches and cache misses far, far better than Altix/ia64....

With that in mind, I think the memory usage optimisation is far more
important to the majority of XFS users than the CPU usage regression
it causes as the majority of users don't have RAM-rich environments
to run repair in.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] repair memory usage reductions
  2009-09-04  2:57   ` Dave Chinner
@ 2009-09-04 13:37     ` Geoffrey Wehrman
  2009-09-04 14:51       ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Geoffrey Wehrman @ 2009-09-04 13:37 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, xfs

On Fri, Sep 04, 2009 at 12:57:53PM +1000, Dave Chinner wrote:
| Christoph asked me to repeat what I said on #xfs w.r.t the regression.

Thank you for the detailed description.  All I had was a statement from
January 2008, "Barry has completed the memory optimization, but initial
testing shows that performance has regressed."  That was the last update
recorded on Barry's work.

| With that in mind, I think the memory usage optimisation is far more
| important to the majority of XFS users than the CPU usage regression
| it causes as the majority of users don't have RAM-rich environments
| to run repair in.

I agree.


-- 
Geoffrey Wehrman  651-683-5496  gwehrman@sgi.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] repair memory usage reductions
  2009-09-04 13:37     ` Geoffrey Wehrman
@ 2009-09-04 14:51       ` Christoph Hellwig
  2009-09-04 17:24         ` Michael Monnerie
  0 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-09-04 14:51 UTC (permalink / raw)
  To: Geoffrey Wehrman; +Cc: Christoph Hellwig, xfs

On Fri, Sep 04, 2009 at 08:37:37AM -0500, Geoffrey Wehrman wrote:
> On Fri, Sep 04, 2009 at 12:57:53PM +1000, Dave Chinner wrote:
> | Christoph asked me to repeat what I said on #xfs w.r.t the regression.
> 
> Thank you for the detailed description.  All I had was a statement from
> January 2008, "Barry has completed the memory optimization, but initial
> testing shows that performance has regressed."  That was the last update
> recorded on Barry's work.
> 
> | With that in mind, I think the memory usage optimisation is far more
> | important to the majority of XFS users than the CPU usage regression
> | it causes as the majority of users don't have RAM-rich environments
> | to run repair in.
> 
> I agree.

In my testing I haven't seen big differences in performance, it
sometimes got a bit faster and sometimes a bit slower.  I will send out
a more detailed performace report in a few days.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] repair memory usage reductions
  2009-09-04 14:51       ` Christoph Hellwig
@ 2009-09-04 17:24         ` Michael Monnerie
  0 siblings, 0 replies; 50+ messages in thread
From: Michael Monnerie @ 2009-09-04 17:24 UTC (permalink / raw)
  To: xfs

On Freitag 04 September 2009 Christoph Hellwig wrote:
> In my testing I haven't seen big differences in performance, it
> sometimes got a bit faster and sometimes a bit slower.  I will send
> out a more detailed performace report in a few days.

>From what I've read, it should be faster on a machine with 2GB RAM and 
10TB storage, while it's maybe slower on a 64GB RAM machine with a 1TB 
xfs storage. Given that disks grow faster than RAM sizes, and that with 
virtualization a single machine typically has not too much RAM these 
days, I guess with the patches speed will improve overall.

mfg zmi
-- 
// Michael Monnerie, Ing.BSc    -----      http://it-management.at
// Tel: 0660 / 415 65 31                      .network.your.ideas.
// PGP Key:         "curl -s http://zmi.at/zmi.asc | gpg --import"
// Fingerprint: AC19 F9D5 36ED CD8A EF38  500E CE14 91F7 1C12 09B4
// Keyserver: wwwkeys.eu.pgp.net                  Key-ID: 1C1209B4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt
  2009-09-02 17:55 ` [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt Christoph Hellwig
@ 2009-10-12 16:53   ` Eric Sandeen
  2009-10-13 22:13     ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Eric Sandeen @ 2009-10-12 16:53 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

Christoph Hellwig wrote:

> Those two functions are almost identical. The big difference is that 
> we only
> move blocks from XR_E_FREE1 to XR_E_FREE state when processing the cnt btree.
>
> Besides that we print bno vs cnt in the messages and obviously validate a
> slightly different magic number in the header.
>
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
>

Generally seems fine to me, a couple of nitpicks below, take 'em or leave 'em.

> Index: xfsprogs-dev/repair/scan.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/scan.c	2009-08-21 18:24:26.000000000 +0000
> +++ xfsprogs-dev/repair/scan.c	2009-08-21 18:40:59.000000000 +0000
> @@ -439,15 +439,16 @@ _("out-of-order bmap key (file offset) i
>  }
>  
>  void
> -scanfunc_bno(
> +scanfunc_allocbt(
>  	struct xfs_btree_block	*block,
>  	int			level,
>  	xfs_agblock_t		bno,
>  	xfs_agnumber_t		agno,
>  	int			suspect,
> -	int			isroot
> -	)
> +	int			isroot,
> +	__uint32_t		magic)
>  {
> +	const char 		*name;
>  	xfs_agblock_t		b, e;
>  	int			i;
>  	xfs_alloc_ptr_t		*pp;
> @@ -456,16 +457,18 @@ scanfunc_bno(
>  	int			numrecs;
>  	int			state;
>  
> -	if (be32_to_cpu(block->bb_magic) != XFS_ABTB_MAGIC) {
> -		do_warn(_("bad magic # %#x in btbno block %d/%d\n"),
> -			be32_to_cpu(block->bb_magic), agno, bno);
> +	name = (magic == XFS_ABTB_MAGIC) ? "bno" : "cnt";

Should we explicitly test that this is either 
XFS_ABTC_MAGIC or XFS_ABTB_MAGIC here to avoid any programming-error
type problems?
> +
> +	if (be32_to_cpu(block->bb_magic) != magic) {
> +		do_warn(_("bad magic # %#x in bt%s block %d/%d\n"),
> +			be32_to_cpu(block->bb_magic), name, agno, bno);
>  		hdr_errors++;
>  		if (suspect)
>  			return;
>  	}
>  	if (be16_to_cpu(block->bb_level) != level) {
> -		do_warn(_("expected level %d got %d in btbno block %d/%d\n"),
> -			level, be16_to_cpu(block->bb_level), agno, bno);
> +		do_warn(_("expected level %d got %d in bt%s block %d/%d\n"),
> +			level, be16_to_cpu(block->bb_level), name, agno, bno);
>  		hdr_errors++;
>  		if (suspect)
>  			return;
> @@ -483,8 +486,8 @@ scanfunc_bno(
>  	default:
>  		set_agbno_state(mp, agno, bno, XR_E_MULT);
>  		do_warn(
> -_("bno freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
> -				state, agno, bno, suspect);
> +_("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
> +				name, state, agno, bno, suspect);
>  		return;
>  	}
>  
> @@ -520,15 +523,27 @@ _("bno freespace btree block claimed (st
>  				continue;
>  			for (b = be32_to_cpu(rp[i].ar_startblock);
>  			     b < e; b++)  {
> -				if (get_agbno_state(mp, agno, b)
> -							== XR_E_UNKNOWN)
> +				state = get_agbno_state(mp, agno, b);
> +				switch (state) {
> +				case XR_E_UNKNOWN:
>  					set_agbno_state(mp, agno, b,
>  							XR_E_FREE1);
> -				else  {
> +					break;
> +				case XR_E_FREE1:
> +					/*
> +					 * no warning messages -- we'll catch
> +					 * FREE1 blocks later
> +					 */
> +					if (magic != XFS_ABTB_MAGIC) {

Why not make this explicitly "if (magic == XFS_ABTC_MAGIC)" - I guess it seems potentially
more future-proof to me though I don't suppose we'll ever get a new type here.  :)
The positive test seems clearer to me but *shrug*.

Rest looks fine.  I suppose we should do the same to the functions in db/* someday.

Thanks,
-Eric

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 02/14] repair: reduce byte swap operations in scanfunc_allocbt
  2009-09-02 17:55 ` [PATCH 02/14] repair: reduce byte swap operations in scanfunc_allocbt Christoph Hellwig
@ 2009-10-12 17:18   ` Eric Sandeen
  2009-10-13 23:37     ` [PATCH 02/14] repair: reduce byte swap operations inscanfunc_allocbt Alex Elder
  0 siblings, 1 reply; 50+ messages in thread
From: Eric Sandeen @ 2009-10-12 17:18 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:

> Store native endian version of the extent startblock and length in
> local variables instead of converting them over and over again.
>
>
> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

I bet there's a lot more places where this could be done too :)

Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
> Index: xfsprogs-dev/repair/scan.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/scan.c	2009-08-21 18:48:01.000000000 +0000
> +++ xfsprogs-dev/repair/scan.c	2009-08-21 18:54:29.000000000 +0000
> @@ -449,7 +449,6 @@ scanfunc_allocbt(
>  	__uint32_t		magic)
>  {
>  	const char 		*name;
> -	xfs_agblock_t		b, e;
>  	int			i;
>  	xfs_alloc_ptr_t		*pp;
>  	xfs_alloc_rec_t		*rp;
> @@ -509,20 +508,21 @@ _("%s freespace btree block claimed (sta
>  
>  		rp = XFS_ALLOC_REC_ADDR(mp, block, 1);
>  		for (i = 0; i < numrecs; i++) {
> -			if (be32_to_cpu(rp[i].ar_blockcount) == 0 ||
> -			    be32_to_cpu(rp[i].ar_startblock) == 0 ||
> -			    !verify_agbno(mp, agno,
> -				be32_to_cpu(rp[i].ar_startblock)) ||
> -			    be32_to_cpu(rp[i].ar_blockcount) >
> -					MAXEXTLEN)
> -				continue;
> +			xfs_agblock_t		b, end;
> +			xfs_extlen_t		len;
> +
> +			b = be32_to_cpu(rp[i].ar_startblock);
> +			len = be32_to_cpu(rp[i].ar_blockcount);
> +			end = b + len;
>  
> -			e = be32_to_cpu(rp[i].ar_startblock) +
> -				be32_to_cpu(rp[i].ar_blockcount);
> -			if (!verify_agbno(mp, agno, e - 1))
> +			if (b == 0 || !verify_agbno(mp, agno, b))
> +				continue;
> +			if (len == 0 || len > MAXEXTLEN)
>  				continue;
> -			for (b = be32_to_cpu(rp[i].ar_startblock);
> -			     b < e; b++)  {
> +			if (!verify_agbno(mp, agno, end - 1))
> +				continue;
> +
> +			for ( ; b < end; b++)  {
>  				state = get_agbno_state(mp, agno, b);
>  				switch (state) {
>  				case XR_E_UNKNOWN:
> @@ -579,6 +579,8 @@ _("%s freespace btree block claimed (sta
>  	}
>  
>  	for (i = 0; i < numrecs; i++)  {
> +		xfs_agblock_t		bno = be32_to_cpu(pp[i]);
> +
>  		/*
>  		 * XXX - put sibling detection right here.
>  		 * we know our sibling chain is good.  So as we go,
> @@ -588,11 +590,11 @@ _("%s freespace btree block claimed (sta
>  		 * pointer mismatch, try and extract as much data
>  		 * as possible.
>  		 */
> -		if (be32_to_cpu(pp[i]) != 0 && verify_agbno(mp, agno,
> -							be32_to_cpu(pp[i])))
> -			scan_sbtree(be32_to_cpu(pp[i]), level, agno, suspect,
> +		if (bno != 0 && verify_agbno(mp, agno, bno)) {
> +			scan_sbtree(bno, level, agno, suspect,
>  				    (magic == XFS_ABTB_MAGIC) ?
>  				    	scanfunc_bno : scanfunc_cnt, 0);
> +		}
>  	}
>  }
>  
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 03/14] repair: kill B_IS_META flag
  2009-09-02 17:55 ` [PATCH 03/14] repair: kill B_IS_META flag Christoph Hellwig
@ 2009-10-12 19:45   ` Eric Sandeen
  2009-10-13 22:16     ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Eric Sandeen @ 2009-10-12 19:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

Christoph Hellwig wrote:

> B_IS_META is the inverse flag of B_IS_INODE which is not really obvious
> from it's use.  So just use !B_IS_INODE to make it more clear.
>

Logic-wise it's fine, but is this change really helpful?   The comment says:

/*
 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
 * the buffer is for an inode or other metadata.
 */

so basically it distinguishes inodes from other metadata right.

!B_IS_INODE() seems less than helpful...

B_IS_INODE is clear; B_IS_META is pretty clear, "!B_IS_INODE" seems muddy; so
very many things are "not inodes" :)

-Eric
> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Index: xfsprogs-dev/repair/prefetch.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/prefetch.c	2009-08-20 00:02:25.000000000 +0000
> +++ xfsprogs-dev/repair/prefetch.c	2009-08-20 00:05:36.000000000 +0000
> @@ -64,7 +64,6 @@
>   * the buffer is for an inode or other metadata.
>   */
>  #define B_IS_INODE(f)	(((f) & 5) == 0)
> -#define B_IS_META(f)	(((f) & 5) != 0)
>  
>  #define DEF_BATCH_BYTES	0x10000
>  
> @@ -131,7 +130,7 @@
>  
>  	if (fsbno > args->last_bno_read) {
>  		radix_tree_insert(&args->primary_io_queue, fsbno, bp);
> -		if (B_IS_META(flag))
> +		if (!B_IS_INODE(flag))
>  			radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
>  		else {
>  			args->inode_bufs_queued++;
> @@ -153,7 +152,7 @@
>  			(long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
>  			args->last_bno_read);
>  #endif
> -		ASSERT(B_IS_META(flag));
> +		ASSERT(!B_IS_INODE(flag));
>  		XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
>  		radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
>  	}
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/14] repair: split up scanfunc_ino
  2009-09-02 17:55 ` [PATCH 04/14] repair: split up scanfunc_ino Christoph Hellwig
@ 2009-10-12 20:06   ` Eric Sandeen
  2009-10-13 22:19     ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Eric Sandeen @ 2009-10-12 20:06 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

Christoph Hellwig wrote:


> Split out a helper to scan a single inode chunk for suspect inodes from
> scanfunc_ino to make it more readable.
>
>
> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Index: xfsprogs-dev/repair/scan.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/scan.c	2009-08-21 19:00:15.000000000 +0000
> +++ xfsprogs-dev/repair/scan.c	2009-08-21 19:03:26.000000000 +0000
>
...

> +
> +	/*
> +	 * set state of each block containing inodes
> +	 */
> +	if (off == 0 && !suspect)  {
> +		for (j = 0;
> +		     j < XFS_INODES_PER_CHUNK;
> +		     j += mp->m_sb.sb_inopblock)  {
> +			agbno = XFS_AGINO_TO_AGBNO(mp, ino + j);
> +			state = get_agbno_state(mp, agno, agbno);
> +			if (state == XR_E_UNKNOWN)  {
> +				set_agbno_state(mp, agno, agbno, XR_E_INO);
> +			} else if (state == XR_E_INUSE_FS && agno == 0 &&
> +				   ino + j >= first_prealloc_ino &&
> +				   ino + j < last_prealloc_ino)  {
> +				set_agbno_state(mp, agno, agbno, XR_E_INO);
> +			} else  {
> +				do_warn(
> +_("inode chunk claims used block, inobt block - agno %d, bno %d, inopb %d\n"),
> +					agno, agbno,
> +				mp->m_sb.sb_inopblock);

pretty weird indentation here can't you just merge w/ previous line?


Also is the change from bno to agbno intentional in the message?
I guess it's fine.

...


> +		for (i = 0; i < numrecs; i++)
> +			suspect = scan_single_ino_chunk(agno, &rp[i], suspect);
>  
>  		if (suspect)
>  			bad_ino_btree = 1;
It seems like it might be nicer to just do:


+		for (i = 0; i < numrecs; i++)
+			suspect += scan_single_ino_chunk(agno, &rp[i]);

and let scan_single_ino_chunk return 0/1 instead of passing suspect in
and returning an incremented value?

Hm but I guess the sub-function tests it doesn't it:

+       /*
+        * set state of each block containing inodes
+        */
+       if (off == 0 && !suspect)  {

so, seems fine as-is, though I'd just fix that indentation.

Thanks,
-Eric


-Eric

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 05/14] repair: reduce byte swapping in scan_freelist
  2009-09-02 17:55 ` [PATCH 05/14] repair: reduce byte swapping in scan_freelist Christoph Hellwig
@ 2009-10-12 20:43   ` Eric Sandeen
  0 siblings, 0 replies; 50+ messages in thread
From: Eric Sandeen @ 2009-10-12 20:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

Christoph Hellwig wrote:

> Store the ag number in a local native endian variable to avoid 
> byteswapping
> it over and over again.
>
>
> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
>

Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
> Index: xfsprogs-dev/repair/scan.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/scan.c	2009-08-21 19:03:26.000000000 +0000
> +++ xfsprogs-dev/repair/scan.c	2009-08-21 19:05:32.000000000 +0000
> @@ -943,23 +943,26 @@ scan_freelist(
>  {
>  	xfs_agfl_t	*agfl;
>  	xfs_buf_t	*agflbuf;
> +	xfs_agnumber_t	agno;
>  	xfs_agblock_t	bno;
>  	int		count;
>  	int		i;
>  
> +	agno = be32_to_cpu(agf->agf_seqno);
> +
>  	if (XFS_SB_BLOCK(mp) != XFS_AGFL_BLOCK(mp) &&
> -			XFS_AGF_BLOCK(mp) != XFS_AGFL_BLOCK(mp) &&
> -			XFS_AGI_BLOCK(mp) != XFS_AGFL_BLOCK(mp))
> -		set_agbno_state(mp, be32_to_cpu(agf->agf_seqno),
> -				XFS_AGFL_BLOCK(mp), XR_E_FS_MAP);
> +	    XFS_AGF_BLOCK(mp) != XFS_AGFL_BLOCK(mp) &&
> +	    XFS_AGI_BLOCK(mp) != XFS_AGFL_BLOCK(mp))
> +		set_agbno_state(mp, agno, XFS_AGFL_BLOCK(mp), XR_E_FS_MAP);
> +
>  	if (be32_to_cpu(agf->agf_flcount) == 0)
>  		return;
> -	agflbuf = libxfs_readbuf(mp->m_dev, XFS_AG_DADDR(mp,
> -				be32_to_cpu(agf->agf_seqno),
> -				XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0);
> +
> +	agflbuf = libxfs_readbuf(mp->m_dev,
> +				 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
> +				 XFS_FSS_TO_BB(mp, 1), 0);
>  	if (!agflbuf)  {
> -		do_abort(_("can't read agfl block for ag %d\n"),
> -			be32_to_cpu(agf->agf_seqno));
> +		do_abort(_("can't read agfl block for ag %d\n"), agno);
>  		return;
>  	}
>  	agfl = XFS_BUF_TO_AGFL(agflbuf);
> @@ -967,12 +970,11 @@ scan_freelist(
>  	count = 0;
>  	for (;;) {
>  		bno = be32_to_cpu(agfl->agfl_bno[i]);
> -		if (verify_agbno(mp, be32_to_cpu(agf->agf_seqno), bno))
> -			set_agbno_state(mp, be32_to_cpu(agf->agf_seqno),
> -					bno, XR_E_FREE);
> +		if (verify_agbno(mp, agno, bno))
> +			set_agbno_state(mp, agno, bno, XR_E_FREE);
>  		else
>  			do_warn(_("bad agbno %u in agfl, agno %d\n"),
> -				bno, be32_to_cpu(agf->agf_seqno));
> +				bno, agno);
>  		count++;
>  		if (i == be32_to_cpu(agf->agf_fllast))
>  			break;
> @@ -981,8 +983,7 @@ scan_freelist(
>  	}
>  	if (count != be32_to_cpu(agf->agf_flcount)) {
>  		do_warn(_("freeblk count %d != flcount %d in ag %d\n"), count,
> -			be32_to_cpu(agf->agf_flcount),
> -			be32_to_cpu(agf->agf_seqno));
> +			be32_to_cpu(agf->agf_flcount), agno);
>  	}
>  	libxfs_putbuf(agflbuf);
>  }
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt
  2009-10-12 16:53   ` Eric Sandeen
@ 2009-10-13 22:13     ` Christoph Hellwig
  2009-10-13 23:36       ` Alex Elder
  0 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-10-13 22:13 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Christoph Hellwig, xfs

On Mon, Oct 12, 2009 at 11:53:00AM -0500, Eric Sandeen wrote:
>
> Should we explicitly test that this is either XFS_ABTC_MAGIC or 
> XFS_ABTB_MAGIC here to avoid any programming-error
> type problems?

We really only have two freespace btrees.  But I'll add an assert
just to be sure.

>> -				else  {
>> +					break;
>> +				case XR_E_FREE1:
>> +					/*
>> +					 * no warning messages -- we'll catch
>> +					 * FREE1 blocks later
>> +					 */
>> +					if (magic != XFS_ABTB_MAGIC) {
>
> Why not make this explicitly "if (magic == XFS_ABTC_MAGIC)" - I guess it seems potentially
> more future-proof to me though I don't suppose we'll ever get a new type here.  :)
> The positive test seems clearer to me but *shrug*.

Ok, changed.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 03/14] repair: kill B_IS_META flag
  2009-10-12 19:45   ` Eric Sandeen
@ 2009-10-13 22:16     ` Christoph Hellwig
  2009-10-13 22:19       ` Eric Sandeen
  0 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-10-13 22:16 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Christoph Hellwig, xfs

On Mon, Oct 12, 2009 at 02:45:08PM -0500, Eric Sandeen wrote:
> Christoph Hellwig wrote:
>
>> B_IS_META is the inverse flag of B_IS_INODE which is not really obvious
>> from it's use.  So just use !B_IS_INODE to make it more clear.
>>
>
> Logic-wise it's fine, but is this change really helpful?   The comment says:
>
> /*
> * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
> * the buffer is for an inode or other metadata.
> */
>
> so basically it distinguishes inodes from other metadata right.

Yes, with the key on other.  In my books inodes are meta-data.

> B_IS_INODE is clear; B_IS_META is pretty clear, "!B_IS_INODE" seems muddy; so
> very many things are "not inodes" :)

In a buffercache (and in fact a whole application) that only deals with
metadata at all !B_IS_INODE meaning other metadata seems a lot more
clear to me than B_IS_META.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 03/14] repair: kill B_IS_META flag
  2009-10-13 22:16     ` Christoph Hellwig
@ 2009-10-13 22:19       ` Eric Sandeen
  2009-10-13 23:38         ` Alex Elder
  0 siblings, 1 reply; 50+ messages in thread
From: Eric Sandeen @ 2009-10-13 22:19 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

Christoph Hellwig wrote:
> On Mon, Oct 12, 2009 at 02:45:08PM -0500, Eric Sandeen wrote:
>> Christoph Hellwig wrote:
>>
>>> B_IS_META is the inverse flag of B_IS_INODE which is not really obvious
>>> from it's use.  So just use !B_IS_INODE to make it more clear.
>>>
>> Logic-wise it's fine, but is this change really helpful?   The comment says:
>>
>> /*
>> * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
>> * the buffer is for an inode or other metadata.
>> */
>>
>> so basically it distinguishes inodes from other metadata right.
> 
> Yes, with the key on other.  In my books inodes are meta-data.
> 
>> B_IS_INODE is clear; B_IS_META is pretty clear, "!B_IS_INODE" seems muddy; so
>> very many things are "not inodes" :)
> 
> In a buffercache (and in fact a whole application) that only deals with
> metadata at all !B_IS_INODE meaning other metadata seems a lot more
> clear to me than B_IS_META.
> 

Ok, I'm fine with that I suppose.

Thanks,
-Eric

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/14] repair: split up scanfunc_ino
  2009-10-12 20:06   ` Eric Sandeen
@ 2009-10-13 22:19     ` Christoph Hellwig
  2009-10-13 22:22       ` Eric Sandeen
  0 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-10-13 22:19 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Christoph Hellwig, xfs

On Mon, Oct 12, 2009 at 03:06:40PM -0500, Eric Sandeen wrote:
>> +					agno, agbno,
>> +				mp->m_sb.sb_inopblock);
>
> pretty weird indentation here can't you just merge w/ previous line?

XFS (especially userspace code) uses this in lots of places.  Gives
more space to messages but keeps the normal arguments normally aligned.

> Also is the change from bno to agbno intentional in the message?
> I guess it's fine.

That version is more correct.  Don't remember how it got in, though.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/14] repair: split up scanfunc_ino
  2009-10-13 22:19     ` Christoph Hellwig
@ 2009-10-13 22:22       ` Eric Sandeen
  2009-10-13 22:23         ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Eric Sandeen @ 2009-10-13 22:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

Christoph Hellwig wrote:
> On Mon, Oct 12, 2009 at 03:06:40PM -0500, Eric Sandeen wrote:
>>> +					agno, agbno,
>>> +				mp->m_sb.sb_inopblock);
>> pretty weird indentation here can't you just merge w/ previous line?

I get that...


+_("inode chunk claims used block, inobt block - agno %d, bno %d, inopb
%d\n"),
+					agno, agbno,
+				mp->m_sb.sb_inopblock);

the unindented string is fine but the 3rd line in the paste above
could/should be merged w/ the 2nd.

-Eric


> XFS (especially userspace code) uses this in lots of places.  Gives
> more space to messages but keeps the normal arguments normally aligned.
> 
>> Also is the change from bno to agbno intentional in the message?
>> I guess it's fine.
> 
> That version is more correct.  Don't remember how it got in, though.
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 04/14] repair: split up scanfunc_ino
  2009-10-13 22:22       ` Eric Sandeen
@ 2009-10-13 22:23         ` Christoph Hellwig
  0 siblings, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2009-10-13 22:23 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Christoph Hellwig, xfs

On Tue, Oct 13, 2009 at 05:22:32PM -0500, Eric Sandeen wrote:
> Christoph Hellwig wrote:
> > On Mon, Oct 12, 2009 at 03:06:40PM -0500, Eric Sandeen wrote:
> >>> +					agno, agbno,
> >>> +				mp->m_sb.sb_inopblock);
> >> pretty weird indentation here can't you just merge w/ previous line?
> 
> I get that...
> 
> 
> +_("inode chunk claims used block, inobt block - agno %d, bno %d, inopb
> %d\n"),
> +					agno, agbno,
> +				mp->m_sb.sb_inopblock);

Yeah, you're right.   I'll fix it up before commiting.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt
  2009-10-13 22:13     ` Christoph Hellwig
@ 2009-10-13 23:36       ` Alex Elder
  0 siblings, 0 replies; 50+ messages in thread
From: Alex Elder @ 2009-10-13 23:36 UTC (permalink / raw)
  To: Christoph Hellwig, Eric Sandeen; +Cc: xfs

On , Christoph Hellwig wrote:
> On Mon, Oct 12, 2009 at 11:53:00AM -0500, Eric Sandeen wrote:
>> 
>> Should we explicitly test that this is either XFS_ABTC_MAGIC or
>> XFS_ABTB_MAGIC here to avoid any programming-error
>> type problems?
> 
> We really only have two freespace btrees.  But I'll add an assert
> just to be sure.
> 
>>> -				else  {
>>> +					break;
>>> +				case XR_E_FREE1:
>>> +					/*
>>> +					 * no warning messages -- we'll catch
>>> +					 * FREE1 blocks later
>>> +					 */
>>> +					if (magic != XFS_ABTB_MAGIC) {
>> 
>> Why not make this explicitly "if (magic == XFS_ABTC_MAGIC)" - I guess it seems potentially
>> more future-proof to me though I don't suppose we'll ever get a new type here.  :)
>> The positive test seems clearer to me but *shrug*.
> 
> Ok, changed.

With changes as described, looks good.

Reviewed-by: Alex Elder <aelder@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 02/14] repair: reduce byte swap operations inscanfunc_allocbt
  2009-10-12 17:18   ` Eric Sandeen
@ 2009-10-13 23:37     ` Alex Elder
  0 siblings, 0 replies; 50+ messages in thread
From: Alex Elder @ 2009-10-13 23:37 UTC (permalink / raw)
  To: Eric Sandeen, Christoph Hellwig; +Cc: Barry Naujok, xfs

On , Eric Sandeen wrote:
> Christoph Hellwig wrote:
> 
>> Store native endian version of the extent startblock and length in
>> local variables instead of converting them over and over again.
>> 
>> 
>> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> I bet there's a lot more places where this could be done too :)

Lots more...

> Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Alex Elder <aelder@sgi.com>

>> Index: xfsprogs-dev/repair/scan.c
>> ===================================================================
>> --- xfsprogs-dev.orig/repair/scan.c	2009-08-21 18:48:01.000000000 +0000
>> +++ xfsprogs-dev/repair/scan.c	2009-08-21 18:54:29.000000000 +0000
>> @@ -449,7 +449,6 @@ scanfunc_allocbt(
>>  	__uint32_t		magic)
>>  {
>>  	const char 		*name;
>> -	xfs_agblock_t		b, e;
>>  	int			i;
>>  	xfs_alloc_ptr_t		*pp;
>>  	xfs_alloc_rec_t		*rp;
>> @@ -509,20 +508,21 @@ _("%s freespace btree block claimed (sta
>> 
>>  		rp = XFS_ALLOC_REC_ADDR(mp, block, 1);
>>  		for (i = 0; i < numrecs; i++) {
>> -			if (be32_to_cpu(rp[i].ar_blockcount) == 0 ||
>> -			    be32_to_cpu(rp[i].ar_startblock) == 0 ||
>> -			    !verify_agbno(mp, agno,
>> -				be32_to_cpu(rp[i].ar_startblock)) ||
>> -			    be32_to_cpu(rp[i].ar_blockcount) >
>> -					MAXEXTLEN)
>> -				continue;
>> +			xfs_agblock_t		b, end;
>> +			xfs_extlen_t		len;
>> +
>> +			b = be32_to_cpu(rp[i].ar_startblock);
>> +			len = be32_to_cpu(rp[i].ar_blockcount);
>> +			end = b + len;
>> 
>> -			e = be32_to_cpu(rp[i].ar_startblock) +
>> -				be32_to_cpu(rp[i].ar_blockcount);
>> -			if (!verify_agbno(mp, agno, e - 1))
>> +			if (b == 0 || !verify_agbno(mp, agno, b))
>> +				continue;
>> +			if (len == 0 || len > MAXEXTLEN)
>>  				continue;
>> -			for (b = be32_to_cpu(rp[i].ar_startblock);
>> -			     b < e; b++)  {
>> +			if (!verify_agbno(mp, agno, end - 1))
>> +				continue;
>> +
>> +			for ( ; b < end; b++)  {
>>  				state = get_agbno_state(mp, agno, b);
>>  				switch (state) {
>>  				case XR_E_UNKNOWN:
>> @@ -579,6 +579,8 @@ _("%s freespace btree block claimed (sta  	}
>> 
>>  	for (i = 0; i < numrecs; i++)  {
>> +		xfs_agblock_t		bno = be32_to_cpu(pp[i]);
>> +
>>  		/*
>>  		 * XXX - put sibling detection right here.
>>  		 * we know our sibling chain is good.  So as we go,
>> @@ -588,11 +590,11 @@ _("%s freespace btree block claimed (sta
>>  		 * pointer mismatch, try and extract as much data
>>  		 * as possible.
>>  		 */
>> -		if (be32_to_cpu(pp[i]) != 0 && verify_agbno(mp, agno,
>> -							be32_to_cpu(pp[i])))
>> -			scan_sbtree(be32_to_cpu(pp[i]), level, agno, suspect,
>> +		if (bno != 0 && verify_agbno(mp, agno, bno)) {
>> +			scan_sbtree(bno, level, agno, suspect,
>>  				    (magic == XFS_ABTB_MAGIC) ?
>>  				    	scanfunc_bno : scanfunc_cnt, 0);
>> +		}
>>  	}
>>  }
>> 
>> 
>> _______________________________________________
>> xfs mailing list
>> xfs@oss.sgi.com
>> http://oss.sgi.com/mailman/listinfo/xfs
>> 
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 03/14] repair: kill B_IS_META flag
  2009-10-13 22:19       ` Eric Sandeen
@ 2009-10-13 23:38         ` Alex Elder
  0 siblings, 0 replies; 50+ messages in thread
From: Alex Elder @ 2009-10-13 23:38 UTC (permalink / raw)
  To: Eric Sandeen, Christoph Hellwig; +Cc: xfs

On , Eric Sandeen wrote:
> Christoph Hellwig wrote:
>> On Mon, Oct 12, 2009 at 02:45:08PM -0500, Eric Sandeen wrote:
>>> Christoph Hellwig wrote:
>>> 
>>>> B_IS_META is the inverse flag of B_IS_INODE which is not really obvious
>>>> from it's use.  So just use !B_IS_INODE to make it more clear.
>>>> 
>>> Logic-wise it's fine, but is this change really helpful?   The comment says:
>>> 
>>> /*
>>> * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
>>> * the buffer is for an inode or other metadata.
>>> */
>>> 
>>> so basically it distinguishes inodes from other metadata right.
>> 
>> Yes, with the key on other.  In my books inodes are meta-data.
>> 
>>> B_IS_INODE is clear; B_IS_META is pretty clear, "!B_IS_INODE" seems muddy; so
>>> very many things are "not inodes" :)
>> 
>> In a buffercache (and in fact a whole application) that only deals with
>> metadata at all !B_IS_INODE meaning other metadata seems a lot more clear to me than B_IS_META.
>> 
> 
> Ok, I'm fine with that I suppose.

Reviewed-by: Alex Elder <aelder@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 06/14] repair: use a btree instead of a radix tree for theprefetch queue
  2009-09-02 17:55 ` [PATCH 06/14] repair: use a btree instead of a radix tree for the prefetch queue Christoph Hellwig
@ 2009-10-21 17:12   ` Alex Elder
  2009-11-12 10:04     ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Alex Elder @ 2009-10-21 17:12 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

On , Christoph Hellwig wrote:
> Currently the prefetch queue in xfs_repair uses a radix tree implementation
> derived from the Linux kernel one to manage it's prefetch queue.
> 
> The radix tree implement is not very memory efficient for sparse indices,
> so replace it with a btree implementation that is much more efficient.
> This is not that important for the prefetch queue but will be very important
> for the next memory optimization patches which need a tree to store things
> like the block map which are very sparse, and we do not want to deal with
> two tree implementations (or rather three given that we still have avl.c
> around)
> 
> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

I set up some code around this to test it, and ended up going
on a wild goose chase for a bug that it turns out only happens
if you happen to simplify things so that BTREE_KEY_MAX is 2.

So, well, don't do that, in case you were thinking of it...

Beyond that the code btree code generally looks fine, but
I'll probably have some suggested changes once it's in.
I have a few comments below.  However...

Reviewed-by: Alex Elder <aelder@sgi.com>

General comments/questions
- Is this code worthy of putting into a library, so other
  XFS user space can use it?
- Is the radix tree code worth putting into a library and
  saving, for similar reasons (I didn't look at that code).
- I accept that this uses less memory for sparsely populated
  key space than radix trees; but it would be nice if that
  could be characterized a bit more precisely (i.e., what
  will the range of values that'll be represented be, just
  how sparse is it, and at what point does this really
  pay off?).
- Related to the previous one--it would be good to have
  a little info about why the value 7 was chosen as the
  number of keys per node.  Perhaps I just don't know
  enough of the history (or content of the upcoming
  patches).
- A number of external interfaces defined are not used
  in the (current) xfs_repair code.  (That's OK, but
  they could be removed if they're never expected to
  to be used--e.g. btree_peek_prev(), and btree_peek_next()).


. . .

> Index: xfsprogs-dev/repair/btree.c
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ xfsprogs-dev/repair/btree.c	2009-08-20 00:06:44.000000000 +0000
> @@ -0,0 +1,1234 @@
> +/*
> + * Copyright (c) 2007, Silicon Graphics, Inc. Barry Naujok <bnaujok@sgi.com>
> + * All Rights Reserved.

. . .

> +#include <libxfs.h>
> +#include "btree.h"
> +
> +

Note that the value of BTREE_KEY_MAX *must* be greater than 2, or
this code will not work.  Maybe nobody should be trying to use 2
here anyway, but I would like to see a comment, e.g., /* Must be 3 or more */

> +#define BTREE_KEY_MAX		7
> +#define BTREE_KEY_MIN		(BTREE_KEY_MAX / 2)
> +
> +#define BTREE_PTR_MAX		(BTREE_KEY_MAX + 1)

. . .

> Index: xfsprogs-dev/repair/prefetch.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/prefetch.c	2009-08-20 00:05:36.000000000 +0000
> +++ xfsprogs-dev/repair/prefetch.c	2009-08-20 00:14:08.000000000 +0000
. . .

> @@ -570,7 +566,7 @@
>  		return NULL;
> 
>  	pthread_mutex_lock(&args->lock);
> -	while (!args->queuing_done || args->primary_io_queue.height) {

There is a btree_is_empty(btree_root) function, you might
as well use it here (it is clearer what you're doing anyway).

> +	while (!args->queuing_done || btree_find(args->primary_io_queue, 0, NULL)) {
> 
>  #ifdef XR_PF_TRACE
>  		pftrace("waiting to start prefetch I/O for AG %d", args->agno);
> @@ -696,8 +692,8 @@
>  #endif
>  	pthread_mutex_lock(&args->lock);
> 
> -	ASSERT(args->primary_io_queue.height == 0);
> -	ASSERT(args->secondary_io_queue.height == 0);

btree_is_empty() would be better here also.

> +	ASSERT(btree_find(args->primary_io_queue, 0, NULL) == NULL);
> +	ASSERT(btree_find(args->secondary_io_queue, 0, NULL) == NULL);
> 
>  	args->prefetch_done = 1;
>  	if (args->next_args)

. . .

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 07/14] repair: use single prefetch queue
  2009-09-02 17:55 ` [PATCH 07/14] repair: use single prefetch queue Christoph Hellwig
@ 2009-10-21 17:48   ` Alex Elder
  2009-11-12 10:09     ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Alex Elder @ 2009-10-21 17:48 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:
> We don't need two prefetch queues as we guarantee execution in order anyway.
> 
> XXX: description could use some more details.
> 
> 
> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

One nit-picky comment below, but looks good.

Reviewed-by: Alex Elder <aelder@sgi.com>

> Index: xfsprogs-dev/repair/prefetch.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/prefetch.c	2009-08-20 00:14:08.000000000 +0000
> +++ xfsprogs-dev/repair/prefetch.c	2009-08-20 00:16:01.000000000 +0000

. . .

The following hunk doesn't really do anything but change whitespace.
It'd be nice if those changes (when there's a bunch like this) were
limited to a separate no-op patch.

> @@ -440,21 +442,22 @@
>  		 */
>  		first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
>  		last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
> -			XFS_BUF_SIZE(bplist[num-1]);
> +						XFS_BUF_SIZE(bplist[num-1]);
>  		while (last_off - first_off > pf_max_bytes) {
>  			num--;
> -			last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
> -				XFS_BUF_SIZE(bplist[num-1]);
> +			last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(
> +				bplist[num-1])) + XFS_BUF_SIZE(bplist[num-1]);
>  		}
> -		if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
> +		if (num < ((last_off - first_off) >>
> +						(mp->m_sb.sb_blocklog + 3))) {
>  			/*
>  			 * not enough blocks for one big read, so determine
>  			 * the number of blocks that are close enough.
>  			 */
>  			last_off = first_off + XFS_BUF_SIZE(bplist[0]);
>  			for (i = 1; i < num; i++) {
> -				next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
> -						XFS_BUF_SIZE(bplist[i]);
> +				next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(
> +					bplist[i])) + XFS_BUF_SIZE(bplist[i]);
>  				if (next_off - last_off > pf_batch_bytes)
>  					break;
>  				last_off = next_off;

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 08/14] repair: clean up prefetch tracing
  2009-09-02 17:55 ` [PATCH 08/14] repair: clean up prefetch tracing Christoph Hellwig
@ 2009-10-21 17:53   ` Alex Elder
  0 siblings, 0 replies; 50+ messages in thread
From: Alex Elder @ 2009-10-21 17:53 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:
> Define a dummy pftrace macro for the non-tracing case to reduce the ifdef hell,
> clean up a few trace calls and add proper init/exit handlers for the tracing
> setup and teardown.
> 
> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Yay!!!

Reviewed-by: Alex Elder <aelder@sgi.com>


> Index: xfsprogs-dev/repair/dino_chunks.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/dino_chunks.c	2009-08-19 23:42:32.000000000 +0000
> +++ xfsprogs-dev/repair/dino_chunks.c	2009-08-20 00:16:53.000000000 +0000
> @@ -629,10 +629,9 @@
>  			cluster_count * sizeof(xfs_buf_t*));

. . .

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 09/14] repair: track logical to physical block mapping moreeffeciently
  2009-09-02 17:55 ` [PATCH 09/14] repair: track logical to physical block mapping more effeciently Christoph Hellwig
@ 2009-10-21 19:06   ` Alex Elder
  2009-11-12 10:18     ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Alex Elder @ 2009-10-21 19:06 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:
> Currently we track the logical to physical block mapping by a structure which
> contains an array of physicial blocks.  This is extremly efficient and is

Should this be "extremely inefficient?"

> replaced with the normal starblock storage we use in the kernel and on disk
> in this patch.

While you're at fixing the above comment, maybe just re-word this
sentence because I don't really grok it very well...

. . .

> [hch: added a small fix in blkmap_set_ext to not call memmove unless needed]
> 
> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

A few minor nits mentioned above and below.  But
overall this looks good.

Reviewed-by: Alex Elder <aelder@sgi.com>

> Index: xfsprogs-dev/repair/bmap.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/bmap.c	2009-08-20 02:32:34.000000000 +0000
> +++ xfsprogs-dev/repair/bmap.c	2009-08-20 02:32:45.000000000 +0000

> @@ -21,106 +21,46 @@
>  #include "bmap.h"
> 

. . .

> +	ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
> +
>  	if (nex < 1)
>  		nex = 1;
> -	if ((blkmap = malloc(BLKMAP_SIZE(nex))) == NULL) {
> -		do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"),
> -			BLKMAP_SIZE(nex));
> -		return blkmap;
> +
> +	key = whichfork ? ablkmap_key : dblkmap_key;
> +	blkmap = pthread_getspecific(key);
> +	if (!blkmap || blkmap->naexts < nex) {
> +		blkmap = realloc(blkmap, BLKMAP_SIZE(nex));

Does the above really have to be a realloc() call, or can
it simply be a free()/malloc() instead?  Also, could the
existing ts_alloc() function be adjusted to accomodate the
usage here?

> +		if (!blkmap) {
> +			do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"),
> +				BLKMAP_SIZE(nex));

. . .

> @@ -131,14 +71,7 @@ void
>  blkmap_free(
>  	blkmap_t	*blkmap)
>  {
> -	blkent_t	**entp;
> -	xfs_extnum_t	i;
> -
> -	if (blkmap == NULL)
> -		return;
> -	for (i = 0, entp = blkmap->ents; i < blkmap->nents; i++, entp++)
> -		free(*entp);
> -	free(blkmap);
> +	/* nothing to do! - keep the memory around for the next inode */

Nobody ever frees it though, either.  I guess it gets done at
exit but I like things tidy (could arrange for a destructor
function to be called, at pthread_key_create() time).

>  }
> 
>  /*

. . .

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 10/14] repair: cleanup helpers for tracking block usage
  2009-09-02 17:55 ` [PATCH 10/14] repair: cleanup helpers for tracking block usage Christoph Hellwig
@ 2009-10-21 19:33   ` Alex Elder
  2009-11-12 10:21     ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Alex Elder @ 2009-10-21 19:33 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:
> Rename get_agbno_state/set_agbno_state to get_bmap/set_bmap because
> those names are more self-descriptive.  Remove the superblous mount
> argument to the as the current filesystem is a global in repair.
> Remove the fsbno taking variant as they just complicated the code.
> Bring all uses of them into the canonical form.

Another big pile o' whitespace changes here, wish they were
separated so it was clearer to see what *really* changed...

One bad bug indicated in comments below, but easy to fix.

But this looks good otherwise.

> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Alex Elder <aelder@sgi.com>


> Index: xfsprogs-dev/repair/scan.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/scan.c	2009-08-21 19:05:32.000000000 +0000
> +++ xfsprogs-dev/repair/scan.c	2009-08-21 19:06:51.000000000 +0000

. . .

> @@ -476,19 +480,15 @@ scanfunc_allocbt(
>  	/*
>  	 * check for btree blocks multiply claimed
>  	 */
> -	state = get_agbno_state(mp, agno, bno);
> -
> -	switch (state)  {
> -	case XR_E_UNKNOWN:
> -		set_agbno_state(mp, agno, bno, XR_E_FS_MAP);
> -		break;
> -	default:
> -		set_agbno_state(mp, agno, bno, XR_E_MULT);
> +	state = get_bmap(agno, bno);
> +	switch (state != XR_E_UNKNOWN)  {

BUG.      You mean "if (state != XR_E_UNKNOWN)"???

> +		set_bmap(agno, bno, XR_E_MULT);
>  		do_warn(
>  _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
>  				name, state, agno, bno, suspect);
>  		return;
>  	}
> +	set_bmap(agno, bno, XR_E_FS_MAP);
> 
>  	numrecs = be16_to_cpu(block->bb_numrecs);
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 11/14] repair: cleanup alloc/free/reset of the block usagetracking
  2009-09-02 17:55 ` [PATCH 11/14] repair: cleanup alloc/free/reset of the block usage tracking Christoph Hellwig
@ 2009-10-21 20:22   ` Alex Elder
  0 siblings, 0 replies; 50+ messages in thread
From: Alex Elder @ 2009-10-21 20:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:
> Currently the code to allocate, free and reset the block usage bitmaps
> is a complete mess.  This patch reorganizes it into logical helpers.
> 
> Details:
> 
>  - the current incore_init code is called just before phase2 is called,
>    which then marks the log and the AG headers used.
>  - we get rid of incore_init init, and replace it with direct calls to the
>    unchanched incore_ino_init/incore_ext_init functions and our new init_bmaps
>    which does all the allocations for the block usage tracking, aswell
>    as a call to reset_bmaps to initialize it to the default values.
>  - reset_bmaps is also called from early phase4 code to reset all state
>    instead of opencoding it.
>  - there is a new free_bmaps helper which we call to free our block usage
>    bitmaps when we don't need them anymore after phase5.  The current
>    code frees some of it a bit early in phase5, but needs to take of it
>    in phase6 in case we didn't call phase5 due to nomodify mode, and leaks
>    it if we don't call phase 6, which might happen in case of a bad inode
>    allocation btree.


Looks good.		-Alex


> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Alex Elder <aelder@sgi.com>


> Index: xfsprogs-dev/repair/phase4.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/phase4.c	2009-08-21 01:59:26.000000000 +0000
> +++ xfsprogs-dev/repair/phase4.c	2009-08-21 02:41:44.000000000 +0000
> @@ -355,19 +355,7 @@ phase4(xfs_mount_t *mp)

. . .

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 12/14] repair: switch block usage bitmap to a btree
  2009-09-02 17:55 ` [PATCH 12/14] repair: switch block usage bitmap to a btree Christoph Hellwig
@ 2009-10-22 16:22   ` Alex Elder
  2009-11-12 10:25     ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Alex Elder @ 2009-10-22 16:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:
> Using a btree representing the extents is much more space efficient than
> using a bitmap tracking every single block.  In addition it also allows
> for more optimal algorithms checking range overlaps instead of walking
> every block in various places.
> 
> Also move the RT tracking bitmap into incore.c instead of leaving it
> a as macros - this keeps the implementation contained.

This is a really good change, and I'm now seeing the benefit
of the whole series.

I have a few minor things I would like to see changed at some
point, but nothing looks obviously incorrect so I'll wait and
post a patch against this once it's committed.

Here are a couple other general thoughts:
- One minor concern is that the btree code has cases in which
  the peek routines don't work (when the keys_valid flag in the
  btree root is zero) and this code doesn't check for that.
  I'll just assume for now that never happens here.
- The bitfield code used for the real-time volume map is
  generally useful and could be separated into its own module.

					-Alex

> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Alex Elder <aelder@sgi.com>

> Index: xfsprogs-dev/repair/dino_chunks.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/dino_chunks.c	2009-09-02 14:51:09.449268859 -0300
> +++ xfsprogs-dev/repair/dino_chunks.c	2009-09-02 14:51:18.593298964 -0300
> @@ -118,6 +118,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
>  	int		i;
>  	int		j;
>  	int		state;
> +	xfs_extlen_t	blen;
> 
>  	agno = XFS_INO_TO_AGNO(mp, ino);
>  	agino = XFS_INO_TO_AGINO(mp, ino);
> @@ -433,9 +434,10 @@ verify_inode_chunk(xfs_mount_t		*mp,
>  	 * entry or an iunlinked pointer
>  	 */
>  	pthread_mutex_lock(&ag_locks[agno]);
> -	for (j = 0, cur_agbno = chunk_start_agbno;
> -			cur_agbno < chunk_stop_agbno; cur_agbno++)  {
> -		state = get_bmap(agno, cur_agbno);
> +	for (cur_agbno = chunk_start_agbno;
> +	     cur_agbno < chunk_stop_agbno;
> +	     cur_agbno += blen)  {
> +		state = get_bmap_ext(agno, cur_agbno, chunk_stop_agbno, &blen);
>  		switch (state) {
>  		case XR_E_MULT:
>  		case XR_E_INUSE:
> @@ -444,9 +446,9 @@ verify_inode_chunk(xfs_mount_t		*mp,
>  			do_warn(
>  		_("inode block %d/%d multiply claimed, (state %d)\n"),
>  				agno, cur_agbno, state);
> -			set_bmap(agno, cur_agbno, XR_E_MULT);
> -			j = 1;
> -			break;
> +			set_bmap_ext(agno, cur_agbno, blen, XR_E_MULT);
> +			pthread_mutex_unlock(&ag_locks[agno]);
> +			return 0;
>  		case XR_E_INO:
>  			do_error(
>  		_("uncertain inode block overlap, agbno = %d, ino = %llu\n"),
> @@ -455,11 +457,6 @@ verify_inode_chunk(xfs_mount_t		*mp,
>  		default:
>  			break;
>  		}
> -
> -		if (j) {
> -			pthread_mutex_unlock(&ag_locks[agno]);
> -			return(0);
> -		}
>  	}
>  	pthread_mutex_unlock(&ag_locks[agno]);
> 
> @@ -487,8 +484,9 @@ verify_inode_chunk(xfs_mount_t		*mp,
>  	pthread_mutex_lock(&ag_locks[agno]);
> 
>  	for (cur_agbno = chunk_start_agbno;
> -			cur_agbno < chunk_stop_agbno; cur_agbno++)  {
> -		state = get_bmap(agno, cur_agbno);
> +	     cur_agbno < chunk_stop_agbno;
> +	     cur_agbno += blen)  {
> +		state = get_bmap_ext(agno, cur_agbno, chunk_stop_agbno, &blen);
>  		switch (state) {
>  		case XR_E_INO:
>  			do_error(
> @@ -498,7 +496,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
>  		case XR_E_UNKNOWN:
>  		case XR_E_FREE1:
>  		case XR_E_FREE:
> -			set_bmap(agno, cur_agbno, XR_E_INO);
> +			set_bmap_ext(agno, cur_agbno, blen, XR_E_INO);
>  			break;
>  		case XR_E_MULT:
>  		case XR_E_INUSE:
> @@ -512,7 +510,7 @@ verify_inode_chunk(xfs_mount_t		*mp,
>  			do_warn(
>  		_("inode block %d/%d bad state, (state %d)\n"),
>  				agno, cur_agbno, state);
> -			set_bmap(agno, cur_agbno, XR_E_INO);
> +			set_bmap_ext(agno, cur_agbno, blen, XR_E_INO);
>  			break;
>  		}
>  	}
> Index: xfsprogs-dev/repair/dinode.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/dinode.c	2009-09-02 14:51:09.457268829 -0300
> +++ xfsprogs-dev/repair/dinode.c	2009-09-02 14:51:18.593298964 -0300
> @@ -524,6 +524,7 @@ process_rt_rec(
> 
>  	/*
>  	 * set the appropriate number of extents
> +	 * this iterates block by block, this can be optimised using extents
>  	 */
>  	for (b = irec->br_startblock; b < irec->br_startblock +
>  			irec->br_blockcount; b += mp->m_sb.sb_rextsize)  {
> @@ -614,9 +615,10 @@ process_bmbt_reclist_int(
>  	char			*forkname;
>  	int			i;
>  	int			state;
> -	xfs_dfsbno_t		e;
>  	xfs_agnumber_t		agno;
>  	xfs_agblock_t		agbno;
> +	xfs_agblock_t		ebno;
> +	xfs_extlen_t		blen;
>  	xfs_agnumber_t		locked_agno = -1;
>  	int			error = 1;
> 
> @@ -718,7 +720,7 @@ process_bmbt_reclist_int(
>  		 */
>  		agno = XFS_FSB_TO_AGNO(mp, irec.br_startblock);
>  		agbno = XFS_FSB_TO_AGBNO(mp, irec.br_startblock);
> -		e = irec.br_startblock + irec.br_blockcount;
> +		ebno = agbno + irec.br_blockcount;
>  		if (agno != locked_agno) {
>  			if (locked_agno != -1)
>  				pthread_mutex_unlock(&ag_locks[locked_agno]);
> @@ -733,7 +735,9 @@ process_bmbt_reclist_int(
>  			 * checking each entry without setting the
>  			 * block bitmap
>  			 */
> -			for (b = irec.br_startblock; b < e; b++, agbno++)  {
> +			for (b = irec.br_startblock;
> +			     agbno < ebno;
> +			     b++, agbno++)  {
>  				if (search_dup_extent(mp, agno, agbno)) {
>  					do_warn(_("%s fork in ino %llu claims "
>  						"dup extent, off - %llu, "
> @@ -748,22 +752,10 @@ process_bmbt_reclist_int(
>  			continue;
>  		}
> 
> -		for (b = irec.br_startblock; b < e; b++, agbno++)  {
> -			/*
> -			 * Process in chunks of 16 (XR_BB_UNIT/XR_BB)
> -			 * for common XR_E_UNKNOWN to XR_E_INUSE transition
> -			 */
> -			if (((agbno & XR_BB_MASK) == 0) && ((irec.br_startblock + irec.br_blockcount - b) >=
> (XR_BB_UNIT/XR_BB))) { 
> -				if (ba_bmap[agno][agbno>>XR_BB] == XR_E_UNKNOWN_LL) {
> -					ba_bmap[agno][agbno>>XR_BB] = XR_E_INUSE_LL;
> -					agbno += (XR_BB_UNIT/XR_BB) - 1;
> -					b += (XR_BB_UNIT/XR_BB) - 1;
> -					continue;
> -				}
> -
> -			}
> -
> -			state = get_bmap(agno, agbno);
> +		for (b = irec.br_startblock;
> +		     agbno < ebno;
> +		     b += blen, agbno += blen) {
> +			state = get_bmap_ext(agno, agbno, ebno, &blen);
>  			switch (state)  {
>  			case XR_E_FREE:
>  			case XR_E_FREE1:
> @@ -772,7 +764,7 @@ process_bmbt_reclist_int(
>  					forkname, ino, (__uint64_t) b);
>  				/* fall through ... */
>  			case XR_E_UNKNOWN:
> -				set_bmap(agno, agbno, XR_E_INUSE);
> +				set_bmap_ext(agno, agbno, blen, XR_E_INUSE);
>  				break;
> 
>  			case XR_E_BAD_STATE:
> @@ -788,7 +780,7 @@ process_bmbt_reclist_int(
> 
>  			case XR_E_INUSE:
>  			case XR_E_MULT:
> -				set_bmap(agno, agbno, XR_E_MULT);
> +				set_bmap_ext(agno, agbno, blen, XR_E_MULT);
>  				do_warn(_("%s fork in %s inode %llu claims "
>  					"used block %llu\n"),
>  					forkname, ftype, ino, (__uint64_t) b);
> Index: xfsprogs-dev/repair/globals.h
> ===================================================================
> --- xfsprogs-dev.orig/repair/globals.h	2009-09-02 14:51:09.461268919 -0300
> +++ xfsprogs-dev/repair/globals.h	2009-09-02 14:51:18.597292070 -0300
> @@ -156,11 +156,6 @@ EXTERN int		chunks_pblock;	/* # of 64-in
>  EXTERN int		max_symlink_blocks;
>  EXTERN __int64_t	fs_max_file_offset;
> 
> -/* block allocation bitmaps */
> -
> -EXTERN __uint64_t	**ba_bmap;	/* see incore.h */
> -EXTERN __uint64_t	*rt_ba_bmap;	/* see incore.h */
> -
>  /* realtime info */
> 
>  EXTERN xfs_rtword_t	*btmcompute;
> Index: xfsprogs-dev/repair/phase2.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/phase2.c	2009-09-02 14:51:09.465298621 -0300
> +++ xfsprogs-dev/repair/phase2.c	2009-09-02 14:51:18.605297206 -0300
> @@ -109,7 +109,6 @@ void
>  phase2(xfs_mount_t *mp)
>  {
>  	xfs_agnumber_t		i;
> -	xfs_agblock_t		b;
>  	int			j;
>  	ino_tree_node_t		*ino_rec;
> 
> @@ -169,11 +168,8 @@ phase2(xfs_mount_t *mp)
>  		/*
>  		 * also mark blocks
>  		 */
> -		for (b = 0; b < mp->m_ialloc_blks; b++)  {
> -			set_bmap(0,
> -				b + XFS_INO_TO_AGBNO(mp, mp->m_sb.sb_rootino),
> -				XR_E_INO);
> -		}
> +		set_bmap_ext(0, XFS_INO_TO_AGBNO(mp, mp->m_sb.sb_rootino),
> +			     mp->m_ialloc_blks, XR_E_INO);
>  	} else  {
>  		do_log(_("        - found root inode chunk\n"));
> 
> Index: xfsprogs-dev/repair/phase4.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/phase4.c	2009-09-02 14:51:09.533268366 -0300
> +++ xfsprogs-dev/repair/phase4.c	2009-09-02 14:51:18.609296598 -0300
> @@ -192,8 +192,7 @@ phase4(xfs_mount_t *mp)
>  	xfs_agnumber_t		i;
>  	xfs_agblock_t		j;
>  	xfs_agblock_t		ag_end;
> -	xfs_agblock_t		extent_start;
> -	xfs_extlen_t		extent_len;
> +	xfs_extlen_t		blen;
>  	int			ag_hdr_len = 4 * mp->m_sb.sb_sectsize;
>  	int			ag_hdr_block;
>  	int			bstate;
> @@ -226,29 +225,13 @@ phase4(xfs_mount_t *mp)
>  		ag_end = (i < mp->m_sb.sb_agcount - 1) ? mp->m_sb.sb_agblocks :
>  			mp->m_sb.sb_dblocks -
>  				(xfs_drfsbno_t) mp->m_sb.sb_agblocks * i;
> -		extent_start = extent_len = 0;
> +
>  		/*
>  		 * set up duplicate extent list for this ag
>  		 */
> -		for (j = ag_hdr_block; j < ag_end; j++)  {
> -
> -			/* Process in chunks of 16 (XR_BB_UNIT/XR_BB) */
> -			if ((extent_start == 0) && ((j & XR_BB_MASK) == 0)) {
> -				switch(ba_bmap[i][j>>XR_BB]) {
> -				case XR_E_UNKNOWN_LL:
> -				case XR_E_FREE1_LL:
> -				case XR_E_FREE_LL:
> -				case XR_E_INUSE_LL:
> -				case XR_E_INUSE_FS_LL:
> -				case XR_E_INO_LL:
> -				case XR_E_FS_MAP_LL:
> -					j += (XR_BB_UNIT/XR_BB) - 1;
> -					continue;
> -				}
> -			}
> -
> -			bstate = get_bmap(i, j);
> -			switch (bstate)  {
> +		for (j = ag_hdr_block; j < ag_end; j += blen)  {
> +			bstate = get_bmap_ext(i, j, ag_end, &blen);
> +			switch (bstate) {
>  			case XR_E_BAD_STATE:
>  			default:
>  				do_warn(
> @@ -262,37 +245,13 @@ phase4(xfs_mount_t *mp)
>  			case XR_E_INUSE_FS:
>  			case XR_E_INO:
>  			case XR_E_FS_MAP:
> -				if (extent_start == 0)
> -					continue;
> -				else  {
> -					/*
> -					 * add extent and reset extent state
> -					 */
> -					add_dup_extent(i, extent_start,
> -							extent_len);
> -					extent_start = 0;
> -					extent_len = 0;
> -				}
>  				break;
>  			case XR_E_MULT:
> -				if (extent_start == 0)  {
> -					extent_start = j;
> -					extent_len = 1;
> -				} else if (extent_len == MAXEXTLEN)  {
> -					add_dup_extent(i, extent_start,
> -							extent_len);
> -					extent_start = j;
> -					extent_len = 1;
> -				} else
> -					extent_len++;
> +				add_dup_extent(i, j, blen);
>  				break;
>  			}
>  		}
> -		/*
> -		 * catch tail-case, extent hitting the end of the ag
> -		 */
> -		if (extent_start != 0)
> -			add_dup_extent(i, extent_start, extent_len);
> +
>  		PROG_RPT_INC(prog_rpt_done[i], 1);
>  	}
>  	print_final_rpt();
> Index: xfsprogs-dev/repair/phase5.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/phase5.c	2009-09-02 14:51:09.561269620 -0300
> +++ xfsprogs-dev/repair/phase5.c	2009-09-02 14:51:18.613269588 -0300
> @@ -88,10 +88,8 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_ag
>  	xfs_agblock_t		agbno;
>  	xfs_agblock_t		ag_end;
>  	uint			free_blocks;
> -#ifdef XR_BLD_FREE_TRACE
> -	int			old_state;
> -	int			state = XR_E_BAD_STATE;
> -#endif
> +	xfs_extlen_t		blen;
> +	int			bstate;
> 
>  	/*
>  	 * scan the bitmap for the ag looking for continuous
> @@ -120,30 +118,10 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_ag
>  	 * ok, now find the number of extents, keep track of the
>  	 * largest extent.
>  	 */
> -	for (agbno = 0; agbno < ag_end; agbno++)  {
> -#if 0
> -		old_state = state;
> -		state = get_bmap(agno, agbno);
> -		if (state != old_state)  {
> -			fprintf(stderr, "agbno %u - new state is %d\n",
> -					agbno, state);
> -		}
> -#endif
> -		/* Process in chunks of 16 (XR_BB_UNIT/XR_BB) */
> -		if ((in_extent == 0) && ((agbno & XR_BB_MASK) == 0)) {
> -			/* testing >= XR_E_INUSE */
> -			switch (ba_bmap[agno][agbno>>XR_BB]) {
> -			case XR_E_INUSE_LL:
> -			case XR_E_INUSE_FS_LL:
> -			case XR_E_INO_LL:
> -			case XR_E_FS_MAP_LL:
> -				agbno += (XR_BB_UNIT/XR_BB) - 1;
> -				continue;
> -			}
> -
> -		}
> -		if (get_bmap(agno, agbno) < XR_E_INUSE)  {
> -			free_blocks++;
> +	for (agbno = 0; agbno < ag_end; agbno += blen) {
> +		bstate = get_bmap_ext(agno, agbno, ag_end, &blen);
> +		if (bstate < XR_E_INUSE)  {
> +			free_blocks += blen;
>  			if (in_extent == 0)  {
>  				/*
>  				 * found the start of a free extent
> @@ -151,9 +129,9 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_ag
>  				in_extent = 1;
>  				num_extents++;
>  				extent_start = agbno;
> -				extent_len = 1;
> +				extent_len = blen;
>  			} else  {
> -				extent_len++;
> +				extent_len += blen;
>  			}
>  		} else   {
>  			if (in_extent)  {
> Index: xfsprogs-dev/repair/incore.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/incore.c	2009-09-02 14:51:09.565269570 -0300
> +++ xfsprogs-dev/repair/incore.c	2009-09-02 14:51:29.072772399 -0300
> @@ -18,6 +18,7 @@
> 
>  #include <libxfs.h>
>  #include "avl.h"
> +#include "btree.h"
>  #include "globals.h"
>  #include "incore.h"
>  #include "agheader.h"
> @@ -52,14 +53,192 @@ free_allocations(ba_rec_t *list)
>  	return;
>  }
> 
> +/*
> + * The following manages the in-core bitmap of the entire filesystem
> + * using extents in a btree.
> + *
> + * The btree items will point to one of the state values below,
> + * rather than storing the value itself in the pointer.
> + */
> +static int states[16] =
> +	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
> +
> +static struct btree_root	**ag_bmap;
> +
> +static void
> +update_bmap(
> +	struct btree_root	*bmap,
> +	unsigned long		offset,
> +	xfs_extlen_t		blen,
> +	void			*new_state)
> +{
> +	unsigned long		end = offset + blen;
> +	int			*cur_state;
> +	unsigned long		cur_key;
> +	int			*next_state;
> +	unsigned long		next_key;
> +	int			*prev_state;
> +
> +	cur_state = btree_find(bmap, offset, &cur_key);
> +	if (!cur_state)
> +		return;
> +
> +	if (offset == cur_key) {
> +		/* if the start is the same as the "item" extent */
> +		if (cur_state == new_state)
> +			return;
> +
> +		/*
> +		 * Note: this may be NULL if we are updating the map for
> +		 * the superblock.
> +		 */
> +		prev_state = btree_peek_prev(bmap, NULL);
> +
> +		next_state = btree_peek_next(bmap, &next_key);
> +		if (next_key > end) {
> +			/* different end */
> +			if (new_state == prev_state) {
> +				/* #1: prev has same state, move offset up */
> +				btree_update_key(bmap, offset, end);
> +				return;
> +			}
> +
> +			/* #4: insert new extent after, update current value */
> +			btree_update_value(bmap, offset, new_state);
> +			btree_insert(bmap, end, cur_state);
> +			return;
> +		}
> +
> +		/* same end (and same start) */
> +		if (new_state == next_state) {
> +			/* next has same state */
> +			if (new_state == prev_state) {
> +				/* #3: merge prev & next */
> +				btree_delete(bmap, offset);
> +				btree_delete(bmap, end);
> +				return;
> +			}
> +
> +			/* #8: merge next */
> +			btree_update_value(bmap, offset, new_state);
> +			btree_delete(bmap, end);
> +			return;
> +		}
> +
> +		/* same start, same end, next has different state */
> +		if (new_state == prev_state) {
> +			/* #5: prev has same state */
> +			btree_delete(bmap, offset);
> +			return;
> +		}
> +
> +		/* #6: update value only */
> +		btree_update_value(bmap, offset, new_state);
> +		return;
> +	}
> +
> +	/* different start, offset is in the middle of "cur" */
> +	prev_state = btree_peek_prev(bmap, NULL);
> +	ASSERT(prev_state != NULL);
> +	if (prev_state == new_state)
> +		return;
> +
> +	if (end == cur_key) {
> +		/* end is at the same point as the current extent */
> +		if (new_state == cur_state) {
> +			/* #7: move next extent down */
> +			btree_update_key(bmap, end, offset);
> +			return;
> +		}
> +
> +		/* #9: different start, same end, add new extent */
> +		btree_insert(bmap, offset, new_state);
> +		return;
> +	}
> +
> +	/* #2: insert an extent into the middle of another extent */
> +	btree_insert(bmap, offset, new_state);
> +	btree_insert(bmap, end, prev_state);
> +}
> +
> +void
> +set_bmap_ext(
> +	xfs_agnumber_t		agno,
> +	xfs_agblock_t		agbno,
> +	xfs_extlen_t		blen,
> +	int			state)
> +{
> +	update_bmap(ag_bmap[agno], agbno, blen, &states[state]);
> +}
> +
> +int
> +get_bmap_ext(
> +	xfs_agnumber_t		agno,
> +	xfs_agblock_t		agbno,
> +	xfs_agblock_t		maxbno,
> +	xfs_extlen_t		*blen)
> +{
> +	int			*statep;
> +	unsigned long		key;
> +
> +	statep = btree_find(ag_bmap[agno], agbno, &key);
> +	if (!statep)
> +		return -1;
> +
> +	if (key == agbno) {
> +		if (blen) {
> +			if (!btree_peek_next(ag_bmap[agno], &key))
> +				return -1;
> +			*blen = MIN(maxbno, key) - agbno;
> +		}
> +		return *statep;
> +	}
> +
> +	statep = btree_peek_prev(ag_bmap[agno], NULL);
> +	if (!statep)
> +		return -1;
> +	if (blen)
> +		*blen = MIN(maxbno, key) - agbno;
> +
> +	return *statep;
> +}
> 
> +static uint64_t		*rt_bmap;
>  static size_t		rt_bmap_size;
> 
> +/* block records fit into __uint64_t's units */
> +#define XR_BB_UNIT	64			/* number of bits/unit */
> +#define XR_BB		4			/* bits per block record */
> +#define XR_BB_NUM	(XR_BB_UNIT/XR_BB)	/* number of records per unit */
> +#define XR_BB_MASK	0xF			/* block record mask */
> +
> +/*
> + * these work in real-time extents (e.g. fsbno == rt extent number)
> + */
> +int
> +get_rtbmap(
> +	xfs_drtbno_t	bno)
> +{
> +	return (*(rt_bmap + bno /  XR_BB_NUM) >>
> +		((bno % XR_BB_NUM) * XR_BB)) & XR_BB_MASK;
> +}
> +
> +void
> +set_rtbmap(
> +	xfs_drtbno_t	bno,
> +	int		state)
> +{
> +	*(rt_bmap + bno / XR_BB_NUM) =
> +	 ((*(rt_bmap + bno / XR_BB_NUM) &
> +	  (~((__uint64_t) XR_BB_MASK << ((bno % XR_BB_NUM) * XR_BB)))) |
> +	 (((__uint64_t) state) << ((bno % XR_BB_NUM) * XR_BB)));
> +}
> +
>  static void
>  reset_rt_bmap(void)
>  {
> -	if (rt_ba_bmap)
> -		memset(rt_ba_bmap, 0x22, rt_bmap_size);	/* XR_E_FREE */
> +	if (rt_bmap)
> +		memset(rt_bmap, 0x22, rt_bmap_size);	/* XR_E_FREE */
>  }
> 
>  static void
> @@ -72,8 +251,8 @@ init_rt_bmap(
>  	rt_bmap_size = roundup(mp->m_sb.sb_rextents / (NBBY / XR_BB),
>  			       sizeof(__uint64_t));
> 
> -	rt_ba_bmap = memalign(sizeof(__uint64_t), rt_bmap_size);
> -	if (!rt_ba_bmap) {
> +	rt_bmap = memalign(sizeof(__uint64_t), rt_bmap_size);
> +	if (!rt_bmap) {
>  		do_error(
>  		_("couldn't allocate realtime block map, size = %llu\n"),
>  			mp->m_sb.sb_rextents);
> @@ -84,8 +263,8 @@ init_rt_bmap(
>  static void
>  free_rt_bmap(xfs_mount_t *mp)
>  {
> -	free(rt_ba_bmap);
> -	rt_ba_bmap = NULL;
> +	free(rt_bmap);
> +	rt_bmap = NULL;
>  }
> 
> 
> @@ -93,28 +272,41 @@ void
>  reset_bmaps(xfs_mount_t *mp)
>  {
>  	xfs_agnumber_t	agno;
> +	xfs_agblock_t	ag_size;
>  	int		ag_hdr_block;
> -	int		i;
> 
>  	ag_hdr_block = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
> +	ag_size = mp->m_sb.sb_agblocks;
> 
> -	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)  {
> -		memset(ba_bmap[agno], 0,
> -		       roundup((mp->m_sb.sb_agblocks + (NBBY / XR_BB) - 1) /
> -				(NBBY / XR_BB), sizeof(__uint64_t)));
> -		for (i = 0; i < ag_hdr_block; i++)
> -			set_bmap(agno, i, XR_E_INUSE_FS);
> +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> +		if (agno == mp->m_sb.sb_agcount - 1)
> +			ag_size = (xfs_extlen_t)(mp->m_sb.sb_dblocks -
> +				   (xfs_drfsbno_t)mp->m_sb.sb_agblocks * agno);
> +#ifdef BTREE_STATS
> +		if (btree_find(ag_bmap[agno], 0, NULL)) {
> +			printf("ag_bmap[%d] btree stats:\n", i);
> +			btree_print_stats(ag_bmap[agno], stdout);
> +		}
> +#endif
> +		/*
> +		 * We always insert an item for the first block having a
> +		 * given state.  So the code below means:
> +		 *
> +		 *	block 0..ag_hdr_block-1:	XR_E_INUSE_FS
> +		 *	ag_hdr_block..ag_size:		XR_E_UNKNOWN
> +		 *	ag_size...			XR_E_BAD_STATE
> +		 */
> +		btree_clear(ag_bmap[agno]);
> +		btree_insert(ag_bmap[agno], 0, &states[XR_E_INUSE_FS]);
> +		btree_insert(ag_bmap[agno],
> +				ag_hdr_block, &states[XR_E_UNKNOWN]);
> +		btree_insert(ag_bmap[agno], ag_size, &states[XR_E_BAD_STATE]);
>  	}
> 
>  	if (mp->m_sb.sb_logstart != 0) {
> -		xfs_dfsbno_t	logend;
> -
> -		logend = mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks;
> -
> -		for (i = mp->m_sb.sb_logstart; i < logend ; i++)  {
> -			set_bmap(XFS_FSB_TO_AGNO(mp, i),
> -				 XFS_FSB_TO_AGBNO(mp, i), XR_E_INUSE_FS);
> -		}
> +		set_bmap_ext(XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart),
> +			     XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart),
> +			     mp->m_sb.sb_logblocks, XR_E_INUSE_FS);
>  	}
> 
>  	reset_rt_bmap();
> @@ -123,30 +315,18 @@ reset_bmaps(xfs_mount_t *mp)
>  void
>  init_bmaps(xfs_mount_t *mp)
>  {
> -	xfs_agblock_t numblocks = mp->m_sb.sb_agblocks;
> -	int agcount = mp->m_sb.sb_agcount;
> -	int i;
> -	size_t size = 0;
> -
> -	ba_bmap = calloc(agcount, sizeof(__uint64_t *));
> -	if (!ba_bmap)
> -		do_error(_("couldn't allocate block map pointers\n"));
> +	xfs_agnumber_t i;
> 
> -	ag_locks = calloc(agcount, sizeof(pthread_mutex_t));
> +	ag_bmap = calloc(mp->m_sb.sb_agcount, sizeof(struct btree_root *));
> +	if (!ag_bmap)
> +		do_error(_("couldn't allocate block map btree roots\n"));
> +
> +	ag_locks = calloc(mp->m_sb.sb_agcount, sizeof(pthread_mutex_t));
>  	if (!ag_locks)
>  		do_error(_("couldn't allocate block map locks\n"));
> 
> -	for (i = 0; i < agcount; i++)  {
> -		size = roundup((numblocks+(NBBY/XR_BB)-1) / (NBBY/XR_BB),
> -		       		sizeof(__uint64_t));
> -
> -		ba_bmap[i] = memalign(sizeof(__uint64_t), size);
> -		if (!ba_bmap[i]) {
> -			do_error(_("couldn't allocate block map, size = %d\n"),
> -				numblocks);
> -			return;
> -		}
> -		memset(ba_bmap[i], 0, size);
> +	for (i = 0; i < mp->m_sb.sb_agcount; i++)  {
> +		btree_init(&ag_bmap[i]);
>  		pthread_mutex_init(&ag_locks[i], NULL);
>  	}
> 
> @@ -160,9 +340,9 @@ free_bmaps(xfs_mount_t *mp)
>  	xfs_agnumber_t i;
> 
>  	for (i = 0; i < mp->m_sb.sb_agcount; i++)
> -		free(ba_bmap[i]);
> -	free(ba_bmap);
> -	ba_bmap = NULL;
> +		btree_destroy(ag_bmap[i]);
> +	free(ag_bmap);
> +	ag_bmap = NULL;
> 
>  	free_rt_bmap(mp);
>  }
> Index: xfsprogs-dev/repair/incore.h
> ===================================================================
> --- xfsprogs-dev.orig/repair/incore.h	2009-09-02 14:51:09.573269190 -0300
> +++ xfsprogs-dev/repair/incore.h	2009-09-02 14:51:18.621298890 -0300
> @@ -37,59 +37,32 @@ void			record_allocation(ba_rec_t *addr,
>  void			free_allocations(ba_rec_t *list);
> 
>  /*
> - * block bit map defs -- track state of each filesystem block.
> - * ba_bmap is an array of bitstrings declared in the globals.h file.
> - * the bitstrings are broken up into 64-bit chunks.  one bitstring per AG.
> - */
> -#define BA_BMAP_SIZE(x)		(howmany(x, 4))
> -
> -void			init_bmaps(xfs_mount_t *mp);
> -void			reset_bmaps(xfs_mount_t *mp);
> -void			free_bmaps(xfs_mount_t *mp);
> -
> -
> -/* blocks are numbered from zero */
> -
> -/* block records fit into __uint64_t's units */
> -
> -#define XR_BB_UNIT	64			/* number of bits/unit */
> -#define XR_BB		4			/* bits per block record */
> -#define XR_BB_NUM	(XR_BB_UNIT/XR_BB)	/* number of records per unit */
> -#define XR_BB_MASK	0xF			/* block record mask */
> -
> -/*
> - * bitstring ops -- set/get block states, either in filesystem
> - * bno's or in agbno's.  turns out that fsbno addressing is
> - * more convenient when dealing with bmap extracted addresses
> - * and agbno addressing is more convenient when dealing with
> - * meta-data extracted addresses.  So the fsbno versions use
> - * mtype (which can be one of the block map types above) to
> - * set the correct block map while the agbno versions assume
> - * you want to use the regular block map.
> - */
> -
> -#define get_bmap(agno, ag_blockno) \
> -			((int) (*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) \
> -				 >> (((ag_blockno)%XR_BB_NUM)*XR_BB)) \
> -				& XR_BB_MASK)
> -#define set_bmap(agno, ag_blockno, state) \
> -	*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) = \
> -		((*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) & \
> -	  (~((__uint64_t) XR_BB_MASK << (((ag_blockno)%XR_BB_NUM)*XR_BB)))) | \
> -	 (((__uint64_t) (state)) << (((ag_blockno)%XR_BB_NUM)*XR_BB)))
> -
> -/*
> - * these work in real-time extents (e.g. fsbno == rt extent number)
> - */
> -#define get_rtbmap(fsbno) \
> -			((*(rt_ba_bmap + (fsbno)/XR_BB_NUM) >> \
> -			(((fsbno)%XR_BB_NUM)*XR_BB)) & XR_BB_MASK)
> -#define set_rtbmap(fsbno, state) \
> -	*(rt_ba_bmap + (fsbno)/XR_BB_NUM) = \
> -	 ((*(rt_ba_bmap + (fsbno)/XR_BB_NUM) & \
> -	  (~((__uint64_t) XR_BB_MASK << (((fsbno)%XR_BB_NUM)*XR_BB)))) | \
> -	 (((__uint64_t) (state)) << (((fsbno)%XR_BB_NUM)*XR_BB)))
> + * block map -- track state of each filesystem block.
> + */
> +
> +void		init_bmaps(xfs_mount_t *mp);
> +void		reset_bmaps(xfs_mount_t *mp);
> +void		free_bmaps(xfs_mount_t *mp);
> +
> +void		set_bmap_ext(xfs_agnumber_t agno, xfs_agblock_t agbno,
> +			     xfs_extlen_t blen, int state);
> +int		get_bmap_ext(xfs_agnumber_t agno, xfs_agblock_t agbno,
> +			     xfs_agblock_t maxbno, xfs_extlen_t *blen);
> 
> +void		set_rtbmap(xfs_drtbno_t bno, int state);
> +int		get_rtbmap(xfs_drtbno_t bno);
> +
> +static inline void
> +set_bmap(xfs_agnumber_t agno, xfs_agblock_t agbno, int state)
> +{
> +	set_bmap_ext(agno, agbno, 1, state);
> +}
> +
> +static inline int
> +get_bmap(xfs_agnumber_t agno, xfs_agblock_t agbno)
> +{
> +	return get_bmap_ext(agno, agbno, agbno + 1, NULL);
> +}
> 
>  /*
>   * extent tree definitions
> Index: xfsprogs-dev/repair/scan.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/scan.c	2009-09-02 14:51:09.577269000 -0300
> +++ xfsprogs-dev/repair/scan.c	2009-09-02 14:51:18.629269735 -0300
> @@ -509,7 +509,7 @@ _("%s freespace btree block claimed (sta
>  		rp = XFS_ALLOC_REC_ADDR(mp, block, 1);
>  		for (i = 0; i < numrecs; i++) {
>  			xfs_agblock_t		b, end;
> -			xfs_extlen_t		len;
> +			xfs_extlen_t		len, blen;
> 
>  			b = be32_to_cpu(rp[i].ar_startblock);
>  			len = be32_to_cpu(rp[i].ar_blockcount);
> @@ -522,8 +522,8 @@ _("%s freespace btree block claimed (sta
>  			if (!verify_agbno(mp, agno, end - 1))
>  				continue;
> 
> -			for ( ; b < end; b++)  {
> -				state = get_bmap(agno, b);
> +			for ( ; b < end; b += blen)  {
> +				state = get_bmap_ext(agno, b, end, &blen);
>  				switch (state) {
>  				case XR_E_UNKNOWN:
>  					set_bmap(agno, b, XR_E_FREE1);
> @@ -534,13 +534,15 @@ _("%s freespace btree block claimed (sta
>  					 * FREE1 blocks later
>  					 */
>  					if (magic != XFS_ABTB_MAGIC) {
> -						set_bmap(agno, b, XR_E_FREE);
> +						set_bmap_ext(agno, b, blen,
> +							     XR_E_FREE);
>  						break;
>  					}
>  				default:
>  					do_warn(
> -	_("block (%d,%d) multiply claimed by %s space tree, state - %d\n"),
> -						agno, b, name, state);
> +	_("block (%d,%d-%d) multiply claimed by %s space tree, state - %d\n"),
> +						agno, b, b + blen - 1,
> +						name, state);
>  					break;
>  				}
>  			}
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 13/14] repair: optimize duplicate extent tracking
  2009-09-02 17:55 ` [PATCH 13/14] repair: optimize duplicate extent tracking Christoph Hellwig
@ 2009-10-22 16:41   ` Alex Elder
  0 siblings, 0 replies; 50+ messages in thread
From: Alex Elder @ 2009-10-22 16:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:
> Switch the duplicate extent tracking from an avl tree to our new btree
> implementation.  Modify search_dup_extent to find overlapping extents
> with differening start blocks instead of having the caller walk every
> possible start block.

Looks good.		-Alex

> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Alex Elder <aelder@sgi.com>

> Index: xfsprogs-dev/repair/incore_ext.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/incore_ext.c	2009-08-21 15:11:16.000000000 +0000
> +++ xfsprogs-dev/repair/incore_ext.c	2009-08-21 15:24:07.000000000 +0000

...

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH 14/14] repair: add missing locking in scanfunc_bmap
  2009-09-02 17:55 ` [PATCH 14/14] repair: add missing locking in scanfunc_bmap Christoph Hellwig
@ 2009-10-22 16:42   ` Alex Elder
  0 siblings, 0 replies; 50+ messages in thread
From: Alex Elder @ 2009-10-22 16:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Barry Naujok, xfs

Christoph Hellwig wrote:
> Make sure to protect access to the block usage tracking btree with
> the ag_lock.


Well that one was easy.  Looks good.	-Alex

> Signed-off-by: Barry Naujok <bnaujok@sgi.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Alex Elder <aelder@sgi.com>

> Index: xfsprogs-dev/repair/scan.c
> ===================================================================
> --- xfsprogs-dev.orig/repair/scan.c	2009-08-20 03:16:13.000000000 +0000
> +++ xfsprogs-dev/repair/scan.c	2009-08-20 03:18:17.000000000 +0000
> @@ -235,6 +235,7 @@
>  		agno = XFS_FSB_TO_AGNO(mp, bno);
>  		agbno = XFS_FSB_TO_AGBNO(mp, bno);
> 
> +		pthread_mutex_lock(&ag_locks[agno]);
>  		state = get_bmap(agno, agbno);
>  		switch (state) {
>  		case XR_E_UNKNOWN:
> @@ -280,6 +281,7 @@
>  				state, ino, (__uint64_t) bno);
>  			break;
>  		}
> +		pthread_mutex_unlock(&ag_locks[agno]);
>  	} else  {
>  		/*
>  		 * attribute fork for realtime files is in the regular
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 06/14] repair: use a btree instead of a radix tree for theprefetch queue
  2009-10-21 17:12   ` [PATCH 06/14] repair: use a btree instead of a radix tree for theprefetch queue Alex Elder
@ 2009-11-12 10:04     ` Christoph Hellwig
  2009-11-12 23:46       ` Dave Chinner
  0 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2009-11-12 10:04 UTC (permalink / raw)
  To: Alex Elder; +Cc: Barry Naujok, xfs

On Wed, Oct 21, 2009 at 12:12:33PM -0500, Alex Elder wrote:
> - Is this code worthy of putting into a library, so other
>   XFS user space can use it?

We can do this if we need it - git even tracks history over renames.

> - Is the radix tree code worth putting into a library and
>   saving, for similar reasons (I didn't look at that code).

I don't think we hould keep dead core around.  We can resurrect it from
git history if needed (don't think we'll need it though).

> - I accept that this uses less memory for sparsely populated
>   key space than radix trees; but it would be nice if that
>   could be characterized a bit more precisely (i.e., what
>   will the range of values that'll be represented be, just
>   how sparse is it, and at what point does this really
>   pay off?).
> - Related to the previous one--it would be good to have
>   a little info about why the value 7 was chosen as the
>   number of keys per node.  Perhaps I just don't know
>   enough of the history (or content of the upcoming
>   patches).

Maybe Barry still remembers some of this.  I've added his current
personal address to the Cc list.

> - A number of external interfaces defined are not used
>   in the (current) xfs_repair code.  (That's OK, but
>   they could be removed if they're never expected to
>   to be used--e.g. btree_peek_prev(), and btree_peek_next()).

True.  Although having the commited first and then removed would at
least assure we have it in the git history so we can easily find it
again.

> > +#include <libxfs.h>
> > +#include "btree.h"
> > +
> > +
> 
> Note that the value of BTREE_KEY_MAX *must* be greater than 2, or
> this code will not work.  Maybe nobody should be trying to use 2
> here anyway, but I would like to see a comment, e.g., /* Must be 3 or more */

Thanks, I've added a comment to the updated version of the patch.

> >  	pthread_mutex_lock(&args->lock);
> > -	while (!args->queuing_done || args->primary_io_queue.height) {
> 
> There is a btree_is_empty(btree_root) function, you might
> as well use it here (it is clearer what you're doing anyway).

Indeed, changed.

> > -	ASSERT(args->primary_io_queue.height == 0);
> > -	ASSERT(args->secondary_io_queue.height == 0);
> 
> btree_is_empty() would be better here also.
> 
> > +	ASSERT(btree_find(args->primary_io_queue, 0, NULL) == NULL);
> > +	ASSERT(btree_find(args->secondary_io_queue, 0, NULL) == NULL);

Fixed up as well.

Current version of the patch below:

-- 

Subject: repair: use a btree instead of a radix tree for the prefetch queue
From: Barry Naujok <bnaujok@sgi.com>

Currently the prefetch queue in xfs_repair uses a radix tree implementation
derived from the Linux kernel one to manage it's prefetch queue.

The radix tree implement is not very memory efficient for sparse indices,
so replace it with a btree implementation that is much more efficient.
This is not that important for the prefetch queue but will be very important
for the next memory optimization patches which need a tree to store things
like the block map which are very sparse, and we do not want to deal with
two tree implementations (or rather three given that we still have avl.c
around)

Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>

Index: xfsprogs-dev/repair/Makefile
===================================================================
--- xfsprogs-dev.orig/repair/Makefile	2009-11-12 10:49:49.029254650 +0100
+++ xfsprogs-dev/repair/Makefile	2009-11-12 10:53:01.087004222 +0100
@@ -9,15 +9,15 @@ LSRCFILES = README
 
 LTCOMMAND = xfs_repair
 
-HFILES = agheader.h attr_repair.h avl.h avl64.h bmap.h dinode.h dir.h \
-	dir2.h err_protos.h globals.h incore.h protos.h rt.h \
-	progress.h scan.h versions.h prefetch.h radix-tree.h threads.h
+HFILES = agheader.h attr_repair.h avl.h avl64.h bmap.h btree.h \
+	dinode.h dir.h dir2.h err_protos.h globals.h incore.h protos.h rt.h \
+	progress.h scan.h versions.h prefetch.h threads.h
 
-CFILES = agheader.c attr_repair.c avl.c avl64.c bmap.c dino_chunks.c \
-	dinode.c dir.c dir2.c globals.c incore.c \
+CFILES = agheader.c attr_repair.c avl.c avl64.c bmap.c btree.c \
+	dino_chunks.c dinode.c dir.c dir2.c globals.c incore.c \
 	incore_bmc.c init.c incore_ext.c incore_ino.c phase1.c \
 	phase2.c phase3.c phase4.c phase5.c phase6.c phase7.c \
-	progress.c prefetch.c radix-tree.c rt.c sb.c scan.c threads.c \
+	progress.c prefetch.c rt.c sb.c scan.c threads.c \
 	versions.c xfs_repair.c
 
 LLDLIBS = $(LIBXFS) $(LIBXLOG) $(LIBUUID) $(LIBRT) $(LIBPTHREAD)
Index: xfsprogs-dev/repair/btree.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ xfsprogs-dev/repair/btree.c	2009-11-12 10:54:07.733011893 +0100
@@ -0,0 +1,1238 @@
+/*
+ * Copyright (c) 2007, Silicon Graphics, Inc. Barry Naujok <bnaujok@sgi.com>
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <libxfs.h>
+#include "btree.h"
+
+/*
+ * Maximum number of keys per node.  Must be greater than 2 for the code
+ * to work.
+ */
+#define BTREE_KEY_MAX		7
+#define BTREE_KEY_MIN		(BTREE_KEY_MAX / 2)
+
+#define BTREE_PTR_MAX		(BTREE_KEY_MAX + 1)
+
+struct btree_node {
+	unsigned long		num_keys;
+	unsigned long		keys[BTREE_KEY_MAX];
+	struct btree_node	*ptrs[BTREE_PTR_MAX];
+};
+
+struct btree_cursor {
+	struct btree_node	*node;
+	int			index;
+};
+
+struct btree_root {
+	struct btree_node	*root_node;
+	struct btree_cursor	*cursor;	/* track path to end leaf */
+	int			height;
+	/* lookup cache */
+	int			keys_valid;	/* set if the cache is valid */
+	unsigned long		cur_key;
+	unsigned long		next_key;
+	void			*next_value;
+	unsigned long		prev_key;
+	void			*prev_value;
+#ifdef BTREE_STATS
+	struct btree_stats {
+		unsigned long	num_items;
+		unsigned long	max_items;
+		int		alloced;
+		int		cache_hits;
+		int		cache_misses;
+		int		lookup;
+		int		find;
+		int		key_update;
+		int		value_update;
+		int		insert;
+		int		delete;
+		int		inc_height;
+		int		dec_height;
+		int		shift_prev;
+		int		shift_next;
+		int		split;
+		int		merge_prev;
+		int		merge_next;
+		int		balance_prev;
+		int		balance_next;
+	} stats;
+#endif
+};
+
+
+static struct btree_node *
+btree_node_alloc(void)
+{
+	return calloc(1, sizeof(struct btree_node));
+}
+
+static void
+btree_node_free(
+	struct btree_node 	*node)
+{
+	free(node);
+}
+
+static void
+btree_free_nodes(
+	struct btree_node	*node,
+	int			level)
+{
+	int			i;
+
+	if (level)
+		for (i = 0; i <= node->num_keys; i++)
+			btree_free_nodes(node->ptrs[i], level - 1);
+	btree_node_free(node);
+}
+
+static void
+__btree_init(
+	struct btree_root	*root)
+{
+	memset(root, 0, sizeof(struct btree_root));
+	root->height = 1;
+	root->cursor = calloc(1, sizeof(struct btree_cursor));
+	root->root_node = btree_node_alloc();
+	ASSERT(root->root_node);
+#ifdef BTREE_STATS
+	root->stats.max_items = 1;
+	root->stats.alloced += 1;
+#endif
+}
+
+static void
+__btree_free(
+	struct btree_root	*root)
+{
+	btree_free_nodes(root->root_node, root->height - 1);
+	free(root->cursor);
+	root->height = 0;
+	root->cursor = NULL;
+	root->root_node = NULL;
+}
+
+void
+btree_init(
+	struct btree_root	**root)
+{
+	*root = calloc(1, sizeof(struct btree_root));
+	__btree_init(*root);
+}
+
+void
+btree_clear(
+	struct btree_root	*root)
+{
+	__btree_free(root);
+	__btree_init(root);
+}
+
+void
+btree_destroy(
+	struct btree_root	*root)
+{
+	__btree_free(root);
+	free(root);
+}
+
+int
+btree_is_empty(
+	struct btree_root	*root)
+{
+	return root->root_node->num_keys == 0;
+}
+
+static inline void
+btree_invalidate_cursor(
+	struct btree_root	*root)
+{
+	root->cursor[0].node = NULL;
+	root->keys_valid = 0;
+}
+
+static inline unsigned long
+btree_key_of_cursor(
+	struct btree_cursor	*cursor,
+	int			height)
+{
+	while (cursor->node->num_keys == cursor->index && --height > 0)
+		cursor++;
+	return cursor->node->keys[cursor->index];
+}
+
+static void *
+btree_get_prev(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	struct btree_cursor	*cur = root->cursor;
+	int			level = 0;
+	struct btree_node	*node;
+
+	if (cur->index > 0) {
+		if (key)
+			*key = cur->node->keys[cur->index - 1];
+		return cur->node->ptrs[cur->index - 1];
+	}
+
+	/* else need to go up and back down the tree to find the previous */
+
+	while (cur->index == 0) {
+		if (++level == root->height)
+			return NULL;
+		cur++;
+	}
+
+	/* the key is in the current level */
+	if (key)
+		*key = cur->node->keys[cur->index - 1];
+
+	/* descend back down the right side to get the pointer */
+	node = cur->node->ptrs[cur->index - 1];
+	while (level--)
+		node = node->ptrs[node->num_keys];
+	return node;
+}
+
+static void *
+btree_get_next(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	struct btree_cursor	*cur = root->cursor;
+	int			level = 0;
+	struct btree_node	*node;
+
+	while (cur->index == cur->node->num_keys) {
+		if (++level == root->height)
+			return NULL;
+		cur++;
+	}
+	if (level == 0) {
+		if (key) {
+			cur->index++;
+			*key = btree_key_of_cursor(cur, root->height);
+			cur->index--;
+		}
+		return cur->node->ptrs[cur->index + 1];
+	}
+
+	node = cur->node->ptrs[cur->index + 1];
+	while (--level > 0)
+		node = node->ptrs[0];
+	if (key)
+		*key = node->keys[0];
+	return node->ptrs[0];
+}
+
+/*
+ * Lookup/Search functions
+ */
+
+static int
+btree_do_search(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+	unsigned long		k = 0;
+	struct btree_cursor	*cur = root->cursor + root->height;
+	struct btree_node	*node = root->root_node;
+	int			height = root->height;
+	int			key_found = 0;
+	int			i;
+
+	while (--height >= 0) {
+		cur--;
+		for (i = 0; i < node->num_keys; i++)
+			if (node->keys[i] >= key) {
+				k = node->keys[i];
+				key_found = 1;
+				break;
+			}
+		cur->node = node;
+		cur->index = i;
+		node = node->ptrs[i];
+	}
+	root->keys_valid = key_found;
+	if (!key_found)
+		return 0;
+
+	root->cur_key = k;
+	root->next_value = NULL;	/* do on-demand next value lookup */
+	root->prev_value = btree_get_prev(root, &root->prev_key);
+	return 1;
+}
+
+static int
+btree_search(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+	if (root->keys_valid && key <= root->cur_key &&
+				(!root->prev_value || key > root->prev_key)) {
+#ifdef BTREE_STATS
+		root->stats.cache_hits++;
+#endif
+		return 1;
+	}
+#ifdef BTREE_STATS
+	root->stats.cache_misses++;
+#endif
+	return btree_do_search(root, key);
+}
+
+void *
+btree_find(
+	struct btree_root	*root,
+	unsigned long		key,
+	unsigned long		*actual_key)
+{
+#ifdef BTREE_STATS
+	root->stats.find += 1;
+#endif
+	if (!btree_search(root, key))
+		return NULL;
+
+	if (actual_key)
+		*actual_key = root->cur_key;
+	return root->cursor->node->ptrs[root->cursor->index];
+}
+
+void *
+btree_lookup(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+#ifdef BTREE_STATS
+	root->stats.lookup += 1;
+#endif
+	if (!btree_search(root, key) || root->cur_key != key)
+		return NULL;
+	return root->cursor->node->ptrs[root->cursor->index];
+}
+
+void *
+btree_peek_prev(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	if (!root->keys_valid)
+		return NULL;
+	if (key)
+		*key = root->prev_key;
+	return root->prev_value;
+}
+
+void *
+btree_peek_next(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	if (!root->keys_valid)
+		return NULL;
+	if (!root->next_value)
+		root->next_value = btree_get_next(root, &root->next_key);
+	if (key)
+		*key = root->next_key;
+	return root->next_value;
+}
+
+static void *
+btree_move_cursor_to_next(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	struct btree_cursor	*cur = root->cursor;
+	int			level = 0;
+
+	while (cur->index == cur->node->num_keys) {
+		if (++level == root->height)
+			return NULL;
+		cur++;
+	}
+	cur->index++;
+	if (level == 0) {
+		if (key)
+			*key = btree_key_of_cursor(cur, root->height);
+		return cur->node->ptrs[cur->index];
+	}
+
+	while (--level >= 0) {
+		root->cursor[level].node = cur->node->ptrs[cur->index];
+		root->cursor[level].index = 0;
+		cur--;
+	}
+	if (key)
+		*key = cur->node->keys[0];
+	return cur->node->ptrs[0];
+}
+
+void *
+btree_lookup_next(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	void			*value;
+
+	if (!root->keys_valid)
+		return NULL;
+
+	root->prev_key = root->cur_key;
+	root->prev_value = root->cursor->node->ptrs[root->cursor->index];
+
+	value = btree_move_cursor_to_next(root, &root->cur_key);
+	if (!value) {
+		btree_invalidate_cursor(root);
+		return NULL;
+	}
+	root->next_value = NULL;	/* on-demand next value fetch */
+	if (key)
+		*key = root->cur_key;
+	return value;
+}
+
+static void *
+btree_move_cursor_to_prev(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	struct btree_cursor	*cur = root->cursor;
+	int			level = 0;
+
+	while (cur->index == 0) {
+		if (++level == root->height)
+			return NULL;
+		cur++;
+	}
+	cur->index--;
+	if (key)	/* the key is in the current level */
+		*key = cur->node->keys[cur->index];
+	while (level > 0) {
+		level--;
+		root->cursor[level].node = cur->node->ptrs[cur->index];
+		root->cursor[level].index = root->cursor[level].node->num_keys;
+		cur--;
+	}
+	return cur->node->ptrs[cur->index];
+}
+
+void *
+btree_lookup_prev(
+	struct btree_root	*root,
+	unsigned long		*key)
+{
+	void			*value;
+
+	if (!root->keys_valid)
+		return NULL;
+
+	value = btree_move_cursor_to_prev(root, &root->cur_key);
+	if (!value)
+		return NULL;
+	root->prev_value = btree_get_prev(root, &root->prev_key);
+	root->next_value = NULL;	/* on-demand next value fetch */
+	if (key)
+		*key = root->cur_key;
+	return value;
+}
+
+void *
+btree_uncached_lookup(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+	/* cursor-less (ie. uncached) lookup */
+	int			height = root->height - 1;
+	struct btree_node	*node = root->root_node;
+	int			i;
+	int			key_found = 0;
+
+	while (height >= 0) {
+		for (i = 0; i < node->num_keys; i++)
+			if (node->keys[i] >= key) {
+				key_found = node->keys[i] == key;
+				break;
+			}
+		node = node->ptrs[i];
+		height--;
+	}
+	return key_found ? node : NULL;
+}
+
+/* Update functions */
+
+static inline void
+btree_update_node_key(
+	struct btree_root	*root,
+	struct btree_cursor	*cursor,
+	int			level,
+	unsigned long		new_key)
+{
+	int			i;
+
+#ifdef BTREE_STATS
+	root->stats.key_update += 1;
+#endif
+
+	cursor += level;
+	for (i = level; i < root->height; i++) {
+		if (cursor->index < cursor->node->num_keys) {
+			cursor->node->keys[cursor->index] = new_key;
+			break;
+		}
+		cursor++;
+	}
+}
+
+int
+btree_update_key(
+	struct btree_root	*root,
+	unsigned long		old_key,
+	unsigned long		new_key)
+{
+	if (!btree_search(root, old_key) || root->cur_key != old_key)
+		return ENOENT;
+
+	if (root->next_value && new_key >= root->next_key)
+		return EINVAL;
+
+	if (root->prev_value && new_key <= root->prev_key)
+		return EINVAL;
+
+	btree_update_node_key(root, root->cursor, 0, new_key);
+
+	return 0;
+}
+
+int
+btree_update_value(
+	struct btree_root	*root,
+	unsigned long		key,
+	void			*new_value)
+{
+	if (!new_value)
+		return EINVAL;
+
+	if (!btree_search(root, key) || root->cur_key != key)
+		return ENOENT;
+
+#ifdef BTREE_STATS
+	root->stats.value_update += 1;
+#endif
+	root->cursor->node->ptrs[root->cursor->index] = new_value;
+
+	return 0;
+}
+
+/*
+ * Cursor modification functions - used for inserting and deleting
+ */
+
+static struct btree_cursor *
+btree_copy_cursor_prev(
+	struct btree_root	*root,
+	struct btree_cursor	*dest_cursor,
+	int			level)
+{
+	struct btree_cursor	*src_cur = root->cursor + level;
+	struct btree_cursor	*dst_cur;
+	int			l = level;
+	int			i;
+
+	if (level >= root->height)
+		return NULL;
+
+	while (src_cur->index == 0) {
+		if (++l >= root->height)
+			return NULL;
+		src_cur++;
+	}
+	for (i = l; i < root->height; i++)
+		dest_cursor[i] = *src_cur++;
+
+	dst_cur = dest_cursor + l;
+	dst_cur->index--;
+	while (l-- >= level) {
+		dest_cursor[l].node = dst_cur->node->ptrs[dst_cur->index];
+		dest_cursor[l].index = dest_cursor[l].node->num_keys;
+		dst_cur--;
+	}
+	return dest_cursor;
+}
+
+static struct btree_cursor *
+btree_copy_cursor_next(
+	struct btree_root	*root,
+	struct btree_cursor	*dest_cursor,
+	int			level)
+{
+	struct btree_cursor	*src_cur = root->cursor + level;
+	struct btree_cursor	*dst_cur;
+	int			l = level;
+	int			i;
+
+	if (level >= root->height)
+		return NULL;
+
+	while (src_cur->index == src_cur->node->num_keys) {
+		if (++l >= root->height)
+			return NULL;
+		src_cur++;
+	}
+	for (i = l; i < root->height; i++)
+		dest_cursor[i] = *src_cur++;
+
+	dst_cur = dest_cursor + l;
+	dst_cur->index++;
+	while (l-- >= level) {
+		dest_cursor[l].node = dst_cur->node->ptrs[dst_cur->index];
+		dest_cursor[l].index = 0;
+		dst_cur--;
+	}
+	return dest_cursor;
+}
+
+/*
+ * Shift functions
+ *
+ * Tries to move items in the current leaf to its sibling if it has space.
+ * Used in both insert and delete functions.
+ * Returns the number of items shifted.
+ */
+
+static int
+btree_shift_to_prev(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*prev_cursor,
+	int			num_children)
+{
+	struct btree_node	*node;
+	struct btree_node	*prev_node;
+	int			num_remain;	/* # of keys left in "node" */
+	unsigned long		key;
+	int			i;
+
+	if (!prev_cursor || !num_children)
+		return 0;
+
+	prev_node = prev_cursor[level].node;
+	node = root->cursor[level].node;
+
+	ASSERT(num_children > 0 && num_children <= node->num_keys + 1);
+
+	if ((prev_node->num_keys + num_children) > BTREE_KEY_MAX)
+		return 0;
+
+#ifdef BTREE_STATS
+	root->stats.shift_prev += 1;
+#endif
+
+	num_remain = node->num_keys - num_children;
+	ASSERT(num_remain == -1 || num_remain >= BTREE_KEY_MIN);
+
+	/* shift parent keys around */
+	level++;
+	if (num_remain > 0)
+		key = node->keys[num_children - 1];
+	else
+		key = btree_key_of_cursor(root->cursor + level,
+						root->height - level);
+	while (prev_cursor[level].index == prev_cursor[level].node->num_keys) {
+		level++;
+		ASSERT(level < root->height);
+	}
+	prev_node->keys[prev_node->num_keys] =
+			prev_cursor[level].node->keys[prev_cursor[level].index];
+	prev_cursor[level].node->keys[prev_cursor[level].index] = key;
+
+	/* copy pointers and keys to the end of the prev node */
+	for (i = 0; i < num_children - 1; i++) {
+		prev_node->keys[prev_node->num_keys + 1 + i] = node->keys[i];
+		prev_node->ptrs[prev_node->num_keys + 1 + i] = node->ptrs[i];
+	}
+	prev_node->ptrs[prev_node->num_keys + 1 + i] = node->ptrs[i];
+	prev_node->num_keys += num_children;
+
+	/* move remaining pointers/keys to start of node */
+	if (num_remain >= 0) {
+		for (i = 0; i < num_remain; i++) {
+			node->keys[i] = node->keys[num_children + i];
+			node->ptrs[i] = node->ptrs[num_children + i];
+		}
+		node->ptrs[i] = node->ptrs[num_children + i];
+		node->num_keys = num_remain;
+	} else
+		node->num_keys = 0;
+
+	return num_children;
+}
+
+static int
+btree_shift_to_next(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*next_cursor,
+	int			num_children)
+{
+	struct btree_node	*node;
+	struct btree_node	*next_node;
+	int			num_remain;	/* # of children left in node */
+	int			i;
+
+	if (!next_cursor || !num_children)
+		return 0;
+
+	node = root->cursor[level].node;
+	next_node = next_cursor[level].node;
+
+	ASSERT(num_children > 0 && num_children <= node->num_keys + 1);
+
+	if ((next_node->num_keys + num_children) > BTREE_KEY_MAX)
+		return 0;
+
+	num_remain = node->num_keys + 1 - num_children;
+	ASSERT(num_remain == 0 || num_remain > BTREE_KEY_MIN);
+
+#ifdef BTREE_STATS
+	root->stats.shift_next += 1;
+#endif
+
+	/* make space for "num_children" items at beginning of next-leaf */
+	i = next_node->num_keys;
+	next_node->ptrs[num_children + i] = next_node->ptrs[i];
+	while (--i >= 0) {
+		next_node->keys[num_children + i] = next_node->keys[i];
+		next_node->ptrs[num_children + i] = next_node->ptrs[i];
+	}
+
+	/* update keys in parent and next node from parent */
+	do {
+		level++;
+		ASSERT(level < root->height);
+	} while (root->cursor[level].index ==
+		 root->cursor[level].node->num_keys);
+
+	next_node->keys[num_children - 1] =
+		root->cursor[level].node->keys[root->cursor[level].index];
+	root->cursor[level].node->keys[root->cursor[level].index] =
+		node->keys[node->num_keys - num_children];
+
+	/* copy last "num_children" items from node into start of next-node */
+	for (i = 0; i < num_children - 1; i++) {
+		next_node->keys[i] = node->keys[num_remain + i];
+		next_node->ptrs[i] = node->ptrs[num_remain + i];
+	}
+	next_node->ptrs[i] = node->ptrs[num_remain + i];
+	next_node->num_keys += num_children;
+
+	if (num_remain > 0)
+		node->num_keys -= num_children;
+	else
+		node->num_keys = 0;
+
+	return num_children;
+}
+
+/*
+ * Insertion functions
+ */
+
+static struct btree_node *
+btree_increase_height(
+	struct btree_root	*root)
+{
+	struct btree_node	*new_root;
+	struct btree_cursor	*new_cursor;
+
+	new_cursor = realloc(root->cursor, (root->height + 1) *
+				sizeof(struct btree_cursor));
+	if (!new_cursor)
+		return NULL;
+	root->cursor = new_cursor;
+
+	new_root = btree_node_alloc();
+	if (!new_root)
+		return NULL;
+
+#ifdef BTREE_STATS
+	root->stats.alloced += 1;
+	root->stats.inc_height += 1;
+	root->stats.max_items *= BTREE_PTR_MAX;
+#endif
+
+	new_root->ptrs[0] = root->root_node;
+	root->root_node = new_root;
+
+	root->cursor[root->height].node = new_root;
+	root->cursor[root->height].index = 0;
+
+	root->height++;
+
+	return new_root;
+}
+
+static int
+btree_insert_item(
+	struct btree_root	*root,
+	int			level,
+	unsigned long		key,
+	void			*value);
+
+
+static struct btree_node *
+btree_split(
+	struct btree_root	*root,
+	int			level,
+	unsigned long		key,
+	int			*index)
+{
+	struct btree_node	*node = root->cursor[level].node;
+	struct btree_node	*new_node;
+	int			i;
+
+	new_node = btree_node_alloc();
+	if (!new_node)
+		return NULL;
+
+	if (btree_insert_item(root, level + 1, node->keys[BTREE_KEY_MIN],
+							new_node) != 0) {
+		btree_node_free(new_node);
+		return NULL;
+	}
+
+#ifdef BTREE_STATS
+	root->stats.alloced += 1;
+	root->stats.split += 1;
+#endif
+
+	for (i = 0; i < BTREE_KEY_MAX - BTREE_KEY_MIN - 1; i++) {
+		new_node->keys[i] = node->keys[BTREE_KEY_MIN + 1 + i];
+		new_node->ptrs[i] = node->ptrs[BTREE_KEY_MIN + 1 + i];
+	}
+	new_node->ptrs[i] = node->ptrs[BTREE_KEY_MIN + 1 + i];
+	new_node->num_keys = BTREE_KEY_MAX - BTREE_KEY_MIN - 1;
+
+	node->num_keys = BTREE_KEY_MIN;
+	if (key < node->keys[BTREE_KEY_MIN])
+		return node;	/* index doesn't change */
+
+	/* insertion point is in new node... */
+	*index -= BTREE_KEY_MIN + 1;
+	return new_node;
+}
+
+static int
+btree_insert_shift_to_prev(
+	struct btree_root	*root,
+	int			level,
+	int			*index)
+{
+	struct btree_cursor	tmp_cursor[root->height];
+	int			n;
+
+	if (*index <= 0)
+		return -1;
+
+	if (!btree_copy_cursor_prev(root, tmp_cursor, level + 1))
+		return -1;
+
+	n = MIN(*index, (BTREE_PTR_MAX - tmp_cursor[level].node->num_keys) / 2);
+	if (!n || !btree_shift_to_prev(root, level, tmp_cursor, n))
+		return -1;
+
+	*index -= n;
+	return 0;
+}
+
+static int
+btree_insert_shift_to_next(
+	struct btree_root	*root,
+	int			level,
+	int			*index)
+{
+	struct btree_cursor	tmp_cursor[root->height];
+	int			n;
+
+	if (*index >= BTREE_KEY_MAX)
+		return -1;
+
+	if (!btree_copy_cursor_next(root, tmp_cursor, level + 1))
+		return -1;
+
+	n = MIN(BTREE_KEY_MAX - *index,
+		(BTREE_PTR_MAX - tmp_cursor[level].node->num_keys) / 2);
+	if (!n || !btree_shift_to_next(root, level, tmp_cursor, n))
+		return -1;
+	return 0;
+}
+
+static int
+btree_insert_item(
+	struct btree_root	*root,
+	int			level,
+	unsigned long		key,
+	void			*value)
+{
+	struct btree_node	*node = root->cursor[level].node;
+	int			index = root->cursor[level].index;
+	int			i;
+
+	if (node->num_keys == BTREE_KEY_MAX) {
+		if (btree_insert_shift_to_prev(root, level, &index) == 0)
+			goto insert;
+		if (btree_insert_shift_to_next(root, level, &index) == 0)
+			goto insert;
+		if (level == root->height - 1) {
+			if (!btree_increase_height(root))
+				return ENOMEM;
+		}
+		node = btree_split(root, level, key, &index);
+		if (!node)
+			return ENOMEM;
+	}
+insert:
+	ASSERT(index <= node->num_keys);
+
+	i = node->num_keys;
+	node->ptrs[i + 1] = node->ptrs[i];
+	while (--i >= index) {
+		node->keys[i + 1] = node->keys[i];
+		node->ptrs[i + 1] = node->ptrs[i];
+	}
+
+	node->num_keys++;
+	node->keys[index] = key;
+
+	if (level == 0)
+		node->ptrs[index] = value;
+	else
+		node->ptrs[index + 1] = value;
+
+	return 0;
+}
+
+
+
+int
+btree_insert(
+	struct btree_root	*root,
+	unsigned long		key,
+	void			*value)
+{
+	int			result;
+
+	if (!value)
+		return EINVAL;
+
+	if (btree_search(root, key) && root->cur_key == key)
+		return EEXIST;
+
+#ifdef BTREE_STATS
+	root->stats.insert += 1;
+	root->stats.num_items += 1;
+#endif
+
+	result = btree_insert_item(root, 0, key, value);
+
+	btree_invalidate_cursor(root);
+
+	return result;
+}
+
+
+/*
+ * Deletion functions
+ *
+ * Rather more complicated as deletions has 4 ways to go once a node
+ * ends up with less than the minimum number of keys:
+ *   - move remainder to previous node
+ *   - move remainder to next node
+ *       (both will involve a parent deletion which may recurse)
+ *   - balance by moving some items from previous node
+ *   - balance by moving some items from next node
+ */
+
+static void
+btree_decrease_height(
+	struct btree_root	*root)
+{
+	struct btree_node	*old_root = root->root_node;
+
+	ASSERT(old_root->num_keys == 0);
+
+#ifdef BTREE_STATS
+	root->stats.alloced -= 1;
+	root->stats.dec_height += 1;
+	root->stats.max_items /= BTREE_PTR_MAX;
+#endif
+	root->root_node = old_root->ptrs[0];
+	btree_node_free(old_root);
+	root->height--;
+}
+
+static int
+btree_merge_with_prev(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*prev_cursor)
+{
+	if (!prev_cursor)
+		return 0;
+
+	if (!btree_shift_to_prev(root, level, prev_cursor,
+					root->cursor[level].node->num_keys + 1))
+		return 0;
+
+#ifdef BTREE_STATS
+	root->stats.merge_prev += 1;
+#endif
+	return 1;
+}
+
+static int
+btree_merge_with_next(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*next_cursor)
+{
+	if (!next_cursor)
+		return 0;
+
+	if (!btree_shift_to_next(root, level, next_cursor,
+					root->cursor[level].node->num_keys + 1))
+		return 0;
+
+#ifdef BTREE_STATS
+	root->stats.merge_next += 1;
+#endif
+	return 1;
+}
+
+static int
+btree_balance_with_prev(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*prev_cursor)
+{
+	struct btree_cursor	*root_cursor = root->cursor;
+
+	if (!prev_cursor)
+		return 0;
+	ASSERT(prev_cursor[level].node->num_keys > BTREE_KEY_MIN);
+
+#ifdef BTREE_STATS
+	root->stats.balance_prev += 1;
+#endif
+	/*
+	 * Move some nodes from the prev node into the current node.
+	 * As the shift operation is a right shift and is relative to
+	 * the root cursor, make the root cursor the prev cursor and
+	 * pass in the root cursor as the next cursor.
+	 */
+
+	root->cursor = prev_cursor;
+	if (!btree_shift_to_next(root, level, root_cursor,
+		(prev_cursor[level].node->num_keys + 1 - BTREE_KEY_MIN) / 2))
+			abort();
+	root->cursor = root_cursor;
+
+	return 1;
+}
+
+static int
+btree_balance_with_next(
+	struct btree_root	*root,
+	int			level,
+	struct btree_cursor	*next_cursor)
+{
+	struct btree_cursor	*root_cursor = root->cursor;
+
+	if (!next_cursor)
+		return 0;
+	assert(next_cursor[level].node->num_keys > BTREE_KEY_MIN);
+
+#ifdef btree_stats
+	root->stats.balance_next += 1;
+#endif
+	/*
+	 * move some nodes from the next node into the current node.
+	 * as the shift operation is a left shift and is relative to
+	 * the root cursor, make the root cursor the next cursor and
+	 * pass in the root cursor as the prev cursor.
+	 */
+
+	root->cursor = next_cursor;
+	if (!btree_shift_to_prev(root, level, root_cursor,
+		(next_cursor[level].node->num_keys + 1 - BTREE_KEY_MIN) / 2))
+			abort();
+	root->cursor = root_cursor;
+
+	return 1;
+
+}
+
+static void
+btree_delete_key(
+	struct btree_root	*root,
+	int			level);
+
+/*
+ * btree_delete_node:
+ *
+ * Return 0 if it's done or 1 if the next level needs to be collapsed
+ */
+static void
+btree_delete_node(
+	struct btree_root	*root,
+	int			level)
+{
+	struct btree_cursor	prev_cursor[root->height];
+	struct btree_cursor	next_cursor[root->height];
+	struct btree_cursor	*pc;
+	struct btree_cursor	*nc;
+
+	/*
+	 * the node has underflowed, grab or merge keys/items from a
+	 * neighbouring node.
+	 */
+
+	if (level == root->height - 1) {
+		if (level > 0 && root->root_node->num_keys == 0)
+			btree_decrease_height(root);
+		return;
+	}
+
+	pc = btree_copy_cursor_prev(root, prev_cursor, level + 1);
+	if (!btree_merge_with_prev(root, level, pc)) {
+		nc = btree_copy_cursor_next(root, next_cursor, level + 1);
+		if (!btree_merge_with_next(root, level, nc)) {
+			/* merging failed, try redistrubution */
+			if (!btree_balance_with_prev(root, level, pc) &&
+			    !btree_balance_with_next(root, level, nc))
+				abort();
+			return;	/* when balancing, then the node isn't freed */
+		}
+	}
+
+#ifdef BTREE_STATS
+	root->stats.alloced -= 1;
+#endif
+	btree_node_free(root->cursor[level].node);
+
+	btree_delete_key(root, level + 1);
+}
+
+static void
+btree_delete_key(
+	struct btree_root	*root,
+	int			level)
+{
+	struct btree_node	*node = root->cursor[level].node;
+	int			index = root->cursor[level].index;
+
+	node->num_keys--;
+	if (index <= node->num_keys) {
+		/*
+		 * if not deleting the last item, shift higher items down
+		 * to cover the item being deleted
+		 */
+		while (index < node->num_keys) {
+			node->keys[index] = node->keys[index + 1];
+			node->ptrs[index] = node->ptrs[index + 1];
+			index++;
+		}
+		node->ptrs[index] = node->ptrs[index + 1];
+	} else {
+		/*
+		 * else update the associated parent key as the last key
+		 * in the leaf has changed
+		 */
+		btree_update_node_key(root, root->cursor, level + 1,
+						node->keys[node->num_keys]);
+	}
+	/*
+	 * if node underflows, either merge with sibling or rebalance
+	 * with sibling.
+	 */
+	if (node->num_keys < BTREE_KEY_MIN)
+		btree_delete_node(root, level);
+}
+
+void *
+btree_delete(
+	struct btree_root	*root,
+	unsigned long		key)
+{
+	void			*value;
+
+	value = btree_lookup(root, key);
+	if (!value)
+		return NULL;
+
+#ifdef BTREE_STATS
+	root->stats.delete += 1;
+	root->stats.num_items -= 1;
+#endif
+
+	btree_delete_key(root, 0);
+
+	btree_invalidate_cursor(root);
+
+	return value;
+}
+
+#ifdef BTREE_STATS
+void
+btree_print_stats(
+	struct btree_root	*root,
+	FILE			*f)
+{
+	unsigned long		max_items = root->stats.max_items *
+						(root->root_node->num_keys + 1);
+
+	fprintf(f, "\tnum_items = %lu, max_items = %lu (%lu%%)\n",
+			root->stats.num_items, max_items,
+			root->stats.num_items * 100 / max_items);
+	fprintf(f, "\talloced = %d nodes, %lu bytes, %lu bytes per item\n",
+			root->stats.alloced,
+			root->stats.alloced * sizeof(struct btree_node),
+			root->stats.alloced * sizeof(struct btree_node) /
+							root->stats.num_items);
+	fprintf(f, "\tlookup = %d\n", root->stats.lookup);
+	fprintf(f, "\tfind = %d\n", root->stats.find);
+	fprintf(f, "\tcache_hits = %d\n", root->stats.cache_hits);
+	fprintf(f, "\tcache_misses = %d\n", root->stats.cache_misses);
+	fprintf(f, "\tkey_update = %d\n", root->stats.key_update);
+	fprintf(f, "\tvalue_update = %d\n", root->stats.value_update);
+	fprintf(f, "\tinsert = %d\n", root->stats.insert);
+	fprintf(f, "\tshift_prev = %d\n", root->stats.shift_prev);
+	fprintf(f, "\tshift_next = %d\n", root->stats.shift_next);
+	fprintf(f, "\tsplit = %d\n", root->stats.split);
+	fprintf(f, "\tinc_height = %d\n", root->stats.inc_height);
+	fprintf(f, "\tdelete = %d\n", root->stats.delete);
+	fprintf(f, "\tmerge_prev = %d\n", root->stats.merge_prev);
+	fprintf(f, "\tmerge_next = %d\n", root->stats.merge_next);
+	fprintf(f, "\tbalance_prev = %d\n", root->stats.balance_prev);
+	fprintf(f, "\tbalance_next = %d\n", root->stats.balance_next);
+	fprintf(f, "\tdec_height = %d\n", root->stats.dec_height);
+}
+#endif
Index: xfsprogs-dev/repair/btree.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ xfsprogs-dev/repair/btree.h	2009-11-12 10:53:01.091025964 +0100
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef _BTREE_H
+#define _BTREE_H
+
+
+struct btree_root;
+
+void
+btree_init(
+	struct btree_root	**root);
+
+void
+btree_destroy(
+	struct btree_root	*root);
+
+int
+btree_is_empty(
+	struct btree_root	*root);
+
+void *
+btree_lookup(
+	struct btree_root	*root,
+	unsigned long		key);
+
+void *
+btree_find(
+	struct btree_root	*root,
+	unsigned long		key,
+	unsigned long		*actual_key);
+
+void *
+btree_peek_prev(
+	struct btree_root	*root,
+	unsigned long		*key);
+
+void *
+btree_peek_next(
+	struct btree_root	*root,
+	unsigned long		*key);
+
+void *
+btree_lookup_next(
+	struct btree_root	*root,
+	unsigned long		*key);
+
+void *
+btree_lookup_prev(
+	struct btree_root	*root,
+	unsigned long		*key);
+
+int
+btree_insert(
+	struct btree_root	*root,
+	unsigned long		key,
+	void			*value);
+
+void *
+btree_delete(
+	struct btree_root	*root,
+	unsigned long		key);
+
+int
+btree_update_key(
+	struct btree_root	*root,
+	unsigned long		old_key,
+	unsigned long		new_key);
+
+int
+btree_update_value(
+	struct btree_root	*root,
+	unsigned long		key,
+	void 			*new_value);
+
+void
+btree_clear(
+	struct btree_root	*root);
+
+#ifdef BTREE_STATS
+void
+btree_print_stats(
+	struct btree_root	*root,
+	FILE			*f);
+#endif
+
+#endif /* _BTREE_H */
Index: xfsprogs-dev/repair/init.c
===================================================================
--- xfsprogs-dev.orig/repair/init.c	2009-11-12 10:49:49.051263879 +0100
+++ xfsprogs-dev/repair/init.c	2009-11-12 10:53:01.092013310 +0100
@@ -26,7 +26,6 @@
 #include "dir.h"
 #include "incore.h"
 #include "prefetch.h"
-#include "radix-tree.h"
 #include <sys/resource.h>
 
 static pthread_key_t dirbuf_key;
@@ -151,5 +150,4 @@ xfs_init(libxfs_init_t *args)
 	ts_create();
 	ts_init();
 	increase_rlimit();
-	radix_tree_init();
 }
Index: xfsprogs-dev/repair/prefetch.c
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.c	2009-11-12 10:49:49.056254319 +0100
+++ xfsprogs-dev/repair/prefetch.c	2009-11-12 11:01:42.533256209 +0100
@@ -1,6 +1,7 @@
 #include <libxfs.h>
 #include <pthread.h>
 #include "avl.h"
+#include "btree.h"
 #include "globals.h"
 #include "agheader.h"
 #include "incore.h"
@@ -14,7 +15,6 @@
 #include "threads.h"
 #include "prefetch.h"
 #include "progress.h"
-#include "radix-tree.h"
 
 int do_prefetch = 1;
 
@@ -129,10 +129,8 @@ pf_queue_io(
 	pthread_mutex_lock(&args->lock);
 
 	if (fsbno > args->last_bno_read) {
-		radix_tree_insert(&args->primary_io_queue, fsbno, bp);
-		if (!B_IS_INODE(flag))
-			radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
-		else {
+		btree_insert(args->primary_io_queue, fsbno, bp);
+		if (B_IS_INODE(flag)) {
 			args->inode_bufs_queued++;
 			if (args->inode_bufs_queued == IO_THRESHOLD)
 				pf_start_io_workers(args);
@@ -154,7 +152,7 @@ pf_queue_io(
 #endif
 		ASSERT(!B_IS_INODE(flag));
 		XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
-		radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
+		btree_insert(args->secondary_io_queue, fsbno, bp);
 	}
 
 	pf_start_processing(args);
@@ -407,7 +405,7 @@ pf_batch_read(
 	pf_which_t		which,
 	void			*buf)
 {
-	struct radix_tree_root	*queue;
+	struct btree_root	*queue;
 	xfs_buf_t		*bplist[MAX_BUFS];
 	unsigned int		num;
 	off64_t			first_off, last_off, next_off;
@@ -415,27 +413,25 @@ pf_batch_read(
 	int			i;
 	int			inode_bufs;
 	unsigned long		fsbno;
+	unsigned long		max_fsbno;
 	char			*pbuf;
 
-	queue = (which != PF_SECONDARY) ? &args->primary_io_queue
-				: &args->secondary_io_queue;
+	queue = (which != PF_SECONDARY) ? args->primary_io_queue
+				: args->secondary_io_queue;
 
-	while (radix_tree_lookup_first(queue, &fsbno) != NULL) {
-
-		if (which != PF_META_ONLY) {
-			num = radix_tree_gang_lookup_ex(queue,
-					(void**)&bplist[0], fsbno,
-					fsbno + pf_max_fsbs, MAX_BUFS);
-			ASSERT(num > 0);
-			ASSERT(XFS_FSB_TO_DADDR(mp, fsbno) ==
-				XFS_BUF_ADDR(bplist[0]));
-		} else {
-			num = radix_tree_gang_lookup_tag(queue,
-					(void**)&bplist[0], fsbno,
-					MAX_BUFS / 4, 0);
-			if (num == 0)
-				return;
+	while (btree_find(queue, 0, &fsbno) != NULL) {
+		max_fsbno = fsbno + pf_max_fsbs;
+		num = 0;
+
+		bplist[0] = btree_lookup(queue, fsbno);
+		while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
+			if (which != PF_META_ONLY ||
+			    !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
+				num++;
+			bplist[num] = btree_lookup_next(queue, &fsbno);
 		}
+		if (!num)
+			return;
 
 		/*
 		 * do a big read if 25% of the potential buffer is useful,
@@ -467,7 +463,7 @@ pf_batch_read(
 		}
 
 		for (i = 0; i < num; i++) {
-			if (radix_tree_delete(queue, XFS_DADDR_TO_FSB(mp,
+			if (btree_delete(queue, XFS_DADDR_TO_FSB(mp,
 					XFS_BUF_ADDR(bplist[i]))) == NULL)
 				do_error(_("prefetch corruption\n"));
 		}
@@ -570,8 +566,7 @@ pf_io_worker(
 		return NULL;
 
 	pthread_mutex_lock(&args->lock);
-	while (!args->queuing_done || args->primary_io_queue.height) {
-
+	while (!args->queuing_done || !btree_is_empty(args->primary_io_queue)) {
 #ifdef XR_PF_TRACE
 		pftrace("waiting to start prefetch I/O for AG %d", args->agno);
 #endif
@@ -696,8 +691,8 @@ pf_queuing_worker(
 #endif
 	pthread_mutex_lock(&args->lock);
 
-	ASSERT(args->primary_io_queue.height == 0);
-	ASSERT(args->secondary_io_queue.height == 0);
+	ASSERT(btree_is_empty(args->primary_io_queue));
+	ASSERT(btree_is_empty(args->secondary_io_queue));
 
 	args->prefetch_done = 1;
 	if (args->next_args)
@@ -755,8 +750,8 @@ start_inode_prefetch(
 
 	args = calloc(1, sizeof(prefetch_args_t));
 
-	INIT_RADIX_TREE(&args->primary_io_queue, 0);
-	INIT_RADIX_TREE(&args->secondary_io_queue, 0);
+	btree_init(&args->primary_io_queue);
+	btree_init(&args->secondary_io_queue);
 	if (pthread_mutex_init(&args->lock, NULL) != 0)
 		do_error(_("failed to initialize prefetch mutex\n"));
 	if (pthread_cond_init(&args->start_reading, NULL) != 0)
@@ -835,6 +830,8 @@ cleanup_inode_prefetch(
 	pthread_cond_destroy(&args->start_reading);
 	pthread_cond_destroy(&args->start_processing);
 	sem_destroy(&args->ra_count);
+	btree_destroy(args->primary_io_queue);
+	btree_destroy(args->secondary_io_queue);
 
 	free(args);
 }
Index: xfsprogs-dev/repair/prefetch.h
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.h	2009-11-12 10:49:49.062253827 +0100
+++ xfsprogs-dev/repair/prefetch.h	2009-11-12 10:53:01.098026017 +0100
@@ -3,7 +3,6 @@
 
 #include <semaphore.h>
 #include "incore.h"
-#include "radix-tree.h"
 
 
 extern int 	do_prefetch;
@@ -14,8 +13,8 @@ typedef struct prefetch_args {
 	pthread_mutex_t		lock;
 	pthread_t		queuing_thread;
 	pthread_t		io_threads[PF_THREAD_COUNT];
-	struct radix_tree_root	primary_io_queue;
-	struct radix_tree_root	secondary_io_queue;
+	struct btree_root	*primary_io_queue;
+	struct btree_root	*secondary_io_queue;
 	pthread_cond_t		start_reading;
 	pthread_cond_t		start_processing;
 	int			agno;
Index: xfsprogs-dev/repair/radix-tree.c
===================================================================
--- xfsprogs-dev.orig/repair/radix-tree.c	2009-11-12 10:49:49.070276497 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,805 +0,0 @@
-/*
- * Copyright (C) 2001 Momchil Velikov
- * Portions Copyright (C) 2001 Christoph Hellwig
- * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <libxfs.h>
-#include "radix-tree.h"
-
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
-#define RADIX_TREE_MAP_SHIFT	6
-#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
-
-#ifdef RADIX_TREE_TAGS
-#define RADIX_TREE_TAG_LONGS	\
-	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
-#endif
-
-struct radix_tree_node {
-	unsigned int	count;
-	void		*slots[RADIX_TREE_MAP_SIZE];
-#ifdef RADIX_TREE_TAGS
-	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
-#endif
-};
-
-struct radix_tree_path {
-	struct radix_tree_node *node;
-	int offset;
-};
-
-#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
-
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH];
-
-/*
- * Radix tree node cache.
- */
-
-#define radix_tree_node_alloc(r) 	((struct radix_tree_node *) \
-		calloc(1, sizeof(struct radix_tree_node)))
-#define radix_tree_node_free(n) 	free(n)
-
-#ifdef RADIX_TREE_TAGS
-
-static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	*((__uint32_t *)node->tags[tag] + (offset >> 5)) |= (1 << (offset & 31));
-}
-
-static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	__uint32_t 	*p = (__uint32_t*)node->tags[tag] + (offset >> 5);
-	__uint32_t 	m = 1 << (offset & 31);
-	*p &= ~m;
-}
-
-static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	return 1 & (((const __uint32_t *)node->tags[tag])[offset >> 5] >> (offset & 31));
-}
-
-/*
- * Returns 1 if any slot in the node has this tag set.
- * Otherwise returns 0.
- */
-static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
-{
-	int idx;
-	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-		if (node->tags[tag][idx])
-			return 1;
-	}
-	return 0;
-}
-
-#endif
-
-/*
- *	Return the maximum key which can be store into a
- *	radix tree with height HEIGHT.
- */
-static inline unsigned long radix_tree_maxindex(unsigned int height)
-{
-	return height_to_maxindex[height];
-}
-
-/*
- *	Extend a radix tree so it can store key @index.
- */
-static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
-{
-	struct radix_tree_node *node;
-	unsigned int height;
-#ifdef RADIX_TREE_TAGS
-	char tags[RADIX_TREE_MAX_TAGS];
-	int tag;
-#endif
-
-	/* Figure out what the height should be.  */
-	height = root->height + 1;
-	while (index > radix_tree_maxindex(height))
-		height++;
-
-	if (root->rnode == NULL) {
-		root->height = height;
-		goto out;
-	}
-
-#ifdef RADIX_TREE_TAGS
-	/*
-	 * Prepare the tag status of the top-level node for propagation
-	 * into the newly-pushed top-level node(s)
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 0;
-		if (any_tag_set(root->rnode, tag))
-			tags[tag] = 1;
-	}
-#endif
-	do {
-		if (!(node = radix_tree_node_alloc(root)))
-			return -ENOMEM;
-
-		/* Increase the height.  */
-		node->slots[0] = root->rnode;
-
-#ifdef RADIX_TREE_TAGS
-		/* Propagate the aggregated tag info into the new root */
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
-				tag_set(node, tag, 0);
-		}
-#endif
-		node->count = 1;
-		root->rnode = node;
-		root->height++;
-	} while (height > root->height);
-out:
-	return 0;
-}
-
-/**
- *	radix_tree_insert    -    insert into a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *	@item:		item to insert
- *
- *	Insert an item into the radix tree at position @index.
- */
-int radix_tree_insert(struct radix_tree_root *root,
-			unsigned long index, void *item)
-{
-	struct radix_tree_node *node = NULL, *slot;
-	unsigned int height, shift;
-	int offset;
-	int error;
-
-	/* Make sure the tree is high enough.  */
-	if ((!index && !root->rnode) ||
-			index > radix_tree_maxindex(root->height)) {
-		error = radix_tree_extend(root, index);
-		if (error)
-			return error;
-	}
-
-	slot = root->rnode;
-	height = root->height;
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
-	offset = 0;			/* uninitialised var warning */
-	do {
-		if (slot == NULL) {
-			/* Have to add a child node.  */
-			if (!(slot = radix_tree_node_alloc(root)))
-				return -ENOMEM;
-			if (node) {
-				node->slots[offset] = slot;
-				node->count++;
-			} else
-				root->rnode = slot;
-		}
-
-		/* Go a level down */
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		node = slot;
-		slot = node->slots[offset];
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	} while (height > 0);
-
-	if (slot != NULL)
-		return -EEXIST;
-
-	ASSERT(node);
-	node->count++;
-	node->slots[offset] = item;
-#ifdef RADIX_TREE_TAGS
-	ASSERT(!tag_get(node, 0, offset));
-	ASSERT(!tag_get(node, 1, offset));
-#endif
-	return 0;
-}
-
-static inline void **__lookup_slot(struct radix_tree_root *root,
-				   unsigned long index)
-{
-	unsigned int height, shift;
-	struct radix_tree_node **slot;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		return NULL;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = &root->rnode;
-
-	while (height > 0) {
-		if (*slot == NULL)
-			return NULL;
-
-		slot = (struct radix_tree_node **)
-			((*slot)->slots +
-				((index >> shift) & RADIX_TREE_MAP_MASK));
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	return (void **)slot;
-}
-
-/**
- *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the slot corresponding to the position @index in the radix tree
- *	@root. This is useful for update-if-exists operations.
- */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
-{
-	return __lookup_slot(root, index);
-}
-
-/**
- *	radix_tree_lookup    -    perform lookup operation on a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the item at the position @index in the radix tree @root.
- */
-void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
-{
-	void **slot;
-
-	slot = __lookup_slot(root, index);
-	return slot != NULL ? *slot : NULL;
-}
-
-/**
- *	raid_tree_first_key - find the first index key in the radix tree
- *	@root:		radix tree root
- *	@index:		where the first index will be placed
- *
- *	Returns the first entry and index key in the radix tree @root.
- */
-void *radix_tree_lookup_first(struct radix_tree_root *root, unsigned long *index)
-{
-	unsigned int height, shift;
-	struct radix_tree_node *slot;
-	unsigned long i;
-
-	height = root->height;
-	*index = 0;
-	if (height == 0)
-		return NULL;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	for (; height > 1; height--) {
-		for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (slot->slots[i] != NULL)
-				break;
-		}
-		ASSERT(i < RADIX_TREE_MAP_SIZE);
-
-		*index |= (i << shift);
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	}
-	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-		if (slot->slots[i] != NULL) {
-			*index |= i;
-			return slot->slots[i];
-		}
-	}
-	return NULL;
-}
-
-#ifdef RADIX_TREE_TAGS
-
-/**
- *	radix_tree_tag_set - set a tag on a radix tree node
- *	@root:		radix tree root
- *	@index:		index key
- *	@tag: 		tag index
- *
- *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
- *	corresponding to @index in the radix tree.  From
- *	the root all the way down to the leaf node.
- *
- *	Returns the address of the tagged item.   Setting a tag on a not-present
- *	item is a bug.
- */
-void *radix_tree_tag_set(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag)
-{
-	unsigned int height, shift;
-	struct radix_tree_node *slot;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		return NULL;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	while (height > 0) {
-		int offset;
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!tag_get(slot, tag, offset))
-			tag_set(slot, tag, offset);
-		slot = slot->slots[offset];
-		ASSERT(slot != NULL);
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	return slot;
-}
-
-/**
- *	radix_tree_tag_clear - clear a tag on a radix tree node
- *	@root:		radix tree root
- *	@index:		index key
- *	@tag: 		tag index
- *
- *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
- *	corresponding to @index in the radix tree.  If
- *	this causes the leaf node to have no tags set then clear the tag in the
- *	next-to-leaf node, etc.
- *
- *	Returns the address of the tagged item on success, else NULL.  ie:
- *	has the same return value and semantics as radix_tree_lookup().
- */
-void *radix_tree_tag_clear(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag)
-{
-	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_node *slot;
-	unsigned int height, shift;
-	void *ret = NULL;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		goto out;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
-	slot = root->rnode;
-
-	while (height > 0) {
-		int offset;
-
-		if (slot == NULL)
-			goto out;
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp[1].offset = offset;
-		pathp[1].node = slot;
-		slot = slot->slots[offset];
-		pathp++;
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	ret = slot;
-	if (ret == NULL)
-		goto out;
-
-	do {
-		if (!tag_get(pathp->node, tag, pathp->offset))
-			goto out;
-		tag_clear(pathp->node, tag, pathp->offset);
-		if (any_tag_set(pathp->node, tag))
-			goto out;
-		pathp--;
-	} while (pathp->node);
-out:
-	return ret;
-}
-
-#endif
-
-static unsigned int
-__lookup(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index)
-{
-	unsigned int nr_found = 0;
-	unsigned int shift, height;
-	struct radix_tree_node *slot;
-	unsigned long i;
-
-	height = root->height;
-	if (height == 0)
-		goto out;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	for ( ; height > 1; height--) {
-
-		for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
-				i < RADIX_TREE_MAP_SIZE; i++) {
-			if (slot->slots[i] != NULL)
-				break;
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
-
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	}
-
-	/* Bottom level: grab some items */
-	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
-		index++;
-		if (slot->slots[i]) {
-			results[nr_found++] = slot->slots[i];
-			if (nr_found == max_items)
-				goto out;
-		}
-	}
-out:
-	*next_index = index;
-	return nr_found;
-}
-
-/**
- *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
- *	@root:		radix tree root
- *	@results:	where the results of the lookup are placed
- *	@first_index:	start the lookup from this key
- *	@max_items:	place up to this many items at *results
- *
- *	Performs an index-ascending scan of the tree for present items.  Places
- *	them at *@results and returns the number of items which were placed at
- *	*@results.
- *
- *	The implementation is naive.
- */
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items)
-{
-	const unsigned long max_index = radix_tree_maxindex(root->height);
-	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
-
-	while (ret < max_items) {
-		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
-
-		if (cur_index > max_index)
-			break;
-		nr_found = __lookup(root, results + ret, cur_index,
-					max_items - ret, &next_index);
-		ret += nr_found;
-		if (next_index == 0)
-			break;
-		cur_index = next_index;
-	}
-	return ret;
-}
-
-/**
- *	radix_tree_gang_lookup_ex - perform multiple lookup on a radix tree
- *	@root:		radix tree root
- *	@results:	where the results of the lookup are placed
- *	@first_index:	start the lookup from this key
- *	@last_index:	don't lookup past this key
- *	@max_items:	place up to this many items at *results
- *
- *	Performs an index-ascending scan of the tree for present items starting
- *	@first_index until @last_index up to as many as @max_items.  Places
- *	them at *@results and returns the number of items which were placed
- *	at *@results.
- *
- *	The implementation is naive.
- */
-unsigned int
-radix_tree_gang_lookup_ex(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned long last_index,
-			unsigned int max_items)
-{
-	const unsigned long max_index = radix_tree_maxindex(root->height);
-	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
-
-	while (ret < max_items && cur_index < last_index) {
-		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
-
-		if (cur_index > max_index)
-			break;
-		nr_found = __lookup(root, results + ret, cur_index,
-					max_items - ret, &next_index);
-		ret += nr_found;
-		if (next_index == 0)
-			break;
-		cur_index = next_index;
-	}
-	return ret;
-}
-
-#ifdef RADIX_TREE_TAGS
-
-static unsigned int
-__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index, unsigned int tag)
-{
-	unsigned int nr_found = 0;
-	unsigned int shift;
-	unsigned int height = root->height;
-	struct radix_tree_node *slot;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	while (height > 0) {
-		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
-
-		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (tag_get(slot, tag, i)) {
-				ASSERT(slot->slots[i] != NULL);
-				break;
-			}
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
-		height--;
-		if (height == 0) {	/* Bottom level: grab some items */
-			unsigned long j = index & RADIX_TREE_MAP_MASK;
-
-			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
-				index++;
-				if (tag_get(slot, tag, j)) {
-					ASSERT(slot->slots[j] != NULL);
-					results[nr_found++] = slot->slots[j];
-					if (nr_found == max_items)
-						goto out;
-				}
-			}
-		}
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	}
-out:
-	*next_index = index;
-	return nr_found;
-}
-
-/**
- *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
- *	                             based on a tag
- *	@root:		radix tree root
- *	@results:	where the results of the lookup are placed
- *	@first_index:	start the lookup from this key
- *	@max_items:	place up to this many items at *results
- *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
- *
- *	Performs an index-ascending scan of the tree for present items which
- *	have the tag indexed by @tag set.  Places the items at *@results and
- *	returns the number of items which were placed at *@results.
- */
-unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-		unsigned long first_index, unsigned int max_items,
-		unsigned int tag)
-{
-	const unsigned long max_index = radix_tree_maxindex(root->height);
-	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
-
-	while (ret < max_items) {
-		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
-
-		if (cur_index > max_index)
-			break;
-		nr_found = __lookup_tag(root, results + ret, cur_index,
-					max_items - ret, &next_index, tag);
-		ret += nr_found;
-		if (next_index == 0)
-			break;
-		cur_index = next_index;
-	}
-	return ret;
-}
-
-#endif
-
-/**
- *	radix_tree_shrink    -    shrink height of a radix tree to minimal
- *	@root		radix tree root
- */
-static inline void radix_tree_shrink(struct radix_tree_root *root)
-{
-	/* try to shrink tree height */
-	while (root->height > 1 &&
-			root->rnode->count == 1 &&
-			root->rnode->slots[0]) {
-		struct radix_tree_node *to_free = root->rnode;
-
-		root->rnode = to_free->slots[0];
-		root->height--;
-		/* must only free zeroed nodes into the slab */
-#ifdef RADIX_TREE_TAGS
-		tag_clear(to_free, 0, 0);
-		tag_clear(to_free, 1, 0);
-#endif
-		to_free->slots[0] = NULL;
-		to_free->count = 0;
-		radix_tree_node_free(to_free);
-	}
-}
-
-/**
- *	radix_tree_delete    -    delete an item from a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Remove the item at @index from the radix tree rooted at @root.
- *
- *	Returns the address of the deleted item, or NULL if it was not present.
- */
-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
-{
-	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_path *orig_pathp;
-	struct radix_tree_node *slot;
-	unsigned int height, shift;
-	void *ret = NULL;
-#ifdef RADIX_TREE_TAGS
-	char tags[RADIX_TREE_MAX_TAGS];
-	int nr_cleared_tags;
-	int tag;
-#endif
-	int offset;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		goto out;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
-	slot = root->rnode;
-
-	for ( ; height > 0; height--) {
-		if (slot == NULL)
-			goto out;
-
-		pathp++;
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp->offset = offset;
-		pathp->node = slot;
-		slot = slot->slots[offset];
-		shift -= RADIX_TREE_MAP_SHIFT;
-	}
-
-	ret = slot;
-	if (ret == NULL)
-		goto out;
-
-	orig_pathp = pathp;
-
-#ifdef RADIX_TREE_TAGS
-	/*
-	 * Clear all tags associated with the just-deleted item
-	 */
-	nr_cleared_tags = 0;
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 1;
-		if (tag_get(pathp->node, tag, pathp->offset)) {
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (!any_tag_set(pathp->node, tag)) {
-				tags[tag] = 0;
-				nr_cleared_tags++;
-			}
-		}
-	}
-
-	for (pathp--; nr_cleared_tags && pathp->node; pathp--) {
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
-				continue;
-
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (any_tag_set(pathp->node, tag)) {
-				tags[tag] = 1;
-				nr_cleared_tags--;
-			}
-		}
-	}
-#endif
-	/* Now free the nodes we do not need anymore */
-	for (pathp = orig_pathp; pathp->node; pathp--) {
-		pathp->node->slots[pathp->offset] = NULL;
-		pathp->node->count--;
-
-		if (pathp->node->count) {
-			if (pathp->node == root->rnode)
-				radix_tree_shrink(root);
-			goto out;
-		}
-
-		/* Node with zero slots in use so free it */
-		radix_tree_node_free(pathp->node);
-	}
-	root->rnode = NULL;
-	root->height = 0;
-out:
-	return ret;
-}
-
-#ifdef RADIX_TREE_TAGS
-/**
- *	radix_tree_tagged - test whether any items in the tree are tagged
- *	@root:		radix tree root
- *	@tag:		tag to test
- */
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
-{
-  	struct radix_tree_node *rnode;
-  	rnode = root->rnode;
-  	if (!rnode)
-  		return 0;
-	return any_tag_set(rnode, tag);
-}
-#endif
-
-static unsigned long __maxindex(unsigned int height)
-{
-	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
-	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
-
-	if (tmp >= RADIX_TREE_INDEX_BITS)
-		index = ~0UL;
-	return index;
-}
-
-static void radix_tree_init_maxindex(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
-		height_to_maxindex[i] = __maxindex(i);
-}
-
-void radix_tree_init(void)
-{
-	radix_tree_init_maxindex();
-}
Index: xfsprogs-dev/repair/radix-tree.h
===================================================================
--- xfsprogs-dev.orig/repair/radix-tree.h	2009-11-12 10:49:49.077254201 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2001 Momchil Velikov
- * Portions Copyright (C) 2001 Christoph Hellwig
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#ifndef __XFS_SUPPORT_RADIX_TREE_H__
-#define __XFS_SUPPORT_RADIX_TREE_H__
-
-#define RADIX_TREE_TAGS
-
-struct radix_tree_root {
-	unsigned int		height;
-	struct radix_tree_node	*rnode;
-};
-
-#define RADIX_TREE_INIT(mask)	{					\
-	.height = 0,							\
-	.rnode = NULL,							\
-}
-
-#define RADIX_TREE(name, mask) \
-	struct radix_tree_root name = RADIX_TREE_INIT(mask)
-
-#define INIT_RADIX_TREE(root, mask)					\
-do {									\
-	(root)->height = 0;						\
-	(root)->rnode = NULL;						\
-} while (0)
-
-#ifdef RADIX_TREE_TAGS
-#define RADIX_TREE_MAX_TAGS 2
-#endif
-
-int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
-void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
-void *radix_tree_lookup_first(struct radix_tree_root *, unsigned long *);
-void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items);
-unsigned int
-radix_tree_gang_lookup_ex(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned long last_index,
-			unsigned int max_items);
-
-void radix_tree_init(void);
-
-#ifdef RADIX_TREE_TAGS
-void *radix_tree_tag_set(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-void *radix_tree_tag_clear(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-int radix_tree_tag_get(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items,
-			unsigned int tag);
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
-#endif
-
-#endif /* __XFS_SUPPORT_RADIX_TREE_H__ */

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 07/14] repair: use single prefetch queue
  2009-10-21 17:48   ` Alex Elder
@ 2009-11-12 10:09     ` Christoph Hellwig
  0 siblings, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2009-11-12 10:09 UTC (permalink / raw)
  To: Alex Elder; +Cc: xfs

On Wed, Oct 21, 2009 at 12:48:35PM -0500, Alex Elder wrote:
> The following hunk doesn't really do anything but change whitespace.
> It'd be nice if those changes (when there's a bunch like this) were
> limited to a separate no-op patch.

Agreed.  There were lots of hunk like that in Barry's original patches
and I though I caught them all, but this one was left over.

Updated patch below:

--

Subject: repair: use single prefetch queue
From: Barry Naujok <bnaujok@sgi.com>

We don't need two prefetch queues as we guarantee execution in order anyway.


Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>

Index: xfsprogs-dev/repair/prefetch.c
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.c	2009-11-12 11:01:42.533256209 +0100
+++ xfsprogs-dev/repair/prefetch.c	2009-11-12 11:07:32.068006860 +0100
@@ -128,8 +128,9 @@ pf_queue_io(
 
 	pthread_mutex_lock(&args->lock);
 
+	btree_insert(args->io_queue, fsbno, bp);
+
 	if (fsbno > args->last_bno_read) {
-		btree_insert(args->primary_io_queue, fsbno, bp);
 		if (B_IS_INODE(flag)) {
 			args->inode_bufs_queued++;
 			if (args->inode_bufs_queued == IO_THRESHOLD)
@@ -152,7 +153,6 @@ pf_queue_io(
 #endif
 		ASSERT(!B_IS_INODE(flag));
 		XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
-		btree_insert(args->secondary_io_queue, fsbno, bp);
 	}
 
 	pf_start_processing(args);
@@ -405,7 +405,6 @@ pf_batch_read(
 	pf_which_t		which,
 	void			*buf)
 {
-	struct btree_root	*queue;
 	xfs_buf_t		*bplist[MAX_BUFS];
 	unsigned int		num;
 	off64_t			first_off, last_off, next_off;
@@ -416,19 +415,22 @@ pf_batch_read(
 	unsigned long		max_fsbno;
 	char			*pbuf;
 
-	queue = (which != PF_SECONDARY) ? args->primary_io_queue
-				: args->secondary_io_queue;
-
-	while (btree_find(queue, 0, &fsbno) != NULL) {
-		max_fsbno = fsbno + pf_max_fsbs;
+	for (;;) {
 		num = 0;
-
-		bplist[0] = btree_lookup(queue, fsbno);
+		if (which == PF_SECONDARY) {
+			bplist[0] = btree_find(args->io_queue, 0, &fsbno);
+			max_fsbno = MIN(fsbno + pf_max_fsbs,
+							args->last_bno_read);
+		} else {
+			bplist[0] = btree_find(args->io_queue,
+						args->last_bno_read, &fsbno);
+			max_fsbno = fsbno + pf_max_fsbs;
+		}
 		while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
 			if (which != PF_META_ONLY ||
 			    !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
 				num++;
-			bplist[num] = btree_lookup_next(queue, &fsbno);
+			bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
 		}
 		if (!num)
 			return;
@@ -463,7 +465,7 @@ pf_batch_read(
 		}
 
 		for (i = 0; i < num; i++) {
-			if (btree_delete(queue, XFS_DADDR_TO_FSB(mp,
+			if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
 					XFS_BUF_ADDR(bplist[i]))) == NULL)
 				do_error(_("prefetch corruption\n"));
 		}
@@ -566,7 +568,7 @@ pf_io_worker(
 		return NULL;
 
 	pthread_mutex_lock(&args->lock);
-	while (!args->queuing_done || !btree_is_empty(args->primary_io_queue)) {
+	while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
 #ifdef XR_PF_TRACE
 		pftrace("waiting to start prefetch I/O for AG %d", args->agno);
 #endif
@@ -691,8 +693,7 @@ pf_queuing_worker(
 #endif
 	pthread_mutex_lock(&args->lock);
 
-	ASSERT(btree_is_empty(args->primary_io_queue));
-	ASSERT(btree_is_empty(args->secondary_io_queue));
+	ASSERT(btree_is_empty(args->io_queue));
 
 	args->prefetch_done = 1;
 	if (args->next_args)
@@ -750,8 +751,7 @@ start_inode_prefetch(
 
 	args = calloc(1, sizeof(prefetch_args_t));
 
-	btree_init(&args->primary_io_queue);
-	btree_init(&args->secondary_io_queue);
+	btree_init(&args->io_queue);
 	if (pthread_mutex_init(&args->lock, NULL) != 0)
 		do_error(_("failed to initialize prefetch mutex\n"));
 	if (pthread_cond_init(&args->start_reading, NULL) != 0)
@@ -830,8 +830,7 @@ cleanup_inode_prefetch(
 	pthread_cond_destroy(&args->start_reading);
 	pthread_cond_destroy(&args->start_processing);
 	sem_destroy(&args->ra_count);
-	btree_destroy(args->primary_io_queue);
-	btree_destroy(args->secondary_io_queue);
+	btree_destroy(args->io_queue);
 
 	free(args);
 }
Index: xfsprogs-dev/repair/prefetch.h
===================================================================
--- xfsprogs-dev.orig/repair/prefetch.h	2009-11-12 10:53:01.098026017 +0100
+++ xfsprogs-dev/repair/prefetch.h	2009-11-12 11:07:01.807004122 +0100
@@ -13,8 +13,7 @@ typedef struct prefetch_args {
 	pthread_mutex_t		lock;
 	pthread_t		queuing_thread;
 	pthread_t		io_threads[PF_THREAD_COUNT];
-	struct btree_root	*primary_io_queue;
-	struct btree_root	*secondary_io_queue;
+	struct btree_root	*io_queue;
 	pthread_cond_t		start_reading;
 	pthread_cond_t		start_processing;
 	int			agno;

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 09/14] repair: track logical to physical block mapping moreeffeciently
  2009-10-21 19:06   ` [PATCH 09/14] repair: track logical to physical block mapping moreeffeciently Alex Elder
@ 2009-11-12 10:18     ` Christoph Hellwig
  0 siblings, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2009-11-12 10:18 UTC (permalink / raw)
  To: Alex Elder; +Cc: Christoph Hellwig, Barry Naujok, xfs

On Wed, Oct 21, 2009 at 02:06:19PM -0500, Alex Elder wrote:
> Christoph Hellwig wrote:
> > Currently we track the logical to physical block mapping by a structure which
> > contains an array of physicial blocks.  This is extremly efficient and is
> 
> Should this be "extremely inefficient?"
> 
> > replaced with the normal starblock storage we use in the kernel and on disk
> > in this patch.
> 
> While you're at fixing the above comment, maybe just re-word this
> sentence because I don't really grok it very well...

Thanks, updated in the version below.

> > +	ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
> > +
> >  	if (nex < 1)
> >  		nex = 1;
> > -	if ((blkmap = malloc(BLKMAP_SIZE(nex))) == NULL) {
> > -		do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"),
> > -			BLKMAP_SIZE(nex));
> > -		return blkmap;
> > +
> > +	key = whichfork ? ablkmap_key : dblkmap_key;
> > +	blkmap = pthread_getspecific(key);
> > +	if (!blkmap || blkmap->naexts < nex) {
> > +		blkmap = realloc(blkmap, BLKMAP_SIZE(nex));
> 
> Does the above really have to be a realloc() call, or can
> it simply be a free()/malloc() instead?  Also, could the
> existing ts_alloc() function be adjusted to accomodate the
> usage here?

It has to be a realloc, we need to keep the existing content.  We really
need to do the growing based on the existing size, so ts_alloc doesn't
fit.  We could try to introduce a ts_realloc, but I'm not sure it's
worth it.

> >  {
> > -	blkent_t	**entp;
> > -	xfs_extnum_t	i;
> > -
> > -	if (blkmap == NULL)
> > -		return;
> > -	for (i = 0, entp = blkmap->ents; i < blkmap->nents; i++, entp++)
> > -		free(*entp);
> > -	free(blkmap);
> > +	/* nothing to do! - keep the memory around for the next inode */
> 
> Nobody ever frees it though, either.  I guess it gets done at
> exit but I like things tidy (could arrange for a destructor
> function to be called, at pthread_key_create() time).

This would complicate things quite a bit, and it would actually cause a
lot more malloc/free cycles that potentially slow repair down.  Right
now we only have to allocate the map if the next inode has a larger
extent map than the previously processed one, which means we can safe
a lot of malloc/free cycles.  Which still can be quite slow in
multi-threaded programs.

-- 

Subject: repair: track logical to physical block mapping more effeciently
From: Barry Naujok <bnaujok@sgi.com>

Currently we track the logical to physical block mapping by a structure which
contains an array of physicial blocks.  This is extremly inefficient and is
replaced with the normal startblock, length extent descriptors.

In addition also use thread-local storage for the block map, this is possible
because repair only processes one inode at a given time per thread, and the
block map does not have to outlive the processing of a single inode.

The combination of those factors means we can use pthread thread-local
storage to store the block map, and we can re-use the allocation over
and over again.

This should be ported over to xfs_db eventually, or even better we could try
to share the code.

[hch: added a small fix in blkmap_set_ext to not call memmove unless needed]

Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>

Index: xfsprogs-dev/repair/bmap.c
===================================================================
--- xfsprogs-dev.orig/repair/bmap.c	2009-10-19 01:55:18.807285612 +0200
+++ xfsprogs-dev/repair/bmap.c	2009-11-12 11:17:04.371006486 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2001,2005,2008 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -21,106 +21,46 @@
 #include "bmap.h"
 
 /*
- * Block mapping code taken from xfs_db.
- */
-
-/*
- * Append an extent to the block entry.
- */
-void
-blkent_append(
-	blkent_t	**entp,
-	xfs_dfsbno_t	b,
-	xfs_dfilblks_t	c)
-{
-	blkent_t	*ent;
-	size_t		size;
-	int		i;
-
-	ent = *entp;
-	size = BLKENT_SIZE(c + ent->nblks);
-	if ((*entp = ent = realloc(ent, size)) == NULL) {
-		do_warn(_("realloc failed in blkent_append (%u bytes)\n"),
-			size);
-		return;
-	}
-	for (i = 0; i < c; i++)
-		ent->blks[ent->nblks + i] = b + i;
-	ent->nblks += c;
-}
-
-/*
- * Make a new block entry.
- */
-blkent_t *
-blkent_new(
-	xfs_dfiloff_t	o,
-	xfs_dfsbno_t	b,
-	xfs_dfilblks_t	c)
-{
-	blkent_t	*ent;
-	int		i;
-
-	if ((ent = malloc(BLKENT_SIZE(c))) == NULL) {
-		do_warn(_("malloc failed in blkent_new (%u bytes)\n"),
-			BLKENT_SIZE(c));
-		return ent;
-	}
-	ent->nblks = c;
-	ent->startoff = o;
-	for (i = 0; i < c; i++)
-		ent->blks[i] = b + i;
-	return ent;
-}
-
-/*
- * Prepend an extent to the block entry.
+ * Track the logical to physical block mapping for inodes.
+ *
+ * Repair only processes one inode at a given time per thread, and the
+ * block map does not have to outlive the processing of a single inode.
+ *
+ * The combination of those factors means we can use pthreads thread-local
+ * storage to store the block map, and we can re-use the allocation over
+ * and over again.
  */
-void
-blkent_prepend(
-	blkent_t	**entp,
-	xfs_dfsbno_t	b,
-	xfs_dfilblks_t	c)
-{
-	int		i;
-	blkent_t	*newent;
-	blkent_t	*oldent;
 
-	oldent = *entp;
-	if ((newent = malloc(BLKENT_SIZE(oldent->nblks + c))) == NULL) {
-		do_warn(_("malloc failed in blkent_prepend (%u bytes)\n"),
-			BLKENT_SIZE(oldent->nblks + c));
-		*entp = newent;
-		return;
-	}
-	newent->nblks = oldent->nblks + c;
-	newent->startoff = oldent->startoff - c;
-	for (i = 0; i < c; i++)
-		newent->blks[i] = b + c;
-	for (; i < oldent->nblks + c; i++)
-		newent->blks[i] = oldent->blks[i - c];
-	free(oldent);
-	*entp = newent;
-}
+pthread_key_t	dblkmap_key;
+pthread_key_t	ablkmap_key;
 
-/*
- * Allocate a block map.
- */
 blkmap_t *
 blkmap_alloc(
-	xfs_extnum_t	nex)
+	xfs_extnum_t	nex,
+	int		whichfork)
 {
+	pthread_key_t	key;
 	blkmap_t	*blkmap;
 
+	ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
+
 	if (nex < 1)
 		nex = 1;
-	if ((blkmap = malloc(BLKMAP_SIZE(nex))) == NULL) {
-		do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"),
-			BLKMAP_SIZE(nex));
-		return blkmap;
+
+	key = whichfork ? ablkmap_key : dblkmap_key;
+	blkmap = pthread_getspecific(key);
+	if (!blkmap || blkmap->naexts < nex) {
+		blkmap = realloc(blkmap, BLKMAP_SIZE(nex));
+		if (!blkmap) {
+			do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"),
+				BLKMAP_SIZE(nex));
+			return NULL;
+		}
+		pthread_setspecific(key, blkmap);
+		blkmap->naexts = nex;
 	}
-	blkmap->naents = nex;
-	blkmap->nents = 0;
+
+	blkmap->nexts = 0;
 	return blkmap;
 }
 
@@ -131,14 +71,7 @@ void
 blkmap_free(
 	blkmap_t	*blkmap)
 {
-	blkent_t	**entp;
-	xfs_extnum_t	i;
-
-	if (blkmap == NULL)
-		return;
-	for (i = 0, entp = blkmap->ents; i < blkmap->nents; i++, entp++)
-		free(*entp);
-	free(blkmap);
+	/* nothing to do! - keep the memory around for the next inode */
 }
 
 /*
@@ -149,20 +82,18 @@ blkmap_get(
 	blkmap_t	*blkmap,
 	xfs_dfiloff_t	o)
 {
-	blkent_t	*ent;
-	blkent_t	**entp;
+	bmap_ext_t	*ext = blkmap->exts;
 	int		i;
 
-	for (i = 0, entp = blkmap->ents; i < blkmap->nents; i++, entp++) {
-		ent = *entp;
-		if (o >= ent->startoff && o < ent->startoff + ent->nblks)
-			return ent->blks[o - ent->startoff];
+	for (i = 0; i < blkmap->nexts; i++, ext++) {
+		if (o >= ext->startoff && o < ext->startoff + ext->blockcount)
+			return ext->startblock + (o - ext->startoff);
 	}
 	return NULLDFSBNO;
 }
 
 /*
- * Get a chunk of entries from a block map.
+ * Get a chunk of entries from a block map - only used for reading dirv2 blocks
  */
 int
 blkmap_getn(
@@ -172,93 +103,62 @@ blkmap_getn(
 	bmap_ext_t	**bmpp,
 	bmap_ext_t	*bmpp_single)
 {
-	bmap_ext_t	*bmp;
-	blkent_t	*ent;
-	xfs_dfiloff_t	ento;
-	blkent_t	**entp;
+	bmap_ext_t	*bmp = NULL;
+	bmap_ext_t	*ext;
 	int		i;
 	int		nex;
 
 	if (nb == 1) {
-		/* 
+		/*
 		 * in the common case, when mp->m_dirblkfsbs == 1,
 		 * avoid additional malloc/free overhead
 		 */
 		bmpp_single->startblock = blkmap_get(blkmap, o);
-		bmpp_single->blockcount = 1;
-		bmpp_single->startoff = 0;
-		bmpp_single->flag = 0;
-		*bmpp = bmpp_single;
-		return (bmpp_single->startblock != NULLDFSBNO) ? 1 : 0;
+		goto single_ext;
 	}
-	for (i = nex = 0, bmp = NULL, entp = blkmap->ents;
-	     i < blkmap->nents;
-	     i++, entp++) {
-		ent = *entp;
-		if (ent->startoff >= o + nb)
+	ext = blkmap->exts;
+	nex = 0;
+	for (i = 0; i < blkmap->nexts; i++, ext++) {
+
+		if (ext->startoff >= o + nb)
 			break;
-		if (ent->startoff + ent->nblks <= o)
+		if (ext->startoff + ext->blockcount <= o)
 			continue;
-		for (ento = ent->startoff;
-		     ento < ent->startoff + ent->nblks && ento < o + nb;
-		     ento++) {
-			if (ento < o)
-				continue;
-			if (bmp &&
-			    bmp[nex - 1].startoff + bmp[nex - 1].blockcount ==
-				    ento &&
-			    bmp[nex - 1].startblock + bmp[nex - 1].blockcount ==
-				    ent->blks[ento - ent->startoff])
-				bmp[nex - 1].blockcount++;
-			else {
-				bmp = realloc(bmp, ++nex * sizeof(*bmp));
-				if (bmp == NULL) {
-					do_warn(_("blkmap_getn realloc failed"
-						" (%u bytes)\n"),
-						nex * sizeof(*bmp));
-					continue;
-				}
-				bmp[nex - 1].startoff = ento;
-				bmp[nex - 1].startblock =
-					ent->blks[ento - ent->startoff];
-				bmp[nex - 1].blockcount = 1;
-				bmp[nex - 1].flag = 0;
-			}
+
+		/*
+		 * if all the requested blocks are in one extent (also common),
+		 * use the bmpp_single option as well
+		 */
+		if (!bmp && o >= ext->startoff &&
+		    o + nb <= ext->startoff + ext->blockcount) {
+			bmpp_single->startblock =
+				 ext->startblock + (o - ext->startoff);
+			goto single_ext;
 		}
+
+		/*
+		 * rare case - multiple extents for a single dir block
+		 */
+		bmp = malloc(nb * sizeof(bmap_ext_t));
+		if (!bmp)
+			do_error(_("blkmap_getn malloc failed (%u bytes)\n"),
+						nb * sizeof(bmap_ext_t));
+
+		bmp[nex].startblock = ext->startblock + (o - ext->startoff);
+		bmp[nex].blockcount = MIN(nb, ext->blockcount -
+				(bmp[nex].startblock - ext->startblock));
+		o += bmp[nex].blockcount;
+		nb -= bmp[nex].blockcount;
+		nex++;
 	}
 	*bmpp = bmp;
 	return nex;
-}
-
-/*
- * Make a block map larger.
- */
-void
-blkmap_grow(
-	blkmap_t	**blkmapp,
-	blkent_t	**entp,
-	blkent_t	*newent)
-{
-	blkmap_t	*blkmap;
-	size_t		size;
-	int		i;
-	int		idx;
 
-	blkmap = *blkmapp;
-	idx = (int)(entp - blkmap->ents);
-	if (blkmap->naents == blkmap->nents) {
-		size = BLKMAP_SIZE(blkmap->nents + 1);
-		if ((*blkmapp = blkmap = realloc(blkmap, size)) == NULL) {
-			do_warn(_("realloc failed in blkmap_grow (%u bytes)\n"),
-				size);
-			return;
-		}
-		blkmap->naents++;
-	}
-	for (i = blkmap->nents; i > idx; i--)
-		blkmap->ents[i] = blkmap->ents[i - 1];
-	blkmap->ents[idx] = newent;
-	blkmap->nents++;
+single_ext:
+	bmpp_single->blockcount = nb;
+	bmpp_single->startoff = 0;	/* not even used by caller! */
+	*bmpp = bmpp_single;
+	return (bmpp_single->startblock != NULLDFSBNO) ? 1 : 0;
 }
 
 /*
@@ -268,12 +168,12 @@ xfs_dfiloff_t
 blkmap_last_off(
 	blkmap_t	*blkmap)
 {
-	blkent_t	*ent;
+	bmap_ext_t	*ext;
 
-	if (!blkmap->nents)
+	if (!blkmap->nexts)
 		return NULLDFILOFF;
-	ent = blkmap->ents[blkmap->nents - 1];
-	return ent->startoff + ent->nblks;
+	ext = blkmap->exts + blkmap->nexts - 1;
+	return ext->startoff + ext->blockcount;
 }
 
 /*
@@ -285,73 +185,45 @@ blkmap_next_off(
 	xfs_dfiloff_t	o,
 	int		*t)
 {
-	blkent_t	*ent;
-	blkent_t	**entp;
+	bmap_ext_t	*ext;
 
-	if (!blkmap->nents)
+	if (!blkmap->nexts)
 		return NULLDFILOFF;
 	if (o == NULLDFILOFF) {
 		*t = 0;
-		ent = blkmap->ents[0];
-		return ent->startoff;
+		return blkmap->exts[0].startoff;
 	}
-	entp = &blkmap->ents[*t];
-	ent = *entp;
-	if (o < ent->startoff + ent->nblks - 1)
+	ext = blkmap->exts + *t;
+	if (o < ext->startoff + ext->blockcount - 1)
 		return o + 1;
-	entp++;
-	if (entp >= &blkmap->ents[blkmap->nents])
+	if (*t >= blkmap->nexts - 1)
 		return NULLDFILOFF;
 	(*t)++;
-	ent = *entp;
-	return ent->startoff;
+	return ext[1].startoff;
 }
 
 /*
- * Set a block value in a block map.
+ * Make a block map larger.
  */
-void
-blkmap_set_blk(
-	blkmap_t	**blkmapp,
-	xfs_dfiloff_t	o,
-	xfs_dfsbno_t	b)
+static blkmap_t *
+blkmap_grow(
+	blkmap_t	**blkmapp)
 {
-	blkmap_t	*blkmap;
-	blkent_t	*ent;
-	blkent_t	**entp;
-	blkent_t	*nextent;
-
-	blkmap = *blkmapp;
-	for (entp = blkmap->ents; entp < &blkmap->ents[blkmap->nents]; entp++) {
-		ent = *entp;
-		if (o < ent->startoff - 1) {
-			ent = blkent_new(o, b, 1);
-			blkmap_grow(blkmapp, entp, ent);
-			return;
-		}
-		if (o == ent->startoff - 1) {
-			blkent_prepend(entp, b, 1);
-			return;
-		}
-		if (o >= ent->startoff && o < ent->startoff + ent->nblks) {
-			ent->blks[o - ent->startoff] = b;
-			return;
-		}
-		if (o > ent->startoff + ent->nblks)
-			continue;
-		blkent_append(entp, b, 1);
-		if (entp == &blkmap->ents[blkmap->nents - 1])
-			return;
-		ent = *entp;
-		nextent = entp[1];
-		if (ent->startoff + ent->nblks < nextent->startoff)
-			return;
-		blkent_append(entp, nextent->blks[0], nextent->nblks);
-		blkmap_shrink(blkmap, &entp[1]);
-		return;
+	pthread_key_t	key = dblkmap_key;
+	blkmap_t	*blkmap = *blkmapp;
+
+	if (pthread_getspecific(key) != blkmap) {
+		key = ablkmap_key;
+		ASSERT(pthread_getspecific(key) == blkmap);
 	}
-	ent = blkent_new(o, b, 1);
-	blkmap_grow(blkmapp, entp, ent);
+
+	blkmap->naexts += 4;
+	blkmap = realloc(blkmap, BLKMAP_SIZE(blkmap->naexts));
+	if (blkmap == NULL)
+		do_error(_("realloc failed in blkmap_grow\n"));
+	*blkmapp = blkmap;
+	pthread_setspecific(key, blkmap);
+	return blkmap;
 }
 
 /*
@@ -364,46 +236,23 @@ blkmap_set_ext(
 	xfs_dfsbno_t	b,
 	xfs_dfilblks_t	c)
 {
-	blkmap_t	*blkmap;
-	blkent_t	*ent;
-	blkent_t	**entp;
+	blkmap_t	*blkmap = *blkmapp;
 	xfs_extnum_t	i;
 
-	blkmap = *blkmapp;
-	if (!blkmap->nents) {
-		blkmap->ents[0] = blkent_new(o, b, c);
-		blkmap->nents = 1;
-		return;
-	}
-	entp = &blkmap->ents[blkmap->nents - 1];
-	ent = *entp;
-	if (ent->startoff + ent->nblks == o) {
-		blkent_append(entp, b, c);
-		return;
-	}
-	if (ent->startoff + ent->nblks < o) {
-		ent = blkent_new(o, b, c);
-		blkmap_grow(blkmapp, &blkmap->ents[blkmap->nents], ent);
-		return;
-	}
-	for (i = 0; i < c; i++)
-		blkmap_set_blk(blkmapp, o + i, b + i);
-}
+	if (blkmap->nexts == blkmap->naexts)
+		blkmap = blkmap_grow(blkmapp);
 
-/*
- * Make a block map smaller.
- */
-void
-blkmap_shrink(
-	blkmap_t	*blkmap,
-	blkent_t	**entp)
-{
-	int		i;
-	int		idx;
+	for (i = 0; i < blkmap->nexts; i++) {
+		if (blkmap->exts[i].startoff > o) {
+			memmove(blkmap->exts + i + 1,
+				blkmap->exts + i,
+				sizeof(bmap_ext_t) * (blkmap->nexts - i));
+			break;
+		}
+	}
 
-	free(*entp);
-	idx = (int)(entp - blkmap->ents);
-	for (i = idx + 1; i < blkmap->nents; i++)
-		blkmap->ents[i] = blkmap->ents[i - 1];
-	blkmap->nents--;
+	blkmap->exts[i].startoff = o;
+	blkmap->exts[i].startblock = b;
+	blkmap->exts[i].blockcount = c;
+	blkmap->nexts++;
 }
Index: xfsprogs-dev/repair/bmap.h
===================================================================
--- xfsprogs-dev.orig/repair/bmap.h	2009-10-19 01:55:18.824256628 +0200
+++ xfsprogs-dev/repair/bmap.h	2009-11-12 11:12:12.138274565 +0100
@@ -16,59 +16,41 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-/*
- * Block mapping code taken from xfs_db.
- */
+#ifndef _XFS_REPAIR_BMAP_H
+#define _XFS_REPAIR_BMAP_H
 
 /*
- * Block map entry.
+ * Extent descriptor.
  */
-typedef struct blkent {
+typedef struct bmap_ext {
 	xfs_dfiloff_t	startoff;
-	xfs_dfilblks_t	nblks;
-	xfs_dfsbno_t	blks[1];
-} blkent_t;
-#define	BLKENT_SIZE(n)	\
-	(offsetof(blkent_t, blks) + (sizeof(xfs_dfsbno_t) * (n)))
+	xfs_dfsbno_t	startblock;
+	xfs_dfilblks_t	blockcount;
+} bmap_ext_t;
 
 /*
  * Block map.
  */
 typedef	struct blkmap {
-	int		naents;
-	int		nents;
-	blkent_t	*ents[1];
+	int		naexts;
+	int		nexts;
+	bmap_ext_t	exts[1];
 } blkmap_t;
-#define	BLKMAP_SIZE(n)	\
-	(offsetof(blkmap_t, ents) + (sizeof(blkent_t *) * (n)))
 
-/*
- * Extent descriptor.
- */
-typedef struct bmap_ext {
-	xfs_dfiloff_t	startoff;
-	xfs_dfsbno_t	startblock;
-	xfs_dfilblks_t	blockcount;
-	int		flag;
-} bmap_ext_t;
+#define	BLKMAP_SIZE(n)	\
+	(offsetof(blkmap_t, exts) + (sizeof(bmap_ext_t) * (n)))
 
-void		blkent_append(blkent_t **entp, xfs_dfsbno_t b,
-			      xfs_dfilblks_t c);
-blkent_t	*blkent_new(xfs_dfiloff_t o, xfs_dfsbno_t b, xfs_dfilblks_t c);
-void		blkent_prepend(blkent_t **entp, xfs_dfsbno_t b,
-			       xfs_dfilblks_t c);
-blkmap_t	*blkmap_alloc(xfs_extnum_t);
+blkmap_t	*blkmap_alloc(xfs_extnum_t nex, int whichfork);
 void		blkmap_free(blkmap_t *blkmap);
+
+void		blkmap_set_ext(blkmap_t **blkmapp, xfs_dfiloff_t o,
+			       xfs_dfsbno_t b, xfs_dfilblks_t c);
+
 xfs_dfsbno_t	blkmap_get(blkmap_t *blkmap, xfs_dfiloff_t o);
 int		blkmap_getn(blkmap_t *blkmap, xfs_dfiloff_t o,
-			    xfs_dfilblks_t nb, bmap_ext_t **bmpp, 
+			    xfs_dfilblks_t nb, bmap_ext_t **bmpp,
 			    bmap_ext_t *bmpp_single);
-void		blkmap_grow(blkmap_t **blkmapp, blkent_t **entp,
-			    blkent_t *newent);
 xfs_dfiloff_t	blkmap_last_off(blkmap_t *blkmap);
 xfs_dfiloff_t	blkmap_next_off(blkmap_t *blkmap, xfs_dfiloff_t o, int *t);
-void		blkmap_set_blk(blkmap_t **blkmapp, xfs_dfiloff_t o,
-			       xfs_dfsbno_t b);
-void		blkmap_set_ext(blkmap_t **blkmapp, xfs_dfiloff_t o,
-			       xfs_dfsbno_t b, xfs_dfilblks_t c);
-void		blkmap_shrink(blkmap_t *blkmap, blkent_t **entp);
+
+#endif /* _XFS_REPAIR_BMAP_H */
Index: xfsprogs-dev/repair/dinode.c
===================================================================
--- xfsprogs-dev.orig/repair/dinode.c	2009-10-19 01:55:18.842284064 +0200
+++ xfsprogs-dev/repair/dinode.c	2009-11-12 11:12:12.143274713 +0100
@@ -2050,7 +2050,7 @@ process_inode_data_fork(
 		*nextents = 1;
 
 	if (dinoc->di_format != XFS_DINODE_FMT_LOCAL && type != XR_INO_RTDATA)
-		*dblkmap = blkmap_alloc(*nextents);
+		*dblkmap = blkmap_alloc(*nextents, XFS_DATA_FORK);
 	*nextents = 0;
 
 	switch (dinoc->di_format) {
@@ -2172,14 +2172,14 @@ process_inode_attr_fork(
 		err = process_lclinode(mp, agno, ino, dino, XFS_ATTR_FORK);
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
-		ablkmap = blkmap_alloc(*anextents);
+		ablkmap = blkmap_alloc(*anextents, XFS_ATTR_FORK);
 		*anextents = 0;
 		err = process_exinode(mp, agno, ino, dino, type, dirty,
 				atotblocks, anextents, &ablkmap,
 				XFS_ATTR_FORK, check_dups);
 		break;
 	case XFS_DINODE_FMT_BTREE:
-		ablkmap = blkmap_alloc(*anextents);
+		ablkmap = blkmap_alloc(*anextents, XFS_ATTR_FORK);
 		*anextents = 0;
 		err = process_btinode(mp, agno, ino, dino, type, dirty,
 				atotblocks, anextents, &ablkmap,
Index: xfsprogs-dev/repair/init.c
===================================================================
--- xfsprogs-dev.orig/repair/init.c	2009-11-12 11:11:41.025026345 +0100
+++ xfsprogs-dev/repair/init.c	2009-11-12 11:12:12.143274713 +0100
@@ -24,19 +24,24 @@
 #include "pthread.h"
 #include "avl.h"
 #include "dir.h"
+#include "bmap.h"
 #include "incore.h"
 #include "prefetch.h"
 #include <sys/resource.h>
 
+/* TODO: dirbuf/freemap key usage is completely b0rked - only used for dirv1 */
 static pthread_key_t dirbuf_key;
 static pthread_key_t dir_freemap_key;
 static pthread_key_t attr_freemap_key;
 
+extern pthread_key_t dblkmap_key;
+extern pthread_key_t ablkmap_key;
+
 static void
 ts_alloc(pthread_key_t key, unsigned n, size_t size)
 {
 	void *voidp;
-	voidp = malloc((n)*(size));
+	voidp = calloc(n, size);
 	if (voidp == NULL) {
 		do_error(_("ts_alloc: cannot allocate thread specific storage\n"));
 		/* NO RETURN */
@@ -52,6 +57,9 @@ ts_create(void)
 	pthread_key_create(&dirbuf_key, NULL);
 	pthread_key_create(&dir_freemap_key, NULL);
 	pthread_key_create(&attr_freemap_key, NULL);
+
+	pthread_key_create(&dblkmap_key, NULL);
+	pthread_key_create(&ablkmap_key, NULL);
 }
 
 void

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 10/14] repair: cleanup helpers for tracking block usage
  2009-10-21 19:33   ` Alex Elder
@ 2009-11-12 10:21     ` Christoph Hellwig
  0 siblings, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2009-11-12 10:21 UTC (permalink / raw)
  To: Alex Elder; +Cc: Christoph Hellwig, Barry Naujok, xfs

On Wed, Oct 21, 2009 at 02:33:05PM -0500, Alex Elder wrote:
> > +	switch (state != XR_E_UNKNOWN)  {
> 
> BUG.      You mean "if (state != XR_E_UNKNOWN)"???

Oops, yes.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 12/14] repair: switch block usage bitmap to a btree
  2009-10-22 16:22   ` Alex Elder
@ 2009-11-12 10:25     ` Christoph Hellwig
  0 siblings, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2009-11-12 10:25 UTC (permalink / raw)
  To: Alex Elder; +Cc: Christoph Hellwig, Barry Naujok, xfs

On Thu, Oct 22, 2009 at 11:22:42AM -0500, Alex Elder wrote:
> I have a few minor things I would like to see changed at some
> point, but nothing looks obviously incorrect so I'll wait and
> post a patch against this once it's committed.
> 
> Here are a couple other general thoughts:
> - One minor concern is that the btree code has cases in which
>   the peek routines don't work (when the keys_valid flag in the
>   btree root is zero) and this code doesn't check for that.
>   I'll just assume for now that never happens here.

It should not happen, but yes - making this explicit would be good.

> - The bitfield code used for the real-time volume map is
>   generally useful and could be separated into its own module.

Not sure.  It's really a crufty leftover.  It should probably also use
a btree instead, but I fear any effort spent on the RT subvolume is
pretty much wasted anyway.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] repair memory usage reductions
  2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
                   ` (14 preceding siblings ...)
  2009-09-03 20:49 ` [PATCH 00/14] repair memory usage reductions Geoffrey Wehrman
@ 2009-11-12 15:58 ` Christoph Hellwig
  15 siblings, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2009-11-12 15:58 UTC (permalink / raw)
  To: xfs

The remaining patches with the updates suggested by Alex are now
available in the repair-speedup-20091112 branch of the kernel.org
xfsprogs-dev git repository:

	http://git.kernel.org/?p=fs/xfs/xfsprogs-dev.git
	git://git.kernel.org/pub/scm/fs/xfs/xfsprogs-dev.git

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 06/14] repair: use a btree instead of a radix tree for theprefetch queue
  2009-11-12 10:04     ` Christoph Hellwig
@ 2009-11-12 23:46       ` Dave Chinner
  0 siblings, 0 replies; 50+ messages in thread
From: Dave Chinner @ 2009-11-12 23:46 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs, Barry Naujok, Alex Elder

On Thu, Nov 12, 2009 at 05:04:08AM -0500, Christoph Hellwig wrote:
> On Wed, Oct 21, 2009 at 12:12:33PM -0500, Alex Elder wrote:
> > - Related to the previous one--it would be good to have
> >   a little info about why the value 7 was chosen as the
> >   number of keys per node.  Perhaps I just don't know
> >   enough of the history (or content of the upcoming
> >   patches).

I think I can answer this one:

> +/*
> + * Maximum number of keys per node.  Must be greater than 2 for the code
> + * to work.
> + */
> +#define BTREE_KEY_MAX		7
> +#define BTREE_KEY_MIN		(BTREE_KEY_MAX / 2)
> +
> +#define BTREE_PTR_MAX		(BTREE_KEY_MAX + 1)
> +
> +struct btree_node {
> +	unsigned long		num_keys;
> +	unsigned long		keys[BTREE_KEY_MAX];
> +	struct btree_node	*ptrs[BTREE_PTR_MAX];
> +};

BTREE_KEY_MAX = 7 results in a btree_node exactly filling
a 64 byte cacheline on 32 bit, and 2 cachelines on 64 bit.
Cacheline aligned and sized nodes minimises the number of cache
misses when traversing/searching the btree....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2009-11-12 23:46 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-02 17:55 [PATCH 00/14] repair memory usage reductions Christoph Hellwig
2009-09-02 17:55 ` [PATCH 01/14] repair: merge scanfunc_bno and scanfunc_cnt Christoph Hellwig
2009-10-12 16:53   ` Eric Sandeen
2009-10-13 22:13     ` Christoph Hellwig
2009-10-13 23:36       ` Alex Elder
2009-09-02 17:55 ` [PATCH 02/14] repair: reduce byte swap operations in scanfunc_allocbt Christoph Hellwig
2009-10-12 17:18   ` Eric Sandeen
2009-10-13 23:37     ` [PATCH 02/14] repair: reduce byte swap operations inscanfunc_allocbt Alex Elder
2009-09-02 17:55 ` [PATCH 03/14] repair: kill B_IS_META flag Christoph Hellwig
2009-10-12 19:45   ` Eric Sandeen
2009-10-13 22:16     ` Christoph Hellwig
2009-10-13 22:19       ` Eric Sandeen
2009-10-13 23:38         ` Alex Elder
2009-09-02 17:55 ` [PATCH 04/14] repair: split up scanfunc_ino Christoph Hellwig
2009-10-12 20:06   ` Eric Sandeen
2009-10-13 22:19     ` Christoph Hellwig
2009-10-13 22:22       ` Eric Sandeen
2009-10-13 22:23         ` Christoph Hellwig
2009-09-02 17:55 ` [PATCH 05/14] repair: reduce byte swapping in scan_freelist Christoph Hellwig
2009-10-12 20:43   ` Eric Sandeen
2009-09-02 17:55 ` [PATCH 06/14] repair: use a btree instead of a radix tree for the prefetch queue Christoph Hellwig
2009-10-21 17:12   ` [PATCH 06/14] repair: use a btree instead of a radix tree for theprefetch queue Alex Elder
2009-11-12 10:04     ` Christoph Hellwig
2009-11-12 23:46       ` Dave Chinner
2009-09-02 17:55 ` [PATCH 07/14] repair: use single prefetch queue Christoph Hellwig
2009-10-21 17:48   ` Alex Elder
2009-11-12 10:09     ` Christoph Hellwig
2009-09-02 17:55 ` [PATCH 08/14] repair: clean up prefetch tracing Christoph Hellwig
2009-10-21 17:53   ` Alex Elder
2009-09-02 17:55 ` [PATCH 09/14] repair: track logical to physical block mapping more effeciently Christoph Hellwig
2009-10-21 19:06   ` [PATCH 09/14] repair: track logical to physical block mapping moreeffeciently Alex Elder
2009-11-12 10:18     ` Christoph Hellwig
2009-09-02 17:55 ` [PATCH 10/14] repair: cleanup helpers for tracking block usage Christoph Hellwig
2009-10-21 19:33   ` Alex Elder
2009-11-12 10:21     ` Christoph Hellwig
2009-09-02 17:55 ` [PATCH 11/14] repair: cleanup alloc/free/reset of the block usage tracking Christoph Hellwig
2009-10-21 20:22   ` [PATCH 11/14] repair: cleanup alloc/free/reset of the block usagetracking Alex Elder
2009-09-02 17:55 ` [PATCH 12/14] repair: switch block usage bitmap to a btree Christoph Hellwig
2009-10-22 16:22   ` Alex Elder
2009-11-12 10:25     ` Christoph Hellwig
2009-09-02 17:55 ` [PATCH 13/14] repair: optimize duplicate extent tracking Christoph Hellwig
2009-10-22 16:41   ` Alex Elder
2009-09-02 17:55 ` [PATCH 14/14] repair: add missing locking in scanfunc_bmap Christoph Hellwig
2009-10-22 16:42   ` Alex Elder
2009-09-03 20:49 ` [PATCH 00/14] repair memory usage reductions Geoffrey Wehrman
2009-09-04  2:57   ` Dave Chinner
2009-09-04 13:37     ` Geoffrey Wehrman
2009-09-04 14:51       ` Christoph Hellwig
2009-09-04 17:24         ` Michael Monnerie
2009-11-12 15:58 ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.