All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] [PATCH 0/4] Replace buffer cache hash with rbtrees
@ 2010-09-08 15:12 Dave Chinner
  2010-09-08 15:12 ` [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Dave Chinner @ 2010-09-08 15:12 UTC (permalink / raw)
  To: xfs

This patch series cleans up several unconventional buffer uses to
avoid using cached buffers, and then converts the buffer cache
indexing to use a set of per-ag rbtrees rather than a hash. This
version show no performance degradation on my 1.1TB filesystem under
parallel create and unlink compared to the existing enlarged hash
cache. In both cases _xfs_buf_find() is consuming 5-6% of the entire
CPU time of the workload.

This patchset is needed to prepare for moving away from using the
page cache as the backing cache and instead maintaining a LRU of
cached buffers. This is necessary because on larger filesystems
we are completely unable to maintain the working set of metadata
buffers hot in cache as we cannot control page cache reclaim. It
means that we need to be able to hold a lot more buffers in memory
than we currently do and as such the hash based indexing needs
replacing first.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers
  2010-09-08 15:12 [RFC] [PATCH 0/4] Replace buffer cache hash with rbtrees Dave Chinner
@ 2010-09-08 15:12 ` Dave Chinner
  2010-09-09  1:33   ` Christoph Hellwig
  2010-09-10 21:17   ` Alex Elder
  2010-09-08 15:12 ` [PATCH 2/4] xfs: use unhashed buffers for size checks Dave Chinner
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 17+ messages in thread
From: Dave Chinner @ 2010-09-08 15:12 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Filesystem level managed buffers are buffers that have their
lifecycle controlled by the filesystem layer, not the buffer cache.
We currently hash these buffers, which makes cleanup and hash
walking somewhat troublesome. Convert the fs managed buffers to
unhashed buffers obtained by via xfs_buf_get_noaddr(), and remove the
XBF_FS_MANAGED special cases from the buffer cache.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_buf.c |   20 +++-------------
 fs/xfs/linux-2.6/xfs_buf.h |    4 ---
 fs/xfs/xfs_mount.c         |   54 +++++++++++++++++++++++++------------------
 3 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 998bfb3..860a23c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -791,8 +791,6 @@ xfs_buf_rele(
 			atomic_inc(&bp->b_hold);
 			spin_unlock(&hash->bh_lock);
 			(*(bp->b_relse)) (bp);
-		} else if (bp->b_flags & XBF_FS_MANAGED) {
-			spin_unlock(&hash->bh_lock);
 		} else {
 			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
 			list_del_init(&bp->b_hash_list);
@@ -1401,26 +1399,16 @@ void
 xfs_wait_buftarg(
 	xfs_buftarg_t	*btp)
 {
-	xfs_buf_t	*bp, *n;
 	xfs_bufhash_t	*hash;
 	uint		i;
 
 	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
 		hash = &btp->bt_hash[i];
-again:
 		spin_lock(&hash->bh_lock);
-		list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-			ASSERT(btp == bp->b_target);
-			if (!(bp->b_flags & XBF_FS_MANAGED)) {
-				spin_unlock(&hash->bh_lock);
-				/*
-				 * Catch superblock reference count leaks
-				 * immediately
-				 */
-				BUG_ON(bp->b_bn == 0);
-				delay(100);
-				goto again;
-			}
+		while (!list_empty(&hash->bh_list)) {
+			spin_unlock(&hash->bh_lock);
+			delay(100);
+			spin_lock(&hash->bh_lock);
 		}
 		spin_unlock(&hash->bh_lock);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614..72215d8 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
 #define XBF_DONE	(1 << 5) /* all pages in the buffer uptodate */
 #define XBF_DELWRI	(1 << 6) /* buffer has dirty pages */
 #define XBF_STALE	(1 << 7) /* buffer has been staled, do not find it */
-#define XBF_FS_MANAGED	(1 << 8) /* filesystem controls freeing memory */
 #define XBF_ORDERED	(1 << 11)/* use ordered writes */
 #define XBF_READ_AHEAD	(1 << 12)/* asynchronous read-ahead */
 #define XBF_LOG_BUFFER	(1 << 13)/* this is a buffer used for the log */
@@ -104,7 +103,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_DELWRI,		"DELWRI" }, \
 	{ XBF_STALE,		"STALE" }, \
-	{ XBF_FS_MANAGED,	"FS_MANAGED" }, \
 	{ XBF_ORDERED,		"ORDERED" }, \
 	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
 	{ XBF_LOCK,		"LOCK" },  	/* should never be set */\
@@ -276,8 +274,6 @@ extern void xfs_buf_terminate(void);
 					XFS_BUF_DONE(bp);	\
 				} while (0)
 
-#define XFS_BUF_UNMANAGE(bp)	((bp)->b_flags &= ~XBF_FS_MANAGED)
-
 #define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
 #define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72..b2009e3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -639,7 +639,6 @@ int
 xfs_readsb(xfs_mount_t *mp, int flags)
 {
 	unsigned int	sector_size;
-	unsigned int	extra_flags;
 	xfs_buf_t	*bp;
 	int		error;
 
@@ -652,17 +651,29 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * access to the superblock.
 	 */
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-	extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
 
-	bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
-			  extra_flags);
+reread:
+	bp = xfs_buf_get_noaddr(sector_size, mp->m_ddev_targp);
+
 	if (!bp || XFS_BUF_ISERROR(bp)) {
-		xfs_fs_mount_cmn_err(flags, "SB read failed");
+		xfs_fs_mount_cmn_err(flags, "SB buffer alloc failed");
 		error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
 		goto fail;
 	}
-	ASSERT(XFS_BUF_ISBUSY(bp));
-	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	/* set up the buffer for a read IO */
+	xfs_buf_lock(bp);
+	XFS_BUF_SET_ADDR(bp, XFS_SB_DADDR);
+        XFS_BUF_READ(bp);
+        XFS_BUF_BUSY(bp);
+
+	xfsbdstrat(mp, bp);
+	error = xfs_iowait(bp);
+	if (error || XFS_BUF_ISERROR(bp)) {
+		xfs_fs_mount_cmn_err(flags, "SB read failed");
+		error = error ? error : XFS_BUF_GETERROR(bp);
+		goto fail;
+	}
 
 	/*
 	 * Initialize the mount structure from the superblock.
@@ -692,33 +703,25 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * re-read the superblock so the buffer is correctly sized.
 	 */
 	if (sector_size < mp->m_sb.sb_sectsize) {
-		XFS_BUF_UNMANAGE(bp);
 		xfs_buf_relse(bp);
 		sector_size = mp->m_sb.sb_sectsize;
-		bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
-				  BTOBB(sector_size), extra_flags);
-		if (!bp || XFS_BUF_ISERROR(bp)) {
-			xfs_fs_mount_cmn_err(flags, "SB re-read failed");
-			error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-			goto fail;
-		}
-		ASSERT(XFS_BUF_ISBUSY(bp));
-		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		goto reread;
 	}
 
 	/* Initialize per-cpu counters */
 	xfs_icsb_reinit_counters(mp);
 
+	/* grab a reference for caching the buffer */
+        XFS_BUF_HOLD(bp);
 	mp->m_sb_bp = bp;
+
 	xfs_buf_relse(bp);
 	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
 	return 0;
 
- fail:
-	if (bp) {
-		XFS_BUF_UNMANAGE(bp);
+fail:
+	if (bp)
 		xfs_buf_relse(bp);
-	}
 	return error;
 }
 
@@ -2007,9 +2010,14 @@ xfs_freesb(
 	 * when we call xfs_buf_relse().
 	 */
 	bp = xfs_getsb(mp, 0);
-	XFS_BUF_UNMANAGE(bp);
-	xfs_buf_relse(bp);
 	mp->m_sb_bp = NULL;
+
+	/*
+	 * need to release the buffer twice to free it because we hold an extra
+	 * reference count on it.
+	 */
+	xfs_buf_relse(bp);
+	xfs_buf_relse(bp);
 }
 
 /*
-- 
1.7.1

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/4] xfs: use unhashed buffers for size checks
  2010-09-08 15:12 [RFC] [PATCH 0/4] Replace buffer cache hash with rbtrees Dave Chinner
  2010-09-08 15:12 ` [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
@ 2010-09-08 15:12 ` Dave Chinner
  2010-09-09  1:38   ` Christoph Hellwig
  2010-09-10 21:33   ` Alex Elder
  2010-09-08 15:12 ` [PATCH 3/4] xfs: remove buftarg hash for external devices Dave Chinner
  2010-09-08 15:12 ` [PATCH 4/4] xfs: convert buffer cache hash to rbtree Dave Chinner
  3 siblings, 2 replies; 17+ messages in thread
From: Dave Chinner @ 2010-09-08 15:12 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

When we are checking we can access the last block of each device, we
do not need to use hashed buffers as they will be tossed away
immediately. Use unhashed buffers for size checks so that all IO
prior to full in-memory structure initialisation does not use the
buffer cache hashes.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_buf.c |   36 ++++++++++++++++++++++++
 fs/xfs/linux-2.6/xfs_buf.h |    4 +++
 fs/xfs/xfs_fsops.c         |   11 +++----
 fs/xfs/xfs_mount.c         |   65 ++++++++++++++-----------------------------
 fs/xfs/xfs_rtalloc.c       |   29 +++++++++-----------
 5 files changed, 79 insertions(+), 66 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 860a23c..59df94c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -638,6 +638,42 @@ xfs_buf_readahead(
 	xfs_buf_read(target, ioff, isize, flags);
 }
 
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		daddr,
+	size_t			length)
+{
+	xfs_buf_t	*bp;
+	int		error;
+
+	bp = xfs_buf_get_noaddr(length, target);
+	if (!bp || XFS_BUF_ISERROR(bp))
+		goto fail;
+
+	/* set up the buffer for a read IO */
+	xfs_buf_lock(bp);
+	XFS_BUF_SET_ADDR(bp, daddr);
+	XFS_BUF_READ(bp);
+	XFS_BUF_BUSY(bp);
+
+	xfsbdstrat(mp, bp);
+	error = xfs_iowait(bp);
+	if (error || XFS_BUF_ISERROR(bp))
+		goto fail;
+
+	return bp;
+fail:
+	if (bp)
+		xfs_buf_relse(bp);
+	return NULL;
+}
+
 xfs_buf_t *
 xfs_buf_get_empty(
 	size_t			len,
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 72215d8..802dc5e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -217,6 +217,10 @@ extern void xfs_buf_hold(xfs_buf_t *);
 extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 
+struct xfs_buf * xfs_buf_read_uncached(struct xfs_mount *mp,
+				struct xfs_buftarg *target,
+				xfs_daddr_t daddr, size_t length);
+
 /* Releasing Buffers */
 extern void xfs_buf_free(xfs_buf_t *);
 extern void xfs_buf_rele(xfs_buf_t *);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d56..158d5ab 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -144,12 +144,11 @@ xfs_growfs_data_private(
 	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
 		return error;
 	dpct = pct - mp->m_sb.sb_imax_pct;
-	error = xfs_read_buf(mp, mp->m_ddev_targp,
-			XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-			XFS_FSS_TO_BB(mp, 1), 0, &bp);
-	if (error)
-		return error;
-	ASSERT(bp);
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+				BBTOB(XFS_FSS_TO_BB(mp, 1)));
+	if (!bp)
+		return EIO;
 	xfs_buf_relse(bp);
 
 	new = nb;	/* use new as a temporary here */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2009e3..2c8dd69 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -653,26 +653,11 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
 
 reread:
-	bp = xfs_buf_get_noaddr(sector_size, mp->m_ddev_targp);
-
-	if (!bp || XFS_BUF_ISERROR(bp)) {
-		xfs_fs_mount_cmn_err(flags, "SB buffer alloc failed");
-		error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-		goto fail;
-	}
-
-	/* set up the buffer for a read IO */
-	xfs_buf_lock(bp);
-	XFS_BUF_SET_ADDR(bp, XFS_SB_DADDR);
-        XFS_BUF_READ(bp);
-        XFS_BUF_BUSY(bp);
-
-	xfsbdstrat(mp, bp);
-	error = xfs_iowait(bp);
-	if (error || XFS_BUF_ISERROR(bp)) {
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+					XFS_SB_DADDR, sector_size);
+	if (!bp) {
 		xfs_fs_mount_cmn_err(flags, "SB read failed");
-		error = error ? error : XFS_BUF_GETERROR(bp);
-		goto fail;
+		return EIO;
 	}
 
 	/*
@@ -720,8 +705,7 @@ reread:
 	return 0;
 
 fail:
-	if (bp)
-		xfs_buf_relse(bp);
+	xfs_buf_relse(bp);
 	return error;
 }
 
@@ -994,42 +978,35 @@ xfs_check_sizes(xfs_mount_t *mp)
 {
 	xfs_buf_t	*bp;
 	xfs_daddr_t	d;
-	int		error;
 
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-		cmn_err(CE_WARN, "XFS: size check 1 failed");
+		cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
 		return XFS_ERROR(EFBIG);
 	}
-	error = xfs_read_buf(mp, mp->m_ddev_targp,
-			     d - XFS_FSS_TO_BB(mp, 1),
-			     XFS_FSS_TO_BB(mp, 1), 0, &bp);
-	if (!error) {
-		xfs_buf_relse(bp);
-	} else {
-		cmn_err(CE_WARN, "XFS: size check 2 failed");
-		if (error == ENOSPC)
-			error = XFS_ERROR(EFBIG);
-		return error;
+	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+					d - XFS_FSS_TO_BB(mp, 1),
+					BBTOB(XFS_FSS_TO_BB(mp, 1)));
+	if (!bp) {
+		cmn_err(CE_WARN, "XFS: last sector read failed");
+		return EIO;
 	}
+	xfs_buf_relse(bp);
 
 	if (mp->m_logdev_targp != mp->m_ddev_targp) {
 		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-			cmn_err(CE_WARN, "XFS: size check 3 failed");
+			cmn_err(CE_WARN, "XFS: log size mismatch detected");
 			return XFS_ERROR(EFBIG);
 		}
-		error = xfs_read_buf(mp, mp->m_logdev_targp,
-				     d - XFS_FSB_TO_BB(mp, 1),
-				     XFS_FSB_TO_BB(mp, 1), 0, &bp);
-		if (!error) {
-			xfs_buf_relse(bp);
-		} else {
-			cmn_err(CE_WARN, "XFS: size check 3 failed");
-			if (error == ENOSPC)
-				error = XFS_ERROR(EFBIG);
-			return error;
+		bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_B(mp, 1));
+		if (!bp) {
+			cmn_err(CE_WARN, "XFS: log device read failed");
+			return EIO;
 		}
+		xfs_buf_relse(bp);
 	}
 	return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260f..5c5a4c4 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_buf.h"
 
 
 /*
@@ -1883,13 +1884,13 @@ xfs_growfs_rt(
 	/*
 	 * Read in the last block of the device, make sure it exists.
 	 */
-	error = xfs_read_buf(mp, mp->m_rtdev_targp,
-			XFS_FSB_TO_BB(mp, nrblocks - 1),
-			XFS_FSB_TO_BB(mp, 1), 0, &bp);
-	if (error)
-		return error;
-	ASSERT(bp);
+	bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+				XFS_FSB_TO_BB(mp, nrblocks - 1),
+				XFS_FSB_TO_B(mp, 1));
+	if (!bp)
+		return EIO;
 	xfs_buf_relse(bp);
+
 	/*
 	 * Calculate new parameters.  These are the final values to be reached.
 	 */
@@ -2215,7 +2216,6 @@ xfs_rtmount_init(
 {
 	xfs_buf_t	*bp;	/* buffer for last block of subvolume */
 	xfs_daddr_t	d;	/* address of last block of subvolume */
-	int		error;	/* error return value */
 	xfs_sb_t	*sbp;	/* filesystem superblock copy in mount */
 
 	sbp = &mp->m_sb;
@@ -2242,15 +2242,12 @@ xfs_rtmount_init(
 			(unsigned long long) mp->m_sb.sb_rblocks);
 		return XFS_ERROR(EFBIG);
 	}
-	error = xfs_read_buf(mp, mp->m_rtdev_targp,
-				d - XFS_FSB_TO_BB(mp, 1),
-				XFS_FSB_TO_BB(mp, 1), 0, &bp);
-	if (error) {
-		cmn_err(CE_WARN,
-	"XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
-		if (error == ENOSPC)
-			return XFS_ERROR(EFBIG);
-		return error;
+	bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+					d - XFS_FSB_TO_BB(mp, 1),
+					XFS_FSB_TO_B(mp, 1));
+	if (!bp) {
+		cmn_err(CE_WARN, "XFS: realtime device size check failed");
+		return EIO;
 	}
 	xfs_buf_relse(bp);
 	return 0;
-- 
1.7.1

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/4] xfs: remove buftarg hash for external devices
  2010-09-08 15:12 [RFC] [PATCH 0/4] Replace buffer cache hash with rbtrees Dave Chinner
  2010-09-08 15:12 ` [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
  2010-09-08 15:12 ` [PATCH 2/4] xfs: use unhashed buffers for size checks Dave Chinner
@ 2010-09-08 15:12 ` Dave Chinner
  2010-09-09  1:39   ` Christoph Hellwig
  2010-09-08 15:12 ` [PATCH 4/4] xfs: convert buffer cache hash to rbtree Dave Chinner
  3 siblings, 1 reply; 17+ messages in thread
From: Dave Chinner @ 2010-09-08 15:12 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

For RT and external log devices, we never use hashed buffers on them now.
Remove the buftarg hash tables that are set up for them.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_buf.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 59df94c..b2b5dea 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1462,7 +1462,11 @@ xfs_alloc_bufhash(
 {
 	unsigned int		i;
 
-	btp->bt_hashshift = external ? 3 : 12;	/* 8 or 4096 buckets */
+	if (external) {
+		btp->bt_hash = NULL;
+		return;
+	}
+	btp->bt_hashshift = 12;	/* 4096 buckets */
 	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
 					 sizeof(xfs_bufhash_t));
 	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-- 
1.7.1

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/4] xfs: convert buffer cache hash to rbtree
  2010-09-08 15:12 [RFC] [PATCH 0/4] Replace buffer cache hash with rbtrees Dave Chinner
                   ` (2 preceding siblings ...)
  2010-09-08 15:12 ` [PATCH 3/4] xfs: remove buftarg hash for external devices Dave Chinner
@ 2010-09-08 15:12 ` Dave Chinner
  2010-09-09  1:51   ` Christoph Hellwig
  2010-09-13 16:53   ` Alex Elder
  3 siblings, 2 replies; 17+ messages in thread
From: Dave Chinner @ 2010-09-08 15:12 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

The buffer cache hash is starting to show typical hash scalability
problems.  large scale testing is showing the number of cached items
growing far larger than the hash can efficiently handle. Hence we
need to move to a self-scaling cache indexing mechanism.

I have selected rbtrees for indexing becuse they can have O(log n)
search scalability, and insert and remove cost is not excessive,
even on large trees. Hence we should be able to cache large numbers
of buffers without incurring the excessive cache miss search
penalties that the hash is imposing on us.

To ensure we still have parallel access to the cache, we need
multiple trees. Rather than hashing the buffers by disk address to
select a tree, it seems more sensible to separate trees by typical
access patterns. Most operations use buffers from within a single AG
at a time, so rather than searching lots of different lists,
separate the buffer indexes out into per-AG rbtrees. This means that
searches during metadata operation have a much higher chance of
hitting cache resident nodes, and that updates of the tree are less
likely to disturb trees being accessed on other CPUs doing
independent operations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_buf.c   |  139 +++++++++++++++++++++--------------------
 fs/xfs/linux-2.6/xfs_buf.h   |   14 ++--
 fs/xfs/linux-2.6/xfs_super.c |    6 +-
 fs/xfs/xfs_ag.h              |    4 +
 fs/xfs/xfs_mount.c           |    4 +-
 5 files changed, 87 insertions(+), 80 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b2b5dea..facd37e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,7 +188,7 @@ _xfs_buf_initialize(
 	atomic_set(&bp->b_hold, 1);
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_list);
-	INIT_LIST_HEAD(&bp->b_hash_list);
+	RB_CLEAR_NODE(&bp->b_rbnode);
 	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
 	XB_SET_OWNER(bp);
 	bp->b_target = target;
@@ -422,8 +422,10 @@ _xfs_buf_find(
 {
 	xfs_off_t		range_base;
 	size_t			range_length;
-	xfs_bufhash_t		*hash;
-	xfs_buf_t		*bp, *n;
+	struct xfs_perag	*pag;
+	struct rb_node		**rbp;
+	struct rb_node		*parent;
+	xfs_buf_t		*bp;
 
 	range_base = (ioff << BBSHIFT);
 	range_length = (isize << BBSHIFT);
@@ -432,14 +434,36 @@ _xfs_buf_find(
 	ASSERT(!(range_length < (1 << btp->bt_sshift)));
 	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 
-	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
-	spin_lock(&hash->bh_lock);
-
-	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-		ASSERT(btp == bp->b_target);
-		if (bp->b_file_offset == range_base &&
-		    bp->b_buffer_length == range_length) {
+	/* get tree root */
+	pag = xfs_perag_get(btp->bt_mp, xfs_daddr_to_agno(btp->bt_mp, ioff));
+
+	/* walk tree */
+	spin_lock(&pag->pagbuf_lock);
+	rbp = &pag->pagbuf_tree.rb_node;
+	parent = NULL;
+	bp = NULL;
+	while (*rbp) {
+		parent = *rbp;
+		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+		if (bp->b_file_offset < range_base)
+			rbp = &(*rbp)->rb_left;
+		else if (bp->b_file_offset > range_base)
+			rbp = &(*rbp)->rb_right;
+		else {
+			/*
+			 * found a block offset match. If the range doesn't
+			 * match, the only way this is allowed is if the buffer
+			 * in the cache is stale and the transaction that made
+			 * it stale has not yet committed. i.e. we are
+			 * reallocating a busy extent. Skip this buffer and
+			 * continue searching to the right for an exact match.
+			 */
+			if (bp->b_buffer_length != range_length) {
+				ASSERT(bp->b_flags & XBF_STALE);
+				rbp = &(*rbp)->rb_right;
+				continue;
+			}
 			atomic_inc(&bp->b_hold);
 			goto found;
 		}
@@ -449,17 +473,20 @@ _xfs_buf_find(
 	if (new_bp) {
 		_xfs_buf_initialize(new_bp, btp, range_base,
 				range_length, flags);
-		new_bp->b_hash = hash;
-		list_add(&new_bp->b_hash_list, &hash->bh_list);
+		new_bp->b_pag = pag;
+		rb_link_node(&new_bp->b_rbnode, parent, rbp);
+		rb_insert_color(&new_bp->b_rbnode, &pag->pagbuf_tree);
+		spin_unlock(&pag->pagbuf_lock);
 	} else {
 		XFS_STATS_INC(xb_miss_locked);
+		spin_unlock(&pag->pagbuf_lock);
+		xfs_perag_put(pag);
 	}
-
-	spin_unlock(&hash->bh_lock);
 	return new_bp;
 
 found:
-	spin_unlock(&hash->bh_lock);
+	spin_unlock(&pag->pagbuf_lock);
+	xfs_perag_put(pag);
 
 	/* Attempt to get the semaphore without sleeping,
 	 * if this does not work then we need to drop the
@@ -810,27 +837,30 @@ void
 xfs_buf_rele(
 	xfs_buf_t		*bp)
 {
-	xfs_bufhash_t		*hash = bp->b_hash;
+	struct xfs_perag	*pag = bp->b_pag;
 
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
-	if (unlikely(!hash)) {
+	if (!pag) {
 		ASSERT(!bp->b_relse);
+		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold))
 			xfs_buf_free(bp);
 		return;
 	}
 
+	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
 	ASSERT(atomic_read(&bp->b_hold) > 0);
-	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+	if (atomic_dec_and_lock(&bp->b_hold, &pag->pagbuf_lock)) {
 		if (bp->b_relse) {
 			atomic_inc(&bp->b_hold);
-			spin_unlock(&hash->bh_lock);
+			spin_unlock(&pag->pagbuf_lock);
 			(*(bp->b_relse)) (bp);
 		} else {
 			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-			list_del_init(&bp->b_hash_list);
-			spin_unlock(&hash->bh_lock);
+			rb_erase(&bp->b_rbnode, &pag->pagbuf_tree);
+			spin_unlock(&pag->pagbuf_lock);
+			xfs_perag_put(pag);
 			xfs_buf_free(bp);
 		}
 	}
@@ -1433,57 +1463,25 @@ xfs_buf_iomove(
  */
 void
 xfs_wait_buftarg(
-	xfs_buftarg_t	*btp)
+	struct xfs_buftarg	*btp)
 {
-	xfs_bufhash_t	*hash;
-	uint		i;
+	struct xfs_perag	*pag;
+	uint			i;
 
-	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-		hash = &btp->bt_hash[i];
-		spin_lock(&hash->bh_lock);
-		while (!list_empty(&hash->bh_list)) {
-			spin_unlock(&hash->bh_lock);
+	for (i = 0; i < btp->bt_mp->m_sb.sb_agcount; i++) {
+		pag = xfs_perag_get(btp->bt_mp, i);
+		spin_lock(&pag->pagbuf_lock);
+		while (rb_first(&pag->pagbuf_tree)) {
+			spin_unlock(&pag->pagbuf_lock);
 			delay(100);
-			spin_lock(&hash->bh_lock);
+			spin_lock(&pag->pagbuf_lock);
 		}
-		spin_unlock(&hash->bh_lock);
+		spin_unlock(&pag->pagbuf_lock);
+		xfs_perag_put(pag);
 	}
 }
 
 /*
- *	Allocate buffer hash table for a given target.
- *	For devices containing metadata (i.e. not the log/realtime devices)
- *	we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-	xfs_buftarg_t		*btp,
-	int			external)
-{
-	unsigned int		i;
-
-	if (external) {
-		btp->bt_hash = NULL;
-		return;
-	}
-	btp->bt_hashshift = 12;	/* 4096 buckets */
-	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
-					 sizeof(xfs_bufhash_t));
-	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-		spin_lock_init(&btp->bt_hash[i].bh_lock);
-		INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
-	}
-}
-
-STATIC void
-xfs_free_bufhash(
-	xfs_buftarg_t		*btp)
-{
-	kmem_free_large(btp->bt_hash);
-	btp->bt_hash = NULL;
-}
-
-/*
  *	buftarg list for delwrite queue processing
  */
 static LIST_HEAD(xfs_buftarg_list);
@@ -1515,7 +1513,6 @@ xfs_free_buftarg(
 	xfs_flush_buftarg(btp, 1);
 	if (mp->m_flags & XFS_MOUNT_BARRIER)
 		xfs_blkdev_issue_flush(btp);
-	xfs_free_bufhash(btp);
 	iput(btp->bt_mapping->host);
 
 	/* Unregister the buftarg first so that we don't get a
@@ -1637,6 +1634,7 @@ out_error:
 
 xfs_buftarg_t *
 xfs_alloc_buftarg(
+	struct xfs_mount	*mp,
 	struct block_device	*bdev,
 	int			external,
 	const char		*fsname)
@@ -1645,6 +1643,12 @@ xfs_alloc_buftarg(
 
 	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
 
+	/*
+	 * The buftarg cache should never be used by external devices.
+	 * Ensure we catch any users with extreme prejudice.
+	 */
+	btp->bt_mp = external ? NULL : mp;
+
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
 	if (xfs_setsize_buftarg_early(btp, bdev))
@@ -1653,7 +1657,6 @@ xfs_alloc_buftarg(
 		goto error;
 	if (xfs_alloc_delwrite_queue(btp, fsname))
 		goto error;
-	xfs_alloc_bufhash(btp, external);
 	return btp;
 
 error:
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 802dc5e..3797ee8 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -126,18 +126,17 @@ typedef struct xfs_bufhash {
 	spinlock_t		bh_lock;
 } xfs_bufhash_t;
 
+struct xfs_mount;
+
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
 	struct block_device	*bt_bdev;
 	struct address_space	*bt_mapping;
+	struct xfs_mount	*bt_mp;
 	unsigned int		bt_bsize;
 	unsigned int		bt_sshift;
 	size_t			bt_smask;
 
-	/* per device buffer hash table */
-	uint			bt_hashshift;
-	xfs_bufhash_t		*bt_hash;
-
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
 	struct list_head	bt_list;
@@ -171,8 +170,8 @@ typedef struct xfs_buf {
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	xfs_buf_flags_t		b_flags;	/* status flags */
-	struct list_head	b_hash_list;	/* hash table list */
-	xfs_bufhash_t		*b_hash;	/* hash table list start */
+	struct rb_node		b_rbnode;	/* rbtree node */
+	struct xfs_perag	*b_pag;		/* rbtree root */
 	xfs_buftarg_t		*b_target;	/* buffer target (device) */
 	atomic_t		b_hold;		/* reference count */
 	xfs_daddr_t		b_bn;		/* block number for I/O */
@@ -374,7 +373,8 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 /*
  *	Handling of buftargs.
  */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+				struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e0797..7426319 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -758,18 +758,18 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
 	if (rtdev) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1, mp->m_fsname);
 		if (!mp->m_rtdev_targp)
 			goto out_free_ddev_targ;
 	}
 
 	if (logdev && logdev != ddev) {
-		mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1, mp->m_fsname);
 		if (!mp->m_logdev_targp)
 			goto out_free_rtdev_targ;
 	} else {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4e..e01d4cf 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,10 @@ typedef struct xfs_perag {
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
+
+	/* buffer cache index */
+	spinlock_t	pagbuf_lock;	/* lock for pagbuf_tree */
+	struct rb_root	pagbuf_tree;	/* ordered tree of active buffers */
 #endif
 	int		pagb_count;	/* pagb slots in use */
 } xfs_perag_t;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2c8dd69..5d9f49c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -210,8 +210,6 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
 	if (pag) {
 		ASSERT(atomic_read(&pag->pag_ref) >= 0);
-		/* catch leaks in the positive direction during testing */
-		ASSERT(atomic_read(&pag->pag_ref) < 1000);
 		ref = atomic_inc_return(&pag->pag_ref);
 	}
 	spin_unlock(&mp->m_perag_lock);
@@ -445,6 +443,8 @@ xfs_initialize_perag(
 		pag->pag_mount = mp;
 		rwlock_init(&pag->pag_ici_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+		spin_lock_init(&pag->pagbuf_lock);
+		pag->pagbuf_tree = RB_ROOT;
 
 		if (radix_tree_preload(GFP_NOFS))
 			goto out_unwind;
-- 
1.7.1

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers
  2010-09-08 15:12 ` [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
@ 2010-09-09  1:33   ` Christoph Hellwig
  2010-09-10  3:10     ` Dave Chinner
  2010-09-10 21:17   ` Alex Elder
  1 sibling, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2010-09-09  1:33 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good, but a few comments below:

> +	bp = xfs_buf_get_noaddr(sector_size, mp->m_ddev_targp);
> +
>  	if (!bp || XFS_BUF_ISERROR(bp)) {

xfs_buf_get_noaddr will never return a buffer with an error set.

> -	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
> +
> +	/* set up the buffer for a read IO */
> +	xfs_buf_lock(bp);
> +	XFS_BUF_SET_ADDR(bp, XFS_SB_DADDR);
> +        XFS_BUF_READ(bp);
> +        XFS_BUF_BUSY(bp);

Various indentation problems.

> +	/* grab a reference for caching the buffer */
> +        XFS_BUF_HOLD(bp);
>  	mp->m_sb_bp = bp;
> +
>  	xfs_buf_relse(bp);

Grabbing the reference just to drop it three lines later is rather
pointless, just remove both.

>  	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);

Given that we took the lock a few lines above this one also feels rather
poinless.

> +fail:
> +	if (bp)
>  		xfs_buf_relse(bp);
> -	}
>  	return error;

I'd rather see this split into a fail_buf_relese label that puts the
buffer, and a fail label that just returns the error.

>  	 * when we call xfs_buf_relse().
>  	 */
>  	bp = xfs_getsb(mp, 0);
> -	XFS_BUF_UNMANAGE(bp);
> -	xfs_buf_relse(bp);
>  	mp->m_sb_bp = NULL;
> +
> +	/*
> +	 * need to release the buffer twice to free it because we hold an extra
> +	 * reference count on it.
> +	 */
> +	xfs_buf_relse(bp);
> +	xfs_buf_relse(bp);

I'd rather rewrite xfs_freesb to not use xfs_getsb and thus avoid taking
the superflous reference:

void
xfs_freesb(
	struct xfs_mount	*mp);

	struct xfs_buf		*bp = mp->m_sb_bp;

	mp->m_sb_bp = NULL;
	if (xfs_buf_cond_lock(bp)
		BUG();
	xfs_buf_relse(bp);
}

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] xfs: use unhashed buffers for size checks
  2010-09-08 15:12 ` [PATCH 2/4] xfs: use unhashed buffers for size checks Dave Chinner
@ 2010-09-09  1:38   ` Christoph Hellwig
  2010-09-10  3:14     ` Dave Chinner
  2010-09-10 21:33   ` Alex Elder
  1 sibling, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2010-09-09  1:38 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

> +struct xfs_buf *
> +xfs_buf_read_uncached(
> +	struct xfs_mount	*mp,
> +	struct xfs_buftarg	*target,
> +	xfs_daddr_t		daddr,
> +	size_t			length)
> +{
> +	xfs_buf_t	*bp;
> +	int		error;

struct xfs_buf and the same indentation as the parameters, please.

> +
> +	bp = xfs_buf_get_noaddr(length, target);

I think both the buf_get and buf_read interfaces for the non-hash
buffers should have the same name.  Either your uncached or maybe better
unhashed?  (And certainly no noaddr, which is not very useful)

> +	if (!bp || XFS_BUF_ISERROR(bp))
> +		goto fail;

xfs_buf_get_noaddr never returns an error in the buffer.

> +	/* set up the buffer for a read IO */
> +	xfs_buf_lock(bp);
> +	XFS_BUF_SET_ADDR(bp, daddr);
> +	XFS_BUF_READ(bp);
> +	XFS_BUF_BUSY(bp);
> +
> +	xfsbdstrat(mp, bp);
> +	error = xfs_iowait(bp);
> +	if (error || XFS_BUF_ISERROR(bp))
> +		goto fail;
> +
> +	return bp;
> +fail:
> +	if (bp)
> +		xfs_buf_relse(bp);

again, different labels please.

Also this one returns the buffer locked, while buf_get_noaddr doesn't.
I suspect we should also change buf_get_noaddr to return a locked buffer
to make it consistant with all other buf_read/get interfaces.

> +struct xfs_buf * xfs_buf_read_uncached(struct xfs_mount *mp,
> +				struct xfs_buftarg *target,
> +				xfs_daddr_t daddr, size_t length);

wrong placement of the *


This patch, or at least the introduction of the new read helper should
be moved before patch 1 so that we don't add code that gets removed a
little later.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/4] xfs: remove buftarg hash for external devices
  2010-09-08 15:12 ` [PATCH 3/4] xfs: remove buftarg hash for external devices Dave Chinner
@ 2010-09-09  1:39   ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2010-09-09  1:39 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, Sep 09, 2010 at 01:12:57AM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> For RT and external log devices, we never use hashed buffers on them now.
> Remove the buftarg hash tables that are set up for them.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Looks good,


Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] xfs: convert buffer cache hash to rbtree
  2010-09-08 15:12 ` [PATCH 4/4] xfs: convert buffer cache hash to rbtree Dave Chinner
@ 2010-09-09  1:51   ` Christoph Hellwig
  2010-09-10  3:22     ` Dave Chinner
  2010-09-13 16:59     ` Alex Elder
  2010-09-13 16:53   ` Alex Elder
  1 sibling, 2 replies; 17+ messages in thread
From: Christoph Hellwig @ 2010-09-09  1:51 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, Sep 09, 2010 at 01:12:58AM +1000, Dave Chinner wrote:
> I have selected rbtrees for indexing becuse they can have O(log n)
> search scalability, and insert and remove cost is not excessive,
> even on large trees. Hence we should be able to cache large numbers
> of buffers without incurring the excessive cache miss search
> penalties that the hash is imposing on us.

Once thing that worries me about the rbtrees is that the Linux
implementation doesn't allow for lockless readers.  But in the end the
buffer cache implementation is very well encapsulated, so if the need
arises we could easily change the underlying data structure.

> +	/*
> +	 * The buftarg cache should never be used by external devices.
> +	 * Ensure we catch any users with extreme prejudice.
> +	 */
> +	btp->bt_mp = external ? NULL : mp;

I'd much prefer to always initialize this field.  We currently have a
b_mount field struct xfs_buf which is used only in a few places
and initialized rather, ehmm, lazily.  If we could replace it with
->b_target->bt_mount we can shrink struct buf and make the information
available much more consistently.  Just adding the mount argument
to the buftarg and removing it from the buf would be a nice little
preparatory patch.

And yes, I think bt_mount would be much nicer name than bt_mp.

> @@ -210,8 +210,6 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
>  	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
>  	if (pag) {
>  		ASSERT(atomic_read(&pag->pag_ref) >= 0);
> -		/* catch leaks in the positive direction during testing */
> -		ASSERT(atomic_read(&pag->pag_ref) < 1000);

Di you manage to hit this during testing?  Either way it should probably
be a separate patch.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers
  2010-09-09  1:33   ` Christoph Hellwig
@ 2010-09-10  3:10     ` Dave Chinner
  0 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-09-10  3:10 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Wed, Sep 08, 2010 at 09:33:13PM -0400, Christoph Hellwig wrote:
> Looks good, but a few comments below:
> 
> > +	bp = xfs_buf_get_noaddr(sector_size, mp->m_ddev_targp);
> > +
> >  	if (!bp || XFS_BUF_ISERROR(bp)) {
> 
> xfs_buf_get_noaddr will never return a buffer with an error set.

OK, will fix.

> > -	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
> > +
> > +	/* set up the buffer for a read IO */
> > +	xfs_buf_lock(bp);
> > +	XFS_BUF_SET_ADDR(bp, XFS_SB_DADDR);
> > +        XFS_BUF_READ(bp);
> > +        XFS_BUF_BUSY(bp);
> 
> Various indentation problems.

I'll fix all these by putting the uncached read function factoring
into an initial patch.

> > +	/* grab a reference for caching the buffer */
> > +        XFS_BUF_HOLD(bp);
> >  	mp->m_sb_bp = bp;
> > +
> >  	xfs_buf_relse(bp);
> 
> Grabbing the reference just to drop it three lines later is rather
> pointless, just remove both.

Except that xfs_buf_relse(bp) also unlocks bp. maybe I coul djust
make it do an explicit unlock....

> >  	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
> 
> Given that we took the lock a few lines above this one also feels rather
> poinless.

That's checking the buffer is unlocked, which could be removed if
there is an explicit unlock call. I'll do that.

> 
> > +fail:
> > +	if (bp)
> >  		xfs_buf_relse(bp);
> > -	}
> >  	return error;
> 
> I'd rather see this split into a fail_buf_relese label that puts the
> buffer, and a fail label that just returns the error.
> 
> >  	 * when we call xfs_buf_relse().
> >  	 */
> >  	bp = xfs_getsb(mp, 0);
> > -	XFS_BUF_UNMANAGE(bp);
> > -	xfs_buf_relse(bp);
> >  	mp->m_sb_bp = NULL;
> > +
> > +	/*
> > +	 * need to release the buffer twice to free it because we hold an extra
> > +	 * reference count on it.
> > +	 */
> > +	xfs_buf_relse(bp);
> > +	xfs_buf_relse(bp);
> 
> I'd rather rewrite xfs_freesb to not use xfs_getsb and thus avoid taking
> the superflous reference:
> 
> void
> xfs_freesb(
> 	struct xfs_mount	*mp);
> 
> 	struct xfs_buf		*bp = mp->m_sb_bp;
> 
> 	mp->m_sb_bp = NULL;
> 	if (xfs_buf_cond_lock(bp)
> 		BUG();
> 	xfs_buf_relse(bp);
> }

Yeah, that's better. I'll do that.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] xfs: use unhashed buffers for size checks
  2010-09-09  1:38   ` Christoph Hellwig
@ 2010-09-10  3:14     ` Dave Chinner
  0 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-09-10  3:14 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Wed, Sep 08, 2010 at 09:38:07PM -0400, Christoph Hellwig wrote:
> > +struct xfs_buf *
> > +xfs_buf_read_uncached(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_buftarg	*target,
> > +	xfs_daddr_t		daddr,
> > +	size_t			length)
> > +{
> > +	xfs_buf_t	*bp;
> > +	int		error;
> 
> struct xfs_buf and the same indentation as the parameters, please.
> 
> > +
> > +	bp = xfs_buf_get_noaddr(length, target);
> 
> I think both the buf_get and buf_read interfaces for the non-hash
> buffers should have the same name.  Either your uncached or maybe better
> unhashed?  (And certainly no noaddr, which is not very useful)

I'll rename it *_uncached, because the hash is going away ;)

> 
> > +	if (!bp || XFS_BUF_ISERROR(bp))
> > +		goto fail;
> 
> xfs_buf_get_noaddr never returns an error in the buffer.

I'll fix all these - they are just CNP from the previous patch.
> 
> Also this one returns the buffer locked, while buf_get_noaddr doesn't.
> I suspect we should also change buf_get_noaddr to return a locked buffer
> to make it consistant with all other buf_read/get interfaces.

None of the other callers require locked buffers. I'll leave this
for a separate patch set for the moment.

> > +struct xfs_buf * xfs_buf_read_uncached(struct xfs_mount *mp,
> > +				struct xfs_buftarg *target,
> > +				xfs_daddr_t daddr, size_t length);
> 
> wrong placement of the *
> 
> 
> This patch, or at least the introduction of the new read helper should
> be moved before patch 1 so that we don't add code that gets removed a
> little later.

Yes, I plan to do that.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] xfs: convert buffer cache hash to rbtree
  2010-09-09  1:51   ` Christoph Hellwig
@ 2010-09-10  3:22     ` Dave Chinner
  2010-09-13 16:59     ` Alex Elder
  1 sibling, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-09-10  3:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Wed, Sep 08, 2010 at 09:51:50PM -0400, Christoph Hellwig wrote:
> On Thu, Sep 09, 2010 at 01:12:58AM +1000, Dave Chinner wrote:
> > I have selected rbtrees for indexing becuse they can have O(log n)
> > search scalability, and insert and remove cost is not excessive,
> > even on large trees. Hence we should be able to cache large numbers
> > of buffers without incurring the excessive cache miss search
> > penalties that the hash is imposing on us.
> 
> Once thing that worries me about the rbtrees is that the Linux
> implementation doesn't allow for lockless readers.  But in the end the
> buffer cache implementation is very well encapsulated, so if the need
> arises we could easily change the underlying data structure.

Agreed. I'm going for simplicity of implementation first - list to
rbtree conversion is pretty trivial and realtively easy to verify.
We can revisit the choice of rbtrees later on if/when we need to.

> > +	/*
> > +	 * The buftarg cache should never be used by external devices.
> > +	 * Ensure we catch any users with extreme prejudice.
> > +	 */
> > +	btp->bt_mp = external ? NULL : mp;
> 
> I'd much prefer to always initialize this field.  We currently have a
> b_mount field struct xfs_buf which is used only in a few places
> and initialized rather, ehmm, lazily.  If we could replace it with
> ->b_target->bt_mount we can shrink struct buf and make the information
> available much more consistently.  Just adding the mount argument
> to the buftarg and removing it from the buf would be a nice little
> preparatory patch.

Good idea. I'll run up a patch to do that - if we've got more
buffers around, giving them a diet makes sense.

> And yes, I think bt_mount would be much nicer name than bt_mp.

Agreed. call me lazy ;)

> > @@ -210,8 +210,6 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
> >  	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
> >  	if (pag) {
> >  		ASSERT(atomic_read(&pag->pag_ref) >= 0);
> > -		/* catch leaks in the positive direction during testing */
> > -		ASSERT(atomic_read(&pag->pag_ref) < 1000);
> 
> Di you manage to hit this during testing?  Either way it should probably
> be a separate patch.

Not with xfstests. Takes about 0.5s for fsmark to hit it, though. ;)
I'll put it in a separate patch, too.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers
  2010-09-08 15:12 ` [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
  2010-09-09  1:33   ` Christoph Hellwig
@ 2010-09-10 21:17   ` Alex Elder
  1 sibling, 0 replies; 17+ messages in thread
From: Alex Elder @ 2010-09-10 21:17 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, 2010-09-09 at 01:12 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Filesystem level managed buffers are buffers that have their
> lifecycle controlled by the filesystem layer, not the buffer cache.
> We currently hash these buffers, which makes cleanup and hash
> walking somewhat troublesome. Convert the fs managed buffers to
> unhashed buffers obtained by via xfs_buf_get_noaddr(), and remove the
> XBF_FS_MANAGED special cases from the buffer cache.

I've looked this over and it looks good to me.
I agree with all of Christoph's comments from
the other day.  I'll look for your update.

					-Alex

> Signed-off-by: Dave Chinner <dchinner@redhat.com>

. . .

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] xfs: use unhashed buffers for size checks
  2010-09-08 15:12 ` [PATCH 2/4] xfs: use unhashed buffers for size checks Dave Chinner
  2010-09-09  1:38   ` Christoph Hellwig
@ 2010-09-10 21:33   ` Alex Elder
  1 sibling, 0 replies; 17+ messages in thread
From: Alex Elder @ 2010-09-10 21:33 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, 2010-09-09 at 01:12 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> When we are checking we can access the last block of each device, we
> do not need to use hashed buffers as they will be tossed away
> immediately. Use unhashed buffers for size checks so that all IO
> prior to full in-memory structure initialisation does not use the
> buffer cache hashes.

This one also looks good, and again I agree with the
comments Christoph made.  I'll look for your update.
Patch 3 in the series is good too; I'll look at the
last patch (the switch to rbtrees) over the weekend.

> Signed-off-by: Dave Chinner <dchinner@redhat.com>


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] xfs: convert buffer cache hash to rbtree
  2010-09-08 15:12 ` [PATCH 4/4] xfs: convert buffer cache hash to rbtree Dave Chinner
  2010-09-09  1:51   ` Christoph Hellwig
@ 2010-09-13 16:53   ` Alex Elder
  2010-09-14  7:13     ` Dave Chinner
  1 sibling, 1 reply; 17+ messages in thread
From: Alex Elder @ 2010-09-13 16:53 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, 2010-09-09 at 01:12 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The buffer cache hash is starting to show typical hash scalability
> problems.  large scale testing is showing the number of cached items
> growing far larger than the hash can efficiently handle. Hence we
> need to move to a self-scaling cache indexing mechanism.
> 
> I have selected rbtrees for indexing becuse they can have O(log n)
> search scalability, and insert and remove cost is not excessive,
> even on large trees. Hence we should be able to cache large numbers
> of buffers without incurring the excessive cache miss search
> penalties that the hash is imposing on us.
> 
> To ensure we still have parallel access to the cache, we need
> multiple trees. Rather than hashing the buffers by disk address to
> select a tree, it seems more sensible to separate trees by typical
> access patterns. Most operations use buffers from within a single AG
> at a time, so rather than searching lots of different lists,
> separate the buffer indexes out into per-AG rbtrees. This means that
> searches during metadata operation have a much higher chance of
> hitting cache resident nodes, and that updates of the tree are less
> likely to disturb trees being accessed on other CPUs doing
> independent operations.

I found a bug in this, and have a bunch of other less
important comments for you to consider.  I haven't spent
any time contemplating your decision to use RB trees on
AG's but it seems quite reasonable.

> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/linux-2.6/xfs_buf.c   |  139 +++++++++++++++++++++--------------------
>  fs/xfs/linux-2.6/xfs_buf.h   |   14 ++--
>  fs/xfs/linux-2.6/xfs_super.c |    6 +-
>  fs/xfs/xfs_ag.h              |    4 +
>  fs/xfs/xfs_mount.c           |    4 +-
>  5 files changed, 87 insertions(+), 80 deletions(-)
> 
> diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
> index b2b5dea..facd37e 100644
> --- a/fs/xfs/linux-2.6/xfs_buf.c
> +++ b/fs/xfs/linux-2.6/xfs_buf.c

. . .

> @@ -432,14 +434,36 @@ _xfs_buf_find(
>  	ASSERT(!(range_length < (1 << btp->bt_sshift)));
>  	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
>  
> -	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
> -
> -	spin_lock(&hash->bh_lock);
> -
> -	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
> -		ASSERT(btp == bp->b_target);
> -		if (bp->b_file_offset == range_base &&
> -		    bp->b_buffer_length == range_length) {
> +	/* get tree root */
> +	pag = xfs_perag_get(btp->bt_mp, xfs_daddr_to_agno(btp->bt_mp, ioff));
> +
> +	/* walk tree */
> +	spin_lock(&pag->pagbuf_lock);
> +	rbp = &pag->pagbuf_tree.rb_node;
> +	parent = NULL;

Could drop the above assignment and change the while
statement to read:  while ((parent = *rbp)) {
(I know that leads to a mildly controversial style.)

> +	bp = NULL;
> +	while (*rbp) {
> +		parent = *rbp;
> +		bp = rb_entry(parent, struct xfs_buf, b_rbnode);

Below here you might as well make use of the
value of "parent" in places where (*rbp) is used.
I realize you're just mimicking what's in the
rbtree.h file though...  If the result seems to
read funny, maybe you could rename "parent" to
be "entry" or something.

Here's the BUG:

> +		if (bp->b_file_offset < range_base)
> +			rbp = &(*rbp)->rb_left;
> +		else if (bp->b_file_offset > range_base)
> +			rbp = &(*rbp)->rb_right;

Your comparisons here are reversed.  In other words,
I believe it should be:

	if (range_base < bp->b_file_offset)	
		rbp = &parent->rb_left;
	else if (range_base > bp->b_file_offset)
		rbp = &parent->rb_right;

Maybe it mostly works in the order you have it,
but I'm pretty sure it's wrong.  (The "continue
searching to the right" below would be wrong though.)


> +		else {
> +			/*
> +			 * found a block offset match. If the range doesn't
> +			 * match, the only way this is allowed is if the buffer
> +			 * in the cache is stale and the transaction that made
> +			 * it stale has not yet committed. i.e. we are
> +			 * reallocating a busy extent. Skip this buffer and
> +			 * continue searching to the right for an exact match.
> +			 */
> +			if (bp->b_buffer_length != range_length) {
> +				ASSERT(bp->b_flags & XBF_STALE);

This assertion is new.  I trust it's correct, but
maybe it should go in separately (first).

> +				rbp = &(*rbp)->rb_right;
> +				continue;
> +			}
>  			atomic_inc(&bp->b_hold);
>  			goto found;
>  		}

When I first read the next hunk I thought you were
mistakenly not dropping the perag reference.  Later
I realized it was intentional--that the buffer holds
a reference on the perag until it (the buffer) is
released.  This is a minor point but I think it would
be helpful to see a comment explaining that.

> @@ -449,17 +473,20 @@ _xfs_buf_find(
>  	if (new_bp) {
>  		_xfs_buf_initialize(new_bp, btp, range_base,
>  				range_length, flags);
> -		new_bp->b_hash = hash;
> -		list_add(&new_bp->b_hash_list, &hash->bh_list);
> +		new_bp->b_pag = pag;
> +		rb_link_node(&new_bp->b_rbnode, parent, rbp);
> +		rb_insert_color(&new_bp->b_rbnode, &pag->pagbuf_tree);
> +		spin_unlock(&pag->pagbuf_lock);
>  	} else {
>  		XFS_STATS_INC(xb_miss_locked);
> +		spin_unlock(&pag->pagbuf_lock);
> +		xfs_perag_put(pag);
>  	}
> -
> -	spin_unlock(&hash->bh_lock);
>  	return new_bp;
>  
>  found:
> -	spin_unlock(&hash->bh_lock);
> +	spin_unlock(&pag->pagbuf_lock);
> +	xfs_perag_put(pag);
>  
>  	/* Attempt to get the semaphore without sleeping,
>  	 * if this does not work then we need to drop the
> @@ -810,27 +837,30 @@ void
>  xfs_buf_rele(
>  	xfs_buf_t		*bp)
>  {
> -	xfs_bufhash_t		*hash = bp->b_hash;
> +	struct xfs_perag	*pag = bp->b_pag;
>  
>  	trace_xfs_buf_rele(bp, _RET_IP_);
>  
> -	if (unlikely(!hash)) {
> +	if (!pag) {

I know this is not related to your change, but I'm
curious whether this can even happen, and if so, when?

>  		ASSERT(!bp->b_relse);
> +		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
>  		if (atomic_dec_and_test(&bp->b_hold))
>  			xfs_buf_free(bp);
>  		return;
>  	}
>  
> +	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
>  	ASSERT(atomic_read(&bp->b_hold) > 0);
> -	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
> +	if (atomic_dec_and_lock(&bp->b_hold, &pag->pagbuf_lock)) {
>  		if (bp->b_relse) {
>  			atomic_inc(&bp->b_hold);
> -			spin_unlock(&hash->bh_lock);
> +			spin_unlock(&pag->pagbuf_lock);
>  			(*(bp->b_relse)) (bp);

Drop the excess parentheses here (above) while you're at it.

>  		} else {
>  			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
> -			list_del_init(&bp->b_hash_list);
> -			spin_unlock(&hash->bh_lock);
> +			rb_erase(&bp->b_rbnode, &pag->pagbuf_tree);
> +			spin_unlock(&pag->pagbuf_lock);
> +			xfs_perag_put(pag);
>  			xfs_buf_free(bp);
>  		}
>  	}
> @@ -1433,57 +1463,25 @@ xfs_buf_iomove(
>   */
>  void
>  xfs_wait_buftarg(
> -	xfs_buftarg_t	*btp)
> +	struct xfs_buftarg	*btp)
>  {
> -	xfs_bufhash_t	*hash;
> -	uint		i;
> +	struct xfs_perag	*pag;
> +	uint			i;
>  
> -	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
> -		hash = &btp->bt_hash[i];
> -		spin_lock(&hash->bh_lock);
> -		while (!list_empty(&hash->bh_list)) {
> -			spin_unlock(&hash->bh_lock);
> +	for (i = 0; i < btp->bt_mp->m_sb.sb_agcount; i++) {
> +		pag = xfs_perag_get(btp->bt_mp, i);
> +		spin_lock(&pag->pagbuf_lock);
> +		while (rb_first(&pag->pagbuf_tree)) {
> +			spin_unlock(&pag->pagbuf_lock);
>  			delay(100);
> -			spin_lock(&hash->bh_lock);
> +			spin_lock(&pag->pagbuf_lock);
>  		}
> -		spin_unlock(&hash->bh_lock);
> +		spin_unlock(&pag->pagbuf_lock);
> +		xfs_perag_put(pag);
>  	}
>  }

This suggestion is again not related to your change, but...
Would this basic structure (not tested) be better than
the above?

	int more;

	do {
		more = 0;
		for (i = 0; i < btp->bt_mp->m_sb.sb_agcount; i++) {
			pag = xfs_perag_get(btp->bt_mp, i);
			spin_lock(&pag->pagbuf_lock);
			if (rb_first(&pag->pagbuf_tree))
				more++;
			spin_unlock(&pag->pagbuf_lock);
			xfs_perag_put(pag);
		}
		if (more)
			delay(100);
	} while (more);
>  
>  /*
> - *	Allocate buffer hash table for a given target.
> - *	For devices containing metadata (i.e. not the log/realtime devices)
> - *	we need to allocate a much larger hash table.

. . . 

> @@ -1645,6 +1643,12 @@ xfs_alloc_buftarg(
>  
>  	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
>  
> +	/*
> +	 * The buftarg cache should never be used by external devices.

I suggest that "buftarg cache" is not a good name for the
(now per-AG) buffer cache.

> +	 * Ensure we catch any users with extreme prejudice.
> +	 */
> +	btp->bt_mp = external ? NULL : mp;
> +
>  	btp->bt_dev =  bdev->bd_dev;
>  	btp->bt_bdev = bdev;
>  	if (xfs_setsize_buftarg_early(btp, bdev))
> @@ -1653,7 +1657,6 @@ xfs_alloc_buftarg(
>  		goto error;
>  	if (xfs_alloc_delwrite_queue(btp, fsname))
>  		goto error;
> -	xfs_alloc_bufhash(btp, external);
>  	return btp;
>  
>  error:
> diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
> index 802dc5e..3797ee8 100644
> --- a/fs/xfs/linux-2.6/xfs_buf.h
> +++ b/fs/xfs/linux-2.6/xfs_buf.h
. . .
> @@ -171,8 +170,8 @@ typedef struct xfs_buf {
>  	wait_queue_head_t	b_waiters;	/* unpin waiters */
>  	struct list_head	b_list;
>  	xfs_buf_flags_t		b_flags;	/* status flags */
> -	struct list_head	b_hash_list;	/* hash table list */
> -	xfs_bufhash_t		*b_hash;	/* hash table list start */
> +	struct rb_node		b_rbnode;	/* rbtree node */
> +	struct xfs_perag	*b_pag;		/* rbtree root */

Rather, "contains rbtree root" (in the comment).

>  	xfs_buftarg_t		*b_target;	/* buffer target (device) */
>  	atomic_t		b_hold;		/* reference count */
>  	xfs_daddr_t		b_bn;		/* block number for I/O */

. . . 

> diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
> index 4917d4e..e01d4cf 100644
> --- a/fs/xfs/xfs_ag.h
> +++ b/fs/xfs/xfs_ag.h
> @@ -230,6 +230,10 @@ typedef struct xfs_perag {
>  	rwlock_t	pag_ici_lock;	/* incore inode lock */
>  	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
>  	int		pag_ici_reclaimable;	/* reclaimable inodes */
> +
> +	/* buffer cache index */
> +	spinlock_t	pagbuf_lock;	/* lock for pagbuf_tree */
> +	struct rb_root	pagbuf_tree;	/* ordered tree of active buffers */

I know it's in some way consistent with the rest of the
naming scheme for fields in this structure, maybe these
could be named pag_buf_lock and pag_buf_tree.  (The rest
of the fields here have a sort of strange convention and
maybe they've got strong enough history that they should
be left alone.)

>  #endif
>  	int		pagb_count;	/* pagb slots in use */
>  } xfs_perag_t;

. . .

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] xfs: convert buffer cache hash to rbtree
  2010-09-09  1:51   ` Christoph Hellwig
  2010-09-10  3:22     ` Dave Chinner
@ 2010-09-13 16:59     ` Alex Elder
  1 sibling, 0 replies; 17+ messages in thread
From: Alex Elder @ 2010-09-13 16:59 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs


> > @@ -210,8 +210,6 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
> >  	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
> >  	if (pag) {
> >  		ASSERT(atomic_read(&pag->pag_ref) >= 0);
> > -		/* catch leaks in the positive direction during testing */
> > -		ASSERT(atomic_read(&pag->pag_ref) < 1000);
> 
> Di you manage to hit this during testing?  Either way it should probably
> be a separate patch.

I'm pretty sure this is due to each buffer now holding
a perag reference.  It might be nice to preserve this
functionality by checking against the number of blocks
in the AG or something similar.  But it's debug so it's
not quite as critical.

					-Alex

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] xfs: convert buffer cache hash to rbtree
  2010-09-13 16:53   ` Alex Elder
@ 2010-09-14  7:13     ` Dave Chinner
  0 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-09-14  7:13 UTC (permalink / raw)
  To: Alex Elder; +Cc: xfs

On Mon, Sep 13, 2010 at 11:53:49AM -0500, Alex Elder wrote:
> On Thu, 2010-09-09 at 01:12 +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > The buffer cache hash is starting to show typical hash scalability
> > problems.  large scale testing is showing the number of cached items
> > growing far larger than the hash can efficiently handle. Hence we
> > need to move to a self-scaling cache indexing mechanism.
> > 
> > I have selected rbtrees for indexing becuse they can have O(log n)
> > search scalability, and insert and remove cost is not excessive,
> > even on large trees. Hence we should be able to cache large numbers
> > of buffers without incurring the excessive cache miss search
> > penalties that the hash is imposing on us.
> > 
> > To ensure we still have parallel access to the cache, we need
> > multiple trees. Rather than hashing the buffers by disk address to
> > select a tree, it seems more sensible to separate trees by typical
> > access patterns. Most operations use buffers from within a single AG
> > at a time, so rather than searching lots of different lists,
> > separate the buffer indexes out into per-AG rbtrees. This means that
> > searches during metadata operation have a much higher chance of
> > hitting cache resident nodes, and that updates of the tree are less
> > likely to disturb trees being accessed on other CPUs doing
> > independent operations.
> 
> I found a bug in this, and have a bunch of other less
> important comments for you to consider.  I haven't spent
> any time contemplating your decision to use RB trees on
> AG's but it seems quite reasonable.
> 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/linux-2.6/xfs_buf.c   |  139 +++++++++++++++++++++--------------------
> >  fs/xfs/linux-2.6/xfs_buf.h   |   14 ++--
> >  fs/xfs/linux-2.6/xfs_super.c |    6 +-
> >  fs/xfs/xfs_ag.h              |    4 +
> >  fs/xfs/xfs_mount.c           |    4 +-
> >  5 files changed, 87 insertions(+), 80 deletions(-)
> > 
> > diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
> > index b2b5dea..facd37e 100644
> > --- a/fs/xfs/linux-2.6/xfs_buf.c
> > +++ b/fs/xfs/linux-2.6/xfs_buf.c
> 
> . . .
> 
> > @@ -432,14 +434,36 @@ _xfs_buf_find(
> >  	ASSERT(!(range_length < (1 << btp->bt_sshift)));
> >  	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
> >  
> > -	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
> > -
> > -	spin_lock(&hash->bh_lock);
> > -
> > -	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
> > -		ASSERT(btp == bp->b_target);
> > -		if (bp->b_file_offset == range_base &&
> > -		    bp->b_buffer_length == range_length) {
> > +	/* get tree root */
> > +	pag = xfs_perag_get(btp->bt_mp, xfs_daddr_to_agno(btp->bt_mp, ioff));
> > +
> > +	/* walk tree */
> > +	spin_lock(&pag->pagbuf_lock);
> > +	rbp = &pag->pagbuf_tree.rb_node;
> > +	parent = NULL;
> 
> Could drop the above assignment and change the while
> statement to read:  while ((parent = *rbp)) {
> (I know that leads to a mildly controversial style.)

That doesn't work. When we walk off the end of the tree, *rbp ==
NULL, but we need parent to point to the previous node we searched,
as that is where we will insert the new node. The above code would
change it so that on a search miss parent == NULL as well, which
would panic or corrupt the tree on insert.

> > +	bp = NULL;
> > +	while (*rbp) {
> > +		parent = *rbp;
> > +		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
> 
> Below here you might as well make use of the
> value of "parent" in places where (*rbp) is used.
> I realize you're just mimicking what's in the
> rbtree.h file though...

And the same code that is in xfs_alloc.c for the busy extent list.
I'd prefer to keep the code in a common form.

> If the result seems to
> read funny, maybe you could rename "parent" to
> be "entry" or something.

If we fall through to the insert case, it is the parent node
we insert at. Hence parent is appropriate....

> Here's the BUG:
> 
> > +		if (bp->b_file_offset < range_base)
> > +			rbp = &(*rbp)->rb_left;
> > +		else if (bp->b_file_offset > range_base)
> > +			rbp = &(*rbp)->rb_right;
> 
> Your comparisons here are reversed. 

Hmmm, yes, good catch, but I don't think it matters because the
left/right node relationship of the rbtree is maintained through
rotations. Hence as long as we always search the same way as we do
for insert, then we will always find the node we are looking for
in the same number of steps.

Having just tested it, there is zero difference in performance,
cache behaviour or CPU uasge. I'll swap it anyway, as it makes more
sense to order the tree as people expect....

> In other words,
> I believe it should be:
> 
> 	if (range_base < bp->b_file_offset)	
> 		rbp = &parent->rb_left;
> 	else if (range_base > bp->b_file_offset)
> 		rbp = &parent->rb_right;
> 
> Maybe it mostly works in the order you have it,
> but I'm pretty sure it's wrong. (The "continue
> searching to the right" below would be wrong though.)

Once again, as long as we always do things the same way (i.e.
duplicate entries are always stored on the same side) it doesn't
matter if we chose to continue the search to the left or right....

> > +		else {
> > +			/*
> > +			 * found a block offset match. If the range doesn't
> > +			 * match, the only way this is allowed is if the buffer
> > +			 * in the cache is stale and the transaction that made
> > +			 * it stale has not yet committed. i.e. we are
> > +			 * reallocating a busy extent. Skip this buffer and
> > +			 * continue searching to the right for an exact match.
> > +			 */
> > +			if (bp->b_buffer_length != range_length) {
> > +				ASSERT(bp->b_flags & XBF_STALE);
> 
> This assertion is new.  I trust it's correct, but
> maybe it should go in separately (first).

It's there to ensure that my assumption about why we can get
duplicates in the rbtree is correct. In a hash, duplicate keys are
not a big deal, and I want to ensure i understand why they are
occurring. I'd prefer to leave it in this patch...

> > +				rbp = &(*rbp)->rb_right;
> > +				continue;
> > +			}
> >  			atomic_inc(&bp->b_hold);
> >  			goto found;
> >  		}
> 
> When I first read the next hunk I thought you were
> mistakenly not dropping the perag reference.  Later
> I realized it was intentional--that the buffer holds
> a reference on the perag until it (the buffer) is
> released.  This is a minor point but I think it would
> be helpful to see a comment explaining that.

Ok.

> >  xfs_buf_rele(
> >  	xfs_buf_t		*bp)
> >  {
> > -	xfs_bufhash_t		*hash = bp->b_hash;
> > +	struct xfs_perag	*pag = bp->b_pag;
> >  
> >  	trace_xfs_buf_rele(bp, _RET_IP_);
> >  
> > -	if (unlikely(!hash)) {
> > +	if (!pag) {
> 
> I know this is not related to your change, but I'm
> curious whether this can even happen, and if so, when?

All the uncached buffers obtained through xfs_buf_get_uncached() and
xfs_buf_read_uncached() still use xfs_brelse()/xfs_buf_rele() to
drop references.


> >  void
> >  xfs_wait_buftarg(
> > -	xfs_buftarg_t	*btp)
> > +	struct xfs_buftarg	*btp)
> >  {
> > -	xfs_bufhash_t	*hash;
> > -	uint		i;
> > +	struct xfs_perag	*pag;
> > +	uint			i;
> >  
> > -	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
> > -		hash = &btp->bt_hash[i];
> > -		spin_lock(&hash->bh_lock);
> > -		while (!list_empty(&hash->bh_list)) {
> > -			spin_unlock(&hash->bh_lock);
> > +	for (i = 0; i < btp->bt_mp->m_sb.sb_agcount; i++) {
> > +		pag = xfs_perag_get(btp->bt_mp, i);
> > +		spin_lock(&pag->pagbuf_lock);
> > +		while (rb_first(&pag->pagbuf_tree)) {
> > +			spin_unlock(&pag->pagbuf_lock);
> >  			delay(100);
> > -			spin_lock(&hash->bh_lock);
> > +			spin_lock(&pag->pagbuf_lock);
> >  		}
> > -		spin_unlock(&hash->bh_lock);
> > +		spin_unlock(&pag->pagbuf_lock);
> > +		xfs_perag_put(pag);
> >  	}
> >  }
> 
> This suggestion is again not related to your change, but...
> Would this basic structure (not tested) be better than
> the above?
> 
> 	int more;
> 
> 	do {
> 		more = 0;
> 		for (i = 0; i < btp->bt_mp->m_sb.sb_agcount; i++) {
> 			pag = xfs_perag_get(btp->bt_mp, i);
> 			spin_lock(&pag->pagbuf_lock);
> 			if (rb_first(&pag->pagbuf_tree))
> 				more++;
> 			spin_unlock(&pag->pagbuf_lock);
> 			xfs_perag_put(pag);
> 		}
> 		if (more)
> 			delay(100);
> 	} while (more);

Once we've found one AG that we have to wait for, checking the
rest to see if we have to do a wait is unnecessary. We may as well
wait first, then continue.


> >  
> >  /*
> > - *	Allocate buffer hash table for a given target.
> > - *	For devices containing metadata (i.e. not the log/realtime devices)
> > - *	we need to allocate a much larger hash table.
> 
> . . . 
> 
> > @@ -1645,6 +1643,12 @@ xfs_alloc_buftarg(
> >  
> >  	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
> >  
> > +	/*
> > +	 * The buftarg cache should never be used by external devices.
> 
> I suggest that "buftarg cache" is not a good name for the
> (now per-AG) buffer cache.

That's already gone.

> 
> > +	 * Ensure we catch any users with extreme prejudice.
> > +	 */
> > +	btp->bt_mp = external ? NULL : mp;
> > +
> >  	btp->bt_dev =  bdev->bd_dev;
> >  	btp->bt_bdev = bdev;
> >  	if (xfs_setsize_buftarg_early(btp, bdev))
> > @@ -1653,7 +1657,6 @@ xfs_alloc_buftarg(
> >  		goto error;
> >  	if (xfs_alloc_delwrite_queue(btp, fsname))
> >  		goto error;
> > -	xfs_alloc_bufhash(btp, external);
> >  	return btp;
> >  
> >  error:
> > diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
> > index 802dc5e..3797ee8 100644
> > --- a/fs/xfs/linux-2.6/xfs_buf.h
> > +++ b/fs/xfs/linux-2.6/xfs_buf.h
> . . .
> > @@ -171,8 +170,8 @@ typedef struct xfs_buf {
> >  	wait_queue_head_t	b_waiters;	/* unpin waiters */
> >  	struct list_head	b_list;
> >  	xfs_buf_flags_t		b_flags;	/* status flags */
> > -	struct list_head	b_hash_list;	/* hash table list */
> > -	xfs_bufhash_t		*b_hash;	/* hash table list start */
> > +	struct rb_node		b_rbnode;	/* rbtree node */
> > +	struct xfs_perag	*b_pag;		/* rbtree root */
> 
> Rather, "contains rbtree root" (in the comment).
> 
> >  	xfs_buftarg_t		*b_target;	/* buffer target (device) */
> >  	atomic_t		b_hold;		/* reference count */
> >  	xfs_daddr_t		b_bn;		/* block number for I/O */
> 
> . . . 
> 
> > diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
> > index 4917d4e..e01d4cf 100644
> > --- a/fs/xfs/xfs_ag.h
> > +++ b/fs/xfs/xfs_ag.h
> > @@ -230,6 +230,10 @@ typedef struct xfs_perag {
> >  	rwlock_t	pag_ici_lock;	/* incore inode lock */
> >  	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
> >  	int		pag_ici_reclaimable;	/* reclaimable inodes */
> > +
> > +	/* buffer cache index */
> > +	spinlock_t	pagbuf_lock;	/* lock for pagbuf_tree */
> > +	struct rb_root	pagbuf_tree;	/* ordered tree of active buffers */
> 
> I know it's in some way consistent with the rest of the
> naming scheme for fields in this structure, maybe these
> could be named pag_buf_lock and pag_buf_tree.  (The rest
> of the fields here have a sort of strange convention and
> maybe they've got strong enough history that they should
> be left alone.)

OK. I'll change if to pag_buf_...

FYI:
pagi = per-ag AGI data
pagf = per-ag AGF data
pagb = per-ag busy extent
pagl = per-ag inode lookup
pag_ici = per-ag inode cache index

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-09-14  7:13 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-08 15:12 [RFC] [PATCH 0/4] Replace buffer cache hash with rbtrees Dave Chinner
2010-09-08 15:12 ` [PATCH 1/4] xfs: kill XBF_FS_MANAGED buffers Dave Chinner
2010-09-09  1:33   ` Christoph Hellwig
2010-09-10  3:10     ` Dave Chinner
2010-09-10 21:17   ` Alex Elder
2010-09-08 15:12 ` [PATCH 2/4] xfs: use unhashed buffers for size checks Dave Chinner
2010-09-09  1:38   ` Christoph Hellwig
2010-09-10  3:14     ` Dave Chinner
2010-09-10 21:33   ` Alex Elder
2010-09-08 15:12 ` [PATCH 3/4] xfs: remove buftarg hash for external devices Dave Chinner
2010-09-09  1:39   ` Christoph Hellwig
2010-09-08 15:12 ` [PATCH 4/4] xfs: convert buffer cache hash to rbtree Dave Chinner
2010-09-09  1:51   ` Christoph Hellwig
2010-09-10  3:22     ` Dave Chinner
2010-09-13 16:59     ` Alex Elder
2010-09-13 16:53   ` Alex Elder
2010-09-14  7:13     ` Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.