All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 00/13] xfs: remove the xfssyncd mess
@ 2012-08-30 12:00 Dave Chinner
  2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
                   ` (14 more replies)
  0 siblings, 15 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

Version 2 of the patchset I described here:

http://oss.sgi.com/archives/xfs/2012-06/msg00064.html

This version has run through xfstests completely once, so it's
less likely to let smoke out....

Version 2:
- fix writeback_inodes_sb_if_idle call in xfs_create()
- refreshed patch 13 before sending.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* [PATCH 01/13] xfs: xfs_syncd_stop must die
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:15   ` Christoph Hellwig
  2012-09-04 16:10   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 02/13] xfs: rename the xfs_syncd workqueue Dave Chinner
                   ` (13 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
functionailty together that actually have different start and stop
requirements. Kill these functions and open code the start/stop
methods for each of the background functions.

Subsequent patches will move the start/stop functions around to the
correct places to avoid races and shutdown issues.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_super.c |   25 ++++++++++++++++++-------
 fs/xfs/xfs_sync.c  |   30 ++++--------------------------
 fs/xfs/xfs_sync.h  |    6 ++++--
 3 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bdaf4cb..116fcb8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -920,7 +920,11 @@ xfs_fs_put_super(
 
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
@@ -1290,9 +1294,11 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	error = xfs_syncd_init(mp);
-	if (error)
-		goto out_filestream_unmount;
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+	xfs_syncd_queue_sync(mp);
 
 	error = xfs_mountfs(mp);
 	if (error)
@@ -1315,8 +1321,10 @@ xfs_fs_fill_super(
 
 	return 0;
  out_syncd_stop:
-	xfs_syncd_stop(mp);
- out_filestream_unmount:
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1335,7 +1343,10 @@ out_destroy_workqueues:
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
 	goto out_free_sb;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 9654817..13830e4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -370,7 +370,7 @@ xfs_quiesce_attr(
 	xfs_buf_unlock(mp->m_sb_bp);
 }
 
-static void
+void
 xfs_syncd_queue_sync(
 	struct xfs_mount        *mp)
 {
@@ -383,7 +383,7 @@ xfs_syncd_queue_sync(
  * disk quotas.  We might need to cover the log to indicate that the
  * filesystem is idle and not frozen.
  */
-STATIC void
+void
 xfs_sync_worker(
 	struct work_struct *work)
 {
@@ -445,7 +445,7 @@ xfs_syncd_queue_reclaim(
  * goes low. It scans as quickly as possible avoiding locked inodes or those
  * already being flushed, and once done schedules a future pass.
  */
-STATIC void
+void
 xfs_reclaim_worker(
 	struct work_struct *work)
 {
@@ -478,7 +478,7 @@ xfs_flush_inodes(
 	flush_work_sync(&mp->m_flush_work);
 }
 
-STATIC void
+void
 xfs_flush_worker(
 	struct work_struct *work)
 {
@@ -489,28 +489,6 @@ xfs_flush_worker(
 	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 
-int
-xfs_syncd_init(
-	struct xfs_mount	*mp)
-{
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-	xfs_syncd_queue_sync(mp);
-
-	return 0;
-}
-
-void
-xfs_syncd_stop(
-	struct xfs_mount	*mp)
-{
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e..3f59e5b 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,8 +26,10 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
+void xfs_syncd_queue_sync(struct xfs_mount *mp);
+void xfs_sync_worker(struct work_struct *work);
+void xfs_flush_worker(struct work_struct *work);
+void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 02/13] xfs: rename the xfs_syncd workqueue
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
  2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:17   ` Christoph Hellwig
  2012-08-30 12:00 ` [PATCH 03/13] xfs: rationalise xfs_mount_wq users Dave Chinner
                   ` (12 subsequent siblings)
  14 siblings, 1 reply; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

There is nothing "sync" realted to this work queue any more. It is a general
purpose per-filesystem work queue. Rename it appropriately, and remove the
"syncd" naming from various functions.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_mount.c |    1 +
 fs/xfs/xfs_mount.h |    2 ++
 fs/xfs/xfs_super.c |   14 +++++++-------
 fs/xfs/xfs_sync.c  |   19 +++++++++----------
 fs/xfs/xfs_sync.h  |    4 +---
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 29c2f83..e2979ee 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,6 +43,7 @@
 #include "xfs_utils.h"
 #include "xfs_trace.h"
 
+struct workqueue_struct	*xfs_mount_wq;
 
 #ifdef HAVE_PERCPU_SB
 STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e..a0113dd 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -60,6 +60,8 @@ struct xfs_nameops;
 struct xfs_ail;
 struct xfs_quotainfo;
 
+extern struct workqueue_struct	*xfs_mount_wq;
+
 #ifdef HAVE_PERCPU_SB
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 116fcb8..d75fdf3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1298,7 +1298,7 @@ xfs_fs_fill_super(
 	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
-	xfs_syncd_queue_sync(mp);
+	xfs_sync_work_queue(mp);
 
 	error = xfs_mountfs(mp);
 	if (error)
@@ -1542,8 +1542,8 @@ xfs_init_workqueues(void)
 	 * competing for ressources.  Use the default large max_active value
 	 * so that even lots of filesystems can perform these task in parallel.
 	 */
-	xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
-	if (!xfs_syncd_wq)
+	xfs_mount_wq = alloc_workqueue("xfsmount", WQ_NON_REENTRANT, 0);
+	if (!xfs_mount_wq)
 		return -ENOMEM;
 
 	/*
@@ -1554,12 +1554,12 @@ xfs_init_workqueues(void)
 	 */
 	xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
 	if (!xfs_alloc_wq)
-		goto out_destroy_syncd;
+		goto out_destroy_mount;
 
 	return 0;
 
-out_destroy_syncd:
-	destroy_workqueue(xfs_syncd_wq);
+out_destroy_mount:
+	destroy_workqueue(xfs_mount_wq);
 	return -ENOMEM;
 }
 
@@ -1567,7 +1567,7 @@ STATIC void
 xfs_destroy_workqueues(void)
 {
 	destroy_workqueue(xfs_alloc_wq);
-	destroy_workqueue(xfs_syncd_wq);
+	destroy_workqueue(xfs_mount_wq);
 }
 
 STATIC int __init
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 13830e4..7744ffe 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -39,7 +39,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
-struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
 /*
  * The inode lookup is done in batches to keep the amount of lock traffic and
@@ -371,10 +370,10 @@ xfs_quiesce_attr(
 }
 
 void
-xfs_syncd_queue_sync(
+xfs_sync_work_queue(
 	struct xfs_mount        *mp)
 {
-	queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
+	queue_delayed_work(xfs_mount_wq, &mp->m_sync_work,
 				msecs_to_jiffies(xfs_syncd_centisecs * 10));
 }
 
@@ -415,7 +414,7 @@ xfs_sync_worker(
 	}
 
 	/* queue us up again */
-	xfs_syncd_queue_sync(mp);
+	xfs_sync_work_queue(mp);
 }
 
 /*
@@ -426,13 +425,13 @@ xfs_sync_worker(
  * aggressive.
  */
 static void
-xfs_syncd_queue_reclaim(
+xfs_reclaim_queue_work(
 	struct xfs_mount        *mp)
 {
 
 	rcu_read_lock();
 	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-		queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+		queue_delayed_work(xfs_mount_wq, &mp->m_reclaim_work,
 			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 	}
 	rcu_read_unlock();
@@ -453,7 +452,7 @@ xfs_reclaim_worker(
 					struct xfs_mount, m_reclaim_work);
 
 	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-	xfs_syncd_queue_reclaim(mp);
+	xfs_reclaim_queue_work(mp);
 }
 
 /*
@@ -474,7 +473,7 @@ xfs_flush_inodes(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
-	queue_work(xfs_syncd_wq, &mp->m_flush_work);
+	queue_work(xfs_mount_wq, &mp->m_flush_work);
 	flush_work_sync(&mp->m_flush_work);
 }
 
@@ -507,7 +506,7 @@ __xfs_inode_set_reclaim_tag(
 		spin_unlock(&ip->i_mount->m_perag_lock);
 
 		/* schedule periodic background inode reclaim */
-		xfs_syncd_queue_reclaim(ip->i_mount);
+		xfs_reclaim_queue_work(ip->i_mount);
 
 		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
 							-1, _RET_IP_);
@@ -923,7 +922,7 @@ xfs_reclaim_inodes_nr(
 	int			nr_to_scan)
 {
 	/* kick background reclaimer and push the AIL */
-	xfs_syncd_queue_reclaim(mp);
+	xfs_reclaim_queue_work(mp);
 	xfs_ail_push_all(mp->m_ail);
 
 	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 3f59e5b..3c22f3d 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -24,9 +24,7 @@ struct xfs_perag;
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
 #define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
-extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
-
-void xfs_syncd_queue_sync(struct xfs_mount *mp);
+void xfs_sync_work_queue(struct xfs_mount *mp);
 void xfs_sync_worker(struct work_struct *work);
 void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
  2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
  2012-08-30 12:00 ` [PATCH 02/13] xfs: rename the xfs_syncd workqueue Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-04 15:48   ` Mark Tinguely
  2012-09-11 21:25   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 04/13] xfs: don't run the sync work if the filesyste is read-only Dave Chinner
                   ` (11 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Instead of starting and stopping background work on the xfs_mount_wq
all at the same time, separate them to where they really are needed
to start and stop.

The xfs_sync_worker, only needs to be started after all the mount
processing has completed successfully, while it needs to be stopped
before the log is unmounted.

The xfs_reclaim_worker is started on demand, and can be
stopped before the unmount process does it's own inode reclaim pass.

The xfs_flush_inodes work is run on demand, and so we really only
need to ensure that it has stopped running before we start
processing an unmount, freeze or remount,ro.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_mount.c |    6 ++++--
 fs/xfs/xfs_super.c |   32 +++++++++++++-------------------
 fs/xfs/xfs_sync.c  |   21 +++++----------------
 3 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e2979ee..62106e2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1484,9 +1484,11 @@ xfs_unmountfs(
 
 	/*
 	 * And reclaim all inodes.  At this point there should be no dirty
-	 * inode, and none should be pinned or locked, but use synchronous
-	 * reclaim just to be sure.
+	 * inodes and none should be pinned or locked, but use synchronous
+	 * reclaim just to be sure. We can stop background inode reclaim
+	 * here as well if it is still running.
 	 */
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
 	xfs_qm_unmount(mp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d75fdf3..a08a648 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,13 +918,11 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	xfs_filestream_unmount(mp);
-	xfs_unmountfs(mp);
-
 	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	cancel_work_sync(&mp->m_flush_work);
 
+	xfs_filestream_unmount(mp);
+	xfs_unmountfs(mp);
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
@@ -1231,6 +1229,9 @@ xfs_fs_fill_super(
 	spin_lock_init(&mp->m_sb_lock);
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
 	sb->s_fs_info = mp;
@@ -1294,15 +1295,9 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-	xfs_sync_work_queue(mp);
-
 	error = xfs_mountfs(mp);
 	if (error)
-		goto out_syncd_stop;
+		goto out_filestream_unmount;
 
 	root = igrab(VFS_I(mp->m_rootip));
 	if (!root) {
@@ -1319,12 +1314,15 @@ xfs_fs_fill_super(
 		goto out_unmount;
 	}
 
+	/*
+	 * The filesystem is successfully mounted, so we can start background
+	 * sync work now.
+	 */
+	xfs_sync_work_queue(mp);
+
 	return 0;
- out_syncd_stop:
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
 
+ out_filestream_unmount:
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1343,10 +1341,6 @@ out_destroy_workqueues:
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
 	goto out_free_sb;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 7744ffe..ea8e0a1 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -378,9 +378,9 @@ xfs_sync_work_queue(
 }
 
 /*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle and not frozen.
  */
 void
 xfs_sync_worker(
@@ -390,17 +390,7 @@ xfs_sync_worker(
 					struct xfs_mount, m_sync_work);
 	int		error;
 
-	/*
-	 * We shouldn't write/force the log if we are in the mount/unmount
-	 * process or on a read only filesystem. The workqueue still needs to be
-	 * active in both cases, however, because it is used for inode reclaim
-	 * during these times.  Use the MS_ACTIVE flag to avoid doing anything
-	 * during mount.  Doing work during unmount is avoided by calling
-	 * cancel_delayed_work_sync on this work queue before tearing down
-	 * the ail and the log in xfs_log_unmount.
-	 */
-	if (!(mp->m_super->s_flags & MS_ACTIVE) &&
-	    !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
 		/* dgc: errors ignored here */
 		if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
 		    xfs_log_need_covered(mp))
@@ -408,8 +398,7 @@ xfs_sync_worker(
 		else
 			xfs_log_force(mp, 0);
 
-		/* start pushing all the metadata that is currently
-		 * dirty */
+		/* start pushing all the metadata that is currently dirty */
 		xfs_ail_push_all(mp->m_ail);
 	}
 
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 04/13] xfs: don't run the sync work if the filesyste is read-only
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (2 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 03/13] xfs: rationalise xfs_mount_wq users Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-04 16:13   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 05/13] xfs: sync work is now only periodic log work Dave Chinner
                   ` (10 subsequent siblings)
  14 siblings, 1 reply; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

If the filesytem is mounted or remounted read-only, stop the sync
worker that tries to flush or cover the log if the filesystem is
dirty. It's read-only, so it isn't dirty. Restart it on a remount,rw
as necessary. This avoids the need for RO checks in the work.

Similarly, stop the sync work when the filesystem is frozen, and
start it again when the filesysetm is thawed. This avoids the need
for special freeze checks in the work.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_super.c |    2 ++
 fs/xfs/xfs_sync.c  |   26 ++++++++++++++------------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a08a648..619a3ff 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1106,6 +1106,7 @@ xfs_fs_remount(
 		 * value if it is non-zero, otherwise go with the default.
 		 */
 		xfs_restore_resvblks(mp);
+		xfs_sync_queue_work(mp);
 	}
 
 	/* rw -> ro */
@@ -1151,6 +1152,7 @@ xfs_fs_unfreeze(
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_restore_resvblks(mp);
+	xfs_sync_queue_work(mp);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index ea8e0a1..b0a11cd 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -322,6 +322,9 @@ xfs_quiesce_data(
  * Second stage of a quiesce. The data is already synced, now we have to take
  * care of the metadata. New transactions are already blocked, so we need to
  * wait for any remaining transactions to drain out before proceeding.
+ *
+ * Note: this stops background sync work - the callers must ensure it is started
+ * again when appropriate.
  */
 void
 xfs_quiesce_attr(
@@ -340,6 +343,9 @@ xfs_quiesce_attr(
 	/* flush all pending changes from the AIL */
 	xfs_ail_push_all_sync(mp->m_ail);
 
+	/* stop background sync work */
+	cancel_delayed_work_sync(&mp->m_sync_work);
+
 	/*
 	 * Just warn here till VFS can correctly support
 	 * read-only remount without racing.
@@ -380,7 +386,7 @@ xfs_sync_work_queue(
 /*
  * Every sync period we need to unpin all items in the AIL and push them to
  * disk. If there is nothing dirty, then we might need to cover the log to
- * indicate that the filesystem is idle and not frozen.
+ * indicate that the filesystem is idle.
  */
 void
 xfs_sync_worker(
@@ -388,19 +394,15 @@ xfs_sync_worker(
 {
 	struct xfs_mount *mp = container_of(to_delayed_work(work),
 					struct xfs_mount, m_sync_work);
-	int		error;
 
-	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		/* dgc: errors ignored here */
-		if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
-		    xfs_log_need_covered(mp))
-			error = xfs_fs_log_dummy(mp);
-		else
-			xfs_log_force(mp, 0);
+	/* dgc: errors ignored - not fatal and nowhere to report them */
+	if (xfs_log_need_covered(mp))
+		xfs_fs_log_dummy(mp);
+	else
+		xfs_log_force(mp, 0);
 
-		/* start pushing all the metadata that is currently dirty */
-		xfs_ail_push_all(mp->m_ail);
-	}
+	/* start pushing all the metadata that is currently dirty */
+	xfs_ail_push_all(mp->m_ail);
 
 	/* queue us up again */
 	xfs_sync_work_queue(mp);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 05/13] xfs: sync work is now only periodic log work
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (3 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 04/13] xfs: don't run the sync work if the filesyste is read-only Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:23   ` Christoph Hellwig
                     ` (2 more replies)
  2012-08-30 12:00 ` [PATCH 06/13] xfs: Bring some sanity to log unmounting Dave Chinner
                   ` (9 subsequent siblings)
  14 siblings, 3 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

The only thing the periodic sync work does now is flush the AIL and
idle the log. These are really functions of the log code, so move
the work to xfs_log.c and rename it appropriately.

The only wart that this leaves behind is the xfssyncd_centisecs
sysctl, otherwise the xfssyncd is dead. Clean up any comments that
related to xfssyncd to reflect it's passing.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log.c      |   41 ++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_log.h      |    3 +++
 fs/xfs/xfs_log_priv.h |    1 +
 fs/xfs/xfs_mount.h    |    1 -
 fs/xfs/xfs_super.c    |   16 ++++------------
 fs/xfs/xfs_sync.c     |   38 +++-----------------------------------
 fs/xfs/xfs_sync.h     |    2 --
 7 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7f4f937..598f279 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 
 kmem_zone_t	*xfs_log_ticket_zone;
 
@@ -698,6 +699,8 @@ xfs_log_mount_finish(xfs_mount_t *mp)
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 	}
 
+	xfs_log_work_queue(mp);
+
 	return error;
 }
 
@@ -858,7 +861,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 void
 xfs_log_unmount(xfs_mount_t *mp)
 {
-	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_log->l_work);
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
 }
@@ -1161,6 +1164,40 @@ done:
 }	/* xlog_get_iclog_buffer_size */
 
 
+void
+xfs_log_work_queue(
+	struct xfs_mount        *mp)
+{
+	queue_delayed_work(xfs_mount_wq, &mp->m_log->l_work,
+				msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+	struct work_struct	*work)
+{
+	struct xlog		*log = container_of(to_delayed_work(work),
+						struct xlog, l_work);
+	struct xfs_mount	*mp = log->l_mp;
+
+	/* dgc: errors ignored - not fatal and nowhere to report them */
+	if (xfs_log_need_covered(mp))
+		xfs_fs_log_dummy(mp);
+	else
+		xfs_log_force(mp, 0);
+
+	/* start pushing all the metadata that is currently dirty */
+	xfs_ail_push_all(mp->m_ail);
+
+	/* queue us up again */
+	xfs_log_work_queue(mp);
+}
+
 /*
  * This routine initializes some of the log structure for a given mount point.
  * Its primary purpose is to fill in enough, so recovery can occur.  However,
@@ -1195,6 +1232,7 @@ xlog_alloc_log(
 	log->l_logBBsize   = num_bblks;
 	log->l_covered_state = XLOG_STATE_COVER_IDLE;
 	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
+	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
 
 	log->l_prev_block  = -1;
 	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -3700,3 +3738,4 @@ xlog_iclogs_empty(
 	} while (iclog != log->l_iclog);
 	return 1;
 }
+
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312..26ed7de 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,8 @@ int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 				xfs_lsn_t *commit_lsn, int flags);
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
+void	xfs_log_work_queue(struct xfs_mount *mp);
+void	xfs_log_worker(struct work_struct *work);
+
 #endif
 #endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d..9a4e0e5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -495,6 +495,7 @@ struct xlog {
 	struct xfs_buf		*l_xbuf;        /* extra buffer for log
 						 * wrapping */
 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	struct delayed_work	l_work;		/* background flush work */
 	uint			l_flags;
 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
 	struct list_head	*l_buf_cancel_table;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a0113dd..10e17d5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -199,7 +199,6 @@ typedef struct xfs_mount {
 	struct mutex		m_icsb_mutex;	/* balancer sync lock */
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
-	struct delayed_work	m_sync_work;	/* background sync work */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
 	struct work_struct	m_flush_work;	/* background inode flush */
 	__int64_t		m_update_flags;	/* sb flags we need to update
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 619a3ff..15946a9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,7 +918,6 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	cancel_delayed_work_sync(&mp->m_sync_work);
 	cancel_work_sync(&mp->m_flush_work);
 
 	xfs_filestream_unmount(mp);
@@ -952,10 +951,10 @@ xfs_fs_sync_fs(
 	if (laptop_mode) {
 		/*
 		 * The disk must be active because we're syncing.
-		 * We schedule xfssyncd now (now that the disk is
+		 * We schedule log work now (now that the disk is
 		 * active) instead of later (when it might not be).
 		 */
-		flush_delayed_work_sync(&mp->m_sync_work);
+		flush_delayed_work_sync(&mp->m_log->l_work);
 	}
 
 	return 0;
@@ -1106,7 +1105,7 @@ xfs_fs_remount(
 		 * value if it is non-zero, otherwise go with the default.
 		 */
 		xfs_restore_resvblks(mp);
-		xfs_sync_queue_work(mp);
+		xfs_log_work_queue(mp);
 	}
 
 	/* rw -> ro */
@@ -1152,7 +1151,7 @@ xfs_fs_unfreeze(
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_restore_resvblks(mp);
-	xfs_sync_queue_work(mp);
+	xfs_log_work_queue(mp);
 	return 0;
 }
 
@@ -1232,7 +1231,6 @@ xfs_fs_fill_super(
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
 	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
@@ -1316,12 +1314,6 @@ xfs_fs_fill_super(
 		goto out_unmount;
 	}
 
-	/*
-	 * The filesystem is successfully mounted, so we can start background
-	 * sync work now.
-	 */
-	xfs_sync_work_queue(mp);
-
 	return 0;
 
  out_filestream_unmount:
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index b0a11cd..c4c9301 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -19,6 +19,7 @@
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_log.h"
+#include "xfs_log_priv.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
@@ -343,8 +344,8 @@ xfs_quiesce_attr(
 	/* flush all pending changes from the AIL */
 	xfs_ail_push_all_sync(mp->m_ail);
 
-	/* stop background sync work */
-	cancel_delayed_work_sync(&mp->m_sync_work);
+	/* stop background log work */
+	cancel_delayed_work_sync(&mp->m_log->l_work);
 
 	/*
 	 * Just warn here till VFS can correctly support
@@ -375,39 +376,6 @@ xfs_quiesce_attr(
 	xfs_buf_unlock(mp->m_sb_bp);
 }
 
-void
-xfs_sync_work_queue(
-	struct xfs_mount        *mp)
-{
-	queue_delayed_work(xfs_mount_wq, &mp->m_sync_work,
-				msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-
-/*
- * Every sync period we need to unpin all items in the AIL and push them to
- * disk. If there is nothing dirty, then we might need to cover the log to
- * indicate that the filesystem is idle.
- */
-void
-xfs_sync_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_sync_work);
-
-	/* dgc: errors ignored - not fatal and nowhere to report them */
-	if (xfs_log_need_covered(mp))
-		xfs_fs_log_dummy(mp);
-	else
-		xfs_log_force(mp, 0);
-
-	/* start pushing all the metadata that is currently dirty */
-	xfs_ail_push_all(mp->m_ail);
-
-	/* queue us up again */
-	xfs_sync_work_queue(mp);
-}
-
 /*
  * Queue a new inode reclaim pass if there are reclaimable inodes and there
  * isn't a reclaim pass already in progress. By default it runs every 5s based
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 3c22f3d..707c46e 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -24,8 +24,6 @@ struct xfs_perag;
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
 #define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
-void xfs_sync_work_queue(struct xfs_mount *mp);
-void xfs_sync_worker(struct work_struct *work);
 void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
 
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 06/13] xfs: Bring some sanity to log unmounting
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (4 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 05/13] xfs: sync work is now only periodic log work Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:28   ` Christoph Hellwig
  2012-09-04 19:11   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 07/13] xfs: xfs_sync_data is redundant Dave Chinner
                   ` (8 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

When unmounting the filesystem, there are lots of operations that
need to be done in a specific order, and they are spread across
across a couple of functions. We have to drain the AIL before we
write the unmount record, and we have to shut down the background
log work before we do either of them.

But this is all split haphazardly across xfs_unmountfs() and
xfs_log_unmount(). Move all the AIL flushing and log manipulations
to xfs_log_unmount() so that the responisbilities of each function
is clear and the operations they perform obvious.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log.c   |   29 ++++++++++++++++++++++++++---
 fs/xfs/xfs_mount.c |   24 ------------------------
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 598f279..4de160a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -853,15 +853,38 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 }	/* xfs_log_unmount_write */
 
 /*
- * Deallocate log structures for unmount/relocation.
+ * Shut down and release the AIL and Log.
  *
- * We need to stop the aild from running before we destroy
- * and deallocate the log as the aild references the log.
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record, tear down the AIL and finally free the log.
  */
 void
 xfs_log_unmount(xfs_mount_t *mp)
 {
 	cancel_delayed_work_sync(&mp->m_log->l_work);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+	 * will push it, xfs_wait_buftarg() will not wait for it. Further,
+	 * xfs_buf_iowait() cannot be used because it was pushed with the
+	 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+	 * the IO to complete.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+	xfs_wait_buftarg(mp->m_ddev_targp);
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_unlock(mp->m_sb_bp);
+
+	xfs_log_unmount_write(mp);
+
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 62106e2..9b56511 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1494,13 +1494,6 @@ xfs_unmountfs(
 	xfs_qm_unmount(mp);
 
 	/*
-	 * Flush out the log synchronously so that we know for sure
-	 * that nothing is pinned.  This is important because bflush()
-	 * will skip pinned buffers.
-	 */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/*
 	 * Unreserve any blocks we have so that when we unmount we don't account
 	 * the reserved free space as used. This is really only necessary for
 	 * lazy superblock counting because it trusts the incore superblock
@@ -1525,23 +1518,6 @@ xfs_unmountfs(
 		xfs_warn(mp, "Unable to update superblock counters. "
 				"Freespace may not be correct on next mount.");
 
-	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
-	 */
-	xfs_ail_push_all_sync(mp->m_ail);
-	xfs_wait_buftarg(mp->m_ddev_targp);
-
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
-
-	xfs_log_unmount_write(mp);
 	xfs_log_unmount(mp);
 	xfs_uuid_unmount(mp);
 
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 07/13] xfs: xfs_sync_data is redundant.
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (5 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 06/13] xfs: Bring some sanity to log unmounting Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:24   ` Christoph Hellwig
  2012-09-04 20:48   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 08/13] xfs: xfs_sync_fsdata " Dave Chinner
                   ` (7 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

We don't do any data writeback from XFS any more - the VFS is
completely responsible for that, including for freeze. We can
replace the remaining caller with the VFS level function that
achieves the same thing, but without conflicting with current
writeback work - writeback_inodes_sb_if_idle().

This means we can remove the flush_work and xfs_flush_inodes() - the
VFS functionality completely replaces the internal flush queue for
doing this writeback work in a separate context to avoid stack
overruns..

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_iomap.c    |    3 +-
 fs/xfs/xfs_mount.h    |    1 -
 fs/xfs/xfs_super.c    |    3 --
 fs/xfs/xfs_sync.c     |   78 -------------------------------------------------
 fs/xfs/xfs_sync.h     |    3 --
 fs/xfs/xfs_vnodeops.c |    3 +-
 6 files changed, 4 insertions(+), 87 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6..b3351d6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -446,7 +446,8 @@ retry:
 
 		if (error == ENOSPC) {
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inodes(ip);
+			writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb,
+						    WB_REASON_FS_FREE_SPACE);
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
 		}
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 10e17d5..4959c5c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -200,7 +200,6 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
-	struct work_struct	m_flush_work;	/* background inode flush */
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 15946a9..787dd79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,8 +918,6 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	cancel_work_sync(&mp->m_flush_work);
-
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
 	xfs_freesb(mp);
@@ -1230,7 +1228,6 @@ xfs_fs_fill_super(
 	spin_lock_init(&mp->m_sb_lock);
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index c4c9301..e5ee24a 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -216,51 +216,6 @@ xfs_inode_ag_iterator(
 }
 
 STATIC int
-xfs_sync_inode_data(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	int			flags)
-{
-	struct inode		*inode = VFS_I(ip);
-	struct address_space *mapping = inode->i_mapping;
-	int			error = 0;
-
-	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		return 0;
-
-	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-		if (flags & SYNC_TRYLOCK)
-			return 0;
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
-	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-				0 : XBF_ASYNC, FI_NONE);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-	struct xfs_mount	*mp,
-	int			flags)
-{
-	int			error;
-
-	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
-	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-	if (error)
-		return XFS_ERROR(error);
-
-	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-	return 0;
-}
-
-STATIC int
 xfs_sync_fsdata(
 	struct xfs_mount	*mp)
 {
@@ -414,39 +369,6 @@ xfs_reclaim_worker(
 	xfs_reclaim_queue_work(mp);
 }
 
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	queue_work(xfs_mount_wq, &mp->m_flush_work);
-	flush_work_sync(&mp->m_flush_work);
-}
-
-void
-xfs_flush_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(work,
-					struct xfs_mount, m_flush_work);
-
-	xfs_sync_data(mp, SYNC_TRYLOCK);
-	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 707c46e..0beabea 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -24,14 +24,11 @@ struct xfs_perag;
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
 #define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
-void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
-void xfs_flush_inodes(struct xfs_inode *ip);
-
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c6373..dcb4de3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -777,7 +777,8 @@ xfs_create(
 			XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
 		/* flush outstanding delalloc blocks and retry */
-		xfs_flush_inodes(dp);
+		writeback_inodes_sb_if_idle(VFS_I(dp)->i_sb,
+					    WB_REASON_FS_FREE_SPACE);
 		error = xfs_trans_reserve(tp, resblks, log_res, 0,
 				XFS_TRANS_PERM_LOG_RES, log_count);
 	}
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 08/13] xfs: xfs_sync_fsdata is redundant
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (6 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 07/13] xfs: xfs_sync_data is redundant Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:27   ` Christoph Hellwig
  2012-09-04 20:59   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c Dave Chinner
                   ` (6 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Why do we need to write the superblock to disk once we've written
all the data?  We don't actually - the reasons for doing this are
lost in the mists of time, and go back to the way Irix used to drive
VFS flushing.

On linux, this code is only called from two contexts: remount and
.sync_fs. In the remount case, the call is followed by a metadata
sync, which unpins and writes the superblock.  In the sync_fs case,
we only need to force the log to disk to ensure that the superblock
is correctly on disk, so we don't actually need to write it. Hence
the functionality is either redundant or superfluous and thus can be
removed.

Seeing as xfs_quiesce_data is essentially now just a log force,
remove it as well and fold the code back into the two callers.
Neither of them need the log covering check, either, as that is
redundant for the remount case, and unnecessary for the .sync_fs
case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_super.c |   19 +++++----------
 fs/xfs/xfs_sync.c  |   67 +++++++---------------------------------------------
 2 files changed, 14 insertions(+), 72 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 787dd79..2d053fb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -934,7 +934,6 @@ xfs_fs_sync_fs(
 	int			wait)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
-	int			error;
 
 	/*
 	 * Doing anything during the async pass would be counterproductive.
@@ -942,10 +941,7 @@ xfs_fs_sync_fs(
 	if (!wait)
 		return 0;
 
-	error = xfs_quiesce_data(mp);
-	if (error)
-		return -error;
-
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	if (laptop_mode) {
 		/*
 		 * The disk must be active because we're syncing.
@@ -1109,15 +1105,12 @@ xfs_fs_remount(
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
 		/*
-		 * After we have synced the data but before we sync the
-		 * metadata, we need to free up the reserve block pool so that
-		 * the used block count in the superblock on disk is correct at
-		 * the end of the remount. Stash the current reserve pool size
-		 * so that if we get remounted rw, we can return it to the same
-		 * size.
+		 * Before we sync the metadata, we need to free up the reserve
+		 * block pool so that the used block count in the superblock on
+		 * disk is correct at the end of the remount. Stash the current
+		 * reserve pool size so that if we get remounted rw, we can
+		 * return it to the same size.
 		 */
-
-		xfs_quiesce_data(mp);
 		xfs_save_resvblks(mp);
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index e5ee24a..7092276 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -215,70 +215,16 @@ xfs_inode_ag_iterator(
 	return XFS_ERROR(last_error);
 }
 
-STATIC int
-xfs_sync_fsdata(
-	struct xfs_mount	*mp)
-{
-	struct xfs_buf		*bp;
-	int			error;
-
-	/*
-	 * If the buffer is pinned then push on the log so we won't get stuck
-	 * waiting in the write for someone, maybe ourselves, to flush the log.
-	 *
-	 * Even though we just pushed the log above, we did not have the
-	 * superblock buffer locked at that point so it can become pinned in
-	 * between there and here.
-	 */
-	bp = xfs_getsb(mp, 0);
-	if (xfs_buf_ispinned(bp))
-		xfs_log_force(mp, 0);
-	error = xfs_bwrite(bp);
-	xfs_buf_relse(bp);
-	return error;
-}
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete.  Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother emptying the AIL
- * because it'll just get dirty again.
- */
-int
-xfs_quiesce_data(
-	struct xfs_mount	*mp)
-{
-	int			error, error2 = 0;
-
-	/* force out the log */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/* write superblock and hoover up shutdown errors */
-	error = xfs_sync_fsdata(mp);
-
-	/* mark the log as covered if needed */
-	if (xfs_log_need_covered(mp))
-		error2 = xfs_fs_log_dummy(mp);
-
-	return error ? error : error2;
-}
-
 /*
  * Second stage of a quiesce. The data is already synced, now we have to take
  * care of the metadata. New transactions are already blocked, so we need to
  * wait for any remaining transactions to drain out before proceeding.
  *
+ * The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ *
  * Note: this stops background sync work - the callers must ensure it is started
  * again when appropriate.
  */
@@ -292,6 +238,9 @@ xfs_quiesce_attr(
 	while (atomic_read(&mp->m_active_trans) > 0)
 		delay(100);
 
+	/* force the log to unpin objects from the now complete transactions */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
 	/* reclaim inodes to do any IO before the freeze completes */
 	xfs_reclaim_inodes(mp, 0);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (7 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 08/13] xfs: xfs_sync_fsdata " Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:27   ` Christoph Hellwig
  2012-09-04 21:03   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount Dave Chinner
                   ` (5 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Both callers of xfs_quiesce_attr() are in xfs_super.c, and there's
nothing really sync-specific about this functionality so it doesn't
really matter where it lives. Move it to benext to it's callers, so
all the remount/sync_fs code is in the one place.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_super.c |   67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_sync.c  |   65 --------------------------------------------------
 fs/xfs/xfs_sync.h  |    3 ---
 3 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2d053fb..7839817 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1025,6 +1025,73 @@ xfs_restore_resvblks(struct xfs_mount *mp)
 	xfs_reserve_blocks(mp, &resblks, NULL);
 }
 
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk (this is the main
+ * difference between a sync and a quiesce).
+ *
+ * This shoul deffectively mimic the code in xfs_unmountfs() and
+ * xfs_log_umount() but without tearing down any structures.
+ * XXX: bug fixes needed!
+ *
+ * Note: this stops background log work - the callers must ensure it is started
+ * again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+	struct xfs_mount	*mp)
+{
+	int	error = 0;
+
+	/* wait for all modifications to complete */
+	while (atomic_read(&mp->m_active_trans) > 0)
+		delay(100);
+
+	/* force the log to unpin objects from the now complete transactions */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/* reclaim inodes to do any IO before the freeze completes */
+	xfs_reclaim_inodes(mp, 0);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+	/* flush all pending changes from the AIL */
+	xfs_ail_push_all_sync(mp->m_ail);
+
+	/* stop background log work */
+	cancel_delayed_work_sync(&mp->m_log->l_work);
+
+	/*
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
+	 */
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+	/* Push the superblock and write an unmount record */
+	error = xfs_log_sbcount(mp);
+	if (error)
+		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+				"Frozen image may not be consistent.");
+	xfs_log_unmount_write(mp);
+
+	/*
+	 * At this point we might have modified the superblock again and thus
+	 * added an item to the AIL, thus flush it again.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+
+	/*
+	 * The superblock buffer is uncached and xfsaild_push() will lock and
+	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
+	 * here but a lock on the superblock buffer will block until iodone()
+	 * has completed.
+	 */
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_unlock(mp->m_sb_bp);
+}
+
 STATIC int
 xfs_fs_remount(
 	struct super_block	*sb,
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 7092276..08fc71f 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -216,71 +216,6 @@ xfs_inode_ag_iterator(
 }
 
 /*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- *
- * The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- *
- * Note: this stops background sync work - the callers must ensure it is started
- * again when appropriate.
- */
-void
-xfs_quiesce_attr(
-	struct xfs_mount	*mp)
-{
-	int	error = 0;
-
-	/* wait for all modifications to complete */
-	while (atomic_read(&mp->m_active_trans) > 0)
-		delay(100);
-
-	/* force the log to unpin objects from the now complete transactions */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/* reclaim inodes to do any IO before the freeze completes */
-	xfs_reclaim_inodes(mp, 0);
-	xfs_reclaim_inodes(mp, SYNC_WAIT);
-
-	/* flush all pending changes from the AIL */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/* stop background log work */
-	cancel_delayed_work_sync(&mp->m_log->l_work);
-
-	/*
-	 * Just warn here till VFS can correctly support
-	 * read-only remount without racing.
-	 */
-	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
-	/* Push the superblock and write an unmount record */
-	error = xfs_log_sbcount(mp);
-	if (error)
-		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-				"Frozen image may not be consistent.");
-	xfs_log_unmount_write(mp);
-
-	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
-	 */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
-}
-
-/*
  * Queue a new inode reclaim pass if there are reclaimable inodes and there
  * isn't a reclaim pass already in progress. By default it runs every 5s based
  * on the xfs syncd work default of 30s. Perhaps this should have it's own
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 0beabea..0ba9c89 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,9 +26,6 @@ struct xfs_perag;
 
 void xfs_reclaim_worker(struct work_struct *work);
 
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (8 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:29   ` Christoph Hellwig
  2012-09-04 21:04   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] Dave Chinner
                   ` (4 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_quiesce_attr() is supposed to leave the log empty with an
unmount record written. Right now it does not wait for the AIL to be
emptied before writing the unmount record, not does it wait for
metadata IO completion, either. Fix it to use the same method and
code as xfs_log_unmount().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log.c   |   25 ++++++++++++++++++-------
 fs/xfs/xfs_log.h   |    1 +
 fs/xfs/xfs_super.c |   41 ++++++++---------------------------------
 3 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4de160a..5091754 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -853,20 +853,17 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 }	/* xfs_log_unmount_write */
 
 /*
- * Shut down and release the AIL and Log.
- *
- * During unmount, we need to ensure we flush all the dirty metadata objects
- * from the AIL so that the log is empty before we write the unmount record to
- * the log.
+ * Empty the log for unmount/freeze.
  *
  * To do this, we first need to shut down the background log work so it is not
  * trying to cover the log as we clean up. We then need to unpin all objects in
  * the log so we can then flush them out. Once they have completed their IO and
  * run the callbacks removing themselves from the AIL, we can write the unmount
- * record, tear down the AIL and finally free the log.
+ * record.
  */
 void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_quiesce(
+	struct xfs_mount	*mp)
 {
 	cancel_delayed_work_sync(&mp->m_log->l_work);
 	xfs_log_force(mp, XFS_LOG_SYNC);
@@ -884,6 +881,20 @@ xfs_log_unmount(xfs_mount_t *mp)
 	xfs_buf_unlock(mp->m_sb_bp);
 
 	xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
+ *
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
+ */
+void
+xfs_log_unmount(
+	struct xfs_mount	*mp)
+{
+	xfs_log_quiesce(mp);
 
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 26ed7de..5caee96 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -183,6 +183,7 @@ bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
 void	xfs_log_work_queue(struct xfs_mount *mp);
 void	xfs_log_worker(struct work_struct *work);
+void	xfs_log_quiesce(struct xfs_mount *mp);
 
 #endif
 #endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 7839817..283d587 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1030,15 +1030,11 @@ xfs_restore_resvblks(struct xfs_mount *mp)
  *
  * This ensures that the metadata is written to their location on disk rather
  * than just existing in transactions in the log. This means after a quiesce
- * there is no log replay required to write the inodes to disk (this is the main
- * difference between a sync and a quiesce).
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
  *
- * This shoul deffectively mimic the code in xfs_unmountfs() and
- * xfs_log_umount() but without tearing down any structures.
- * XXX: bug fixes needed!
- *
- * Note: this stops background log work - the callers must ensure it is started
- * again when appropriate.
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
  */
 void
 xfs_quiesce_attr(
@@ -1057,39 +1053,18 @@ xfs_quiesce_attr(
 	xfs_reclaim_inodes(mp, 0);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
-	/* flush all pending changes from the AIL */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/* stop background log work */
-	cancel_delayed_work_sync(&mp->m_log->l_work);
-
-	/*
-	 * Just warn here till VFS can correctly support
-	 * read-only remount without racing.
-	 */
-	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
 	/* Push the superblock and write an unmount record */
 	error = xfs_log_sbcount(mp);
 	if (error)
 		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
 				"Frozen image may not be consistent.");
-	xfs_log_unmount_write(mp);
-
 	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
 	 */
-	xfs_ail_push_all_sync(mp->m_ail);
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
 
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
+	xfs_log_quiesce(mp);
 }
 
 STATIC int
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch]
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (9 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:30   ` Christoph Hellwig
  2012-09-04 21:06   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c Dave Chinner
                   ` (3 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_sync.c now only contains inode reclaim functions and inode cache
iteration functions. It is not related to sync operations anymore.
Rename to xfs_icache.c to reflect it's contents and prepare for
consolidation with the other inode cache file that exists
(xfs_iget.c).

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/Makefile          |    2 +-
 fs/xfs/xfs_icache.c      |  716 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_icache.h      |   43 +++
 fs/xfs/xfs_iget.c        |    1 +
 fs/xfs/xfs_mount.c       |    1 +
 fs/xfs/xfs_mount.h       |    2 -
 fs/xfs/xfs_qm_syscalls.c |    1 +
 fs/xfs/xfs_super.c       |    2 +-
 fs/xfs/xfs_sync.c        |  715 ---------------------------------------------
 fs/xfs/xfs_sync.h        |   43 ---
 10 files changed, 764 insertions(+), 762 deletions(-)
 create mode 100644 fs/xfs/xfs_icache.c
 create mode 100644 fs/xfs/xfs_icache.h
 delete mode 100644 fs/xfs/xfs_sync.c
 delete mode 100644 fs/xfs/xfs_sync.h

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974..442f256 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -39,6 +39,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_fsops.o \
 				   xfs_fs_subr.o \
 				   xfs_globals.o \
+				   xfs_icache.o \
 				   xfs_iget.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
@@ -47,7 +48,6 @@ xfs-y				+= xfs_aops.o \
 				   xfs_message.o \
 				   xfs_mru_cache.o \
 				   xfs_super.o \
-				   xfs_sync.o \
 				   xfs_xattr.o \
 				   xfs_rename.o \
 				   xfs_utils.o \
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
new file mode 100644
index 0000000..c21a72a
--- /dev/null
+++ b/fs/xfs/xfs_icache.c
@@ -0,0 +1,716 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_icache.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH	32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	ASSERT(rcu_read_lock_held());
+
+	/*
+	 * check for stale RCU freed inode
+	 *
+	 * If the inode has been reallocated, it doesn't matter if it's not in
+	 * the AG we are walking - we are walking for writeback, so if it
+	 * passes all the "valid inode" checks and is dirty, then we'll write
+	 * it back anyway.  If it has been reallocated and still being
+	 * initialised, the XFS_INEW check below will catch it.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		goto out_unlock_noent;
+	spin_unlock(&ip->i_flags_lock);
+
+	/* nothing to sync during shutdown */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return EFSCORRUPTED;
+
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		return ENOENT;
+
+	if (is_bad_inode(inode)) {
+		IRELE(ip);
+		return ENOENT;
+	}
+
+	/* inode is valid */
+	return 0;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags)
+{
+	uint32_t		first_index;
+	int			last_error = 0;
+	int			skipped;
+	int			done;
+	int			nr_found;
+
+restart:
+	done = 0;
+	skipped = 0;
+	first_index = 0;
+	nr_found = 0;
+	do {
+		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+		int		error = 0;
+		int		i;
+
+		rcu_read_lock();
+		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH);
+		if (!nr_found) {
+			rcu_read_unlock();
+			break;
+		}
+
+		/*
+		 * Grab the inodes before we drop the lock. if we found
+		 * nothing, nr == 0 and the loop will be skipped.
+		 */
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_inode *ip = batch[i];
+
+			if (done || xfs_inode_ag_walk_grab(ip))
+				batch[i] = NULL;
+
+			/*
+			 * Update the index for the next lookup. Catch
+			 * overflows into the next AG range which can occur if
+			 * we have inodes in the last block of the AG and we
+			 * are currently pointing to the last inode.
+			 *
+			 * Because we may see inodes that are from the wrong AG
+			 * due to RCU freeing and reallocation, only update the
+			 * index if it lies in this AG. It was a race that lead
+			 * us to see this inode, so another lookup from the
+			 * same index will not find it again.
+			 */
+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+				continue;
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = 1;
+		}
+
+		/* unlock now we've grabbed the inodes. */
+		rcu_read_unlock();
+
+		for (i = 0; i < nr_found; i++) {
+			if (!batch[i])
+				continue;
+			error = execute(batch[i], pag, flags);
+			IRELE(batch[i]);
+			if (error == EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != EFSCORRUPTED)
+				last_error = error;
+		}
+
+		/* bail out if the filesystem is corrupted.  */
+		if (error == EFSCORRUPTED)
+			break;
+
+		cond_resched();
+
+	} while (nr_found && !done);
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+	return last_error;
+}
+
+int
+xfs_inode_ag_iterator(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	ag = 0;
+	while ((pag = xfs_perag_get(mp, ag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags);
+		xfs_perag_put(pag);
+		if (error) {
+			last_error = error;
+			if (error == EFSCORRUPTED)
+				break;
+		}
+	}
+	return XFS_ERROR(last_error);
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_queue_work(
+	struct xfs_mount        *mp)
+{
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+		queue_delayed_work(xfs_mount_wq, &mp->m_reclaim_work,
+			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_reclaim_work);
+
+	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+	xfs_reclaim_queue_work(mp);
+}
+
+void
+__xfs_inode_set_reclaim_tag(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	radix_tree_tag_set(&pag->pag_ici_root,
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			   XFS_ICI_RECLAIM_TAG);
+
+	if (!pag->pag_ici_reclaimable) {
+		/* propagate the reclaim tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+
+		/* schedule periodic background inode reclaim */
+		xfs_reclaim_queue_work(ip->i_mount);
+
+		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+	pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+	__xfs_inode_set_reclaim_tag(pag, ip);
+	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	pag->pag_ici_reclaimable--;
+	if (!pag->pag_ici_reclaimable) {
+		/* clear the reclaim tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+	xfs_mount_t	*mp,
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	__xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	ASSERT(rcu_read_lock_held());
+
+	/* quick check for stale RCU freed inode */
+	if (!ip->i_ino)
+		return 1;
+
+	/*
+	 * If we are asked for non-blocking operation, do unlocked checks to
+	 * see if the inode already is being flushed or in reclaim to avoid
+	 * lock traffic.
+	 */
+	if ((flags & SYNC_TRYLOCK) &&
+	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+		return 1;
+
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 *
+	 * Due to RCU lookup, we may find inodes that have been freed and only
+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+	 * aren't candidates for reclaim at all, so we must check the
+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* not a reclaim candidate. */
+		spin_unlock(&ip->i_flags_lock);
+		return 1;
+	}
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
+ *
+ *	inode state	     iflush ret		required action
+ *      ---------------      ----------         ---------------
+ *	bad			-		reclaim
+ *	shutdown		EIO		unpin and reclaim
+ *	clean, unpinned		0		reclaim
+ *	stale, unpinned		0		reclaim
+ *	clean, pinned(*)	0		requeue
+ *	stale, pinned		EAGAIN		requeue
+ *	dirty, async		-		requeue
+ *	dirty, sync		0		reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean.
+ *
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting.  For background relaim, we only
+ * bother to reclaim clean inodes anyway.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *	bad		=> reclaim
+ *	shutdown	=> unpin and reclaim
+ *	pinned, async	=> requeue
+ *	pinned, sync	=> unpin
+ *	stale		=> reclaim
+ *	clean		=> reclaim
+ *	dirty, async	=> requeue
+ *	dirty, sync	=> flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			sync_mode)
+{
+	struct xfs_buf		*bp = NULL;
+	int			error;
+
+restart:
+	error = 0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out;
+		xfs_iflock(ip);
+	}
+
+	if (is_bad_inode(VFS_I(ip)))
+		goto reclaim;
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_iunpin_wait(ip);
+		xfs_iflush_abort(ip, false);
+		goto reclaim;
+	}
+	if (xfs_ipincount(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out_ifunlock;
+		xfs_iunpin_wait(ip);
+	}
+	if (xfs_iflags_test(ip, XFS_ISTALE))
+		goto reclaim;
+	if (xfs_inode_clean(ip))
+		goto reclaim;
+
+	/*
+	 * Never flush out dirty data during non-blocking reclaim, as it would
+	 * just contend with AIL pushing trying to do the same job.
+	 */
+	if (!(sync_mode & SYNC_WAIT))
+		goto out_ifunlock;
+
+	/*
+	 * Now we have an inode that needs flushing.
+	 *
+	 * Note that xfs_iflush will never block on the inode buffer lock, as
+	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
+	 * ip->i_lock, and we are doing the exact opposite here.  As a result,
+	 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
+	 * result in an ABBA deadlock with xfs_ifree_cluster().
+	 *
+	 * As xfs_ifree_cluser() must gather all inodes that are active in the
+	 * cache to mark them stale, if we hit this case we don't actually want
+	 * to do IO here - we want the inode marked stale so we can simply
+	 * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
+	 * inode, back off and try again.  Hopefully the next pass through will
+	 * see the stale flag set on the inode.
+	 */
+	error = xfs_iflush(ip, &bp);
+	if (error == EAGAIN) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		/* backoff longer than in xfs_ifree_cluster */
+		delay(2);
+		goto restart;
+	}
+
+	if (!error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+	}
+
+	xfs_iflock(ip);
+reclaim:
+	xfs_ifunlock(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	XFS_STATS_INC(xs_ig_reclaims);
+	/*
+	 * Remove the inode from the per-AG radix tree.
+	 *
+	 * Because radix_tree_delete won't complain even if the item was never
+	 * added to the tree assert that it's been there before to catch
+	 * problems with the inode life time early on.
+	 */
+	spin_lock(&pag->pag_ici_lock);
+	if (!radix_tree_delete(&pag->pag_ici_root,
+				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+		ASSERT(0);
+	__xfs_inode_clear_reclaim(pag, ip);
+	spin_unlock(&pag->pag_ici_lock);
+
+	/*
+	 * Here we do an (almost) spurious inode lock in order to coordinate
+	 * with inode cache radix tree lookups.  This is because the lookup
+	 * can reference the inodes in the cache without taking references.
+	 *
+	 * We make that OK here by ensuring that we wait until the inode is
+	 * unlocked after the lookup before we go ahead and free it.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_qm_dqdetach(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	xfs_inode_free(ip);
+	return error;
+
+out_ifunlock:
+	xfs_ifunlock(ip);
+out:
+	xfs_iflags_clear(ip, XFS_IRECLAIM);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * We could return EAGAIN here to make reclaim rescan the inode tree in
+	 * a short while. However, this just burns CPU time scanning the tree
+	 * waiting for IO to complete and xfssyncd never goes back to the idle
+	 * state. Instead, return 0 to let the next scheduled background reclaim
+	 * attempt to reclaim the inode again.
+	 */
+	return 0;
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+	struct xfs_mount	*mp,
+	int			flags,
+	int			*nr_to_scan)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+	int			trylock = flags & SYNC_TRYLOCK;
+	int			skipped;
+
+restart:
+	ag = 0;
+	skipped = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		unsigned long	first_index = 0;
+		int		done = 0;
+		int		nr_found = 0;
+
+		ag = pag->pag_agno + 1;
+
+		if (trylock) {
+			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+				skipped++;
+				xfs_perag_put(pag);
+				continue;
+			}
+			first_index = pag->pag_ici_reclaim_cursor;
+		} else
+			mutex_lock(&pag->pag_ici_reclaim_lock);
+
+		do {
+			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+			int	i;
+
+			rcu_read_lock();
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH,
+					XFS_ICI_RECLAIM_TAG);
+			if (!nr_found) {
+				done = 1;
+				rcu_read_unlock();
+				break;
+			}
+
+			/*
+			 * Grab the inodes before we drop the lock. if we found
+			 * nothing, nr == 0 and the loop will be skipped.
+			 */
+			for (i = 0; i < nr_found; i++) {
+				struct xfs_inode *ip = batch[i];
+
+				if (done || xfs_reclaim_inode_grab(ip, flags))
+					batch[i] = NULL;
+
+				/*
+				 * Update the index for the next lookup. Catch
+				 * overflows into the next AG range which can
+				 * occur if we have inodes in the last block of
+				 * the AG and we are currently pointing to the
+				 * last inode.
+				 *
+				 * Because we may see inodes that are from the
+				 * wrong AG due to RCU freeing and
+				 * reallocation, only update the index if it
+				 * lies in this AG. It was a race that lead us
+				 * to see this inode, so another lookup from
+				 * the same index will not find it again.
+				 */
+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+								pag->pag_agno)
+					continue;
+				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+					done = 1;
+			}
+
+			/* unlock now we've grabbed the inodes. */
+			rcu_read_unlock();
+
+			for (i = 0; i < nr_found; i++) {
+				if (!batch[i])
+					continue;
+				error = xfs_reclaim_inode(batch[i], pag, flags);
+				if (error && last_error != EFSCORRUPTED)
+					last_error = error;
+			}
+
+			*nr_to_scan -= XFS_LOOKUP_BATCH;
+
+			cond_resched();
+
+		} while (nr_found && !done && *nr_to_scan > 0);
+
+		if (trylock && !done)
+			pag->pag_ici_reclaim_cursor = first_index;
+		else
+			pag->pag_ici_reclaim_cursor = 0;
+		mutex_unlock(&pag->pag_ici_reclaim_lock);
+		xfs_perag_put(pag);
+	}
+
+	/*
+	 * if we skipped any AG, and we still have scan count remaining, do
+	 * another pass this time using blocking reclaim semantics (i.e
+	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
+	 * ensure that when we get more reclaimers than AGs we block rather
+	 * than spin trying to execute reclaim.
+	 */
+	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
+		trylock = 0;
+		goto restart;
+	}
+	return XFS_ERROR(last_error);
+}
+
+int
+xfs_reclaim_inodes(
+	xfs_mount_t	*mp,
+	int		mode)
+{
+	int		nr_to_scan = INT_MAX;
+
+	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Scan a certain number of inodes for reclaim.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+void
+xfs_reclaim_inodes_nr(
+	struct xfs_mount	*mp,
+	int			nr_to_scan)
+{
+	/* kick background reclaimer and push the AIL */
+	xfs_reclaim_queue_work(mp);
+	xfs_ail_push_all(mp->m_ail);
+
+	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+}
+
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+int
+xfs_reclaim_inodes_count(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		ag = 0;
+	int			reclaimable = 0;
+
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		ag = pag->pag_agno + 1;
+		reclaimable += pag->pag_ici_reclaimable;
+		xfs_perag_put(pag);
+	}
+	return reclaimable;
+}
+
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
new file mode 100644
index 0000000..0ba9c89
--- /dev/null
+++ b/fs/xfs/xfs_icache.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
+#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
+
+void xfs_reclaim_worker(struct work_struct *work);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+int xfs_reclaim_inodes_count(struct xfs_mount *mp);
+void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+				struct xfs_inode *ip);
+
+int xfs_sync_inode_grab(struct xfs_inode *ip);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+	int flags);
+
+#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 784a803..069c5ce 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -38,6 +38,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 9b56511..9f76f8c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 struct workqueue_struct	*xfs_mount_wq;
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4959c5c..d0946ad 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
 
 #else /* __KERNEL__ */
 
-#include "xfs_sync.h"
-
 struct xlog;
 struct xfs_inode;
 struct xfs_mru_cache;
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b1..7a9071f 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 283d587..cfc26f0 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
 #include "xfs_trace.h"
 
 #include <linux/namei.h>
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
deleted file mode 100644
index 08fc71f..0000000
--- a/fs/xfs/xfs_sync.c
+++ /dev/null
@@ -1,715 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_log_priv.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dinode.h"
-#include "xfs_error.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
-#include "xfs_inode_item.h"
-#include "xfs_quota.h"
-#include "xfs_trace.h"
-#include "xfs_fsops.h"
-
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-
-
-/*
- * The inode lookup is done in batches to keep the amount of lock traffic and
- * radix tree lookups to a minimum. The batch size is a trade off between
- * lookup reduction and stack usage. This is in the reclaim path, so we can't
- * be too greedy.
- */
-#define XFS_LOOKUP_BATCH	32
-
-STATIC int
-xfs_inode_ag_walk_grab(
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-
-	ASSERT(rcu_read_lock_held());
-
-	/*
-	 * check for stale RCU freed inode
-	 *
-	 * If the inode has been reallocated, it doesn't matter if it's not in
-	 * the AG we are walking - we are walking for writeback, so if it
-	 * passes all the "valid inode" checks and is dirty, then we'll write
-	 * it back anyway.  If it has been reallocated and still being
-	 * initialised, the XFS_INEW check below will catch it.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (!ip->i_ino)
-		goto out_unlock_noent;
-
-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-		goto out_unlock_noent;
-	spin_unlock(&ip->i_flags_lock);
-
-	/* nothing to sync during shutdown */
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return EFSCORRUPTED;
-
-	/* If we can't grab the inode, it must on it's way to reclaim. */
-	if (!igrab(inode))
-		return ENOENT;
-
-	if (is_bad_inode(inode)) {
-		IRELE(ip);
-		return ENOENT;
-	}
-
-	/* inode is valid */
-	return 0;
-
-out_unlock_noent:
-	spin_unlock(&ip->i_flags_lock);
-	return ENOENT;
-}
-
-STATIC int
-xfs_inode_ag_walk(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag,
-	int			(*execute)(struct xfs_inode *ip,
-					   struct xfs_perag *pag, int flags),
-	int			flags)
-{
-	uint32_t		first_index;
-	int			last_error = 0;
-	int			skipped;
-	int			done;
-	int			nr_found;
-
-restart:
-	done = 0;
-	skipped = 0;
-	first_index = 0;
-	nr_found = 0;
-	do {
-		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-		int		error = 0;
-		int		i;
-
-		rcu_read_lock();
-		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-					(void **)batch, first_index,
-					XFS_LOOKUP_BATCH);
-		if (!nr_found) {
-			rcu_read_unlock();
-			break;
-		}
-
-		/*
-		 * Grab the inodes before we drop the lock. if we found
-		 * nothing, nr == 0 and the loop will be skipped.
-		 */
-		for (i = 0; i < nr_found; i++) {
-			struct xfs_inode *ip = batch[i];
-
-			if (done || xfs_inode_ag_walk_grab(ip))
-				batch[i] = NULL;
-
-			/*
-			 * Update the index for the next lookup. Catch
-			 * overflows into the next AG range which can occur if
-			 * we have inodes in the last block of the AG and we
-			 * are currently pointing to the last inode.
-			 *
-			 * Because we may see inodes that are from the wrong AG
-			 * due to RCU freeing and reallocation, only update the
-			 * index if it lies in this AG. It was a race that lead
-			 * us to see this inode, so another lookup from the
-			 * same index will not find it again.
-			 */
-			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
-				continue;
-			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-				done = 1;
-		}
-
-		/* unlock now we've grabbed the inodes. */
-		rcu_read_unlock();
-
-		for (i = 0; i < nr_found; i++) {
-			if (!batch[i])
-				continue;
-			error = execute(batch[i], pag, flags);
-			IRELE(batch[i]);
-			if (error == EAGAIN) {
-				skipped++;
-				continue;
-			}
-			if (error && last_error != EFSCORRUPTED)
-				last_error = error;
-		}
-
-		/* bail out if the filesystem is corrupted.  */
-		if (error == EFSCORRUPTED)
-			break;
-
-		cond_resched();
-
-	} while (nr_found && !done);
-
-	if (skipped) {
-		delay(1);
-		goto restart;
-	}
-	return last_error;
-}
-
-int
-xfs_inode_ag_iterator(
-	struct xfs_mount	*mp,
-	int			(*execute)(struct xfs_inode *ip,
-					   struct xfs_perag *pag, int flags),
-	int			flags)
-{
-	struct xfs_perag	*pag;
-	int			error = 0;
-	int			last_error = 0;
-	xfs_agnumber_t		ag;
-
-	ag = 0;
-	while ((pag = xfs_perag_get(mp, ag))) {
-		ag = pag->pag_agno + 1;
-		error = xfs_inode_ag_walk(mp, pag, execute, flags);
-		xfs_perag_put(pag);
-		if (error) {
-			last_error = error;
-			if (error == EFSCORRUPTED)
-				break;
-		}
-	}
-	return XFS_ERROR(last_error);
-}
-
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_queue_work(
-	struct xfs_mount        *mp)
-{
-
-	rcu_read_lock();
-	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-		queue_delayed_work(xfs_mount_wq, &mp->m_reclaim_work,
-			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-	}
-	rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_reclaim_work);
-
-	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-	xfs_reclaim_queue_work(mp);
-}
-
-void
-__xfs_inode_set_reclaim_tag(
-	struct xfs_perag	*pag,
-	struct xfs_inode	*ip)
-{
-	radix_tree_tag_set(&pag->pag_ici_root,
-			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-			   XFS_ICI_RECLAIM_TAG);
-
-	if (!pag->pag_ici_reclaimable) {
-		/* propagate the reclaim tag up into the perag radix tree */
-		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				XFS_ICI_RECLAIM_TAG);
-		spin_unlock(&ip->i_mount->m_perag_lock);
-
-		/* schedule periodic background inode reclaim */
-		xfs_reclaim_queue_work(ip->i_mount);
-
-		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-							-1, _RET_IP_);
-	}
-	pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-	xfs_inode_t	*ip)
-{
-	struct xfs_mount *mp = ip->i_mount;
-	struct xfs_perag *pag;
-
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	spin_lock(&pag->pag_ici_lock);
-	spin_lock(&ip->i_flags_lock);
-	__xfs_inode_set_reclaim_tag(pag, ip);
-	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-	spin_unlock(&ip->i_flags_lock);
-	spin_unlock(&pag->pag_ici_lock);
-	xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
-	xfs_perag_t	*pag,
-	xfs_inode_t	*ip)
-{
-	pag->pag_ici_reclaimable--;
-	if (!pag->pag_ici_reclaimable) {
-		/* clear the reclaim tag from the perag radix tree */
-		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				XFS_ICI_RECLAIM_TAG);
-		spin_unlock(&ip->i_mount->m_perag_lock);
-		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-							-1, _RET_IP_);
-	}
-}
-
-void
-__xfs_inode_clear_reclaim_tag(
-	xfs_mount_t	*mp,
-	xfs_perag_t	*pag,
-	xfs_inode_t	*ip)
-{
-	radix_tree_tag_clear(&pag->pag_ici_root,
-			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-	__xfs_inode_clear_reclaim(pag, ip);
-}
-
-/*
- * Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
- */
-STATIC int
-xfs_reclaim_inode_grab(
-	struct xfs_inode	*ip,
-	int			flags)
-{
-	ASSERT(rcu_read_lock_held());
-
-	/* quick check for stale RCU freed inode */
-	if (!ip->i_ino)
-		return 1;
-
-	/*
-	 * If we are asked for non-blocking operation, do unlocked checks to
-	 * see if the inode already is being flushed or in reclaim to avoid
-	 * lock traffic.
-	 */
-	if ((flags & SYNC_TRYLOCK) &&
-	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
-		return 1;
-
-	/*
-	 * The radix tree lock here protects a thread in xfs_iget from racing
-	 * with us starting reclaim on the inode.  Once we have the
-	 * XFS_IRECLAIM flag set it will not touch us.
-	 *
-	 * Due to RCU lookup, we may find inodes that have been freed and only
-	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
-	 * aren't candidates for reclaim at all, so we must check the
-	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		/* not a reclaim candidate. */
-		spin_unlock(&ip->i_flags_lock);
-		return 1;
-	}
-	__xfs_iflags_set(ip, XFS_IRECLAIM);
-	spin_unlock(&ip->i_flags_lock);
-	return 0;
-}
-
-/*
- * Inodes in different states need to be treated differently. The following
- * table lists the inode states and the reclaim actions necessary:
- *
- *	inode state	     iflush ret		required action
- *      ---------------      ----------         ---------------
- *	bad			-		reclaim
- *	shutdown		EIO		unpin and reclaim
- *	clean, unpinned		0		reclaim
- *	stale, unpinned		0		reclaim
- *	clean, pinned(*)	0		requeue
- *	stale, pinned		EAGAIN		requeue
- *	dirty, async		-		requeue
- *	dirty, sync		0		reclaim
- *
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean.
- *
- * Note that because the inode is flushed delayed write by AIL pushing, the
- * flush lock may already be held here and waiting on it can result in very
- * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
- * the caller should push the AIL first before trying to reclaim inodes to
- * minimise the amount of time spent waiting.  For background relaim, we only
- * bother to reclaim clean inodes anyway.
- *
- * Hence the order of actions after gaining the locks should be:
- *	bad		=> reclaim
- *	shutdown	=> unpin and reclaim
- *	pinned, async	=> requeue
- *	pinned, sync	=> unpin
- *	stale		=> reclaim
- *	clean		=> reclaim
- *	dirty, async	=> requeue
- *	dirty, sync	=> flush, wait and reclaim
- */
-STATIC int
-xfs_reclaim_inode(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	int			sync_mode)
-{
-	struct xfs_buf		*bp = NULL;
-	int			error;
-
-restart:
-	error = 0;
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (!xfs_iflock_nowait(ip)) {
-		if (!(sync_mode & SYNC_WAIT))
-			goto out;
-		xfs_iflock(ip);
-	}
-
-	if (is_bad_inode(VFS_I(ip)))
-		goto reclaim;
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		xfs_iunpin_wait(ip);
-		xfs_iflush_abort(ip, false);
-		goto reclaim;
-	}
-	if (xfs_ipincount(ip)) {
-		if (!(sync_mode & SYNC_WAIT))
-			goto out_ifunlock;
-		xfs_iunpin_wait(ip);
-	}
-	if (xfs_iflags_test(ip, XFS_ISTALE))
-		goto reclaim;
-	if (xfs_inode_clean(ip))
-		goto reclaim;
-
-	/*
-	 * Never flush out dirty data during non-blocking reclaim, as it would
-	 * just contend with AIL pushing trying to do the same job.
-	 */
-	if (!(sync_mode & SYNC_WAIT))
-		goto out_ifunlock;
-
-	/*
-	 * Now we have an inode that needs flushing.
-	 *
-	 * Note that xfs_iflush will never block on the inode buffer lock, as
-	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
-	 * ip->i_lock, and we are doing the exact opposite here.  As a result,
-	 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
-	 * result in an ABBA deadlock with xfs_ifree_cluster().
-	 *
-	 * As xfs_ifree_cluser() must gather all inodes that are active in the
-	 * cache to mark them stale, if we hit this case we don't actually want
-	 * to do IO here - we want the inode marked stale so we can simply
-	 * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
-	 * inode, back off and try again.  Hopefully the next pass through will
-	 * see the stale flag set on the inode.
-	 */
-	error = xfs_iflush(ip, &bp);
-	if (error == EAGAIN) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		/* backoff longer than in xfs_ifree_cluster */
-		delay(2);
-		goto restart;
-	}
-
-	if (!error) {
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-	}
-
-	xfs_iflock(ip);
-reclaim:
-	xfs_ifunlock(ip);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	XFS_STATS_INC(xs_ig_reclaims);
-	/*
-	 * Remove the inode from the per-AG radix tree.
-	 *
-	 * Because radix_tree_delete won't complain even if the item was never
-	 * added to the tree assert that it's been there before to catch
-	 * problems with the inode life time early on.
-	 */
-	spin_lock(&pag->pag_ici_lock);
-	if (!radix_tree_delete(&pag->pag_ici_root,
-				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
-		ASSERT(0);
-	__xfs_inode_clear_reclaim(pag, ip);
-	spin_unlock(&pag->pag_ici_lock);
-
-	/*
-	 * Here we do an (almost) spurious inode lock in order to coordinate
-	 * with inode cache radix tree lookups.  This is because the lookup
-	 * can reference the inodes in the cache without taking references.
-	 *
-	 * We make that OK here by ensuring that we wait until the inode is
-	 * unlocked after the lookup before we go ahead and free it.
-	 */
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_qm_dqdetach(ip);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	xfs_inode_free(ip);
-	return error;
-
-out_ifunlock:
-	xfs_ifunlock(ip);
-out:
-	xfs_iflags_clear(ip, XFS_IRECLAIM);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	/*
-	 * We could return EAGAIN here to make reclaim rescan the inode tree in
-	 * a short while. However, this just burns CPU time scanning the tree
-	 * waiting for IO to complete and xfssyncd never goes back to the idle
-	 * state. Instead, return 0 to let the next scheduled background reclaim
-	 * attempt to reclaim the inode again.
-	 */
-	return 0;
-}
-
-/*
- * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
- * corrupted, we still want to try to reclaim all the inodes. If we don't,
- * then a shut down during filesystem unmount reclaim walk leak all the
- * unreclaimed inodes.
- */
-int
-xfs_reclaim_inodes_ag(
-	struct xfs_mount	*mp,
-	int			flags,
-	int			*nr_to_scan)
-{
-	struct xfs_perag	*pag;
-	int			error = 0;
-	int			last_error = 0;
-	xfs_agnumber_t		ag;
-	int			trylock = flags & SYNC_TRYLOCK;
-	int			skipped;
-
-restart:
-	ag = 0;
-	skipped = 0;
-	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-		unsigned long	first_index = 0;
-		int		done = 0;
-		int		nr_found = 0;
-
-		ag = pag->pag_agno + 1;
-
-		if (trylock) {
-			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
-				skipped++;
-				xfs_perag_put(pag);
-				continue;
-			}
-			first_index = pag->pag_ici_reclaim_cursor;
-		} else
-			mutex_lock(&pag->pag_ici_reclaim_lock);
-
-		do {
-			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-			int	i;
-
-			rcu_read_lock();
-			nr_found = radix_tree_gang_lookup_tag(
-					&pag->pag_ici_root,
-					(void **)batch, first_index,
-					XFS_LOOKUP_BATCH,
-					XFS_ICI_RECLAIM_TAG);
-			if (!nr_found) {
-				done = 1;
-				rcu_read_unlock();
-				break;
-			}
-
-			/*
-			 * Grab the inodes before we drop the lock. if we found
-			 * nothing, nr == 0 and the loop will be skipped.
-			 */
-			for (i = 0; i < nr_found; i++) {
-				struct xfs_inode *ip = batch[i];
-
-				if (done || xfs_reclaim_inode_grab(ip, flags))
-					batch[i] = NULL;
-
-				/*
-				 * Update the index for the next lookup. Catch
-				 * overflows into the next AG range which can
-				 * occur if we have inodes in the last block of
-				 * the AG and we are currently pointing to the
-				 * last inode.
-				 *
-				 * Because we may see inodes that are from the
-				 * wrong AG due to RCU freeing and
-				 * reallocation, only update the index if it
-				 * lies in this AG. It was a race that lead us
-				 * to see this inode, so another lookup from
-				 * the same index will not find it again.
-				 */
-				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
-								pag->pag_agno)
-					continue;
-				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-					done = 1;
-			}
-
-			/* unlock now we've grabbed the inodes. */
-			rcu_read_unlock();
-
-			for (i = 0; i < nr_found; i++) {
-				if (!batch[i])
-					continue;
-				error = xfs_reclaim_inode(batch[i], pag, flags);
-				if (error && last_error != EFSCORRUPTED)
-					last_error = error;
-			}
-
-			*nr_to_scan -= XFS_LOOKUP_BATCH;
-
-			cond_resched();
-
-		} while (nr_found && !done && *nr_to_scan > 0);
-
-		if (trylock && !done)
-			pag->pag_ici_reclaim_cursor = first_index;
-		else
-			pag->pag_ici_reclaim_cursor = 0;
-		mutex_unlock(&pag->pag_ici_reclaim_lock);
-		xfs_perag_put(pag);
-	}
-
-	/*
-	 * if we skipped any AG, and we still have scan count remaining, do
-	 * another pass this time using blocking reclaim semantics (i.e
-	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
-	 * ensure that when we get more reclaimers than AGs we block rather
-	 * than spin trying to execute reclaim.
-	 */
-	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
-		trylock = 0;
-		goto restart;
-	}
-	return XFS_ERROR(last_error);
-}
-
-int
-xfs_reclaim_inodes(
-	xfs_mount_t	*mp,
-	int		mode)
-{
-	int		nr_to_scan = INT_MAX;
-
-	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
-}
-
-/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
- */
-void
-xfs_reclaim_inodes_nr(
-	struct xfs_mount	*mp,
-	int			nr_to_scan)
-{
-	/* kick background reclaimer and push the AIL */
-	xfs_reclaim_queue_work(mp);
-	xfs_ail_push_all(mp->m_ail);
-
-	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
-}
-
-/*
- * Return the number of reclaimable inodes in the filesystem for
- * the shrinker to determine how much to reclaim.
- */
-int
-xfs_reclaim_inodes_count(
-	struct xfs_mount	*mp)
-{
-	struct xfs_perag	*pag;
-	xfs_agnumber_t		ag = 0;
-	int			reclaimable = 0;
-
-	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-		ag = pag->pag_agno + 1;
-		reclaimable += pag->pag_ici_reclaimable;
-		xfs_perag_put(pag);
-	}
-	return reclaimable;
-}
-
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
deleted file mode 100644
index 0ba9c89..0000000
--- a/fs/xfs/xfs_sync.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef XFS_SYNC_H
-#define XFS_SYNC_H 1
-
-struct xfs_mount;
-struct xfs_perag;
-
-#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
-#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
-
-void xfs_reclaim_worker(struct work_struct *work);
-
-int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
-int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-
-void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
-				struct xfs_inode *ip);
-
-int xfs_sync_inode_grab(struct xfs_inode *ip);
-int xfs_inode_ag_iterator(struct xfs_mount *mp,
-	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags);
-
-#endif
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (10 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:30   ` Christoph Hellwig
  2012-09-04 21:07   ` Mark Tinguely
  2012-08-30 12:00 ` [PATCH 13/13] xfs: remove xfs_iget.c Dave Chinner
                   ` (2 subsequent siblings)
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_ilock() and friends really aren't related to the inode cache in
any way, so move them to xfs_inode.c with all the other inode
related functionality.

While doing this move, move the xfs_ilock() tracepoints to *before*
the lock is taken so that when a hang on a lock occurs we have
events to indicate which process and what inode we were trying to
lock when the hang occurred. This is much better than the current
silence we get on a hang...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_iget.c  |  251 ----------------------------------------------------
 fs/xfs/xfs_inode.c |  250 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 250 insertions(+), 251 deletions(-)

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 069c5ce..ea9a5fa 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -453,254 +453,3 @@ out_error_or_again:
 	return error;
 }
 
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code.  It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format.  If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in.  Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared().  This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
-	xfs_inode_t	*ip)
-{
-	uint	lock_mode;
-
-	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
-	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
-		lock_mode = XFS_ILOCK_EXCL;
-	} else {
-		lock_mode = XFS_ILOCK_SHARED;
-	}
-
-	xfs_ilock(ip, lock_mode);
-
-	return lock_mode;
-}
-
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
-	xfs_inode_t	*ip,
-	unsigned int	lock_mode)
-{
-	xfs_iunlock(ip, lock_mode);
-}
-
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock.  This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- *       to be locked.  It can be:
- *		XFS_IOLOCK_SHARED,
- *		XFS_IOLOCK_EXCL,
- *		XFS_ILOCK_SHARED,
- *		XFS_ILOCK_EXCL,
- *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_ILOCK_SHARED)
-		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-
-	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep.  It returns 1 if it gets
- * the requested locks and 0 otherwise.  If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be locked.  See the comment for xfs_ilock() for a list
- *	 of valid values.
- */
-int
-xfs_ilock_nowait(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_iolock))
-			goto out;
-	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_iolock))
-			goto out;
-	}
-	if (lock_flags & XFS_ILOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_lock))
-			goto out_undo_iolock;
-	} else if (lock_flags & XFS_ILOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_lock))
-			goto out_undo_iolock;
-	}
-	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
-	return 1;
-
- out_undo_iolock:
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
- out:
-	return 0;
-}
-
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be unlocked.  See the comment for xfs_ilock() for a list
- *	 of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-	ASSERT(lock_flags != 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrunlock_excl(&ip->i_lock);
-	else if (lock_flags & XFS_ILOCK_SHARED)
-		mrunlock_shared(&ip->i_lock);
-
-	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * give up write locks.  the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrdemote(&ip->i_lock);
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrdemote(&ip->i_iolock);
-
-	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
-}
-
-#ifdef DEBUG
-int
-xfs_isilocked(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-		if (!(lock_flags & XFS_ILOCK_SHARED))
-			return !!ip->i_lock.mr_writer;
-		return rwsem_is_locked(&ip->i_lock.mr_lock);
-	}
-
-	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-		if (!(lock_flags & XFS_IOLOCK_SHARED))
-			return !!ip->i_iolock.mr_writer;
-		return rwsem_is_locked(&ip->i_iolock.mr_lock);
-	}
-
-	ASSERT(0);
-	return 0;
-}
-#endif
-
-void
-__xfs_iflock(
-	struct xfs_inode	*ip)
-{
-	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
-	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-		if (xfs_isiflocked(ip))
-			io_schedule();
-	} while (!xfs_iflock_nowait(ip));
-
-	finish_wait(wq, &wait.wait);
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2778258..ba404e4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -74,6 +74,256 @@ xfs_get_extsz_hint(
 	return 0;
 }
 
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code.  It is used in places that wish to lock the inode solely
+ * for reading the extents.  The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in.  Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though.  What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+	xfs_inode_t	*ip)
+{
+	uint	lock_mode;
+
+	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+		lock_mode = XFS_ILOCK_EXCL;
+	} else {
+		lock_mode = XFS_ILOCK_SHARED;
+	}
+
+	xfs_ilock(ip, lock_mode);
+
+	return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+	xfs_inode_t	*ip,
+	unsigned int	lock_mode)
+{
+	xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *		XFS_IOLOCK_SHARED,
+ *		XFS_IOLOCK_EXCL,
+ *		XFS_ILOCK_SHARED,
+ *		XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *	 of valid values.
+ */
+int
+xfs_ilock_nowait(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_iolock))
+			goto out;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_iolock))
+			goto out;
+	}
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_lock))
+			goto out_undo_iolock;
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_lock))
+			goto out_undo_iolock;
+	}
+	return 1;
+
+ out_undo_iolock:
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+ out:
+	return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *	 of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+	ASSERT(lock_flags != 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrunlock_excl(&ip->i_lock);
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mrunlock_shared(&ip->i_lock);
+
+	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrdemote(&ip->i_lock);
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrdemote(&ip->i_iolock);
+
+	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+		if (!(lock_flags & XFS_ILOCK_SHARED))
+			return !!ip->i_lock.mr_writer;
+		return rwsem_is_locked(&ip->i_lock.mr_lock);
+	}
+
+	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+		if (!(lock_flags & XFS_IOLOCK_SHARED))
+			return !!ip->i_iolock.mr_writer;
+		return rwsem_is_locked(&ip->i_iolock.mr_lock);
+	}
+
+	ASSERT(0);
+	return 0;
+}
+#endif
+
+void
+__xfs_iflock(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_isiflocked(ip))
+			io_schedule();
+	} while (!xfs_iflock_nowait(ip));
+
+	finish_wait(wq, &wait.wait);
+}
+
 #ifdef DEBUG
 /*
  * Make sure that the extents in the given memory buffer
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* [PATCH 13/13] xfs: remove xfs_iget.c
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (11 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:31   ` Christoph Hellwig
  2012-09-04 21:11   ` Mark Tinguely
  2012-08-30 12:15 ` [PATCH V2 00/13] xfs: remove the xfssyncd mess Markus Trippelsdorf
  2012-08-31 14:01 ` Mark Tinguely
  14 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

The inode cache functions remaining in xfs_iget.c can be moved to xfs_icache.c
along with the other inode cache functions. This removes all functionality from
xfs_iget.c, so the file can simply be removed.

This move results in various functions now only having the scope of a single
file (e.g. xfs_inode_free()), so clean up all the definitions and exported
prototypes in xfs_icache.[ch] and xfs_inode.h appropriately.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/Makefile          |    1 -
 fs/xfs/xfs_export.c      |    1 +
 fs/xfs/xfs_icache.c      |  421 +++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_icache.h      |    6 +-
 fs/xfs/xfs_iget.c        |  455 ----------------------------------------------
 fs/xfs/xfs_inode.c       |    1 +
 fs/xfs/xfs_inode.h       |   10 +-
 fs/xfs/xfs_itable.c      |    1 +
 fs/xfs/xfs_log_recover.c |    1 +
 fs/xfs/xfs_qm.c          |    1 +
 fs/xfs/xfs_rtalloc.c     |    1 +
 fs/xfs/xfs_vnodeops.c    |    1 +
 12 files changed, 430 insertions(+), 470 deletions(-)
 delete mode 100644 fs/xfs/xfs_iget.c

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 442f256..e65357b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -40,7 +40,6 @@ xfs-y				+= xfs_aops.o \
 				   xfs_fs_subr.o \
 				   xfs_globals.o \
 				   xfs_icache.o \
-				   xfs_iget.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
 				   xfs_iops.o \
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 4267922..9b6e330 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c21a72a..9aa1ed2 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -41,6 +41,421 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+				struct xfs_perag *pag, struct xfs_inode *ip);
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+
+	/*
+	 * if this didn't occur in transactions, we could use
+	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+	 * code up to do this anyway.
+	 */
+	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+	if (!ip)
+		return NULL;
+	if (inode_init_always(mp->m_super, VFS_I(ip))) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		return NULL;
+	}
+
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(!xfs_isiflocked(ip));
+	ASSERT(ip->i_ino == 0);
+
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+	/* initialise the xfs inode */
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+	ip->i_afp = NULL;
+	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+	ip->i_flags = 0;
+	ip->i_delayed_blks = 0;
+	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+
+	return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+	struct rcu_head		*head)
+{
+	struct inode		*inode = container_of(head, struct inode, i_rcu);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
+STATIC void
+xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+		break;
+	}
+
+	if (ip->i_afp)
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+	if (ip->i_itemp) {
+		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+		xfs_inode_item_destroy(ip);
+		ip->i_itemp = NULL;
+	}
+
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(!xfs_isiflocked(ip));
+
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
+	int			flags,
+	int			lock_flags) __releases(RCU)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
+
+	/*
+	 * If we are racing with another cache hit that is currently
+	 * instantiating this inode or currently recycling it out of
+	 * reclaimabe state, wait for the initialisation to complete
+	 * before continuing.
+	 *
+	 * XXX(hch): eventually we should do something equivalent to
+	 *	     wait_on_inode to wait for these flags to be cleared
+	 *	     instead of polling for it.
+	 */
+	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
+	/*
+	 * If lookup is racing with unlink return an error immediately.
+	 */
+	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_error;
+	}
+
+	/*
+	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+	 * Need to carefully get it back into useable state.
+	 */
+	if (ip->i_flags & XFS_IRECLAIMABLE) {
+		trace_xfs_iget_reclaim(ip);
+
+		/*
+		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+		 * from stomping over us while we recycle the inode.  We can't
+		 * clear the radix tree reclaimable tag yet as it requires
+		 * pag_ici_lock to be held exclusive.
+		 */
+		ip->i_flags |= XFS_IRECLAIM;
+
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+
+		error = -inode_init_always(mp->m_super, inode);
+		if (error) {
+			/*
+			 * Re-initializing the inode failed, and we are in deep
+			 * trouble.  Try to re-add it to the reclaim list.
+			 */
+			rcu_read_lock();
+			spin_lock(&ip->i_flags_lock);
+
+			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+			trace_xfs_iget_reclaim_fail(ip);
+			goto out_error;
+		}
+
+		spin_lock(&pag->pag_ici_lock);
+		spin_lock(&ip->i_flags_lock);
+
+		/*
+		 * Clear the per-lifetime state in the inode as we are now
+		 * effectively a new inode and need to return to the initial
+		 * state before reuse occurs.
+		 */
+		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+		ip->i_flags |= XFS_INEW;
+		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
+		inode->i_state = I_NEW;
+
+		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+		spin_unlock(&ip->i_flags_lock);
+		spin_unlock(&pag->pag_ici_lock);
+	} else {
+		/* If the VFS inode is being torn down, pause and try again. */
+		if (!igrab(inode)) {
+			trace_xfs_iget_skip(ip);
+			error = EAGAIN;
+			goto out_error;
+		}
+
+		/* We've got a live one. */
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+		trace_xfs_iget_hit(ip);
+	}
+
+	if (lock_flags != 0)
+		xfs_ilock(ip, lock_flags);
+
+	xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+	XFS_STATS_INC(xs_ig_found);
+
+	return 0;
+
+out_error:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+	return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	xfs_trans_t		*tp,
+	xfs_ino_t		ino,
+	struct xfs_inode	**ipp,
+	int			flags,
+	int			lock_flags)
+{
+	struct xfs_inode	*ip;
+	int			error;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
+	int			iflags;
+
+	ip = xfs_inode_alloc(mp, ino);
+	if (!ip)
+		return ENOMEM;
+
+	error = xfs_iread(mp, tp, ip, flags);
+	if (error)
+		goto out_destroy;
+
+	trace_xfs_iget_miss(ip);
+
+	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_destroy;
+	}
+
+	/*
+	 * Preload the radix tree so we can insert safely under the
+	 * write spinlock. Note that we cannot sleep inside the preload
+	 * region. Since we can be called from transaction context, don't
+	 * recurse into the file system.
+	 */
+	if (radix_tree_preload(GFP_NOFS)) {
+		error = EAGAIN;
+		goto out_destroy;
+	}
+
+	/*
+	 * Because the inode hasn't been added to the radix-tree yet it can't
+	 * be found by another thread, so we can do the non-sleeping lock here.
+	 */
+	if (lock_flags) {
+		if (!xfs_ilock_nowait(ip, lock_flags))
+			BUG();
+	}
+
+	/*
+	 * These values must be set before inserting the inode into the radix
+	 * tree as the moment it is inserted a concurrent lookup (allowed by the
+	 * RCU locking mechanism) can find it and that lookup must see that this
+	 * is an inode currently under construction (i.e. that XFS_INEW is set).
+	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+	 * memory barrier that ensures this detection works correctly at lookup
+	 * time.
+	 */
+	iflags = XFS_INEW;
+	if (flags & XFS_IGET_DONTCACHE)
+		iflags |= XFS_IDONTCACHE;
+	ip->i_udquot = ip->i_gdquot = NULL;
+	xfs_iflags_set(ip, iflags);
+
+	/* insert the new inode */
+	spin_lock(&pag->pag_ici_lock);
+	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+	if (unlikely(error)) {
+		WARN_ON(error != -EEXIST);
+		XFS_STATS_INC(xs_ig_dup);
+		error = EAGAIN;
+		goto out_preload_end;
+	}
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+
+	*ipp = ip;
+	return 0;
+
+out_preload_end:
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+out_destroy:
+	__destroy_inode(VFS_I(ip));
+	xfs_inode_free(ip);
+	return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *		 for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		flags,
+	uint		lock_flags,
+	xfs_inode_t	**ipp)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+
+	/*
+	 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+	 * doesn't get freed while it's being referenced during a
+	 * radix tree traversal here.  It assumes this function
+	 * aqcuires only the ILOCK (and therefore it has no need to
+	 * involve the IOLOCK in this synchronization).
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+	/* reject inode numbers outside existing AGs */
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+		return EINVAL;
+
+	/* get the perag structure and ensure that it's inode capable */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+	agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+	error = 0;
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+	if (ip) {
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	} else {
+		rcu_read_unlock();
+		XFS_STATS_INC(xs_ig_missed);
+
+		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+							flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	}
+	xfs_perag_put(pag);
+
+	*ipp = ip;
+
+	/*
+	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
+	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
+	 */
+	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+		xfs_setup_inode(ip);
+	return 0;
+
+out_error_or_again:
+	if (error == EAGAIN) {
+		delay(1);
+		goto again;
+	}
+	xfs_perag_put(pag);
+	return error;
+}
+
 
 /*
  * The inode lookup is done in batches to keep the amount of lock traffic and
@@ -254,7 +669,7 @@ xfs_reclaim_worker(
 	xfs_reclaim_queue_work(mp);
 }
 
-void
+static void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
 	struct xfs_inode	*ip)
@@ -320,7 +735,7 @@ __xfs_inode_clear_reclaim(
 	}
 }
 
-void
+STATIC void
 __xfs_inode_clear_reclaim_tag(
 	xfs_mount_t	*mp,
 	xfs_perag_t	*pag,
@@ -543,7 +958,7 @@ out:
  * then a shut down during filesystem unmount reclaim walk leak all the
  * unreclaimed inodes.
  */
-int
+STATIC int
 xfs_reclaim_inodes_ag(
 	struct xfs_mount	*mp,
 	int			flags,
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 0ba9c89..222e22f 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,6 +24,9 @@ struct xfs_perag;
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
 #define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+	     uint flags, uint lock_flags, xfs_inode_t **ipp);
+
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
@@ -31,9 +34,6 @@ int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
-				struct xfs_inode *ip);
 
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index ea9a5fa..0000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,455 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_priv.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-#include "xfs_icache.h"
-
-
-/*
- * Allocate and initialise an xfs_inode.
- */
-STATIC struct xfs_inode *
-xfs_inode_alloc(
-	struct xfs_mount	*mp,
-	xfs_ino_t		ino)
-{
-	struct xfs_inode	*ip;
-
-	/*
-	 * if this didn't occur in transactions, we could use
-	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
-	 * code up to do this anyway.
-	 */
-	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
-	if (!ip)
-		return NULL;
-	if (inode_init_always(mp->m_super, VFS_I(ip))) {
-		kmem_zone_free(xfs_inode_zone, ip);
-		return NULL;
-	}
-
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
-	ASSERT(ip->i_ino == 0);
-
-	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
-	/* initialise the xfs inode */
-	ip->i_ino = ino;
-	ip->i_mount = mp;
-	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
-	ip->i_afp = NULL;
-	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
-	ip->i_flags = 0;
-	ip->i_delayed_blks = 0;
-	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-
-	return ip;
-}
-
-STATIC void
-xfs_inode_free_callback(
-	struct rcu_head		*head)
-{
-	struct inode		*inode = container_of(head, struct inode, i_rcu);
-	struct xfs_inode	*ip = XFS_I(inode);
-
-	kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
-	struct xfs_inode	*ip)
-{
-	switch (ip->i_d.di_mode & S_IFMT) {
-	case S_IFREG:
-	case S_IFDIR:
-	case S_IFLNK:
-		xfs_idestroy_fork(ip, XFS_DATA_FORK);
-		break;
-	}
-
-	if (ip->i_afp)
-		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
-	if (ip->i_itemp) {
-		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
-		xfs_inode_item_destroy(ip);
-		ip->i_itemp = NULL;
-	}
-
-	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
-
-	/*
-	 * Because we use RCU freeing we need to ensure the inode always
-	 * appears to be reclaimed with an invalid inode number when in the
-	 * free state. The ip->i_flags_lock provides the barrier against lookup
-	 * races.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	ip->i_flags = XFS_IRECLAIM;
-	ip->i_ino = 0;
-	spin_unlock(&ip->i_flags_lock);
-
-	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
-}
-
-/*
- * Check the validity of the inode we just found it the cache
- */
-static int
-xfs_iget_cache_hit(
-	struct xfs_perag	*pag,
-	struct xfs_inode	*ip,
-	xfs_ino_t		ino,
-	int			flags,
-	int			lock_flags) __releases(RCU)
-{
-	struct inode		*inode = VFS_I(ip);
-	struct xfs_mount	*mp = ip->i_mount;
-	int			error;
-
-	/*
-	 * check for re-use of an inode within an RCU grace period due to the
-	 * radix tree nodes not being updated yet. We monitor for this by
-	 * setting the inode number to zero before freeing the inode structure.
-	 * If the inode has been reallocated and set up, then the inode number
-	 * will not match, so check for that, too.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (ip->i_ino != ino) {
-		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
-		goto out_error;
-	}
-
-
-	/*
-	 * If we are racing with another cache hit that is currently
-	 * instantiating this inode or currently recycling it out of
-	 * reclaimabe state, wait for the initialisation to complete
-	 * before continuing.
-	 *
-	 * XXX(hch): eventually we should do something equivalent to
-	 *	     wait_on_inode to wait for these flags to be cleared
-	 *	     instead of polling for it.
-	 */
-	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
-		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
-		goto out_error;
-	}
-
-	/*
-	 * If lookup is racing with unlink return an error immediately.
-	 */
-	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
-		goto out_error;
-	}
-
-	/*
-	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
-	 * Need to carefully get it back into useable state.
-	 */
-	if (ip->i_flags & XFS_IRECLAIMABLE) {
-		trace_xfs_iget_reclaim(ip);
-
-		/*
-		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
-		 * from stomping over us while we recycle the inode.  We can't
-		 * clear the radix tree reclaimable tag yet as it requires
-		 * pag_ici_lock to be held exclusive.
-		 */
-		ip->i_flags |= XFS_IRECLAIM;
-
-		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
-
-		error = -inode_init_always(mp->m_super, inode);
-		if (error) {
-			/*
-			 * Re-initializing the inode failed, and we are in deep
-			 * trouble.  Try to re-add it to the reclaim list.
-			 */
-			rcu_read_lock();
-			spin_lock(&ip->i_flags_lock);
-
-			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
-			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
-			trace_xfs_iget_reclaim_fail(ip);
-			goto out_error;
-		}
-
-		spin_lock(&pag->pag_ici_lock);
-		spin_lock(&ip->i_flags_lock);
-
-		/*
-		 * Clear the per-lifetime state in the inode as we are now
-		 * effectively a new inode and need to return to the initial
-		 * state before reuse occurs.
-		 */
-		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
-		ip->i_flags |= XFS_INEW;
-		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
-		inode->i_state = I_NEW;
-
-		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
-		spin_unlock(&ip->i_flags_lock);
-		spin_unlock(&pag->pag_ici_lock);
-	} else {
-		/* If the VFS inode is being torn down, pause and try again. */
-		if (!igrab(inode)) {
-			trace_xfs_iget_skip(ip);
-			error = EAGAIN;
-			goto out_error;
-		}
-
-		/* We've got a live one. */
-		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
-		trace_xfs_iget_hit(ip);
-	}
-
-	if (lock_flags != 0)
-		xfs_ilock(ip, lock_flags);
-
-	xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
-	XFS_STATS_INC(xs_ig_found);
-
-	return 0;
-
-out_error:
-	spin_unlock(&ip->i_flags_lock);
-	rcu_read_unlock();
-	return error;
-}
-
-
-static int
-xfs_iget_cache_miss(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag,
-	xfs_trans_t		*tp,
-	xfs_ino_t		ino,
-	struct xfs_inode	**ipp,
-	int			flags,
-	int			lock_flags)
-{
-	struct xfs_inode	*ip;
-	int			error;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
-	int			iflags;
-
-	ip = xfs_inode_alloc(mp, ino);
-	if (!ip)
-		return ENOMEM;
-
-	error = xfs_iread(mp, tp, ip, flags);
-	if (error)
-		goto out_destroy;
-
-	trace_xfs_iget_miss(ip);
-
-	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
-		goto out_destroy;
-	}
-
-	/*
-	 * Preload the radix tree so we can insert safely under the
-	 * write spinlock. Note that we cannot sleep inside the preload
-	 * region. Since we can be called from transaction context, don't
-	 * recurse into the file system.
-	 */
-	if (radix_tree_preload(GFP_NOFS)) {
-		error = EAGAIN;
-		goto out_destroy;
-	}
-
-	/*
-	 * Because the inode hasn't been added to the radix-tree yet it can't
-	 * be found by another thread, so we can do the non-sleeping lock here.
-	 */
-	if (lock_flags) {
-		if (!xfs_ilock_nowait(ip, lock_flags))
-			BUG();
-	}
-
-	/*
-	 * These values must be set before inserting the inode into the radix
-	 * tree as the moment it is inserted a concurrent lookup (allowed by the
-	 * RCU locking mechanism) can find it and that lookup must see that this
-	 * is an inode currently under construction (i.e. that XFS_INEW is set).
-	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
-	 * memory barrier that ensures this detection works correctly at lookup
-	 * time.
-	 */
-	iflags = XFS_INEW;
-	if (flags & XFS_IGET_DONTCACHE)
-		iflags |= XFS_IDONTCACHE;
-	ip->i_udquot = ip->i_gdquot = NULL;
-	xfs_iflags_set(ip, iflags);
-
-	/* insert the new inode */
-	spin_lock(&pag->pag_ici_lock);
-	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
-	if (unlikely(error)) {
-		WARN_ON(error != -EEXIST);
-		XFS_STATS_INC(xs_ig_dup);
-		error = EAGAIN;
-		goto out_preload_end;
-	}
-	spin_unlock(&pag->pag_ici_lock);
-	radix_tree_preload_end();
-
-	*ipp = ip;
-	return 0;
-
-out_preload_end:
-	spin_unlock(&pag->pag_ici_lock);
-	radix_tree_preload_end();
-	if (lock_flags)
-		xfs_iunlock(ip, lock_flags);
-out_destroy:
-	__destroy_inode(VFS_I(ip));
-	xfs_inode_free(ip);
-	return error;
-}
-
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *		 for xfs_ilock() for a list of valid values.
- */
-int
-xfs_iget(
-	xfs_mount_t	*mp,
-	xfs_trans_t	*tp,
-	xfs_ino_t	ino,
-	uint		flags,
-	uint		lock_flags,
-	xfs_inode_t	**ipp)
-{
-	xfs_inode_t	*ip;
-	int		error;
-	xfs_perag_t	*pag;
-	xfs_agino_t	agino;
-
-	/*
-	 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
-	 * doesn't get freed while it's being referenced during a
-	 * radix tree traversal here.  It assumes this function
-	 * aqcuires only the ILOCK (and therefore it has no need to
-	 * involve the IOLOCK in this synchronization).
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
-
-	/* reject inode numbers outside existing AGs */
-	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-		return EINVAL;
-
-	/* get the perag structure and ensure that it's inode capable */
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-	agino = XFS_INO_TO_AGINO(mp, ino);
-
-again:
-	error = 0;
-	rcu_read_lock();
-	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-
-	if (ip) {
-		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
-		if (error)
-			goto out_error_or_again;
-	} else {
-		rcu_read_unlock();
-		XFS_STATS_INC(xs_ig_missed);
-
-		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
-							flags, lock_flags);
-		if (error)
-			goto out_error_or_again;
-	}
-	xfs_perag_put(pag);
-
-	*ipp = ip;
-
-	/*
-	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
-	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
-	 */
-	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-		xfs_setup_inode(ip);
-	return 0;
-
-out_error_or_again:
-	if (error == EAGAIN) {
-		delay(1);
-		goto again;
-	}
-	xfs_perag_put(pag);
-	return error;
-}
-
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ba404e4..bba8f37 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f9..1fc2065 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 	(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
 	 ((pip)->i_d.di_mode & S_ISGID))
 
+
 /*
- * xfs_iget.c prototypes.
+ * xfs_inode.c prototypes.
  */
-int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-			 uint, uint, xfs_inode_t **);
 void		xfs_ilock(xfs_inode_t *, uint);
 int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
 uint		xfs_ilock_map_shared(xfs_inode_t *);
 void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void		xfs_inode_free(struct xfs_inode *ip);
-
-/*
- * xfs_inode.c prototypes.
- */
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
 			   xfs_nlink_t, xfs_dev_t, prid_t, int,
 			   struct xfs_buf **, xfs_inode_t **);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a6..3998fd2 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int
 xfs_internal_inum(
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5da3ace..651c988 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -42,6 +42,7 @@
 #include "xfs_quota.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int
 xlog_find_zeroed(
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0..48c750b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4b..a69e0b4 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
 #include "xfs_utils.h"
 #include "xfs_trace.h"
 #include "xfs_buf.h"
+#include "xfs_icache.h"
 
 
 /*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index dcb4de3..da4a378 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * The maximum pathlen is 1024 bytes. Since the minimum file system
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (12 preceding siblings ...)
  2012-08-30 12:00 ` [PATCH 13/13] xfs: remove xfs_iget.c Dave Chinner
@ 2012-08-30 12:15 ` Markus Trippelsdorf
  2012-08-30 22:51   ` Dave Chinner
  2012-08-31 14:01 ` Mark Tinguely
  14 siblings, 1 reply; 60+ messages in thread
From: Markus Trippelsdorf @ 2012-08-30 12:15 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 2012.08.30 at 22:00 +1000, Dave Chinner wrote:
> Version 2 of the patchset I described here:
> 
> http://oss.sgi.com/archives/xfs/2012-06/msg00064.html
> 
> This version has run through xfstests completely once, so it's
> less likely to let smoke out....

Is there a publicly accessible git tree available where one could pull
from? (This would be way easier than saving and hand-applying 13
patches.)

-- 
Markus

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-08-30 12:15 ` [PATCH V2 00/13] xfs: remove the xfssyncd mess Markus Trippelsdorf
@ 2012-08-30 22:51   ` Dave Chinner
  2012-08-31  6:18     ` Markus Trippelsdorf
  0 siblings, 1 reply; 60+ messages in thread
From: Dave Chinner @ 2012-08-30 22:51 UTC (permalink / raw)
  To: Markus Trippelsdorf; +Cc: xfs

On Thu, Aug 30, 2012 at 02:15:16PM +0200, Markus Trippelsdorf wrote:
> On 2012.08.30 at 22:00 +1000, Dave Chinner wrote:
> > Version 2 of the patchset I described here:
> > 
> > http://oss.sgi.com/archives/xfs/2012-06/msg00064.html
> > 
> > This version has run through xfstests completely once, so it's
> > less likely to let smoke out....
> 
> Is there a publicly accessible git tree available where one could pull
> from? (This would be way easier than saving and hand-applying 13
> patches.)

No.

Instead, save all the patches to a single mbox format file, then run:

$ git checkout -b umount-fix-test
$ git am <mbox file>

And it will apply all the patches as separate commits to the
umount-fix-test branch. This is how I take patch sets from my inbox
to git. You can build and testing them from there.

I, personally, convert them to a guilt series first, usually
integrating them into my working/test branch to test them along with
everything else I'm working on at the same time. To do this, I
generally do:

$ git checkout -b umount-fix-test working
$ git am <mbox file>
$ git checkout working
$ git log --pretty=oneline working..umount-fix-test
<record first and last commit id in applied series>
$ guilt import-commits <first-commit..last-commit>

And that adds the commits from the umount-fix-test branch into
individual patches in the guilt series preserving all the metadata
from the original sender in them. i.e. if I then patchbomb them back
to the mailing list, they will have the correct "From:" attribution
and dates as found in the original emails....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-08-30 22:51   ` Dave Chinner
@ 2012-08-31  6:18     ` Markus Trippelsdorf
  2012-08-31  8:42       ` Dave Chinner
  0 siblings, 1 reply; 60+ messages in thread
From: Markus Trippelsdorf @ 2012-08-31  6:18 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 2012.08.31 at 08:51 +1000, Dave Chinner wrote:
> On Thu, Aug 30, 2012 at 02:15:16PM +0200, Markus Trippelsdorf wrote:
> > On 2012.08.30 at 22:00 +1000, Dave Chinner wrote:
> > > Version 2 of the patchset I described here:
> > > 
> > > http://oss.sgi.com/archives/xfs/2012-06/msg00064.html
> > > 
> > > This version has run through xfstests completely once, so it's
> > > less likely to let smoke out....
> > 
> > Is there a publicly accessible git tree available where one could pull
> > from? (This would be way easier than saving and hand-applying 13
> > patches.)
> 
> No.
> 
> Instead, save all the patches to a single mbox format file, then run:
> 
> $ git checkout -b umount-fix-test
> $ git am <mbox file>
> 
> And it will apply all the patches as separate commits to the
> umount-fix-test branch. This is how I take patch sets from my inbox
> to git. You can build and testing them from there.

Yeah. That works if you're using mbox format files. But if you're are a
Maildir user like myself you're basically screwed, because 
 $ git am <Maildir> 
expects the Maildir to be *sorted* and because mails normally don't
arrive in the right order, git-am will therefore try to apply the
patches in the wrong order. (Maybe this should be reported to the git
mailing-list)

So if there are mutt users out there who use "mbox_type=Maildir" and
know how to save a thread to a single mbox file, then please let me
know.
Thanks.

-- 
Markus

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-08-31  6:18     ` Markus Trippelsdorf
@ 2012-08-31  8:42       ` Dave Chinner
  2012-08-31  9:30         ` Markus Trippelsdorf
  0 siblings, 1 reply; 60+ messages in thread
From: Dave Chinner @ 2012-08-31  8:42 UTC (permalink / raw)
  To: Markus Trippelsdorf; +Cc: xfs

On Fri, Aug 31, 2012 at 08:18:25AM +0200, Markus Trippelsdorf wrote:
> On 2012.08.31 at 08:51 +1000, Dave Chinner wrote:
> > On Thu, Aug 30, 2012 at 02:15:16PM +0200, Markus Trippelsdorf wrote:
> > > On 2012.08.30 at 22:00 +1000, Dave Chinner wrote:
> > > > Version 2 of the patchset I described here:
> > > > 
> > > > http://oss.sgi.com/archives/xfs/2012-06/msg00064.html
> > > > 
> > > > This version has run through xfstests completely once, so it's
> > > > less likely to let smoke out....
> > > 
> > > Is there a publicly accessible git tree available where one could pull
> > > from? (This would be way easier than saving and hand-applying 13
> > > patches.)
> > 
> > No.
> > 
> > Instead, save all the patches to a single mbox format file, then run:
> > 
> > $ git checkout -b umount-fix-test
> > $ git am <mbox file>
> > 
> > And it will apply all the patches as separate commits to the
> > umount-fix-test branch. This is how I take patch sets from my inbox
> > to git. You can build and testing them from there.
> 
> Yeah. That works if you're using mbox format files. But if you're are a
> Maildir user like myself you're basically screwed, because 
>  $ git am <Maildir> 
> expects the Maildir to be *sorted* and because mails normally don't
> arrive in the right order, git-am will therefore try to apply the
> patches in the wrong order. (Maybe this should be reported to the git
> mailing-list)
> 
> So if there are mutt users out there who use "mbox_type=Maildir" and
> know how to save a thread to a single mbox file, then please let me
> know.

I use mutt, and store all my mail folders in maildir format.
However, I don't use mbox_type=Maildir - I simply create new maildir
folders via the CLI when I need a new one.  Mutt automatically
recognises the directories as being in maildir format and uses it.
Hence when I tag a thread and ;s to save all the tagged mail, it
creates a mbox file as that is the default.

That probably doesn't help you, though. However, IIRC, you can get
the same result regardless of your mbox_type by piping all the
tagged mail through formail and redirecting that to a file like so:

	;| formail -ds > $file

and that will result in $file being a mbox format file with all the
tagged mail in it...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-08-31  8:42       ` Dave Chinner
@ 2012-08-31  9:30         ` Markus Trippelsdorf
  0 siblings, 0 replies; 60+ messages in thread
From: Markus Trippelsdorf @ 2012-08-31  9:30 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 2012.08.31 at 18:42 +1000, Dave Chinner wrote:
> On Fri, Aug 31, 2012 at 08:18:25AM +0200, Markus Trippelsdorf wrote:
> > On 2012.08.31 at 08:51 +1000, Dave Chinner wrote:
> > > On Thu, Aug 30, 2012 at 02:15:16PM +0200, Markus Trippelsdorf wrote:
> > > > On 2012.08.30 at 22:00 +1000, Dave Chinner wrote:
> > > > > Version 2 of the patchset I described here:
> > > > > 
> > > > > http://oss.sgi.com/archives/xfs/2012-06/msg00064.html
> > > > > 
> > > > > This version has run through xfstests completely once, so it's
> > > > > less likely to let smoke out....
> > > > 
> > > > Is there a publicly accessible git tree available where one could pull
> > > > from? (This would be way easier than saving and hand-applying 13
> > > > patches.)
> > > 
> > > No.
> > > 
> > > Instead, save all the patches to a single mbox format file, then run:
> > > 
> > > $ git checkout -b umount-fix-test
> > > $ git am <mbox file>
> > > 
> > > And it will apply all the patches as separate commits to the
> > > umount-fix-test branch. This is how I take patch sets from my inbox
> > > to git. You can build and testing them from there.
> > 
> > Yeah. That works if you're using mbox format files. But if you're are a
> > Maildir user like myself you're basically screwed, because 
> >  $ git am <Maildir> 
> > expects the Maildir to be *sorted* and because mails normally don't
> > arrive in the right order, git-am will therefore try to apply the
> > patches in the wrong order. (Maybe this should be reported to the git
> > mailing-list)
> > 
> > So if there are mutt users out there who use "mbox_type=Maildir" and
> > know how to save a thread to a single mbox file, then please let me
> > know.
> 
> I use mutt, and store all my mail folders in maildir format.
> However, I don't use mbox_type=Maildir - I simply create new maildir
> folders via the CLI when I need a new one.  Mutt automatically
> recognises the directories as being in maildir format and uses it.
> Hence when I tag a thread and ;s to save all the tagged mail, it
> creates a mbox file as that is the default.
> 
> That probably doesn't help you, though. However, IIRC, you can get
> the same result regardless of your mbox_type by piping all the
> tagged mail through formail and redirecting that to a file like so:
> 
> 	;| formail -ds > $file
> 
> and that will result in $file being a mbox format file with all the
> tagged mail in it...

After some experimentation I came up with the following:

macro index <F5> "<enter-command>set mbox_type=mbox;set confirmcreate=no;set sort=subject<enter>;sPatches<enter><enter-command>set mbox_type=Maildir; set sort=threads;set confirmcreate=yes<enter>U.<enter>"

Basically you just tag the thread (esc-t) and then hit F5 and it will save it
to a mbox file called "Patches" in your home dir.

-- 
Markus

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (13 preceding siblings ...)
  2012-08-30 12:15 ` [PATCH V2 00/13] xfs: remove the xfssyncd mess Markus Trippelsdorf
@ 2012-08-31 14:01 ` Mark Tinguely
  2012-09-03  4:05   ` Dave Chinner
  14 siblings, 1 reply; 60+ messages in thread
From: Mark Tinguely @ 2012-08-31 14:01 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> Version 2 of the patchset I described here:
>
> http://oss.sgi.com/archives/xfs/2012-06/msg00064.html
>
> This version has run through xfstests completely once, so it's
> less likely to let smoke out....
>
> Version 2:
> - fix writeback_inodes_sb_if_idle call in xfs_create()
> - refreshed patch 13 before sending.
>
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

I wanted to get a fast look at your patch series. I am getting the 
following ASSERT on xfstest 179 when running the series with the latest 
OSS soruces.The ASSERT appears to start at patch number 3. Sorry these 
boxes won't kdump the top of tree kernels:

[17474.545964] XFS: Assertion failed: atomic_read(&bp->b_hold) > 0, 
file: /root/xfs/fs/xfs/xfs_buf.c, line: 896
[17474.555828] ------------[ cut here ]------------
[17474.559784] kernel BUG at /root/xfs/fs/xfs/xfs_message.c:100!
[17474.559784] invalid opcode: 0000 [#1] SMP
[17474.559784] Modules linked in: xfs(O) autofs4 binfmt_misc mperf fuse 
loop dm_mod qla2xxx ib_mthca ipv6 i5k_amb scsi_transport_fc ib_mad 
e1000e i2c_i801 i5000_edac scsi_tgt i2c_core ioatdma shpchp edac_core 
pci_hotplug sg lpc_ich mfd_core ib_core pcspkr dca microcode rtc_cmos 
button uhci_hcd ehci_hcd sd_mod crc_t10dif usbcore usb_common 
scsi_dh_emc scsi_dh_rdac scsi_dh_alua scsi_dh_hp_sw scsi_dh thermal 
sata_nv processor mptsas mptscsih scsi_transport_sas mptbase 
megaraid_sas fan thermal_sys hwmon ext3 jbd mbcache edd ata_piix ahci 
libahci libata scsi_mod [last unloaded: xfs]
[17474.559784] CPU 0
[17474.559784] Pid: 26427, comm: umount Tainted: G           O 
3.6.0-rc1+ #1 SGI.COM AltixXE210/S5000PAL0
[17474.559784] RIP: 0010:[<ffffffffa05c544d>]  [<ffffffffa05c544d>] 
assfail+0x1d/0x30 [xfs]
[17474.559784] RSP: 0018:ffff8808379238a8  EFLAGS: 00010296
[17474.559784] RAX: 0000000000000060 RBX: ffff8807f8c0a564 RCX: 
0000000000000082
[17474.559784] RDX: 0000000000004a61 RSI: 0000000000000086 RDI: 
0000000000000246
[17474.559784] RBP: ffff8808379238a8 R08: 0000000000000811 R09: 
ffffffff818ba780
[17474.559784] R10: 0000000000000811 R11: 0000000000000006 R12: 
ffff8807f8c0a540
[17474.559784] R13: ffffffffa05b5b86 R14: ffff88083a8810c0 R15: 
0000000000000000
[17474.559784] FS:  00007fc12646d740(0000) GS:ffff88085fc00000(0000) 
knlGS:0000000000000000
[17474.559784] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[17474.559784] CR2: 00007fc125b29570 CR3: 000000085be2c000 CR4: 
00000000000007f0
[17474.559784] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[17474.559784] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 
0000000000000400
[17474.559784] Process umount (pid: 26427, threadinfo ffff880837922000, 
task ffff88083d278600)
[17474.559784] Stack:
[17474.559784]  ffff8808379238d8 ffffffffa05b4ed4 ffff8807f8c0a540 
ffff8807f8c0a540
[17474.559784]  ffffffffa061e939 0000000000000000 ffff8808379238f8 
ffffffffa05b5b86
[17474.559784]  ffff880837a436d0 ffff8807f8c0a540 ffff880837923928 
ffffffffa05b5c26
[17474.559784] Call Trace:
[17474.559784]  [<ffffffffa05b4ed4>] xfs_buf_rele+0xa4/0x1b0 [xfs]
[17474.559784]  [<ffffffffa061e939>] ? 
xfs_buf_iodone_callbacks+0x59/0x230 [xfs]
[17474.559784]  [<ffffffffa05b5b86>] xfs_buf_iodone_work+0x46/0x50 [xfs]
[17474.559784]  [<ffffffffa05b5c26>] xfs_buf_ioend+0x96/0x120 [xfs]
[17474.559784]  [<ffffffffa05b5b61>] ? xfs_buf_iodone_work+0x21/0x50 [xfs]
[17474.559784]  [<ffffffffa061e939>] xfs_buf_iodone_callbacks+0x59/0x230 
[xfs]
[17474.559784]  [<ffffffffa061f7e9>] ? xfs_buf_item_unpin+0x289/0x2d0 [xfs]
[17474.559784]  [<ffffffffa05b5b61>] xfs_buf_iodone_work+0x21/0x50 [xfs]
[17474.559784]  [<ffffffffa05b5c26>] xfs_buf_ioend+0x96/0x120 [xfs]
[17474.559784]  [<ffffffffa061f7e9>] xfs_buf_item_unpin+0x289/0x2d0 [xfs]
[17474.559784]  [<ffffffffa0617c33>] 
xfs_trans_committed_bulk+0x213/0x300 [xfs]
[17474.559784]  [<ffffffffa061bd33>] ? 
xlog_state_get_iclog_space+0x293/0x350 [xfs]
[17474.559784]  [<ffffffff8107f9e8>] ? idle_balance+0xe8/0x150
[17474.559784]  [<ffffffffa061c9d5>] ? xlog_write+0x165/0x5d0 [xfs]
[17474.559784]  [<ffffffffa061dde6>] xlog_cil_committed+0x36/0x130 [xfs]
[17474.559784]  [<ffffffffa061e1e8>] xlog_cil_push+0x308/0x430 [xfs]
[17474.559784]  [<ffffffff81110522>] ? pagevec_lookup_tag+0x22/0x30
[17474.559784]  [<ffffffff8105dbb9>] ? start_flush_work+0x29/0x100
[17474.559784]  [<ffffffffa061e466>] xlog_cil_force_lsn+0x146/0x1b0 [xfs]
[17474.559784]  [<ffffffff81433a82>] ? wait_for_common+0xd2/0x190
[17474.559784]  [<ffffffff81178285>] ? iput_final+0x145/0x1e0
[17474.559784]  [<ffffffffa061c1e4>] _xfs_log_force+0x64/0x280 [xfs]
[17474.559784]  [<ffffffff811863fe>] ? sync_inodes_sb+0x9e/0xd0
[17474.559784]  [<ffffffffa061c454>] xfs_log_force+0x54/0x80 [xfs]
[17474.559784]  [<ffffffffa05c65dd>] xfs_fs_sync_fs+0x2d/0x50 [xfs]
[17474.559784]  [<ffffffff8118c00b>] __sync_filesystem+0x2b/0x50
[17474.559784]  [<ffffffff8118c073>] sync_filesystem+0x43/0x60
[17474.559784]  [<ffffffff81160846>] generic_shutdown_super+0x36/0xe0
[17474.559784]  [<ffffffff8116091c>] kill_block_super+0x2c/0x80
[17474.559784]  [<ffffffff81160e78>] deactivate_locked_super+0x38/0x90
[17474.559784]  [<ffffffff81161951>] deactivate_super+0x61/0x70
[17474.559784]  [<ffffffff8117c659>] mntput_no_expire+0x149/0x1b0
[17474.559784]  [<ffffffff8117d10e>] sys_umount+0x6e/0xd0
[17474.559784]  [<ffffffff8143d479>] system_call_fastpath+0x16/0x1b
[17474.559784] Code: 00 00 00 48 89 45 c8 e8 72 fc ff ff c9 c3 55 41 89 
d0 48 89 f1 48 89 fa 48 c7 c6 58 b0 63 a0 31 ff 48 89 e5 31 c0 e8 93 ff 
ff ff <0f> 0b eb fe 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 55 4c
[17474.559784] RIP  [<ffffffffa05c544d>] assfail+0x1d/0x30 [xfs]
[17474.559784]  RSP <ffff8808379238a8>

I got this ASSERT when I ran it on the 8/27 OSS sources:

[188646.952426] XFS: Assertion failed: atomic_read(&iclog->ic_refcnt) == 
0, file:
  /root/xfs/fs/xfs/xfs_log.c, line: 2590
[188646.963050] ------------[ cut here ]------------
[188646.967020] kernel BUG at /root/xfs/fs/xfs/xfs_message.c:100!
[188646.967020] invalid opcode: 0000 [#1] SMP
[188646.967020] Modules linked in: xfs(O) autofs4 binfmt_misc mperf fuse 
loop dm_
mod qla2xxx i5k_amb i5000_edac ipv6 scsi_transport_fc scsi_tgt e1000e 
edac_core i
b_mthca shpchp sg ioatdma pci_hotplug lpc_ich mfd_core microcode ib_mad 
ib_core d
ca pcspkr i2c_i801 i2c_core button rtc_cmos uhci_hcd ehci_hcd sd_mod 
crc_t10dif u
sbcore usb_common scsi_dh_emc scsi_dh_rdac scsi_dh_alua scsi_dh_hp_sw 
scsi_dh the
rmal sata_nv processor mptsas mptscsih scsi_transport_sas mptbase 
megaraid_sas fa
n thermal_sys hwmon ext3 jbd mbcache edd ata_piix ahci libahci libata 
scsi_mod [l
ast unloaded: xfs]
[188646.967020] CPU 2
[188646.967020] Pid: 356, comm: kworker/2:1H Tainted: G           O 
3.6.0-rc1+ #1
  SGI.COM AltixXE210/S5000PAL0
[188646.967020] RIP: 0010:[<ffffffffa0188e2d>]  [<ffffffffa0188e2d>] 
assfail+0x1d
/0x30 [xfs]
[188646.967020] RSP: 0018:ffff8808396a5d90  EFLAGS: 00010286
[188646.967020] RAX: 0000000000000068 RBX: ffff88083b4efdc0 RCX: 
0000000000000086
[188646.967020] RDX: 0000000000000b04 RSI: 0000000000000082 RDI: 
0000000000000246
[188646.967020] RBP: ffff8808396a5d90 R08: 0000000000002291 R09: 
ffffffff818ba780
[188646.967020] R10: 0000000000002291 R11: 0000000000000006 R12: 
ffff88053ff99800
[188646.967020] R13: ffff88053ff99928 R14: 0000000000000002 R15: 
ffff88085fc8d730
[188646.967020] FS:  0000000000000000(0000) GS:ffff88085fc80000(0000) 
knlGS:00000
00000000000
[188646.967020] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[188646.967020] CR2: ffffffffff600400 CR3: 0000000838497000 CR4: 
00000000000007e0
[188646.967020] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[188646.967020] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 
0000000000000400
[188646.967020] Process kworker/2:1H (pid: 356, threadinfo 
ffff8808396a4000, task
  ffff88083a9aa1c0)
[188646.967020] Stack:
[188646.967020]  ffff8808396a5dc0 ffffffffa01dd2bf 0000000000000002 
ffff88083ae67
980
[188646.967020]  ffff88083b4efdc0 ffff88085fc8d400 ffff8808396a5df0 
ffffffffa01dd
bde
[188646.967020]  ffff88085fc8d400 ffff88083ae67980 ffff88083ae67a18 
ffff88083a5e3
240
[188646.967020] Call Trace:
[188646.967020]  [<ffffffffa01dd2bf>] xlog_state_done_syncing+0x7f/0x110 
[xfs]
[188646.967020]  [<ffffffffa01ddbde>] xlog_iodone+0x7e/0x100 [xfs]
[188646.967020]  [<ffffffffa0179b51>] xfs_buf_iodone_work+0x21/0x50 [xfs]
[188646.967020]  [<ffffffff8105d6b3>] process_one_work+0x1d3/0x370
[188646.967020]  [<ffffffffa0179b30>] ? xfs_bioerror_relse+0x80/0x80 [xfs]
[188646.967020]  [<ffffffff810603e3>] worker_thread+0x133/0x390
[188646.967020]  [<ffffffff810602b0>] ? manage_workers+0x70/0x70
[188646.967020]  [<ffffffff810651ce>] kthread+0x9e/0xb0
[188646.967020]  [<ffffffff8143e504>] kernel_thread_helper+0x4/0x10
[188646.967020]  [<ffffffff81065130>] ? 
kthread_freezable_should_stop+0x70/0x70
[188646.967020]  [<ffffffff8143e500>] ? gs_change+0x13/0x13
[188646.967020] Code: 00 00 00 48 89 45 c8 e8 72 fc ff ff c9 c3 55 41 89 
d0 48 89
  f1 48 89 fa 48 c7 c6 58 ef 1f a0 31 ff 48 89 e5 31 c0 e8 93 ff ff ff 
<0f> 0b eb
fe 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 55 4c
[188646.967020] RIP  [<ffffffffa0188e2d>] assfail+0x1d/0x30 [xfs]
[188646.967020]  RSP <ffff8808396a5d90

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 01/13] xfs: xfs_syncd_stop must die
  2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
@ 2012-09-01 23:15   ` Christoph Hellwig
  2012-09-04 16:10   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:15 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, Aug 30, 2012 at 10:00:05PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
> functionailty together that actually have different start and stop
> requirements. Kill these functions and open code the start/stop
> methods for each of the background functions.
> 
> Subsequent patches will move the start/stop functions around to the
> correct places to avoid races and shutdown issues.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 02/13] xfs: rename the xfs_syncd workqueue
  2012-08-30 12:00 ` [PATCH 02/13] xfs: rename the xfs_syncd workqueue Dave Chinner
@ 2012-09-01 23:17   ` Christoph Hellwig
  2012-09-03  3:09     ` Dave Chinner
  0 siblings, 1 reply; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:17 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, Aug 30, 2012 at 10:00:06PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> There is nothing "sync" realted to this work queue any more. It is a general
> purpose per-filesystem work queue. Rename it appropriately, and remove the
> "syncd" naming from various functions.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

With the current leight-weight workqueues I don't see why we'd even want
a shared one for the different callers.  Let's move the patch until
after killing xfs_flush_worker & co, and then add a workqueue for
background and the log sync individually.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 05/13] xfs: sync work is now only periodic log work
  2012-08-30 12:00 ` [PATCH 05/13] xfs: sync work is now only periodic log work Dave Chinner
@ 2012-09-01 23:23   ` Christoph Hellwig
  2012-09-03  3:36     ` Dave Chinner
  2012-09-04 16:14   ` Mark Tinguely
  2012-09-04 18:57   ` Mark Tinguely
  2 siblings, 1 reply; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:23 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, Aug 30, 2012 at 10:00:09PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The only thing the periodic sync work does now is flush the AIL and
> idle the log. These are really functions of the log code, so move
> the work to xfs_log.c and rename it appropriately.
> 
> The only wart that this leaves behind is the xfssyncd_centisecs
> sysctl, otherwise the xfssyncd is dead. Clean up any comments that
> related to xfssyncd to reflect it's passing.

FYI: A while ago I looked into folding the work here into xfsaild.
While soing the periodic ail push there aka just waking up with a
timeout which we did more often anyway at that point was easy, getting
the log force / dummy log never really worked out.

I think at least the former probably should be dropped here as well.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 07/13] xfs: xfs_sync_data is redundant.
  2012-08-30 12:00 ` [PATCH 07/13] xfs: xfs_sync_data is redundant Dave Chinner
@ 2012-09-01 23:24   ` Christoph Hellwig
  2012-09-03  6:08     ` Dave Chinner
  2012-09-04 20:48   ` Mark Tinguely
  1 sibling, 1 reply; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:24 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, Aug 30, 2012 at 10:00:11PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> We don't do any data writeback from XFS any more - the VFS is
> completely responsible for that, including for freeze. We can
> replace the remaining caller with the VFS level function that
> achieves the same thing, but without conflicting with current
> writeback work - writeback_inodes_sb_if_idle().
> 
> This means we can remove the flush_work and xfs_flush_inodes() - the
> VFS functionality completely replaces the internal flush queue for
> doing this writeback work in a separate context to avoid stack
> overruns..

Are the lock ordering issues with writeback_inodes_sb_if_idle sorted out
by now?  IIRC it still needs to be switch to a trylock.

> -			xfs_flush_inodes(ip);
> +			writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb,
> +						    WB_REASON_FS_FREE_SPACE);

I'd prefer to still keep a wrapper for an ugly call like this if we can.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 08/13] xfs: xfs_sync_fsdata is redundant
  2012-08-30 12:00 ` [PATCH 08/13] xfs: xfs_sync_fsdata " Dave Chinner
@ 2012-09-01 23:27   ` Christoph Hellwig
  2012-09-04 20:59   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:27 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c
  2012-08-30 12:00 ` [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c Dave Chinner
@ 2012-09-01 23:27   ` Christoph Hellwig
  2012-09-04 21:03   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:27 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 06/13] xfs: Bring some sanity to log unmounting
  2012-08-30 12:00 ` [PATCH 06/13] xfs: Bring some sanity to log unmounting Dave Chinner
@ 2012-09-01 23:28   ` Christoph Hellwig
  2012-09-04 19:11   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:28 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount
  2012-08-30 12:00 ` [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount Dave Chinner
@ 2012-09-01 23:29   ` Christoph Hellwig
  2012-09-04 21:04   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:29 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch]
  2012-08-30 12:00 ` [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] Dave Chinner
@ 2012-09-01 23:30   ` Christoph Hellwig
  2012-09-04 21:06   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:30 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c
  2012-08-30 12:00 ` [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c Dave Chinner
@ 2012-09-01 23:30   ` Christoph Hellwig
  2012-09-04 21:07   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:30 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, Aug 30, 2012 at 10:00:16PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> xfs_ilock() and friends really aren't related to the inode cache in
> any way, so move them to xfs_inode.c with all the other inode
> related functionality.
> 
> While doing this move, move the xfs_ilock() tracepoints to *before*
> the lock is taken so that when a hang on a lock occurs we have
> events to indicate which process and what inode we were trying to
> lock when the hang occurred. This is much better than the current
> silence we get on a hang...

Looks good.

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 13/13] xfs: remove xfs_iget.c
  2012-08-30 12:00 ` [PATCH 13/13] xfs: remove xfs_iget.c Dave Chinner
@ 2012-09-01 23:31   ` Christoph Hellwig
  2012-09-04 21:11   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:31 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 02/13] xfs: rename the xfs_syncd workqueue
  2012-09-01 23:17   ` Christoph Hellwig
@ 2012-09-03  3:09     ` Dave Chinner
  0 siblings, 0 replies; 60+ messages in thread
From: Dave Chinner @ 2012-09-03  3:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Sat, Sep 01, 2012 at 07:17:22PM -0400, Christoph Hellwig wrote:
> On Thu, Aug 30, 2012 at 10:00:06PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > There is nothing "sync" realted to this work queue any more. It is a general
> > purpose per-filesystem work queue. Rename it appropriately, and remove the
> > "syncd" naming from various functions.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> 
> With the current leight-weight workqueues I don't see why we'd even want
> a shared one for the different callers.  Let's move the patch until
> after killing xfs_flush_worker & co, and then add a workqueue for
> background and the log sync individually.

OK. It starts to make workqueues like slab caches - all the cool
kids have got one. :)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 05/13] xfs: sync work is now only periodic log work
  2012-09-01 23:23   ` Christoph Hellwig
@ 2012-09-03  3:36     ` Dave Chinner
  0 siblings, 0 replies; 60+ messages in thread
From: Dave Chinner @ 2012-09-03  3:36 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Sat, Sep 01, 2012 at 07:23:43PM -0400, Christoph Hellwig wrote:
> On Thu, Aug 30, 2012 at 10:00:09PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > The only thing the periodic sync work does now is flush the AIL and
> > idle the log. These are really functions of the log code, so move
> > the work to xfs_log.c and rename it appropriately.
> > 
> > The only wart that this leaves behind is the xfssyncd_centisecs
> > sysctl, otherwise the xfssyncd is dead. Clean up any comments that
> > related to xfssyncd to reflect it's passing.
> 
> FYI: A while ago I looked into folding the work here into xfsaild.
> While soing the periodic ail push there aka just waking up with a
> timeout which we did more often anyway at that point was easy, getting
> the log force / dummy log never really worked out.
> 
> I think at least the former probably should be dropped here as well.

Perhaps. We've always had a periodic log force as a get out gaol
free card for issues with pinned objects, so I'm not really inclined
to remove it/change that behaviour in this series.

IOWs, while I do agree that the log covering and AIL pushing code
should be more tightly integrated so that we can get the filesystem
to an idle state much faster than we do now, I don't think this
patchset is the place to start doing bits of that work.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-08-31 14:01 ` Mark Tinguely
@ 2012-09-03  4:05   ` Dave Chinner
  2012-09-04  0:13     ` Mark Tinguely
  2012-09-25  9:26     ` Christoph Hellwig
  0 siblings, 2 replies; 60+ messages in thread
From: Dave Chinner @ 2012-09-03  4:05 UTC (permalink / raw)
  To: Mark Tinguely; +Cc: xfs

On Fri, Aug 31, 2012 at 09:01:04AM -0500, Mark Tinguely wrote:
> On 08/30/12 07:00, Dave Chinner wrote:
> >Version 2 of the patchset I described here:
> >
> >http://oss.sgi.com/archives/xfs/2012-06/msg00064.html
> >
> >This version has run through xfstests completely once, so it's
> >less likely to let smoke out....
> >
> >Version 2:
> >- fix writeback_inodes_sb_if_idle call in xfs_create()
> >- refreshed patch 13 before sending.
> >
> >_______________________________________________
> >xfs mailing list
> >xfs@oss.sgi.com
> >http://oss.sgi.com/mailman/listinfo/xfs
> 
> I wanted to get a fast look at your patch series. I am getting the
> following ASSERT on xfstest 179 when running the series with the
> latest OSS soruces.The ASSERT appears to start at patch number 3.
> Sorry these boxes won't kdump the top of tree kernels:
> 
> [17474.545964] XFS: Assertion failed: atomic_read(&bp->b_hold) > 0,
> file: /root/xfs/fs/xfs/xfs_buf.c, line: 896

FWIW, when you paste stack traces, can you turn off line wrapping
when you paste it so the crash is simple to quote in reply? (use
:set paste in mutt, the :set nopaste when finished pasting it in).

> [17474.559784] Process umount (pid: 26427, threadinfo
...
> [17474.559784] Call Trace:
> [17474.559784]  [<ffffffffa05b4ed4>] xfs_buf_rele+0xa4/0x1b0 [xfs]
> [17474.559784]  [<ffffffffa05b5b86>] xfs_buf_iodone_work+0x46/0x50 [xfs]
> [17474.559784]  [<ffffffffa05b5c26>] xfs_buf_ioend+0x96/0x120 [xfs]
> [17474.559784]  [<ffffffffa061e939>] xfs_buf_iodone_callbacks+0x59/0x230 [xfs]
> [17474.559784]  [<ffffffffa05b5b61>] xfs_buf_iodone_work+0x21/0x50 [xfs]
> [17474.559784]  [<ffffffffa05b5c26>] xfs_buf_ioend+0x96/0x120 [xfs]
> [17474.559784]  [<ffffffffa061f7e9>] xfs_buf_item_unpin+0x289/0x2d0 [xfs]
> [17474.559784]  [<ffffffffa0617c33>] xfs_trans_committed_bulk+0x213/0x300 [xfs]
> [17474.559784]  [<ffffffffa061dde6>] xlog_cil_committed+0x36/0x130 [xfs]
> [17474.559784]  [<ffffffffa061e1e8>] xlog_cil_push+0x308/0x430 [xfs]
> [17474.559784]  [<ffffffffa061e466>] xlog_cil_force_lsn+0x146/0x1b0 [xfs]
> [17474.559784]  [<ffffffffa061c1e4>] _xfs_log_force+0x64/0x280 [xfs]
> [17474.559784]  [<ffffffffa061c454>] xfs_log_force+0x54/0x80 [xfs]
> [17474.559784]  [<ffffffffa05c65dd>] xfs_fs_sync_fs+0x2d/0x50 [xfs]
> [17474.559784]  [<ffffffff8118c00b>] __sync_filesystem+0x2b/0x50
> [17474.559784]  [<ffffffff8118c073>] sync_filesystem+0x43/0x60
> [17474.559784]  [<ffffffff81160846>] generic_shutdown_super+0x36/0xe0
> [17474.559784]  [<ffffffff8116091c>] kill_block_super+0x2c/0x80
> [17474.559784]  [<ffffffff81160e78>] deactivate_locked_super+0x38/0x90
> [17474.559784]  [<ffffffff81161951>] deactivate_super+0x61/0x70
> [17474.559784]  [<ffffffff8117c659>] mntput_no_expire+0x149/0x1b0
> [17474.559784]  [<ffffffff8117d10e>] sys_umount+0x6e/0xd0

Nothing has been shut down in XFS at this point (i.e. .put_super()
has not yet been called) so none of the shutdown changes could have
caused this problem.

Indeed, it looks like this is during a forced shutdown here in
xfs_buf_item_unpin:

        } else if (freed && remove) {
                xfs_buf_lock(bp);
                xfs_buf_ioerror(bp, EIO);
                XFS_BUF_UNDONE(bp);
                xfs_buf_stale(bp);
>>>>>>          xfs_buf_ioend(bp, 0);
        }

Now, xfs_buf_stale() does this:

	ASSERT(atomic_read(&bp->b_hold) >= 1);

Which means that in calling xfs_buf_ioend(), at least two references
to the buffer are being dropped. Working out why that is occurring
will find the root cause of this problem. 

All that I can say at this point is that I find it highly unlikely
that it is caused by the changes in this patchset.

> I got this ASSERT when I ran it on the 8/27 OSS sources:
> 
> [188646.952426] XFS: Assertion failed:
> atomic_read(&iclog->ic_refcnt) == 0, file:
>  /root/xfs/fs/xfs/xfs_log.c, line: 2590

> [188646.967020] Process kworker/2:1H (pid: 356, threadinfo ffff8808396a4000, task ffff88083a9aa1c0)
> [188646.967020] Call Trace:
> [188646.967020]  [<ffffffffa01dd2bf>] xlog_state_done_syncing+0x7f/0x110 [xfs]
> [188646.967020]  [<ffffffffa01ddbde>] xlog_iodone+0x7e/0x100 [xfs]
> [188646.967020]  [<ffffffffa0179b51>] xfs_buf_iodone_work+0x21/0x50 [xfs]
> [188646.967020]  [<ffffffff8105d6b3>] process_one_work+0x1d3/0x370
> [188646.967020]  [<ffffffff810603e3>] worker_thread+0x133/0x390
> [188646.967020]  [<ffffffff810651ce>] kthread+0x9e/0xb0
> [188646.967020]  [<ffffffff8143e504>] kernel_thread_helper+0x4/0x10

I've never seen that ASSERT fire. That implies we've got a log
buffer that is being actively modified under IO, but I cannot see
how that would happen. Was this during an unmount? What test?

/me is starting to wonder about memory errors...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 07/13] xfs: xfs_sync_data is redundant.
  2012-09-01 23:24   ` Christoph Hellwig
@ 2012-09-03  6:08     ` Dave Chinner
  0 siblings, 0 replies; 60+ messages in thread
From: Dave Chinner @ 2012-09-03  6:08 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Sat, Sep 01, 2012 at 07:24:56PM -0400, Christoph Hellwig wrote:
> On Thu, Aug 30, 2012 at 10:00:11PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > We don't do any data writeback from XFS any more - the VFS is
> > completely responsible for that, including for freeze. We can
> > replace the remaining caller with the VFS level function that
> > achieves the same thing, but without conflicting with current
> > writeback work - writeback_inodes_sb_if_idle().
> > 
> > This means we can remove the flush_work and xfs_flush_inodes() - the
> > VFS functionality completely replaces the internal flush queue for
> > doing this writeback work in a separate context to avoid stack
> > overruns..
> 
> Are the lock ordering issues with writeback_inodes_sb_if_idle sorted out
> by now?  IIRC it still needs to be switch to a trylock.

It never gets called from unmount or freeze context, so I can't see
how it would deadlock in these use cases. It's only when we call
writeback_inodes_sb_if_idle() from a context that already holds
s_umount that the locking it has matters - that's where btrfs and
ext4 have been getting into trouble with this.

> > -			xfs_flush_inodes(ip);
> > +			writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb,
> > +						    WB_REASON_FS_FREE_SPACE);
> 
> I'd prefer to still keep a wrapper for an ugly call like this if we can.

OK, I'll add an inline function to do this.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-09-03  4:05   ` Dave Chinner
@ 2012-09-04  0:13     ` Mark Tinguely
  2012-09-25  9:26     ` Christoph Hellwig
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04  0:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 09/02/12 23:05, Dave Chinner wrote:
> On Fri, Aug 31, 2012 at 09:01:04AM -0500, Mark Tinguely wrote:
>> On 08/30/12 07:00, Dave Chinner wrote:
>>> Version 2 of the patchset I described here:
>>>
>>> http://oss.sgi.com/archives/xfs/2012-06/msg00064.html
>>>
>>> This version has run through xfstests completely once, so it's
>>> less likely to let smoke out....
>>>
>>> Version 2:
>>> - fix writeback_inodes_sb_if_idle call in xfs_create()
>>> - refreshed patch 13 before sending.
>>>
>>> _______________________________________________
>>> xfs mailing list
>>> xfs@oss.sgi.com
>>> http://oss.sgi.com/mailman/listinfo/xfs
>>
>> I wanted to get a fast look at your patch series. I am getting the
>> following ASSERT on xfstest 179 when running the series with the
>> latest OSS soruces.The ASSERT appears to start at patch number 3.
>> Sorry these boxes won't kdump the top of tree kernels:
>>
>> [17474.545964] XFS: Assertion failed: atomic_read(&bp->b_hold)>  0,
>> file: /root/xfs/fs/xfs/xfs_buf.c, line: 896
>
> FWIW, when you paste stack traces, can you turn off line wrapping
> when you paste it so the crash is simple to quote in reply? (use
> :set paste in mutt, the :set nopaste when finished pasting it in).
>
>> [17474.559784] Process umount (pid: 26427, threadinfo
> ...
>> [17474.559784] Call Trace:
>> [17474.559784]  [<ffffffffa05b4ed4>] xfs_buf_rele+0xa4/0x1b0 [xfs]
>> [17474.559784]  [<ffffffffa05b5b86>] xfs_buf_iodone_work+0x46/0x50 [xfs]
>> [17474.559784]  [<ffffffffa05b5c26>] xfs_buf_ioend+0x96/0x120 [xfs]
>> [17474.559784]  [<ffffffffa061e939>] xfs_buf_iodone_callbacks+0x59/0x230 [xfs]
>> [17474.559784]  [<ffffffffa05b5b61>] xfs_buf_iodone_work+0x21/0x50 [xfs]
>> [17474.559784]  [<ffffffffa05b5c26>] xfs_buf_ioend+0x96/0x120 [xfs]
>> [17474.559784]  [<ffffffffa061f7e9>] xfs_buf_item_unpin+0x289/0x2d0 [xfs]
>> [17474.559784]  [<ffffffffa0617c33>] xfs_trans_committed_bulk+0x213/0x300 [xfs]
>> [17474.559784]  [<ffffffffa061dde6>] xlog_cil_committed+0x36/0x130 [xfs]
>> [17474.559784]  [<ffffffffa061e1e8>] xlog_cil_push+0x308/0x430 [xfs]
>> [17474.559784]  [<ffffffffa061e466>] xlog_cil_force_lsn+0x146/0x1b0 [xfs]
>> [17474.559784]  [<ffffffffa061c1e4>] _xfs_log_force+0x64/0x280 [xfs]
>> [17474.559784]  [<ffffffffa061c454>] xfs_log_force+0x54/0x80 [xfs]
>> [17474.559784]  [<ffffffffa05c65dd>] xfs_fs_sync_fs+0x2d/0x50 [xfs]
>> [17474.559784]  [<ffffffff8118c00b>] __sync_filesystem+0x2b/0x50
>> [17474.559784]  [<ffffffff8118c073>] sync_filesystem+0x43/0x60
>> [17474.559784]  [<ffffffff81160846>] generic_shutdown_super+0x36/0xe0
>> [17474.559784]  [<ffffffff8116091c>] kill_block_super+0x2c/0x80
>> [17474.559784]  [<ffffffff81160e78>] deactivate_locked_super+0x38/0x90
>> [17474.559784]  [<ffffffff81161951>] deactivate_super+0x61/0x70
>> [17474.559784]  [<ffffffff8117c659>] mntput_no_expire+0x149/0x1b0
>> [17474.559784]  [<ffffffff8117d10e>] sys_umount+0x6e/0xd0
>
> Nothing has been shut down in XFS at this point (i.e. .put_super()
> has not yet been called) so none of the shutdown changes could have
> caused this problem.
>
> Indeed, it looks like this is during a forced shutdown here in
> xfs_buf_item_unpin:
>
>          } else if (freed&&  remove) {
>                  xfs_buf_lock(bp);
>                  xfs_buf_ioerror(bp, EIO);
>                  XFS_BUF_UNDONE(bp);
>                  xfs_buf_stale(bp);
>>>>>>>           xfs_buf_ioend(bp, 0);
>          }
>
> Now, xfs_buf_stale() does this:
>
> 	ASSERT(atomic_read(&bp->b_hold)>= 1);
>
> Which means that in calling xfs_buf_ioend(), at least two references
> to the buffer are being dropped. Working out why that is occurring
> will find the root cause of this problem.
>
> All that I can say at this point is that I find it highly unlikely
> that it is caused by the changes in this patchset.
>
>> I got this ASSERT when I ran it on the 8/27 OSS sources:
>>
>> [188646.952426] XFS: Assertion failed:
>> atomic_read(&iclog->ic_refcnt) == 0, file:
>>   /root/xfs/fs/xfs/xfs_log.c, line: 2590
>
>> [188646.967020] Process kworker/2:1H (pid: 356, threadinfo ffff8808396a4000, task ffff88083a9aa1c0)
>> [188646.967020] Call Trace:
>> [188646.967020]  [<ffffffffa01dd2bf>] xlog_state_done_syncing+0x7f/0x110 [xfs]
>> [188646.967020]  [<ffffffffa01ddbde>] xlog_iodone+0x7e/0x100 [xfs]
>> [188646.967020]  [<ffffffffa0179b51>] xfs_buf_iodone_work+0x21/0x50 [xfs]
>> [188646.967020]  [<ffffffff8105d6b3>] process_one_work+0x1d3/0x370
>> [188646.967020]  [<ffffffff810603e3>] worker_thread+0x133/0x390
>> [188646.967020]  [<ffffffff810651ce>] kthread+0x9e/0xb0
>> [188646.967020]  [<ffffffff8143e504>] kernel_thread_helper+0x4/0x10
>
> I've never seen that ASSERT fire. That implies we've got a log
> buffer that is being actively modified under IO, but I cannot see
> how that would happen. Was this during an unmount? What test?
>
> /me is starting to wonder about memory errors...
>
> Cheers,
>
> Dave.



all panic on xfstest 179 - 3 different machines: 2 are x86_64 and one is 
x86_32. I believe all have XFS debug turned.

I will see what else I can find out.

--Mark.


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-08-30 12:00 ` [PATCH 03/13] xfs: rationalise xfs_mount_wq users Dave Chinner
@ 2012-09-04 15:48   ` Mark Tinguely
  2012-09-05  4:30     ` Dave Chinner
  2012-09-11 21:25   ` Mark Tinguely
  1 sibling, 1 reply; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 15:48 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> -	/*
> -	 * We shouldn't write/force the log if we are in the mount/unmount
> -	 * process or on a read only filesystem. The workqueue still needs to be
> -	 * active in both cases, however, because it is used for inode reclaim
> -	 * during these times.  Use the MS_ACTIVE flag to avoid doing anything
> -	 * during mount.  Doing work during unmount is avoided by calling
> -	 * cancel_delayed_work_sync on this work queue before tearing down
> -	 * the ail and the log in xfs_log_unmount.
> -	 */
> -	if (!(mp->m_super->s_flags&  MS_ACTIVE)&&
> -	    !(mp->m_flags&  XFS_MOUNT_RDONLY)) {
> +	if (!(mp->m_flags&  XFS_MOUNT_RDONLY)) {
>   		/* dgc: errors ignored here */
>   		if (mp->m_super->s_writers.frozen == SB_UNFROZEN&&
>   		xfs_log_need_covered(mp))
> @@ -408,8 +398,7 @@ xfs_sync_worker(
>   		else
>   			xfs_log_force(mp, 0);
>
> -		/* start pushing all the metadata that is currently
> -		 * dirty */
> +		/* start pushing all the metadata that is currently dirty */
>   		xfs_ail_push_all(mp->m_ail);
>   	}
>

It appears that the removal of the MS_ACTIVE flag is causing the
"atomic_read(&bp->b_hold)>  0," ASSERT.

--Mark.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 01/13] xfs: xfs_syncd_stop must die
  2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
  2012-09-01 23:15   ` Christoph Hellwig
@ 2012-09-04 16:10   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 16:10 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
> functionailty together that actually have different start and stop
> requirements. Kill these functions and open code the start/stop
> methods for each of the background functions.
>
> Subsequent patches will move the start/stop functions around to the
> correct places to avoid races and shutdown issues.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 04/13] xfs: don't run the sync work if the filesyste is read-only
  2012-08-30 12:00 ` [PATCH 04/13] xfs: don't run the sync work if the filesyste is read-only Dave Chinner
@ 2012-09-04 16:13   ` Mark Tinguely
  0 siblings, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 16:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> If the filesytem is mounted or remounted read-only, stop the sync
> worker that tries to flush or cover the log if the filesystem is
> dirty. It's read-only, so it isn't dirty. Restart it on a remount,rw
> as necessary. This avoids the need for RO checks in the work.
>
> Similarly, stop the sync work when the filesystem is frozen, and
> start it again when the filesysetm is thawed. This avoids the need
> for special freeze checks in the work.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 05/13] xfs: sync work is now only periodic log work
  2012-08-30 12:00 ` [PATCH 05/13] xfs: sync work is now only periodic log work Dave Chinner
  2012-09-01 23:23   ` Christoph Hellwig
@ 2012-09-04 16:14   ` Mark Tinguely
  2012-09-04 18:57   ` Mark Tinguely
  2 siblings, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 16:14 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> + * Every sync period we need to unpin all items in the AIL and push them to
> + * disk. If there is nothing dirty, then we might need to cover the log to
> + * indicate that the filesystem is idle.
> + */
> +void
> +xfs_log_worker(
> +	struct work_struct	*work)
> +{
> +	struct xlog		*log = container_of(to_delayed_work(work),
> +						struct xlog, l_work);
> +	struct xfs_mount	*mp = log->l_mp;
> +
> +	/* dgc: errors ignored - not fatal and nowhere to report them */
> +	if (xfs_log_need_covered(mp))
> +		xfs_fs_log_dummy(mp);
> +	else
> +		xfs_log_force(mp, 0);
> +
> +	/* start pushing all the metadata that is currently dirty */
> +	xfs_ail_push_all(mp->m_ail);
> +
> +	/* queue us up again */
> +	xfs_log_work_queue(mp);
> +}

Having log space for the xfs_fs_log_dummy() is the weak point in the 
remaining log hang problems.

I agree with Dave and the above should remain in this series and log 
issue handled in another series.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 05/13] xfs: sync work is now only periodic log work
  2012-08-30 12:00 ` [PATCH 05/13] xfs: sync work is now only periodic log work Dave Chinner
  2012-09-01 23:23   ` Christoph Hellwig
  2012-09-04 16:14   ` Mark Tinguely
@ 2012-09-04 18:57   ` Mark Tinguely
  2012-09-05  4:35     ` Dave Chinner
  2 siblings, 1 reply; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 18:57 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> The only thing the periodic sync work does now is flush the AIL and
> idle the log. These are really functions of the log code, so move
> the work to xfs_log.c and rename it appropriately.
>
> The only wart that this leaves behind is the xfssyncd_centisecs
> sysctl, otherwise the xfssyncd is dead. Clean up any comments that
> related to xfssyncd to reflect it's passing.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

...

>
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index 7f4f937..598f279 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -34,6 +34,7 @@
>   #include "xfs_dinode.h"
>   #include "xfs_inode.h"
>   #include "xfs_trace.h"
> +#include "xfs_fsops.h"
>
>   kmem_zone_t	*xfs_log_ticket_zone;
>
> @@ -698,6 +699,8 @@ xfs_log_mount_finish(xfs_mount_t *mp)
>   		ASSERT(mp->m_flags&  XFS_MOUNT_RDONLY);
>   	}
>

Looking at this closer, shouldn't there be a check for recovery failure. 
If it did, the mount will stop:

	if (!error)

> +	xfs_log_work_queue(mp);
> +
>   	return error;
>   }

--Mark.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 06/13] xfs: Bring some sanity to log unmounting
  2012-08-30 12:00 ` [PATCH 06/13] xfs: Bring some sanity to log unmounting Dave Chinner
  2012-09-01 23:28   ` Christoph Hellwig
@ 2012-09-04 19:11   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 19:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> When unmounting the filesystem, there are lots of operations that
> need to be done in a specific order, and they are spread across
> across a couple of functions. We have to drain the AIL before we
> write the unmount record, and we have to shut down the background
> log work before we do either of them.
>
> But this is all split haphazardly across xfs_unmountfs() and
> xfs_log_unmount(). Move all the AIL flushing and log manipulations
> to xfs_log_unmount() so that the responisbilities of each function
> is clear and the operations they perform obvious.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>



_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 07/13] xfs: xfs_sync_data is redundant.
  2012-08-30 12:00 ` [PATCH 07/13] xfs: xfs_sync_data is redundant Dave Chinner
  2012-09-01 23:24   ` Christoph Hellwig
@ 2012-09-04 20:48   ` Mark Tinguely
  2012-09-06  0:53     ` Dave Chinner
  1 sibling, 1 reply; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 20:48 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> We don't do any data writeback from XFS any more - the VFS is
> completely responsible for that, including for freeze. We can
> replace the remaining caller with the VFS level function that
> achieves the same thing, but without conflicting with current
> writeback work - writeback_inodes_sb_if_idle().
>
> This means we can remove the flush_work and xfs_flush_inodes() - the
> VFS functionality completely replaces the internal flush queue for
> doing this writeback work in a separate context to avoid stack
> overruns..
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

I get a XFS hang on xfstest 205 - couple different machines:

# cat /proc/413/stack
[<ffffffff810fa889>] sleep_on_page+0x9/0x10
[<ffffffff810fa874>] __lock_page+0x64/0x70
[<ffffffff81104f58>] write_cache_pages+0x368/0x510
[<ffffffff8110514c>] generic_writepages+0x4c/0x70
[<ffffffffa046d084>] xfs_vm_writepages+0x54/0x70 [xfs]
[<ffffffff8110518b>] do_writepages+0x1b/0x40
[<ffffffff8117ad85>] __writeback_single_inode+0x45/0x160
[<ffffffff8117c0c7>] writeback_sb_inodes+0x2a7/0x490
[<ffffffff8117c539>] wb_writeback+0x119/0x2b0
[<ffffffff8117c7a4>] wb_do_writeback+0xd4/0x230
[<ffffffff8117c9db>] bdi_writeback_thread+0xdb/0x230
[<ffffffff810650be>] kthread+0x9e/0xb0
[<ffffffff81432dc4>] kernel_thread_helper+0x4/0x10
[<ffffffffffffffff>] 0xffffffffffffffff

# cat /proc/12489/stack (dd command)
[<ffffffff8117b415>] writeback_inodes_sb_nr+0x85/0xb0
[<ffffffff8117b77c>] writeback_inodes_sb+0x5c/0x80
[<ffffffff8117b7e2>] writeback_inodes_sb_if_idle+0x42/0x60
[<ffffffffa047b54e>] xfs_iomap_write_delay+0x28e/0x320 [xfs]
[<ffffffffa046c738>] __xfs_get_blocks+0x2b8/0x500 [xfs]
[<ffffffffa046c9ac>] xfs_get_blocks+0xc/0x10 [xfs]
[<ffffffff811863df>] __block_write_begin+0x2af/0x5c0
[<ffffffffa046cfa1>] xfs_vm_write_begin+0x61/0xd0 [xfs]
[<ffffffff810f9c02>] generic_perform_write+0xc2/0x1e0
[<ffffffff810f9d80>] generic_file_buffered_write+0x60/0xa0
[<ffffffffa047454d>] xfs_file_buffered_aio_write+0x11d/0x1b0 [xfs]
[<ffffffffa04746f0>] xfs_file_aio_write+0x110/0x170 [xfs]
[<ffffffff811530e1>] do_sync_write+0xa1/0xf0
[<ffffffff811536eb>] vfs_write+0xcb/0x130
[<ffffffff81153840>] sys_write+0x50/0x90
[<ffffffff81431d39>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 08/13] xfs: xfs_sync_fsdata is redundant
  2012-08-30 12:00 ` [PATCH 08/13] xfs: xfs_sync_fsdata " Dave Chinner
  2012-09-01 23:27   ` Christoph Hellwig
@ 2012-09-04 20:59   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 20:59 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> Why do we need to write the superblock to disk once we've written
> all the data?  We don't actually - the reasons for doing this are
> lost in the mists of time, and go back to the way Irix used to drive
> VFS flushing.
>
> On linux, this code is only called from two contexts: remount and
> .sync_fs. In the remount case, the call is followed by a metadata
> sync, which unpins and writes the superblock.  In the sync_fs case,
> we only need to force the log to disk to ensure that the superblock
> is correctly on disk, so we don't actually need to write it. Hence
> the functionality is either redundant or superfluous and thus can be
> removed.
>
> Seeing as xfs_quiesce_data is essentially now just a log force,
> remove it as well and fold the code back into the two callers.
> Neither of them need the log covering check, either, as that is
> redundant for the remount case, and unnecessary for the .sync_fs
> case.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c
  2012-08-30 12:00 ` [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c Dave Chinner
  2012-09-01 23:27   ` Christoph Hellwig
@ 2012-09-04 21:03   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 21:03 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> Both callers of xfs_quiesce_attr() are in xfs_super.c, and there's
> nothing really sync-specific about this functionality so it doesn't
> really matter where it lives. Move it to benext to it's callers, so
> all the remount/sync_fs code is in the one place.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount
  2012-08-30 12:00 ` [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount Dave Chinner
  2012-09-01 23:29   ` Christoph Hellwig
@ 2012-09-04 21:04   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 21:04 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> xfs_quiesce_attr() is supposed to leave the log empty with an
> unmount record written. Right now it does not wait for the AIL to be
> emptied before writing the unmount record, not does it wait for
> metadata IO completion, either. Fix it to use the same method and
> code as xfs_log_unmount().
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---


Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch]
  2012-08-30 12:00 ` [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] Dave Chinner
  2012-09-01 23:30   ` Christoph Hellwig
@ 2012-09-04 21:06   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 21:06 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> xfs_sync.c now only contains inode reclaim functions and inode cache
> iteration functions. It is not related to sync operations anymore.
> Rename to xfs_icache.c to reflect it's contents and prepare for
> consolidation with the other inode cache file that exists
> (xfs_iget.c).
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c
  2012-08-30 12:00 ` [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c Dave Chinner
  2012-09-01 23:30   ` Christoph Hellwig
@ 2012-09-04 21:07   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 21:07 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> xfs_ilock() and friends really aren't related to the inode cache in
> any way, so move them to xfs_inode.c with all the other inode
> related functionality.
>
> While doing this move, move the xfs_ilock() tracepoints to *before*
> the lock is taken so that when a hang on a lock occurs we have
> events to indicate which process and what inode we were trying to
> lock when the hang occurred. This is much better than the current
> silence we get on a hang...
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 13/13] xfs: remove xfs_iget.c
  2012-08-30 12:00 ` [PATCH 13/13] xfs: remove xfs_iget.c Dave Chinner
  2012-09-01 23:31   ` Christoph Hellwig
@ 2012-09-04 21:11   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-04 21:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> The inode cache functions remaining in xfs_iget.c can be moved to xfs_icache.c
> along with the other inode cache functions. This removes all functionality from
> xfs_iget.c, so the file can simply be removed.
>
> This move results in various functions now only having the scope of a single
> file (e.g. xfs_inode_free()), so clean up all the definitions and exported
> prototypes in xfs_icache.[ch] and xfs_inode.h appropriately.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-09-04 15:48   ` Mark Tinguely
@ 2012-09-05  4:30     ` Dave Chinner
  2012-09-05 13:16       ` Mark Tinguely
  0 siblings, 1 reply; 60+ messages in thread
From: Dave Chinner @ 2012-09-05  4:30 UTC (permalink / raw)
  To: Mark Tinguely; +Cc: xfs

On Tue, Sep 04, 2012 at 10:48:17AM -0500, Mark Tinguely wrote:
> On 08/30/12 07:00, Dave Chinner wrote:
> >-	/*
> >-	 * We shouldn't write/force the log if we are in the mount/unmount
> >-	 * process or on a read only filesystem. The workqueue still needs to be
> >-	 * active in both cases, however, because it is used for inode reclaim
> >-	 * during these times.  Use the MS_ACTIVE flag to avoid doing anything
> >-	 * during mount.  Doing work during unmount is avoided by calling
> >-	 * cancel_delayed_work_sync on this work queue before tearing down
> >-	 * the ail and the log in xfs_log_unmount.
> >-	 */
> >-	if (!(mp->m_super->s_flags&  MS_ACTIVE)&&
> >-	    !(mp->m_flags&  XFS_MOUNT_RDONLY)) {
> >+	if (!(mp->m_flags&  XFS_MOUNT_RDONLY)) {
> >  		/* dgc: errors ignored here */
> >  		if (mp->m_super->s_writers.frozen == SB_UNFROZEN&&
> >  		xfs_log_need_covered(mp))
> >@@ -408,8 +398,7 @@ xfs_sync_worker(
> >  		else
> >  			xfs_log_force(mp, 0);
> >
> >-		/* start pushing all the metadata that is currently
> >-		 * dirty */
> >+		/* start pushing all the metadata that is currently dirty */
> >  		xfs_ail_push_all(mp->m_ail);
> >  	}
> >
> 
> It appears that the removal of the MS_ACTIVE flag is causing the
> "atomic_read(&bp->b_hold)>  0," ASSERT.

I must be being slow today - I don't see why that would cause any
problems. The worker is not started at the end of the mount process
after everything is set up (i.e. just before MS_ACTIVE is removed),
and the worker is stopped before anything is torn down. That should
effectively replicate what the MS_ACTIVE flag is providing in the
old code.

Can you explain in more detail what lead you to this conclusion?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 05/13] xfs: sync work is now only periodic log work
  2012-09-04 18:57   ` Mark Tinguely
@ 2012-09-05  4:35     ` Dave Chinner
  0 siblings, 0 replies; 60+ messages in thread
From: Dave Chinner @ 2012-09-05  4:35 UTC (permalink / raw)
  To: Mark Tinguely; +Cc: xfs

On Tue, Sep 04, 2012 at 01:57:38PM -0500, Mark Tinguely wrote:
> On 08/30/12 07:00, Dave Chinner wrote:
> >From: Dave Chinner<dchinner@redhat.com>
> >
> >The only thing the periodic sync work does now is flush the AIL and
> >idle the log. These are really functions of the log code, so move
> >the work to xfs_log.c and rename it appropriately.
> >
> >The only wart that this leaves behind is the xfssyncd_centisecs
> >sysctl, otherwise the xfssyncd is dead. Clean up any comments that
> >related to xfssyncd to reflect it's passing.
> >
> >Signed-off-by: Dave Chinner<dchinner@redhat.com>
> >---
> 
> ...
> 
> >
> >diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> >index 7f4f937..598f279 100644
> >--- a/fs/xfs/xfs_log.c
> >+++ b/fs/xfs/xfs_log.c
> >@@ -34,6 +34,7 @@
> >  #include "xfs_dinode.h"
> >  #include "xfs_inode.h"
> >  #include "xfs_trace.h"
> >+#include "xfs_fsops.h"
> >
> >  kmem_zone_t	*xfs_log_ticket_zone;
> >
> >@@ -698,6 +699,8 @@ xfs_log_mount_finish(xfs_mount_t *mp)
> >  		ASSERT(mp->m_flags&  XFS_MOUNT_RDONLY);
> >  	}
> >
> 
> Looking at this closer, shouldn't there be a check for recovery
> failure. If it did, the mount will stop:
> 
> 	if (!error)
> 
> >+	xfs_log_work_queue(mp);
> >+
> >  	return error;

Probably should, but the error handling for a xfs_log_mount_finish()
failure calls xfs_log_unmount() and that shuts down the worker
correctly even if this failure occurs.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-09-05  4:30     ` Dave Chinner
@ 2012-09-05 13:16       ` Mark Tinguely
  2012-09-05 14:34         ` Mark Tinguely
  2012-09-06  0:46         ` Dave Chinner
  0 siblings, 2 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-05 13:16 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 09/04/12 23:30, Dave Chinner wrote:
> On Tue, Sep 04, 2012 at 10:48:17AM -0500, Mark Tinguely wrote:
>> On 08/30/12 07:00, Dave Chinner wrote:
>>> -	/*
>>> -	 * We shouldn't write/force the log if we are in the mount/unmount
>>> -	 * process or on a read only filesystem. The workqueue still needs to be
>>> -	 * active in both cases, however, because it is used for inode reclaim
>>> -	 * during these times.  Use the MS_ACTIVE flag to avoid doing anything
>>> -	 * during mount.  Doing work during unmount is avoided by calling
>>> -	 * cancel_delayed_work_sync on this work queue before tearing down
>>> -	 * the ail and the log in xfs_log_unmount.
>>> -	 */
>>> -	if (!(mp->m_super->s_flags&   MS_ACTIVE)&&
>>> -	    !(mp->m_flags&   XFS_MOUNT_RDONLY)) {
>>> +	if (!(mp->m_flags&   XFS_MOUNT_RDONLY)) {
>>>   		/* dgc: errors ignored here */
>>>   		if (mp->m_super->s_writers.frozen == SB_UNFROZEN&&
>>>   		xfs_log_need_covered(mp))
>>> @@ -408,8 +398,7 @@ xfs_sync_worker(
>>>   		else
>>>   			xfs_log_force(mp, 0);
>>>
>>> -		/* start pushing all the metadata that is currently
>>> -		 * dirty */
>>> +		/* start pushing all the metadata that is currently dirty */
>>>   		xfs_ail_push_all(mp->m_ail);
>>>   	}
>>>
>>
>> It appears that the removal of the MS_ACTIVE flag is causing the
>> "atomic_read(&bp->b_hold)>   0," ASSERT.
>
> I must be being slow today - I don't see why that would cause any
> problems. The worker is not started at the end of the mount process
> after everything is set up (i.e. just before MS_ACTIVE is removed),
> and the worker is stopped before anything is torn down. That should
> effectively replicate what the MS_ACTIVE flag is providing in the
> old code.
>
> Can you explain in more detail what lead you to this conclusion?
>
> Cheers,
>
> Dave.

You are correct, it does not make sense, but with the
  !(mp->m_super->s_flags &  MS_ACTIVE)
test removed, test 107 causes the above assert on
different machines/architectures. Place the test in, the
assert does not happen.

I will see if I can get it to dump on the x86_32 machine.

--Mark.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-09-05 13:16       ` Mark Tinguely
@ 2012-09-05 14:34         ` Mark Tinguely
  2012-09-06  0:46         ` Dave Chinner
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-05 14:34 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 09/05/12 08:16, Mark Tinguely wrote:
> On 09/04/12 23:30, Dave Chinner wrote:
>> On Tue, Sep 04, 2012 at 10:48:17AM -0500, Mark Tinguely wrote:
>>> On 08/30/12 07:00, Dave Chinner wrote:
>>>> - /*
>>>> - * We shouldn't write/force the log if we are in the mount/unmount
>>>> - * process or on a read only filesystem. The workqueue still needs
>>>> to be
>>>> - * active in both cases, however, because it is used for inode reclaim
>>>> - * during these times. Use the MS_ACTIVE flag to avoid doing anything
>>>> - * during mount. Doing work during unmount is avoided by calling
>>>> - * cancel_delayed_work_sync on this work queue before tearing down
>>>> - * the ail and the log in xfs_log_unmount.
>>>> - */
>>>> - if (!(mp->m_super->s_flags& MS_ACTIVE)&&
>>>> - !(mp->m_flags& XFS_MOUNT_RDONLY)) {
>>>> + if (!(mp->m_flags& XFS_MOUNT_RDONLY)) {
>>>> /* dgc: errors ignored here */
>>>> if (mp->m_super->s_writers.frozen == SB_UNFROZEN&&
>>>> xfs_log_need_covered(mp))
>>>> @@ -408,8 +398,7 @@ xfs_sync_worker(
>>>> else
>>>> xfs_log_force(mp, 0);
>>>>
>>>> - /* start pushing all the metadata that is currently
>>>> - * dirty */
>>>> + /* start pushing all the metadata that is currently dirty */
>>>> xfs_ail_push_all(mp->m_ail);
>>>> }
>>>>
>>>
>>> It appears that the removal of the MS_ACTIVE flag is causing the
>>> "atomic_read(&bp->b_hold)> 0," ASSERT.
>>
>> I must be being slow today - I don't see why that would cause any
>> problems. The worker is not started at the end of the mount process
>> after everything is set up (i.e. just before MS_ACTIVE is removed),
>> and the worker is stopped before anything is torn down. That should
>> effectively replicate what the MS_ACTIVE flag is providing in the
>> old code.
>>
>> Can you explain in more detail what lead you to this conclusion?
>>
>> Cheers,
>>
>> Dave.
>
> You are correct, it does not make sense, but with the
> !(mp->m_super->s_flags & MS_ACTIVE)
> test removed, test 107 causes the above assert on
> different machines/architectures. Place the test in, the
> assert does not happen.
>
> I will see if I can get it to dump on the x86_32 machine.
>
> --Mark.

Make that xfstest 179. The ASSERT happens right away. I have a dump from 
x86_32 machine. I will take a quick look at it.

--Mark.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-09-05 13:16       ` Mark Tinguely
  2012-09-05 14:34         ` Mark Tinguely
@ 2012-09-06  0:46         ` Dave Chinner
  2012-09-06 15:08           ` Mark Tinguely
  1 sibling, 1 reply; 60+ messages in thread
From: Dave Chinner @ 2012-09-06  0:46 UTC (permalink / raw)
  To: Mark Tinguely; +Cc: xfs

On Wed, Sep 05, 2012 at 08:16:59AM -0500, Mark Tinguely wrote:
> On 09/04/12 23:30, Dave Chinner wrote:
> >On Tue, Sep 04, 2012 at 10:48:17AM -0500, Mark Tinguely wrote:
> >>On 08/30/12 07:00, Dave Chinner wrote:
> >>>-	/*
> >>>-	 * We shouldn't write/force the log if we are in the mount/unmount
> >>>-	 * process or on a read only filesystem. The workqueue still needs to be
> >>>-	 * active in both cases, however, because it is used for inode reclaim
> >>>-	 * during these times.  Use the MS_ACTIVE flag to avoid doing anything
> >>>-	 * during mount.  Doing work during unmount is avoided by calling
> >>>-	 * cancel_delayed_work_sync on this work queue before tearing down
> >>>-	 * the ail and the log in xfs_log_unmount.
> >>>-	 */
> >>>-	if (!(mp->m_super->s_flags&   MS_ACTIVE)&&
> >>>-	    !(mp->m_flags&   XFS_MOUNT_RDONLY)) {
> >>>+	if (!(mp->m_flags&   XFS_MOUNT_RDONLY)) {
> >>>  		/* dgc: errors ignored here */
> >>>  		if (mp->m_super->s_writers.frozen == SB_UNFROZEN&&
> >>>  		xfs_log_need_covered(mp))
> >>>@@ -408,8 +398,7 @@ xfs_sync_worker(
> >>>  		else
> >>>  			xfs_log_force(mp, 0);
> >>>
> >>>-		/* start pushing all the metadata that is currently
> >>>-		 * dirty */
> >>>+		/* start pushing all the metadata that is currently dirty */
> >>>  		xfs_ail_push_all(mp->m_ail);
> >>>  	}
> >>>
> >>
> >>It appears that the removal of the MS_ACTIVE flag is causing the
> >>"atomic_read(&bp->b_hold)>   0," ASSERT.
> >
> >I must be being slow today - I don't see why that would cause any
> >problems. The worker is not started at the end of the mount process
> >after everything is set up (i.e. just before MS_ACTIVE is removed),
> >and the worker is stopped before anything is torn down. That should
> >effectively replicate what the MS_ACTIVE flag is providing in the
> >old code.
> >
> >Can you explain in more detail what lead you to this conclusion?
> >
> >Cheers,
> >
> >Dave.
> 
> You are correct, it does not make sense, but with the
>  !(mp->m_super->s_flags &  MS_ACTIVE)
> test removed, test 107 causes the above assert on
> different machines/architectures. Place the test in, the
> assert does not happen.

test 107 is not in the auto group. That means it is generally
unreliable as a regression test, so I don't run it. That said, I
don't see anything unusual in that test that would cause problems...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 07/13] xfs: xfs_sync_data is redundant.
  2012-09-04 20:48   ` Mark Tinguely
@ 2012-09-06  0:53     ` Dave Chinner
  0 siblings, 0 replies; 60+ messages in thread
From: Dave Chinner @ 2012-09-06  0:53 UTC (permalink / raw)
  To: Mark Tinguely; +Cc: xfs

On Tue, Sep 04, 2012 at 03:48:58PM -0500, Mark Tinguely wrote:
> On 08/30/12 07:00, Dave Chinner wrote:
> >From: Dave Chinner<dchinner@redhat.com>
> >
> >We don't do any data writeback from XFS any more - the VFS is
> >completely responsible for that, including for freeze. We can
> >replace the remaining caller with the VFS level function that
> >achieves the same thing, but without conflicting with current
> >writeback work - writeback_inodes_sb_if_idle().
> >
> >This means we can remove the flush_work and xfs_flush_inodes() - the
> >VFS functionality completely replaces the internal flush queue for
> >doing this writeback work in a separate context to avoid stack
> >overruns..
> >
> >Signed-off-by: Dave Chinner<dchinner@redhat.com>
> >---
> 
> I get a XFS hang on xfstest 205 - couple different machines:
> 
> # cat /proc/413/stack
> [<ffffffff810fa889>] sleep_on_page+0x9/0x10
> [<ffffffff810fa874>] __lock_page+0x64/0x70
> [<ffffffff81104f58>] write_cache_pages+0x368/0x510
> [<ffffffff8110514c>] generic_writepages+0x4c/0x70
> [<ffffffffa046d084>] xfs_vm_writepages+0x54/0x70 [xfs]
> [<ffffffff8110518b>] do_writepages+0x1b/0x40
> [<ffffffff8117ad85>] __writeback_single_inode+0x45/0x160
> [<ffffffff8117c0c7>] writeback_sb_inodes+0x2a7/0x490
> [<ffffffff8117c539>] wb_writeback+0x119/0x2b0
> [<ffffffff8117c7a4>] wb_do_writeback+0xd4/0x230
> [<ffffffff8117c9db>] bdi_writeback_thread+0xdb/0x230
> [<ffffffff810650be>] kthread+0x9e/0xb0
> [<ffffffff81432dc4>] kernel_thread_helper+0x4/0x10
> [<ffffffffffffffff>] 0xffffffffffffffff

Oh, curious. That implies that writeback has got stuck on the page
we currently hold locked in this thread:

> # cat /proc/12489/stack (dd command)
> [<ffffffff8117b415>] writeback_inodes_sb_nr+0x85/0xb0
> [<ffffffff8117b77c>] writeback_inodes_sb+0x5c/0x80
> [<ffffffff8117b7e2>] writeback_inodes_sb_if_idle+0x42/0x60
> [<ffffffffa047b54e>] xfs_iomap_write_delay+0x28e/0x320 [xfs]
> [<ffffffffa046c738>] __xfs_get_blocks+0x2b8/0x500 [xfs]
> [<ffffffffa046c9ac>] xfs_get_blocks+0xc/0x10 [xfs]
> [<ffffffff811863df>] __block_write_begin+0x2af/0x5c0
> [<ffffffffa046cfa1>] xfs_vm_write_begin+0x61/0xd0 [xfs]
> [<ffffffff810f9c02>] generic_perform_write+0xc2/0x1e0
> [<ffffffff810f9d80>] generic_file_buffered_write+0x60/0xa0
> [<ffffffffa047454d>] xfs_file_buffered_aio_write+0x11d/0x1b0 [xfs]
> [<ffffffffa04746f0>] xfs_file_aio_write+0x110/0x170 [xfs]
> [<ffffffff811530e1>] do_sync_write+0xa1/0xf0
> [<ffffffff811536eb>] vfs_write+0xcb/0x130
> [<ffffffff81153840>] sys_write+0x50/0x90
> [<ffffffff81431d39>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff

Why didn't the current writeback code have this problem? It blocked
waiting for writeback on dirty inodes.

Oh, it woul dhave found the xfs_inode with the IOLOCK already held,
so it skipped writeback on the inode that triggered the flush.
Bugger. Let me have a bit of a think about this.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-09-06  0:46         ` Dave Chinner
@ 2012-09-06 15:08           ` Mark Tinguely
  2012-09-07  0:41             ` Dave Chinner
  0 siblings, 1 reply; 60+ messages in thread
From: Mark Tinguely @ 2012-09-06 15:08 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 09/05/12 19:46, Dave Chinner wrote:
> On Wed, Sep 05, 2012 at 08:16:59AM -0500, Mark Tinguely wrote:
>> On 09/04/12 23:30, Dave Chinner wrote:
>>> On Tue, Sep 04, 2012 at 10:48:17AM -0500, Mark Tinguely wrote:
>>>> On 08/30/12 07:00, Dave Chinner wrote:
>>>>> -	/*
>>>>> -	 * We shouldn't write/force the log if we are in the mount/unmount
>>>>> -	 * process or on a read only filesystem. The workqueue still needs to be
>>>>> -	 * active in both cases, however, because it is used for inode reclaim
>>>>> -	 * during these times.  Use the MS_ACTIVE flag to avoid doing anything
>>>>> -	 * during mount.  Doing work during unmount is avoided by calling
>>>>> -	 * cancel_delayed_work_sync on this work queue before tearing down
>>>>> -	 * the ail and the log in xfs_log_unmount.
>>>>> -	 */
>>>>> -	if (!(mp->m_super->s_flags&    MS_ACTIVE)&&
>>>>> -	    !(mp->m_flags&    XFS_MOUNT_RDONLY)) {
>>>>> +	if (!(mp->m_flags&    XFS_MOUNT_RDONLY)) {
>>>>>   		/* dgc: errors ignored here */
>>>>>   		if (mp->m_super->s_writers.frozen == SB_UNFROZEN&&
>>>>>   		xfs_log_need_covered(mp))
>>>>> @@ -408,8 +398,7 @@ xfs_sync_worker(
>>>>>   		else
>>>>>   			xfs_log_force(mp, 0);
>>>>>
>>>>> -		/* start pushing all the metadata that is currently
>>>>> -		 * dirty */
>>>>> +		/* start pushing all the metadata that is currently dirty */
>>>>>   		xfs_ail_push_all(mp->m_ail);
>>>>>   	}
>>>>>
>>>>
>>>> It appears that the removal of the MS_ACTIVE flag is causing the
>>>> "atomic_read(&bp->b_hold)>    0," ASSERT.
>>>
>>> I must be being slow today - I don't see why that would cause any
>>> problems. The worker is not started at the end of the mount process
>>> after everything is set up (i.e. just before MS_ACTIVE is removed),
>>> and the worker is stopped before anything is torn down. That should
>>> effectively replicate what the MS_ACTIVE flag is providing in the
>>> old code.
>>>
>>> Can you explain in more detail what lead you to this conclusion?
>>>
>>> Cheers,
>>>
>>> Dave.
>>
>> You are correct, it does not make sense, but with the
>>   !(mp->m_super->s_flags&   MS_ACTIVE)
>> test removed, test 107 causes the above assert on
>> different machines/architectures. Place the test in, the
>> assert does not happen.
>
> test 107 is not in the auto group. That means it is generally
> unreliable as a regression test, so I don't run it. That said, I
> don't see anything unusual in that test that would cause problems...
>
> Cheers,
>
> Dave.

I misspoke, it is xfs test 179. I hit it doing a "check -g auto".

My test boxes had CONFIG_XFS_DEBUG=y which may be a factor. The
test ran fine on a box without the debug enabled and assert as
soon as I added it back.

The buffer with zero b_hold count is the freelist buffer (XAGF)
for AG0. The buffer is marked STALE, it has already gone through
the release code, so there is no transaction pointer nor log item
pointer. The xlog_cil_committed() is being called with the
XFS_LI_ABORTED flag.

The X86_32 machine is now asserting with:
   XFS: Assertion failed: fs_is_ok, file: /xfs/fs/xfs/xfs_alloc.c, line: 
1503
The X86_64 machines are still asserting on the zero b_hold.

Adding back the MS_ACTIVE or (it appears) not compiling with the
CONFIG_XFS_DEBUG option seems to make the problem go away too.
Timing? Does not explain the removal of the XFS_DEBUG.

Sorry if this is a wild goose chase.

--Mark T.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-09-06 15:08           ` Mark Tinguely
@ 2012-09-07  0:41             ` Dave Chinner
  0 siblings, 0 replies; 60+ messages in thread
From: Dave Chinner @ 2012-09-07  0:41 UTC (permalink / raw)
  To: Mark Tinguely; +Cc: xfs

On Thu, Sep 06, 2012 at 10:08:47AM -0500, Mark Tinguely wrote:
> I misspoke, it is xfs test 179. I hit it doing a "check -g auto".
> 
> My test boxes had CONFIG_XFS_DEBUG=y which may be a factor. The
> test ran fine on a box without the debug enabled and assert as
> soon as I added it back.

It is - the assert doesn't exist on a non-DEBUG kernel, so it won't
fail. ;)

> The buffer with zero b_hold count is the freelist buffer (XAGF)
> for AG0. The buffer is marked STALE, it has already gone through
> the release code, so there is no transaction pointer nor log item
> pointer. The xlog_cil_committed() is being called with the
> XFS_LI_ABORTED flag.

It looks like another case of this problem:

http://oss.sgi.com/archives/xfs/2012-09/msg00021.html

Which appears to be another case of the failure that Eric reported
he was seeing on test 137:

http://oss.sgi.com/archives/xfs/2012-08/msg00019.html
http://oss.sgi.com/archives/xfs/2012-09/msg00017.html

i.e. what appears to be a double free of a buffer during a forced
shutdown. The b_hold assertion that is being hit here is just prior
to doing the second free of a buffer.  Given that Eric's case was
bisected down to the delwri queuing changes, I'd say this is the
same cause of this issue. i.e. unrelated to the patch set that was
posted.

What I can't work out is what reference is going missing, and I
can't reproduce it to be able to debug it....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH 03/13] xfs: rationalise xfs_mount_wq users
  2012-08-30 12:00 ` [PATCH 03/13] xfs: rationalise xfs_mount_wq users Dave Chinner
  2012-09-04 15:48   ` Mark Tinguely
@ 2012-09-11 21:25   ` Mark Tinguely
  1 sibling, 0 replies; 60+ messages in thread
From: Mark Tinguely @ 2012-09-11 21:25 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> Instead of starting and stopping background work on the xfs_mount_wq
> all at the same time, separate them to where they really are needed
> to start and stop.
>
> The xfs_sync_worker, only needs to be started after all the mount
> processing has completed successfully, while it needs to be stopped
> before the log is unmounted.
>
> The xfs_reclaim_worker is started on demand, and can be
> stopped before the unmount process does it's own inode reclaim pass.
>
> The xfs_flush_inodes work is run on demand, and so we really only
> need to ensure that it has stopped running before we start
> processing an unmount, freeze or remount,ro.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

The patch itself looks good. and the "atomic_read(&bp->b_hold)> 0" 
assert appears specific to my machines, so forgive the noise.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-09-03  4:05   ` Dave Chinner
  2012-09-04  0:13     ` Mark Tinguely
@ 2012-09-25  9:26     ` Christoph Hellwig
  2012-09-25  9:35       ` Dave Chinner
  1 sibling, 1 reply; 60+ messages in thread
From: Christoph Hellwig @ 2012-09-25  9:26 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Mark Tinguely, xfs

Any chance to get a repost of this so that we can get it into 3.7?

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH V2 00/13] xfs: remove the xfssyncd mess
  2012-09-25  9:26     ` Christoph Hellwig
@ 2012-09-25  9:35       ` Dave Chinner
  0 siblings, 0 replies; 60+ messages in thread
From: Dave Chinner @ 2012-09-25  9:35 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Mark Tinguely, xfs

On Tue, Sep 25, 2012 at 05:26:55AM -0400, Christoph Hellwig wrote:
> Any chance to get a repost of this so that we can get it into 3.7?

That's tomorrow's job.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 60+ messages in thread

end of thread, other threads:[~2012-09-25  9:33 UTC | newest]

Thread overview: 60+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
2012-09-01 23:15   ` Christoph Hellwig
2012-09-04 16:10   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 02/13] xfs: rename the xfs_syncd workqueue Dave Chinner
2012-09-01 23:17   ` Christoph Hellwig
2012-09-03  3:09     ` Dave Chinner
2012-08-30 12:00 ` [PATCH 03/13] xfs: rationalise xfs_mount_wq users Dave Chinner
2012-09-04 15:48   ` Mark Tinguely
2012-09-05  4:30     ` Dave Chinner
2012-09-05 13:16       ` Mark Tinguely
2012-09-05 14:34         ` Mark Tinguely
2012-09-06  0:46         ` Dave Chinner
2012-09-06 15:08           ` Mark Tinguely
2012-09-07  0:41             ` Dave Chinner
2012-09-11 21:25   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 04/13] xfs: don't run the sync work if the filesyste is read-only Dave Chinner
2012-09-04 16:13   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 05/13] xfs: sync work is now only periodic log work Dave Chinner
2012-09-01 23:23   ` Christoph Hellwig
2012-09-03  3:36     ` Dave Chinner
2012-09-04 16:14   ` Mark Tinguely
2012-09-04 18:57   ` Mark Tinguely
2012-09-05  4:35     ` Dave Chinner
2012-08-30 12:00 ` [PATCH 06/13] xfs: Bring some sanity to log unmounting Dave Chinner
2012-09-01 23:28   ` Christoph Hellwig
2012-09-04 19:11   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 07/13] xfs: xfs_sync_data is redundant Dave Chinner
2012-09-01 23:24   ` Christoph Hellwig
2012-09-03  6:08     ` Dave Chinner
2012-09-04 20:48   ` Mark Tinguely
2012-09-06  0:53     ` Dave Chinner
2012-08-30 12:00 ` [PATCH 08/13] xfs: xfs_sync_fsdata " Dave Chinner
2012-09-01 23:27   ` Christoph Hellwig
2012-09-04 20:59   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c Dave Chinner
2012-09-01 23:27   ` Christoph Hellwig
2012-09-04 21:03   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount Dave Chinner
2012-09-01 23:29   ` Christoph Hellwig
2012-09-04 21:04   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] Dave Chinner
2012-09-01 23:30   ` Christoph Hellwig
2012-09-04 21:06   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c Dave Chinner
2012-09-01 23:30   ` Christoph Hellwig
2012-09-04 21:07   ` Mark Tinguely
2012-08-30 12:00 ` [PATCH 13/13] xfs: remove xfs_iget.c Dave Chinner
2012-09-01 23:31   ` Christoph Hellwig
2012-09-04 21:11   ` Mark Tinguely
2012-08-30 12:15 ` [PATCH V2 00/13] xfs: remove the xfssyncd mess Markus Trippelsdorf
2012-08-30 22:51   ` Dave Chinner
2012-08-31  6:18     ` Markus Trippelsdorf
2012-08-31  8:42       ` Dave Chinner
2012-08-31  9:30         ` Markus Trippelsdorf
2012-08-31 14:01 ` Mark Tinguely
2012-09-03  4:05   ` Dave Chinner
2012-09-04  0:13     ` Mark Tinguely
2012-09-25  9:26     ` Christoph Hellwig
2012-09-25  9:35       ` Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.