All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] [XFS] Spurious ENOSPC fixes
@ 2009-03-15 10:57 Dave Chinner
  2009-03-15 10:57 ` [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing Dave Chinner
                   ` (5 more replies)
  0 siblings, 6 replies; 15+ messages in thread
From: Dave Chinner @ 2009-03-15 10:57 UTC (permalink / raw)
  To: xfs; +Cc: mpatocka

This series fix the spurious ENOSPC problems reported by and
partially solved by Mikulas Patocka. This series makes the recently
posted QA tests 203 and 204 pass.

Mikulas, if I've got any of the attributions wrong for the
bits of your code I used, let me know and I'll fix them.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-15 10:57 [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
@ 2009-03-15 10:57 ` Dave Chinner
  2009-03-17 13:08   ` Mikulas Patocka
  2009-03-15 10:57 ` [PATCH 2/5] [XFS] Make inode flush at ENOSPC synchronous Dave Chinner
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Dave Chinner @ 2009-03-15 10:57 UTC (permalink / raw)
  To: xfs; +Cc: mpatocka

From: Dave Chinner <dgc@evostor.com>

Currently xfs_device_flush calls sync_blockdev() which is
a no-op for XFS as all it's metadata is held in a different
address to the one sync_blockdev() works on.

Call xfs_sync_inodes() instead to flush all the delayed
allocation blocks out. To do this as efficiently as possible,
do it via two passes - one to do an async flush of all the
dirty blocks and a second to wait for all the IO to complete.
This requires some modification to the xfs-sync_inodes_ag()
flush code to do efficiently.

Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_fs_subr.c |   14 ++++++------
 fs/xfs/linux-2.6/xfs_sync.c    |   43 +++++++++++++++++++++++----------------
 fs/xfs/linux-2.6/xfs_sync.h    |    7 +++--
 fs/xfs/xfs_iomap.c             |    2 +-
 fs/xfs/xfs_mount.h             |    2 +-
 5 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 5aeb777..08be36d 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -74,14 +74,14 @@ xfs_flush_pages(
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_fdatawrite(mapping);
-		if (flags & XFS_B_ASYNC)
-			return -ret;
-		ret2 = filemap_fdatawait(mapping);
-		if (!ret)
-			ret = ret2;
+		ret = -filemap_fdatawrite(mapping);
 	}
-	return -ret;
+	if (flags & XFS_B_ASYNC)
+		return ret;
+	ret2 = xfs_wait_on_pages(ip, first, last);
+	if (!ret)
+		ret = ret2;
+	return ret;
 }
 
 int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a608e72..88caafc 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -62,12 +62,6 @@ xfs_sync_inodes_ag(
 	uint32_t	first_index = 0;
 	int		error = 0;
 	int		last_error = 0;
-	int		fflag = XFS_B_ASYNC;
-
-	if (flags & SYNC_DELWRI)
-		fflag = XFS_B_DELWRI;
-	if (flags & SYNC_WAIT)
-		fflag = 0;		/* synchronous overrides all */
 
 	do {
 		struct inode	*inode;
@@ -128,11 +122,23 @@ xfs_sync_inodes_ag(
 		 * If we have to flush data or wait for I/O completion
 		 * we need to hold the iolock.
 		 */
-		if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
-			xfs_ilock(ip, XFS_IOLOCK_SHARED);
-			lock_flags |= XFS_IOLOCK_SHARED;
-			error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
-			if (flags & SYNC_IOWAIT)
+		if (flags & SYNC_DELWRI) {
+			if (VN_DIRTY(inode)) {
+				if (flags & SYNC_TRYLOCK) {
+					if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+						lock_flags |= XFS_IOLOCK_SHARED;
+				} else {
+					xfs_ilock(ip, XFS_IOLOCK_SHARED);
+					lock_flags |= XFS_IOLOCK_SHARED;
+				}
+				if (lock_flags & XFS_IOLOCK_SHARED) {
+					error = xfs_flush_pages(ip, 0, -1,
+							(flags & SYNC_WAIT) ? 0
+								: XFS_B_ASYNC,
+							FI_NONE);
+				}
+			}
+			if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
 				xfs_ioend_wait(ip);
 		}
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -400,9 +406,9 @@ xfs_syncd_queue_work(
 	void		*data,
 	void		(*syncer)(struct xfs_mount *, void *))
 {
-	struct bhv_vfs_sync_work *work;
+	struct xfs_sync_work *work;
 
-	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+	work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
 	INIT_LIST_HEAD(&work->w_list);
 	work->w_syncer = syncer;
 	work->w_data = data;
@@ -445,23 +451,24 @@ xfs_flush_inode(
  * (IOW, "If at first you don't succeed, use a Bigger Hammer").
  */
 STATIC void
-xfs_flush_device_work(
+xfs_flush_inodes_work(
 	struct xfs_mount *mp,
 	void		*arg)
 {
 	struct inode	*inode = arg;
-	sync_blockdev(mp->m_super->s_bdev);
+	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
+	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
 	iput(inode);
 }
 
 void
-xfs_flush_device(
+xfs_flush_inodes(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = VFS_I(ip);
 
 	igrab(inode);
-	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work);
 	delay(msecs_to_jiffies(500));
 	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
@@ -497,7 +504,7 @@ xfssyncd(
 {
 	struct xfs_mount	*mp = arg;
 	long			timeleft;
-	bhv_vfs_sync_work_t	*work, *n;
+	xfs_sync_work_t		*work, *n;
 	LIST_HEAD		(tmp);
 
 	set_freezable();
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 04f058c..ec95e26 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -21,18 +21,19 @@
 struct xfs_mount;
 struct xfs_perag;
 
-typedef struct bhv_vfs_sync_work {
+typedef struct xfs_sync_work {
 	struct list_head	w_list;
 	struct xfs_mount	*w_mount;
 	void			*w_data;	/* syncer routine argument */
 	void			(*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
+} xfs_sync_work_t;
 
 #define SYNC_ATTR		0x0001	/* sync attributes */
 #define SYNC_DELWRI		0x0002	/* look at delayed writes */
 #define SYNC_WAIT		0x0004	/* wait for i/o to complete */
 #define SYNC_BDFLUSH		0x0008	/* BDFLUSH is calling -- don't block */
 #define SYNC_IOWAIT		0x0010  /* wait for all I/O to complete */
+#define SYNC_TRYLOCK		0x0020  /* only try to lock inodes */
 
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
@@ -44,7 +45,7 @@ int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
 void xfs_flush_inode(struct xfs_inode *ip);
-void xfs_flush_device(struct xfs_inode *ip);
+void xfs_flush_inodes(struct xfs_inode *ip);
 
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 08ce723..8b97d82 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -361,7 +361,7 @@ xfs_flush_space(
 		return 0;
 	case 2:
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_flush_device(ip);
+		xfs_flush_inodes(ip);
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		*fsynced = 3;
 		return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1438dd4..6df3a31 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -318,7 +318,7 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct task_struct	*m_sync_task;	/* generalised sync thread */
-	bhv_vfs_sync_work_t	m_sync_work;	/* work item for VFS_SYNC */
+	xfs_sync_work_t		m_sync_work;	/* work item for VFS_SYNC */
 	struct list_head	m_sync_list;	/* sync thread work item list */
 	spinlock_t		m_sync_lock;	/* work item list lock */
 	int			m_sync_seq;	/* sync thread generation no. */
-- 
1.6.2

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/5] [XFS] Make inode flush at ENOSPC synchronous
  2009-03-15 10:57 [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
  2009-03-15 10:57 ` [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing Dave Chinner
@ 2009-03-15 10:57 ` Dave Chinner
  2009-03-15 10:57 ` [PATCH 3/5] [XFS] Block callers of xfs_flush_inodes() correctly Dave Chinner
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Dave Chinner @ 2009-03-15 10:57 UTC (permalink / raw)
  To: xfs; +Cc: mpatocka

From: Dave Chinner <dgc@evostor.com>

When we are writing to a single file and hit ENOSPC, we trigger a background
flush of the inode and try again.  Because we hold page locks and the iolock,
the flush won't proceed until after we release these locks. This occurs once
we've given up and ENOSPC has been reported. Hence if this one is the only
dirty inode in the system, we'll get an ENOSPC prematurely.

To fix this, remove the async flush from the allocation routines and move
it to the top of the write path where we can do a synchronous flush
and retry the write again. Only retry once as a second ENOSPC indicates
that we really are ENOSPC.

This avoids a page cache deadlock when trying to do this flush synchronously
in the allocation layer that was identified by Mikulas Patocka.

Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c  |   18 +++++++++++++++++-
 fs/xfs/linux-2.6/xfs_sync.c |   25 -------------------------
 fs/xfs/linux-2.6/xfs_sync.h |    1 -
 fs/xfs/xfs_iomap.c          |    2 +-
 4 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7e90daa..9142192 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -751,10 +751,26 @@ start:
 			goto relock;
 		}
 	} else {
+		int enospc = 0;
+		ssize_t ret2 = 0;
+
+write_retry:
 		xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
 				*offset, ioflags);
-		ret = generic_file_buffered_write(iocb, iovp, segs,
+		ret2 = generic_file_buffered_write(iocb, iovp, segs,
 				pos, offset, count, ret);
+		/*
+		 * if we just got an ENOSPC, flush the inode now we
+		 * aren't holding any page locks and retry *once*
+		 */
+		if (ret2 == -ENOSPC && !enospc) {
+			error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
+			if (error)
+				goto out_unlock_internal;
+			enospc = 1;
+			goto write_retry;
+		}
+		ret = ret2;
 	}
 
 	current->backing_dev_info = NULL;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 88caafc..73cf8dc 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -426,31 +426,6 @@ xfs_syncd_queue_work(
  * heads, looking about for more room...
  */
 STATIC void
-xfs_flush_inode_work(
-	struct xfs_mount *mp,
-	void		*arg)
-{
-	struct inode	*inode = arg;
-	filemap_flush(inode->i_mapping);
-	iput(inode);
-}
-
-void
-xfs_flush_inode(
-	xfs_inode_t	*ip)
-{
-	struct inode	*inode = VFS_I(ip);
-
-	igrab(inode);
-	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-	delay(msecs_to_jiffies(500));
-}
-
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
 xfs_flush_inodes_work(
 	struct xfs_mount *mp,
 	void		*arg)
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index ec95e26..6e83a35 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -44,7 +44,6 @@ int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
-void xfs_flush_inode(struct xfs_inode *ip);
 void xfs_flush_inodes(struct xfs_inode *ip);
 
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8b97d82..7b8b170 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -347,7 +347,7 @@ xfs_flush_space(
 	case 0:
 		if (ip->i_delayed_blks) {
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inode(ip);
+			delay(1);
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
 			*fsynced = 1;
 		} else {
-- 
1.6.2

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/5] [XFS] Block callers of xfs_flush_inodes() correctly.
  2009-03-15 10:57 [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
  2009-03-15 10:57 ` [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing Dave Chinner
  2009-03-15 10:57 ` [PATCH 2/5] [XFS] Make inode flush at ENOSPC synchronous Dave Chinner
@ 2009-03-15 10:57 ` Dave Chinner
  2009-03-15 10:57 ` [PATCH 4/5] [XFS] Flush delayed allcoation blocks on ENOSPC in create Dave Chinner
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Dave Chinner @ 2009-03-15 10:57 UTC (permalink / raw)
  To: xfs; +Cc: mpatocka

From: Dave Chinner <dgc@evostor.com>

xfs_flush_inodes() currently uses a magic timeout to wait for
some inodes to be flushed before returning. This isn't
really reliable but used to be the best that could be done
due to deadlock potential of waiting for the entire flush.

Now the inode flush is safe to execute while we hold page
and inode locks, we can wait for all the inodes to flush
synchronously. Convert the wait mechanism to a completion
to do this efficiently. This should remove all remaining
spurious ENOSPC errors from the delayed allocation reservation
path.

This is extracted almost line for line from a larger patch
from Mikulas Patocka.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_sync.c |   12 +++++++++---
 fs/xfs/linux-2.6/xfs_sync.h |    1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 73cf8dc..f7ba766 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -404,7 +404,8 @@ STATIC void
 xfs_syncd_queue_work(
 	struct xfs_mount *mp,
 	void		*data,
-	void		(*syncer)(struct xfs_mount *, void *))
+	void		(*syncer)(struct xfs_mount *, void *),
+	struct completion *completion)
 {
 	struct xfs_sync_work *work;
 
@@ -413,6 +414,7 @@ xfs_syncd_queue_work(
 	work->w_syncer = syncer;
 	work->w_data = data;
 	work->w_mount = mp;
+	work->w_completion = completion;
 	spin_lock(&mp->m_sync_lock);
 	list_add_tail(&work->w_list, &mp->m_sync_list);
 	spin_unlock(&mp->m_sync_lock);
@@ -441,10 +443,11 @@ xfs_flush_inodes(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = VFS_I(ip);
+	DECLARE_COMPLETION_ONSTACK(completion);
 
 	igrab(inode);
-	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work);
-	delay(msecs_to_jiffies(500));
+	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
+	wait_for_completion(&completion);
 	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
 
@@ -514,6 +517,8 @@ xfssyncd(
 			list_del(&work->w_list);
 			if (work == &mp->m_sync_work)
 				continue;
+			if (work->w_completion)
+				complete(work->w_completion);
 			kmem_free(work);
 		}
 	}
@@ -527,6 +532,7 @@ xfs_syncd_init(
 {
 	mp->m_sync_work.w_syncer = xfs_sync_worker;
 	mp->m_sync_work.w_mount = mp;
+	mp->m_sync_work.w_completion = NULL;
 	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
 	if (IS_ERR(mp->m_sync_task))
 		return -PTR_ERR(mp->m_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 6e83a35..308d5bf 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -26,6 +26,7 @@ typedef struct xfs_sync_work {
 	struct xfs_mount	*w_mount;
 	void			*w_data;	/* syncer routine argument */
 	void			(*w_syncer)(struct xfs_mount *, void *);
+	struct completion	*w_completion;
 } xfs_sync_work_t;
 
 #define SYNC_ATTR		0x0001	/* sync attributes */
-- 
1.6.2

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/5] [XFS] Flush delayed allcoation blocks on ENOSPC in create
  2009-03-15 10:57 [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
                   ` (2 preceding siblings ...)
  2009-03-15 10:57 ` [PATCH 3/5] [XFS] Block callers of xfs_flush_inodes() correctly Dave Chinner
@ 2009-03-15 10:57 ` Dave Chinner
  2009-03-15 10:57 ` [PATCH 5/5] [XFS] Remove xfs_flush_space Dave Chinner
  2009-03-15 11:11 ` [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
  5 siblings, 0 replies; 15+ messages in thread
From: Dave Chinner @ 2009-03-15 10:57 UTC (permalink / raw)
  To: xfs; +Cc: mpatocka

From: Dave Chinner <dgc@evostor.com>

If we are creating lots of small files, we can fail to get
a reservation for inode create earlier than we should due to
EOF preallocation done during delayed allocation reservation.
Hence on the first reservation ENOSPC failure flush all the
delayed allocation blocks out of the system and retry.

This fixes the last commonly triggered spurious ENOSPC issue
that has been reported.

Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_vnodeops.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 59de049..faf671b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1457,6 +1457,13 @@ xfs_create(
 	error = xfs_trans_reserve(tp, resblks, log_res, 0,
 			XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
+		/* flush outstanding delalloc blocks and retry */
+		xfs_flush_inodes(dp);
+		error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+	}
+	if (error == ENOSPC) {
+		/* No space at all so try a "no-allocation" reservation */
 		resblks = 0;
 		error = xfs_trans_reserve(tp, 0, log_res, 0,
 				XFS_TRANS_PERM_LOG_RES, log_count);
-- 
1.6.2

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/5] [XFS] Remove xfs_flush_space
  2009-03-15 10:57 [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
                   ` (3 preceding siblings ...)
  2009-03-15 10:57 ` [PATCH 4/5] [XFS] Flush delayed allcoation blocks on ENOSPC in create Dave Chinner
@ 2009-03-15 10:57 ` Dave Chinner
  2009-03-15 11:11 ` [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
  5 siblings, 0 replies; 15+ messages in thread
From: Dave Chinner @ 2009-03-15 10:57 UTC (permalink / raw)
  To: xfs; +Cc: mpatocka

From: Dave Chinner <dgc@evostor.com>

The only thing we need to do now when we get an ENOSPC condition during delayed
allocation reservation is flush all the other inodes with delalloc blocks on
them and retry without EOF preallocation. Remove the unneeded mess that is
xfs_flush_space() and just call xfs_flush_inodes() directly from
xfs_iomap_write_delay().

Also, change the location of the retry label to avoid trying to do EOF
preallocation because we don't want to do that at ENOSPC. This enables us to
remove the BMAPI_SYNC flag as it is no longer used.

Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_iomap.c |   61 ++++++++++++---------------------------------------
 fs/xfs/xfs_iomap.h |    3 +-
 2 files changed, 16 insertions(+), 48 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7b8b170..5aaa2d7 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -338,38 +338,6 @@ xfs_iomap_eof_align_last_fsb(
 }
 
 STATIC int
-xfs_flush_space(
-	xfs_inode_t	*ip,
-	int		*fsynced,
-	int		*ioflags)
-{
-	switch (*fsynced) {
-	case 0:
-		if (ip->i_delayed_blks) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			delay(1);
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			*fsynced = 1;
-		} else {
-			*ioflags |= BMAPI_SYNC;
-			*fsynced = 2;
-		}
-		return 0;
-	case 1:
-		*fsynced = 2;
-		*ioflags |= BMAPI_SYNC;
-		return 0;
-	case 2:
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_flush_inodes(ip);
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		*fsynced = 3;
-		return 0;
-	}
-	return 1;
-}
-
-STATIC int
 xfs_cmn_err_fsblock_zero(
 	xfs_inode_t	*ip,
 	xfs_bmbt_irec_t	*imap)
@@ -538,15 +506,9 @@ error_out:
 }
 
 /*
- * If the caller is doing a write at the end of the file,
- * then extend the allocation out to the file system's write
- * iosize.  We clean up any extra space left over when the
- * file is closed in xfs_inactive().
- *
- * For sync writes, we are flushing delayed allocate space to
- * try to make additional space available for allocation near
- * the filesystem full boundary - preallocation hurts in that
- * situation, of course.
+ * If the caller is doing a write at the end of the file, then extend the
+ * allocation out to the file system's write iosize.  We clean up any extra
+ * space left over when the file is closed in xfs_inactive().
  */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -565,7 +527,7 @@ xfs_iomap_eof_want_preallocate(
 	int		n, error, imaps;
 
 	*prealloc = 0;
-	if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
+	if ((offset + count) <= ip->i_size)
 		return 0;
 
 	/*
@@ -611,7 +573,7 @@ xfs_iomap_write_delay(
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		prealloc, fsynced = 0;
+	int		prealloc, flushed = 0;
 	int		error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -627,12 +589,12 @@ xfs_iomap_write_delay(
 	extsz = xfs_get_extsz_hint(ip);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-retry:
 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
 				ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
 	if (error)
 		return error;
 
+retry:
 	if (prealloc) {
 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
@@ -659,15 +621,22 @@ retry:
 
 	/*
 	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-	 * then we must have run out of space - flush delalloc, and retry..
+	 * then we must have run out of space - flush all other inodes with
+	 * delalloc blocks and retry without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
 					ip, offset, count);
-		if (xfs_flush_space(ip, &fsynced, &ioflag))
+		if (flushed)
 			return XFS_ERROR(ENOSPC);
 
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_flush_inodes(ip);
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		flushed = 1;
 		error = 0;
+		prealloc = 0;
 		goto retry;
 	}
 
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ee1a0c1..87fb9d1 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -40,8 +40,7 @@ typedef enum {
 	BMAPI_IGNSTATE = (1 << 4),	/* ignore unwritten state on read */
 	BMAPI_DIRECT = (1 << 5),	/* direct instead of buffered write */
 	BMAPI_MMAP = (1 << 6),		/* allocate for mmap write */
-	BMAPI_SYNC = (1 << 7),		/* sync write to flush delalloc space */
-	BMAPI_TRYLOCK = (1 << 8),	/* non-blocking request */
+	BMAPI_TRYLOCK = (1 << 7),	/* non-blocking request */
 } bmapi_flags_t;
 
 
-- 
1.6.2

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] [XFS] Spurious ENOSPC fixes
  2009-03-15 10:57 [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
                   ` (4 preceding siblings ...)
  2009-03-15 10:57 ` [PATCH 5/5] [XFS] Remove xfs_flush_space Dave Chinner
@ 2009-03-15 11:11 ` Dave Chinner
  5 siblings, 0 replies; 15+ messages in thread
From: Dave Chinner @ 2009-03-15 11:11 UTC (permalink / raw)
  To: xfs; +Cc: mpatocka

On Sun, Mar 15, 2009 at 09:57:54PM +1100, Dave Chinner wrote:
> This series fix the spurious ENOSPC problems reported by and
> partially solved by Mikulas Patocka. This series makes the recently
> posted QA tests 203 and 204 pass.
> 
> Mikulas, if I've got any of the attributions wrong for the
> bits of your code I used, let me know and I'll fix them.

I'm about to resend these with the correct sign-off
(i.e. david@fromorbit.com). Please ignore all the patches
with a different signed-off by.

[ I screwed up the .gitconfig earlier today when setting up a
new build/test box and didn't notice that the local commits
in the kernel tree had the wrong address... ]

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-15 10:57 ` [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing Dave Chinner
@ 2009-03-17 13:08   ` Mikulas Patocka
  2009-03-18  4:17     ` Dave Chinner
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2009-03-17 13:08 UTC (permalink / raw)
  To: Dave Chinner, Christoph Hellwig; +Cc: xfs

On Sun, 15 Mar 2009, Dave Chinner wrote:

> From: Dave Chinner <dgc@evostor.com>
> 
> Currently xfs_device_flush calls sync_blockdev() which is
> a no-op for XFS as all it's metadata is held in a different
> address to the one sync_blockdev() works on.
> 
> Call xfs_sync_inodes() instead to flush all the delayed
> allocation blocks out. To do this as efficiently as possible,
> do it via two passes - one to do an async flush of all the
> dirty blocks and a second to wait for all the IO to complete.
> This requires some modification to the xfs-sync_inodes_ag()
> flush code to do efficiently.
> 
> Signed-off-by: Dave Chinner <david@fromorbit.com>
> ---
>  fs/xfs/linux-2.6/xfs_fs_subr.c |   14 ++++++------
>  fs/xfs/linux-2.6/xfs_sync.c    |   43 +++++++++++++++++++++++----------------
>  fs/xfs/linux-2.6/xfs_sync.h    |    7 +++--
>  fs/xfs/xfs_iomap.c             |    2 +-
>  fs/xfs/xfs_mount.h             |    2 +-
>  5 files changed, 38 insertions(+), 30 deletions(-)
> 
> diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
> index 5aeb777..08be36d 100644
> --- a/fs/xfs/linux-2.6/xfs_fs_subr.c
> +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
> @@ -74,14 +74,14 @@ xfs_flush_pages(
>  
>  	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
>  		xfs_iflags_clear(ip, XFS_ITRUNCATED);
> -		ret = filemap_fdatawrite(mapping);
> -		if (flags & XFS_B_ASYNC)
> -			return -ret;
> -		ret2 = filemap_fdatawait(mapping);
> -		if (!ret)
> -			ret = ret2;
> +		ret = -filemap_fdatawrite(mapping);
>  	}
> -	return -ret;
> +	if (flags & XFS_B_ASYNC)
> +		return ret;
> +	ret2 = xfs_wait_on_pages(ip, first, last);
> +	if (!ret)
> +		ret = ret2;
> +	return ret;
>  }
>  
>  int
> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index a608e72..88caafc 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
> @@ -62,12 +62,6 @@ xfs_sync_inodes_ag(
>  	uint32_t	first_index = 0;
>  	int		error = 0;
>  	int		last_error = 0;
> -	int		fflag = XFS_B_ASYNC;
> -
> -	if (flags & SYNC_DELWRI)
> -		fflag = XFS_B_DELWRI;
> -	if (flags & SYNC_WAIT)
> -		fflag = 0;		/* synchronous overrides all */
>  
>  	do {
>  		struct inode	*inode;
> @@ -128,11 +122,23 @@ xfs_sync_inodes_ag(
>  		 * If we have to flush data or wait for I/O completion
>  		 * we need to hold the iolock.
>  		 */
> -		if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
> -			xfs_ilock(ip, XFS_IOLOCK_SHARED);
> -			lock_flags |= XFS_IOLOCK_SHARED;
> -			error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
> -			if (flags & SYNC_IOWAIT)
> +		if (flags & SYNC_DELWRI) {
> +			if (VN_DIRTY(inode)) {
> +				if (flags & SYNC_TRYLOCK) {
> +					if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
> +						lock_flags |= XFS_IOLOCK_SHARED;
> +				} else {
> +					xfs_ilock(ip, XFS_IOLOCK_SHARED);
> +					lock_flags |= XFS_IOLOCK_SHARED;
> +				}
> +				if (lock_flags & XFS_IOLOCK_SHARED) {
> +					error = xfs_flush_pages(ip, 0, -1,
> +							(flags & SYNC_WAIT) ? 0
> +								: XFS_B_ASYNC,
> +							FI_NONE);
> +				}
> +			}
> +			if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
>  				xfs_ioend_wait(ip);
>  		}
>  		xfs_ilock(ip, XFS_ILOCK_SHARED);
> @@ -400,9 +406,9 @@ xfs_syncd_queue_work(
>  	void		*data,
>  	void		(*syncer)(struct xfs_mount *, void *))
>  {
> -	struct bhv_vfs_sync_work *work;
> +	struct xfs_sync_work *work;
>  
> -	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
> +	work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
>  	INIT_LIST_HEAD(&work->w_list);
>  	work->w_syncer = syncer;
>  	work->w_data = data;
> @@ -445,23 +451,24 @@ xfs_flush_inode(
>   * (IOW, "If at first you don't succeed, use a Bigger Hammer").
>   */
>  STATIC void
> -xfs_flush_device_work(
> +xfs_flush_inodes_work(
>  	struct xfs_mount *mp,
>  	void		*arg)
>  {
>  	struct inode	*inode = arg;
> -	sync_blockdev(mp->m_super->s_bdev);
> +	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
> +	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
>  	iput(inode);
>  }

I tested the code and it works fine.

But I'm not somehow convinced that that TRYLOCK is right ... that it will 
avoid that pagelock-deadlock that I found before.

What is the ordering of pagelock, ILOCK and IOLOCK? If you keep the 
ordering right, you don't need trylock. And if the ordering is wrong, 
trylock will only lower the probability of a deadlock, not avoid it 
completely.

Mikulas

>  void
> -xfs_flush_device(
> +xfs_flush_inodes(
>  	xfs_inode_t	*ip)
>  {
>  	struct inode	*inode = VFS_I(ip);
>  
>  	igrab(inode);
> -	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
> +	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work);
>  	delay(msecs_to_jiffies(500));
>  	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
>  }
> @@ -497,7 +504,7 @@ xfssyncd(
>  {
>  	struct xfs_mount	*mp = arg;
>  	long			timeleft;
> -	bhv_vfs_sync_work_t	*work, *n;
> +	xfs_sync_work_t		*work, *n;
>  	LIST_HEAD		(tmp);
>  
>  	set_freezable();
> diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
> index 04f058c..ec95e26 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.h
> +++ b/fs/xfs/linux-2.6/xfs_sync.h
> @@ -21,18 +21,19 @@
>  struct xfs_mount;
>  struct xfs_perag;
>  
> -typedef struct bhv_vfs_sync_work {
> +typedef struct xfs_sync_work {
>  	struct list_head	w_list;
>  	struct xfs_mount	*w_mount;
>  	void			*w_data;	/* syncer routine argument */
>  	void			(*w_syncer)(struct xfs_mount *, void *);
> -} bhv_vfs_sync_work_t;
> +} xfs_sync_work_t;
>  
>  #define SYNC_ATTR		0x0001	/* sync attributes */
>  #define SYNC_DELWRI		0x0002	/* look at delayed writes */
>  #define SYNC_WAIT		0x0004	/* wait for i/o to complete */
>  #define SYNC_BDFLUSH		0x0008	/* BDFLUSH is calling -- don't block */
>  #define SYNC_IOWAIT		0x0010  /* wait for all I/O to complete */
> +#define SYNC_TRYLOCK		0x0020  /* only try to lock inodes */
>  
>  int xfs_syncd_init(struct xfs_mount *mp);
>  void xfs_syncd_stop(struct xfs_mount *mp);
> @@ -44,7 +45,7 @@ int xfs_quiesce_data(struct xfs_mount *mp);
>  void xfs_quiesce_attr(struct xfs_mount *mp);
>  
>  void xfs_flush_inode(struct xfs_inode *ip);
> -void xfs_flush_device(struct xfs_inode *ip);
> +void xfs_flush_inodes(struct xfs_inode *ip);
>  
>  int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
>  int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 08ce723..8b97d82 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -361,7 +361,7 @@ xfs_flush_space(
>  		return 0;
>  	case 2:
>  		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -		xfs_flush_device(ip);
> +		xfs_flush_inodes(ip);
>  		xfs_ilock(ip, XFS_ILOCK_EXCL);
>  		*fsynced = 3;
>  		return 0;
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 1438dd4..6df3a31 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -318,7 +318,7 @@ typedef struct xfs_mount {
>  #endif
>  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
>  	struct task_struct	*m_sync_task;	/* generalised sync thread */
> -	bhv_vfs_sync_work_t	m_sync_work;	/* work item for VFS_SYNC */
> +	xfs_sync_work_t		m_sync_work;	/* work item for VFS_SYNC */
>  	struct list_head	m_sync_list;	/* sync thread work item list */
>  	spinlock_t		m_sync_lock;	/* work item list lock */
>  	int			m_sync_seq;	/* sync thread generation no. */
> -- 
> 1.6.2
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-17 13:08   ` Mikulas Patocka
@ 2009-03-18  4:17     ` Dave Chinner
  2009-03-18 16:14       ` Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Dave Chinner @ 2009-03-18  4:17 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Christoph Hellwig, xfs

On Tue, Mar 17, 2009 at 09:08:36AM -0400, Mikulas Patocka wrote:
> On Sun, 15 Mar 2009, Dave Chinner wrote:
> 
> > From: Dave Chinner <dgc@evostor.com>
> > 
> > Currently xfs_device_flush calls sync_blockdev() which is
> > a no-op for XFS as all it's metadata is held in a different
> > address to the one sync_blockdev() works on.
> > 
> > Call xfs_sync_inodes() instead to flush all the delayed
> > allocation blocks out. To do this as efficiently as possible,
> > do it via two passes - one to do an async flush of all the
> > dirty blocks and a second to wait for all the IO to complete.
> > This requires some modification to the xfs-sync_inodes_ag()
> > flush code to do efficiently.

.....

> I tested the code and it works fine.
> 
> But I'm not somehow convinced that that TRYLOCK is right ... that it will 
> avoid that pagelock-deadlock that I found before.

Care to expand on why?

> What is the ordering of pagelock, ILOCK and IOLOCK?

iolock, pagelock, and ilock is always the innermost of the three.

> If you keep the 
> ordering right, you don't need trylock.

In this case, we are doing:

	iolock exclusive
	pagelock
	<flush>

Clearly we can't take the iolock during <flush> (nor any other
several other places in XFS that might try to take the iolock), so
trylock-and-fail is the usual method used in XFS for avoiding a
deadlock in this scenario.

> And if the ordering is wrong, 
> trylock will only lower the probability of a deadlock, not avoid it 
> completely.

Details of why you think this? XFS uses trylocks extensively to
avoid deadlocks in these sorts of situations, so I'm curious to
know why you think this technique provide an absolute guarantee in
this case....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-18  4:17     ` Dave Chinner
@ 2009-03-18 16:14       ` Mikulas Patocka
  2009-03-20  4:30         ` Dave Chinner
  0 siblings, 1 reply; 15+ messages in thread
From: Mikulas Patocka @ 2009-03-18 16:14 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, xfs



On Wed, 18 Mar 2009, Dave Chinner wrote:

> On Tue, Mar 17, 2009 at 09:08:36AM -0400, Mikulas Patocka wrote:
> > On Sun, 15 Mar 2009, Dave Chinner wrote:
> > 
> > > From: Dave Chinner <dgc@evostor.com>
> > > 
> > > Currently xfs_device_flush calls sync_blockdev() which is
> > > a no-op for XFS as all it's metadata is held in a different
> > > address to the one sync_blockdev() works on.
> > > 
> > > Call xfs_sync_inodes() instead to flush all the delayed
> > > allocation blocks out. To do this as efficiently as possible,
> > > do it via two passes - one to do an async flush of all the
> > > dirty blocks and a second to wait for all the IO to complete.
> > > This requires some modification to the xfs-sync_inodes_ag()
> > > flush code to do efficiently.
> 
> .....
> 
> > I tested the code and it works fine.
> > 
> > But I'm not somehow convinced that that TRYLOCK is right ... that it will 
> > avoid that pagelock-deadlock that I found before.
> 
> Care to expand on why?
> 
> > What is the ordering of pagelock, ILOCK and IOLOCK?
> 
> iolock, pagelock, and ilock is always the innermost of the three.
> 
> > If you keep the 
> > ordering right, you don't need trylock.
> 
> In this case, we are doing:
> 
> 	iolock exclusive
> 	pagelock
> 	<flush>
> 
> Clearly we can't take the iolock during <flush> (nor any other
> several other places in XFS that might try to take the iolock), so
> trylock-and-fail is the usual method used in XFS for avoiding a
> deadlock in this scenario.
> 
> > And if the ordering is wrong, 
> > trylock will only lower the probability of a deadlock, not avoid it 
> > completely.
> 
> Details of why you think this? XFS uses trylocks extensively to
> avoid deadlocks in these sorts of situations, so I'm curious to
> know why you think this technique provide an absolute guarantee in
> this case....
> 
> Cheers,
> 
> Dave.

What I find scary is those two recursive pagelocks being held without 
trylock. The sequence is like:

lock iolock 1
lock pagelock 1
--- submit request to xfssyncd, that does
trylock iolock 2
lock pagelock 2

... now suppose that this is racing with another process that takes 
pagelock without taking iolock first (memory manager trying to flush files 
mmaped for write or so). It can do

lock pagelock 2
... and submit flush request to a thread that is actually waiting to get 
pagelock 2.

--- I don't know --- is there a possibility to reserve filesystem space 
for write-mapped files at the time of the first page fault? (so that the 
space won't be allocated at the time of a flush?).

Mikulas

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-18 16:14       ` Mikulas Patocka
@ 2009-03-20  4:30         ` Dave Chinner
  2009-03-25 15:19           ` Mikulas Patocka
  0 siblings, 1 reply; 15+ messages in thread
From: Dave Chinner @ 2009-03-20  4:30 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Christoph Hellwig, xfs

On Wed, Mar 18, 2009 at 12:14:51PM -0400, Mikulas Patocka wrote:
> On Wed, 18 Mar 2009, Dave Chinner wrote:
> > On Tue, Mar 17, 2009 at 09:08:36AM -0400, Mikulas Patocka wrote:
> > > On Sun, 15 Mar 2009, Dave Chinner wrote:
> What I find scary is those two recursive pagelocks being held without 
> trylock. The sequence is like:
> 
> lock iolock 1
> lock pagelock 1
> --- submit request to xfssyncd, that does
> trylock iolock 2
> lock pagelock 2

Those two pages will be on different inodes, so locking through all
paths to pagelock 2 except for page writeback will be protected by the iolocks...

> ... now suppose that this is racing with another process that takes 
> pagelock without taking iolock first (memory manager trying to flush files 
> mmaped for write or so). It can do
> 
> lock pagelock 2
> ... and submit flush request to a thread that is actually waiting to get 
> pagelock 2.

Which, AFAIK, is never done in XFS. Once we have a page locked in
the writeback path we either dispatch it in an IO or unlock it
directly before continuing. There should not be a case like you
describe occurring (it is a bug if that does happen).

> --- I don't know --- is there a possibility to reserve filesystem space 
> for write-mapped files at the time of the first page fault? (so that the 
> space won't be allocated at the time of a flush?).

->page_mkwrite

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-20  4:30         ` Dave Chinner
@ 2009-03-25 15:19           ` Mikulas Patocka
  0 siblings, 0 replies; 15+ messages in thread
From: Mikulas Patocka @ 2009-03-25 15:19 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, xfs



On Fri, 20 Mar 2009, Dave Chinner wrote:

> On Wed, Mar 18, 2009 at 12:14:51PM -0400, Mikulas Patocka wrote:
> > On Wed, 18 Mar 2009, Dave Chinner wrote:
> > > On Tue, Mar 17, 2009 at 09:08:36AM -0400, Mikulas Patocka wrote:
> > > > On Sun, 15 Mar 2009, Dave Chinner wrote:
> > What I find scary is those two recursive pagelocks being held without 
> > trylock. The sequence is like:
> > 
> > lock iolock 1
> > lock pagelock 1
> > --- submit request to xfssyncd, that does
> > trylock iolock 2
> > lock pagelock 2
> 
> Those two pages will be on different inodes, so locking through all
> paths to pagelock 2 except for page writeback will be protected by the iolocks...
> 
> > ... now suppose that this is racing with another process that takes 
> > pagelock without taking iolock first (memory manager trying to flush files 
> > mmaped for write or so). It can do
> > 
> > lock pagelock 2
> > ... and submit flush request to a thread that is actually waiting to get 
> > pagelock 2.
> 
> Which, AFAIK, is never done in XFS. Once we have a page locked in
> the writeback path we either dispatch it in an IO or unlock it
> directly before continuing. There should not be a case like you
> describe occurring (it is a bug if that does happen).
> 
> > --- I don't know --- is there a possibility to reserve filesystem space 
> > for write-mapped files at the time of the first page fault? (so that the 
> > space won't be allocated at the time of a flush?).
> 
> ->page_mkwrite

This is called without page lock so it is ok.

So I think there is no deadlock possibility.

But I still say that it is scary to take two pagelocks recursively and 
there is risk that someone will forget about these specific requirements 
after few years and make a bug.

Mikulas

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-16  9:08   ` Christoph Hellwig
@ 2009-03-16 10:45     ` Dave Chinner
  0 siblings, 0 replies; 15+ messages in thread
From: Dave Chinner @ 2009-03-16 10:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: mpatocka, xfs

On Mon, Mar 16, 2009 at 05:08:47AM -0400, Christoph Hellwig wrote:
> On Sun, Mar 15, 2009 at 10:31:43PM +1100, Dave Chinner wrote:
> > index 5aeb777..08be36d 100644
> > --- a/fs/xfs/linux-2.6/xfs_fs_subr.c
> > +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
> > @@ -74,14 +74,14 @@ xfs_flush_pages(
> >  
> >  	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> >  		xfs_iflags_clear(ip, XFS_ITRUNCATED);
> > -		ret = filemap_fdatawrite(mapping);
> > -		if (flags & XFS_B_ASYNC)
> > -			return -ret;
> > -		ret2 = filemap_fdatawait(mapping);
> > -		if (!ret)
> > -			ret = ret2;
> > +		ret = -filemap_fdatawrite(mapping);
> >  	}
> > -	return -ret;
> > +	if (flags & XFS_B_ASYNC)
> > +		return ret;
> > +	ret2 = xfs_wait_on_pages(ip, first, last);
> > +	if (!ret)
> > +		ret = ret2;
> > +	return ret;
> >  }
> 
> How does this belong into this patch series?

So xfs_flush_pages waits on writeback pages which aren't caught by
mapping_tagged(mapping, PAGECACHE_TAG_DIRTY). If we are waiting for
a flush to complete, we should be waiting for all pages under IO to
complete regardless of whether there are dirty pages or not. That
matters if we are doing 2 passes to flush everything and then wait;
the wait won't see dirty mappings, but there might still be pages
under IO....

> Also I think the sync code should just use filemap_fdatawait and
> filemap_fdatawait directly.  It's at a high enough level that we don't
> need all these obsfucations.

OK. I'll rework it so that:

> > +		if (flags & SYNC_DELWRI) {
> > +			if (VN_DIRTY(inode)) {
> > +				if (flags & SYNC_TRYLOCK) {
> > +					if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
> > +						lock_flags |= XFS_IOLOCK_SHARED;
> > +				} else {
> > +					xfs_ilock(ip, XFS_IOLOCK_SHARED);
> > +					lock_flags |= XFS_IOLOCK_SHARED;
> > +				}
> 
> > +				if (lock_flags & XFS_IOLOCK_SHARED) {
> 
> Too long line and pretty ugly use of the lock_flags variable, but given
> that it gets sorted out in the sync series I think we can leave it that way.

This gets done early and the later patches become simpler.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-15 11:31 ` [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing Dave Chinner
@ 2009-03-16  9:08   ` Christoph Hellwig
  2009-03-16 10:45     ` Dave Chinner
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2009-03-16  9:08 UTC (permalink / raw)
  To: Dave Chinner; +Cc: mpatocka, xfs

On Sun, Mar 15, 2009 at 10:31:43PM +1100, Dave Chinner wrote:
> index 5aeb777..08be36d 100644
> --- a/fs/xfs/linux-2.6/xfs_fs_subr.c
> +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
> @@ -74,14 +74,14 @@ xfs_flush_pages(
>  
>  	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
>  		xfs_iflags_clear(ip, XFS_ITRUNCATED);
> -		ret = filemap_fdatawrite(mapping);
> -		if (flags & XFS_B_ASYNC)
> -			return -ret;
> -		ret2 = filemap_fdatawait(mapping);
> -		if (!ret)
> -			ret = ret2;
> +		ret = -filemap_fdatawrite(mapping);
>  	}
> -	return -ret;
> +	if (flags & XFS_B_ASYNC)
> +		return ret;
> +	ret2 = xfs_wait_on_pages(ip, first, last);
> +	if (!ret)
> +		ret = ret2;
> +	return ret;
>  }

How does this belong into this patch series?

Also I think the sync code should just use filemap_fdatawait and
filemap_fdatawait directly.  It's at a high enough level that we don't
need all these obsfucations.

> +		if (flags & SYNC_DELWRI) {
> +			if (VN_DIRTY(inode)) {
> +				if (flags & SYNC_TRYLOCK) {
> +					if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
> +						lock_flags |= XFS_IOLOCK_SHARED;
> +				} else {
> +					xfs_ilock(ip, XFS_IOLOCK_SHARED);
> +					lock_flags |= XFS_IOLOCK_SHARED;
> +				}

> +				if (lock_flags & XFS_IOLOCK_SHARED) {

Too long line and pretty ugly use of the lock_flags variable, but given
that it gets sorted out in the sync series I think we can leave it that way.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing
  2009-03-15 11:31 [PATCH 0/5, RESEND] " Dave Chinner
@ 2009-03-15 11:31 ` Dave Chinner
  2009-03-16  9:08   ` Christoph Hellwig
  0 siblings, 1 reply; 15+ messages in thread
From: Dave Chinner @ 2009-03-15 11:31 UTC (permalink / raw)
  To: xfs; +Cc: mpatocka

Currently xfs_device_flush calls sync_blockdev() which is
a no-op for XFS as all it's metadata is held in a different
address to the one sync_blockdev() works on.

Call xfs_sync_inodes() instead to flush all the delayed
allocation blocks out. To do this as efficiently as possible,
do it via two passes - one to do an async flush of all the
dirty blocks and a second to wait for all the IO to complete.
This requires some modification to the xfs-sync_inodes_ag()
flush code to do efficiently.

Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_fs_subr.c |   14 ++++++------
 fs/xfs/linux-2.6/xfs_sync.c    |   43 +++++++++++++++++++++++----------------
 fs/xfs/linux-2.6/xfs_sync.h    |    7 +++--
 fs/xfs/xfs_iomap.c             |    2 +-
 fs/xfs/xfs_mount.h             |    2 +-
 5 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 5aeb777..08be36d 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -74,14 +74,14 @@ xfs_flush_pages(
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_fdatawrite(mapping);
-		if (flags & XFS_B_ASYNC)
-			return -ret;
-		ret2 = filemap_fdatawait(mapping);
-		if (!ret)
-			ret = ret2;
+		ret = -filemap_fdatawrite(mapping);
 	}
-	return -ret;
+	if (flags & XFS_B_ASYNC)
+		return ret;
+	ret2 = xfs_wait_on_pages(ip, first, last);
+	if (!ret)
+		ret = ret2;
+	return ret;
 }
 
 int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a608e72..88caafc 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -62,12 +62,6 @@ xfs_sync_inodes_ag(
 	uint32_t	first_index = 0;
 	int		error = 0;
 	int		last_error = 0;
-	int		fflag = XFS_B_ASYNC;
-
-	if (flags & SYNC_DELWRI)
-		fflag = XFS_B_DELWRI;
-	if (flags & SYNC_WAIT)
-		fflag = 0;		/* synchronous overrides all */
 
 	do {
 		struct inode	*inode;
@@ -128,11 +122,23 @@ xfs_sync_inodes_ag(
 		 * If we have to flush data or wait for I/O completion
 		 * we need to hold the iolock.
 		 */
-		if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
-			xfs_ilock(ip, XFS_IOLOCK_SHARED);
-			lock_flags |= XFS_IOLOCK_SHARED;
-			error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
-			if (flags & SYNC_IOWAIT)
+		if (flags & SYNC_DELWRI) {
+			if (VN_DIRTY(inode)) {
+				if (flags & SYNC_TRYLOCK) {
+					if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+						lock_flags |= XFS_IOLOCK_SHARED;
+				} else {
+					xfs_ilock(ip, XFS_IOLOCK_SHARED);
+					lock_flags |= XFS_IOLOCK_SHARED;
+				}
+				if (lock_flags & XFS_IOLOCK_SHARED) {
+					error = xfs_flush_pages(ip, 0, -1,
+							(flags & SYNC_WAIT) ? 0
+								: XFS_B_ASYNC,
+							FI_NONE);
+				}
+			}
+			if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
 				xfs_ioend_wait(ip);
 		}
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -400,9 +406,9 @@ xfs_syncd_queue_work(
 	void		*data,
 	void		(*syncer)(struct xfs_mount *, void *))
 {
-	struct bhv_vfs_sync_work *work;
+	struct xfs_sync_work *work;
 
-	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+	work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
 	INIT_LIST_HEAD(&work->w_list);
 	work->w_syncer = syncer;
 	work->w_data = data;
@@ -445,23 +451,24 @@ xfs_flush_inode(
  * (IOW, "If at first you don't succeed, use a Bigger Hammer").
  */
 STATIC void
-xfs_flush_device_work(
+xfs_flush_inodes_work(
 	struct xfs_mount *mp,
 	void		*arg)
 {
 	struct inode	*inode = arg;
-	sync_blockdev(mp->m_super->s_bdev);
+	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
+	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
 	iput(inode);
 }
 
 void
-xfs_flush_device(
+xfs_flush_inodes(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = VFS_I(ip);
 
 	igrab(inode);
-	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work);
 	delay(msecs_to_jiffies(500));
 	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
@@ -497,7 +504,7 @@ xfssyncd(
 {
 	struct xfs_mount	*mp = arg;
 	long			timeleft;
-	bhv_vfs_sync_work_t	*work, *n;
+	xfs_sync_work_t		*work, *n;
 	LIST_HEAD		(tmp);
 
 	set_freezable();
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 04f058c..ec95e26 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -21,18 +21,19 @@
 struct xfs_mount;
 struct xfs_perag;
 
-typedef struct bhv_vfs_sync_work {
+typedef struct xfs_sync_work {
 	struct list_head	w_list;
 	struct xfs_mount	*w_mount;
 	void			*w_data;	/* syncer routine argument */
 	void			(*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
+} xfs_sync_work_t;
 
 #define SYNC_ATTR		0x0001	/* sync attributes */
 #define SYNC_DELWRI		0x0002	/* look at delayed writes */
 #define SYNC_WAIT		0x0004	/* wait for i/o to complete */
 #define SYNC_BDFLUSH		0x0008	/* BDFLUSH is calling -- don't block */
 #define SYNC_IOWAIT		0x0010  /* wait for all I/O to complete */
+#define SYNC_TRYLOCK		0x0020  /* only try to lock inodes */
 
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
@@ -44,7 +45,7 @@ int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
 void xfs_flush_inode(struct xfs_inode *ip);
-void xfs_flush_device(struct xfs_inode *ip);
+void xfs_flush_inodes(struct xfs_inode *ip);
 
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 08ce723..8b97d82 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -361,7 +361,7 @@ xfs_flush_space(
 		return 0;
 	case 2:
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_flush_device(ip);
+		xfs_flush_inodes(ip);
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		*fsynced = 3;
 		return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1438dd4..6df3a31 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -318,7 +318,7 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct task_struct	*m_sync_task;	/* generalised sync thread */
-	bhv_vfs_sync_work_t	m_sync_work;	/* work item for VFS_SYNC */
+	xfs_sync_work_t		m_sync_work;	/* work item for VFS_SYNC */
 	struct list_head	m_sync_list;	/* sync thread work item list */
 	spinlock_t		m_sync_lock;	/* work item list lock */
 	int			m_sync_seq;	/* sync thread generation no. */
-- 
1.6.2

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2009-03-25 15:20 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-15 10:57 [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
2009-03-15 10:57 ` [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing Dave Chinner
2009-03-17 13:08   ` Mikulas Patocka
2009-03-18  4:17     ` Dave Chinner
2009-03-18 16:14       ` Mikulas Patocka
2009-03-20  4:30         ` Dave Chinner
2009-03-25 15:19           ` Mikulas Patocka
2009-03-15 10:57 ` [PATCH 2/5] [XFS] Make inode flush at ENOSPC synchronous Dave Chinner
2009-03-15 10:57 ` [PATCH 3/5] [XFS] Block callers of xfs_flush_inodes() correctly Dave Chinner
2009-03-15 10:57 ` [PATCH 4/5] [XFS] Flush delayed allcoation blocks on ENOSPC in create Dave Chinner
2009-03-15 10:57 ` [PATCH 5/5] [XFS] Remove xfs_flush_space Dave Chinner
2009-03-15 11:11 ` [PATCH 0/5] [XFS] Spurious ENOSPC fixes Dave Chinner
2009-03-15 11:31 [PATCH 0/5, RESEND] " Dave Chinner
2009-03-15 11:31 ` [PATCH 1/5] [XFS] Use xfs_sync_inodes() for device flushing Dave Chinner
2009-03-16  9:08   ` Christoph Hellwig
2009-03-16 10:45     ` Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.