All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] xfs: miscellaneous fixes
@ 2017-10-08 23:54 Dave Chinner
  2017-10-08 23:54 ` [PATCH 1/4] xfs: Don't log uninitialised fields in inode structures Dave Chinner
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: Dave Chinner @ 2017-10-08 23:54 UTC (permalink / raw)
  To: linux-xfs

Darrick,

These are fixes that have been sitting in my test tree for a while,
so I thought I better get them cleaned out and up into the main
tree. They all address recent reported problems on the list - the RT
is not directly related to a user bug report but is something I
did while looking at the recent CVE issue.

Please consider for merge.

-Dave.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/4] xfs: Don't log uninitialised fields in inode structures
  2017-10-08 23:54 [PATCH 0/4] xfs: miscellaneous fixes Dave Chinner
@ 2017-10-08 23:54 ` Dave Chinner
  2017-10-09 14:24   ` Brian Foster
  2017-10-08 23:54 ` [PATCH 2/4] xfs: move more RT specific code under CONFIG_XFS_RT Dave Chinner
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2017-10-08 23:54 UTC (permalink / raw)
  To: linux-xfs

From: Dave Chinner <dchinner@redhat.com>

Prevent kmemcheck from throwing warnings about reading uninitialised
memory when formatting inodes into the incore log buffer. There are
several issues here - we don't always log all the fields in the
inode log format item, and we never log the inode the
di_next_unlinked field.

In the case of the inode log format item, this is exacerbated
by the old xfs_inode_log_format structure padding issue. Hence make
the padded, 64 bit aligned version of the structure the one we always
use for formatting the log and get rid of the 64 bit variant. This
means we'll always log the 64-bit version and so recovery only needs
to convert from the unpadded 32 bit version from older 32 bit
kernels.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 fs/xfs/libxfs/xfs_log_format.h | 27 +++++----------
 fs/xfs/xfs_inode_item.c        | 79 ++++++++++++++++++++++--------------------
 fs/xfs/xfs_ondisk.h            |  2 +-
 3 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 8372e9bcd7b6..71de185735e0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -270,6 +270,7 @@ typedef struct xfs_inode_log_format {
 	uint32_t		ilf_fields;	/* flags for fields logged */
 	uint16_t		ilf_asize;	/* size of attr d/ext/root */
 	uint16_t		ilf_dsize;	/* size of data/ext/root */
+	uint32_t		ilf_pad;	/* pad for 64 bit boundary */
 	uint64_t		ilf_ino;	/* inode number */
 	union {
 		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
@@ -280,29 +281,17 @@ typedef struct xfs_inode_log_format {
 	int32_t			ilf_boffset;	/* off of inode in buffer */
 } xfs_inode_log_format_t;
 
-typedef struct xfs_inode_log_format_32 {
-	uint16_t		ilf_type;	/* inode log item type */
-	uint16_t		ilf_size;	/* size of this item */
-	uint32_t		ilf_fields;	/* flags for fields logged */
-	uint16_t		ilf_asize;	/* size of attr d/ext/root */
-	uint16_t		ilf_dsize;	/* size of data/ext/root */
-	uint64_t		ilf_ino;	/* inode number */
-	union {
-		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
-		uuid_t		ilfu_uuid;	/* mount point value */
-	} ilf_u;
-	int64_t			ilf_blkno;	/* blkno of inode buffer */
-	int32_t			ilf_len;	/* len of inode buffer */
-	int32_t			ilf_boffset;	/* off of inode in buffer */
-} __attribute__((packed)) xfs_inode_log_format_32_t;
-
-typedef struct xfs_inode_log_format_64 {
+/*
+ * Old 32 bit systems will log in this format without the 64 bit
+ * alignment padding. Recovery will detect this and convert it to the
+ * correct format.
+ */
+struct xfs_inode_log_format_32 {
 	uint16_t		ilf_type;	/* inode log item type */
 	uint16_t		ilf_size;	/* size of this item */
 	uint32_t		ilf_fields;	/* flags for fields logged */
 	uint16_t		ilf_asize;	/* size of attr d/ext/root */
 	uint16_t		ilf_dsize;	/* size of data/ext/root */
-	uint32_t		ilf_pad;	/* pad for 64 bit boundary */
 	uint64_t		ilf_ino;	/* inode number */
 	union {
 		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
@@ -311,7 +300,7 @@ typedef struct xfs_inode_log_format_64 {
 	int64_t			ilf_blkno;	/* blkno of inode buffer */
 	int32_t			ilf_len;	/* len of inode buffer */
 	int32_t			ilf_boffset;	/* off of inode in buffer */
-} xfs_inode_log_format_64_t;
+} __attribute__((packed));
 
 
 /*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a705f34b58fa..9bbc2d7cc8cb 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -364,6 +364,9 @@ xfs_inode_to_log_dinode(
 	to->di_dmstate = from->di_dmstate;
 	to->di_flags = from->di_flags;
 
+	/* log a dummy value to ensure log structure is fully initialised */
+	to->di_next_unlinked = NULLAGINO;
+
 	if (from->di_version == 3) {
 		to->di_changecount = inode->i_version;
 		to->di_crtime.t_sec = from->di_crtime.t_sec;
@@ -404,6 +407,11 @@ xfs_inode_item_format_core(
  * the second with the on-disk inode structure, and a possible third and/or
  * fourth with the inode data/extents/b-tree root and inode attributes
  * data/extents/b-tree root.
+ *
+ * Note: Always use the 64 bit inode log format structure so we don't
+ * leave an uninitialised hole in the format item on 64 bit systems. Log
+ * recovery on 32 bit systems handles this just fine, so there's no reason
+ * for not using an initialising the properly padded structure all the time.
  */
 STATIC void
 xfs_inode_item_format(
@@ -412,8 +420,8 @@ xfs_inode_item_format(
 {
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
-	struct xfs_inode_log_format *ilf;
 	struct xfs_log_iovec	*vecp = NULL;
+	struct xfs_inode_log_format *ilf;
 
 	ASSERT(ip->i_d.di_version > 1);
 
@@ -425,7 +433,17 @@ xfs_inode_item_format(
 	ilf->ilf_boffset = ip->i_imap.im_boffset;
 	ilf->ilf_fields = XFS_ILOG_CORE;
 	ilf->ilf_size = 2; /* format + core */
-	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+
+	/*
+	 * make sure we don't leak uninitialised data into the log in the case
+	 * when we don't log every field in the inode.
+	 */
+	ilf->ilf_dsize = 0;
+	ilf->ilf_asize = 0;
+	ilf->ilf_pad = 0;
+	uuid_copy(&ilf->ilf_u.ilfu_uuid, &uuid_null);
+
+	xlog_finish_iovec(lv, vecp, sizeof(*ilf));
 
 	xfs_inode_item_format_core(ip, lv, &vecp);
 	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
@@ -855,44 +873,29 @@ xfs_istale_done(
 }
 
 /*
- * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
- * (which can have different field alignments) to the native version
+ * convert an xfs_inode_log_format struct from the old 32 bit version
+ * (which can have different field alignments) to the native 64 bit version
  */
 int
 xfs_inode_item_format_convert(
-	xfs_log_iovec_t		*buf,
-	xfs_inode_log_format_t	*in_f)
+	struct xfs_log_iovec		*buf,
+	struct xfs_inode_log_format	*in_f)
 {
-	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
-		xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
-
-		in_f->ilf_type = in_f32->ilf_type;
-		in_f->ilf_size = in_f32->ilf_size;
-		in_f->ilf_fields = in_f32->ilf_fields;
-		in_f->ilf_asize = in_f32->ilf_asize;
-		in_f->ilf_dsize = in_f32->ilf_dsize;
-		in_f->ilf_ino = in_f32->ilf_ino;
-		/* copy biggest field of ilf_u */
-		uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid);
-		in_f->ilf_blkno = in_f32->ilf_blkno;
-		in_f->ilf_len = in_f32->ilf_len;
-		in_f->ilf_boffset = in_f32->ilf_boffset;
-		return 0;
-	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
-		xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
-
-		in_f->ilf_type = in_f64->ilf_type;
-		in_f->ilf_size = in_f64->ilf_size;
-		in_f->ilf_fields = in_f64->ilf_fields;
-		in_f->ilf_asize = in_f64->ilf_asize;
-		in_f->ilf_dsize = in_f64->ilf_dsize;
-		in_f->ilf_ino = in_f64->ilf_ino;
-		/* copy biggest field of ilf_u */
-		uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid);
-		in_f->ilf_blkno = in_f64->ilf_blkno;
-		in_f->ilf_len = in_f64->ilf_len;
-		in_f->ilf_boffset = in_f64->ilf_boffset;
-		return 0;
-	}
-	return -EFSCORRUPTED;
+	struct xfs_inode_log_format_32	*in_f32 = buf->i_addr;
+
+	if (buf->i_len != sizeof(*in_f32))
+		return -EFSCORRUPTED;
+
+	in_f->ilf_type = in_f32->ilf_type;
+	in_f->ilf_size = in_f32->ilf_size;
+	in_f->ilf_fields = in_f32->ilf_fields;
+	in_f->ilf_asize = in_f32->ilf_asize;
+	in_f->ilf_dsize = in_f32->ilf_dsize;
+	in_f->ilf_ino = in_f32->ilf_ino;
+	/* copy biggest field of ilf_u */
+	uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid);
+	in_f->ilf_blkno = in_f32->ilf_blkno;
+	in_f->ilf_len = in_f32->ilf_len;
+	in_f->ilf_boffset = in_f32->ilf_boffset;
+	return 0;
 }
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 0c381d71b242..0492436a053f 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log,		28);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp,		8);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32,	52);
-	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64,	56);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,	56);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,	20);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,		16);
 }
-- 
2.14.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/4] xfs: move more RT specific code under CONFIG_XFS_RT
  2017-10-08 23:54 [PATCH 0/4] xfs: miscellaneous fixes Dave Chinner
  2017-10-08 23:54 ` [PATCH 1/4] xfs: Don't log uninitialised fields in inode structures Dave Chinner
@ 2017-10-08 23:54 ` Dave Chinner
  2017-10-09 14:24   ` Brian Foster
  2017-10-08 23:54 ` [PATCH 3/4] xfs: don't change inode mode if ACL update fails Dave Chinner
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2017-10-08 23:54 UTC (permalink / raw)
  To: linux-xfs

From: Dave Chinner <dchinner@redhat.com>

Various utility functions and interfaces that iterate internal
devices try to reference the realtime device even when RT support is
not compiled into the kernel.

Make sure this code is excluded from the CONFIG_XFS_RT=n build,
and where appropriate stub functions to return fatal errors if
they ever get called when RT support is not present.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_bmap_util.c |  2 ++
 fs/xfs/xfs_bmap_util.h | 13 +++++++++++++
 fs/xfs/xfs_fsmap.c     | 12 ++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index e9db7fc95b70..6503cfa44262 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -84,6 +84,7 @@ xfs_zero_extent(
 		GFP_NOFS, 0);
 }
 
+#ifdef CONFIG_XFS_RT
 int
 xfs_bmap_rtalloc(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
@@ -190,6 +191,7 @@ xfs_bmap_rtalloc(
 	}
 	return 0;
 }
+#endif /* CONFIG_XFS_RT */
 
 /*
  * Check if the endoff is outside the last extent. If so the caller will grow
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 0eaa81dc49be..7d330b3c77c3 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -28,7 +28,20 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_bmalloca;
 
+#ifdef CONFIG_XFS_RT
 int	xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+#else /* !CONFIG_XFS_RT */
+/*
+ * Attempts to allocate RT extents when RT is disable indicates corruption and
+ * should trigger a shutdown.
+ */
+static inline int
+xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
+{
+	return -EFSCORRUPTED;
+}
+#endif /* CONFIG_XFS_RT */
+
 int	xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
 		     int whichfork, int *eof);
 int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 814ed729881d..560e0b40ac1b 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -521,6 +521,7 @@ __xfs_getfsmap_rtdev(
 	return query_fn(tp, info);
 }
 
+#ifdef CONFIG_XFS_RT
 /* Actually query the realtime bitmap. */
 STATIC int
 xfs_getfsmap_rtdev_rtbitmap_query(
@@ -561,6 +562,7 @@ xfs_getfsmap_rtdev_rtbitmap(
 	return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
 			info);
 }
+#endif /* CONFIG_XFS_RT */
 
 /* Execute a getfsmap query against the regular data device. */
 STATIC int
@@ -795,7 +797,15 @@ xfs_getfsmap_check_keys(
 	return false;
 }
 
+/*
+ * There are only two devices if we didn't configure RT devices at build time.
+ */
+#ifdef CONFIG_XFS_RT
 #define XFS_GETFSMAP_DEVS	3
+#else
+#define XFS_GETFSMAP_DEVS	2
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Get filesystem's extents as described in head, and format for
  * output.  Calls formatter to fill the user's buffer until all
@@ -853,10 +863,12 @@ xfs_getfsmap(
 		handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
 		handlers[1].fn = xfs_getfsmap_logdev;
 	}
+#ifdef CONFIG_XFS_RT
 	if (mp->m_rtdev_targp) {
 		handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
 		handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
 	}
+#endif /* CONFIG_XFS_RT */
 
 	xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
 			xfs_getfsmap_dev_compare);
-- 
2.14.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/4] xfs: don't change inode mode if ACL update fails
  2017-10-08 23:54 [PATCH 0/4] xfs: miscellaneous fixes Dave Chinner
  2017-10-08 23:54 ` [PATCH 1/4] xfs: Don't log uninitialised fields in inode structures Dave Chinner
  2017-10-08 23:54 ` [PATCH 2/4] xfs: move more RT specific code under CONFIG_XFS_RT Dave Chinner
@ 2017-10-08 23:54 ` Dave Chinner
  2017-10-09 14:24   ` Brian Foster
  2017-10-08 23:54 ` [PATCH 4/4] xfs: cancel dirty pages on invalidation Dave Chinner
  2017-10-09 18:47 ` [PATCH 0/4] xfs: miscellaneous fixes Darrick J. Wong
  4 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2017-10-08 23:54 UTC (permalink / raw)
  To: linux-xfs

From: Dave Chinner <dchinner@redhat.com>

If we get ENOSPC half way through setting the ACL, the inode mode
can still be changed even though the ACL does not exist. Reorder the
operation to only change the mode of the inode if the ACL is set
correctly.

Whilst this does not fix the problem with crash consistency (that requires
attribute addition to be a deferred op) it does prevent ENOSPC and other
non-fatal errors setting an xattr to be handled sanely.

This fixes xfstests generic/449.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_acl.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7034e17535de..3354140de07e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode)
 int
 xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
+	umode_t mode;
+	bool set_mode = false;
 	int error = 0;
 
 	if (!acl)
@@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		return error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		umode_t mode;
-
 		error = posix_acl_update_mode(inode, &mode, &acl);
 		if (error)
 			return error;
-		error = xfs_set_mode(inode, mode);
-		if (error)
-			return error;
+		set_mode = true;
 	}
 
  set_acl:
-	return __xfs_set_acl(inode, acl, type);
+	error =  __xfs_set_acl(inode, acl, type);
+	if (error)
+		return error;
+
+	/*
+	 * We set the mode after successfully updating the ACL xattr because the
+	 * xattr update can fail at ENOSPC and we don't want to change the mode
+	 * if the ACL update hasn't been applied.
+	 */
+	if (set_mode)
+		error = xfs_set_mode(inode, mode);
+
+	return error;
 }
-- 
2.14.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-08 23:54 [PATCH 0/4] xfs: miscellaneous fixes Dave Chinner
                   ` (2 preceding siblings ...)
  2017-10-08 23:54 ` [PATCH 3/4] xfs: don't change inode mode if ACL update fails Dave Chinner
@ 2017-10-08 23:54 ` Dave Chinner
  2017-10-09 14:24   ` Brian Foster
  2017-10-16 19:39   ` Darrick J. Wong
  2017-10-09 18:47 ` [PATCH 0/4] xfs: miscellaneous fixes Darrick J. Wong
  4 siblings, 2 replies; 19+ messages in thread
From: Dave Chinner @ 2017-10-08 23:54 UTC (permalink / raw)
  To: linux-xfs

From: Dave Chinner <dchinner@redhat.com>

Recently we've had warnings arise from the vm handing us pages
without bufferheads attached to them. This should not ever occur
in XFS, but we don't defend against it properly if it does. The only
place where we remove bufferheads from a page is in
xfs_vm_releasepage(), but we can't tell the difference here between
"page is dirty so don't release" and "page is dirty but is being
invalidated so release it".

In some places that are invalidating pages ask for pages to be
released and follow up afterward calling ->releasepage by checking
whether the page was dirty and then aborting the invalidation. This
is a possible vector for releasing buffers from a page but then
leaving it in the mapping, so we really do need to avoid dirty pages
in xfs_vm_releasepage().

To differentiate between invalidated pages and normal pages, we need
to clear the page dirty flag when invalidating the pages. This can
be done through xfs_vm_invalidatepage(), and will result
xfs_vm_releasepage() seeing the page as clean which matches the
bufferhead state on the page after calling block_invalidatepage().

Hence we can re-add the page dirty check in xfs_vm_releasepage to
catch the case where we might be releasing a page that is actually
dirty and so should not have the bufferheads on it removed. This
will remove one possible vector of "dirty page with no bufferheads"
and so help narrow down the search for the root cause of that
problem.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_aops.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f18e5932aec4..067284d84d9e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -735,6 +735,14 @@ xfs_vm_invalidatepage(
 {
 	trace_xfs_invalidatepage(page->mapping->host, page, offset,
 				 length);
+
+	/*
+	 * If we are invalidating the entire page, clear the dirty state from it
+	 * so that we can check for attempts to release dirty cached pages in
+	 * xfs_vm_releasepage().
+	 */
+	if (offset == 0 && length >= PAGE_SIZE)
+		cancel_dirty_page(page);
 	block_invalidatepage(page, offset, length);
 }
 
@@ -1190,25 +1198,27 @@ xfs_vm_releasepage(
 	 * mm accommodates an old ext3 case where clean pages might not have had
 	 * the dirty bit cleared. Thus, it can send actual dirty pages to
 	 * ->releasepage() via shrink_active_list(). Conversely,
-	 * block_invalidatepage() can send pages that are still marked dirty
-	 * but otherwise have invalidated buffers.
+	 * block_invalidatepage() can send pages that are still marked dirty but
+	 * otherwise have invalidated buffers.
 	 *
 	 * We want to release the latter to avoid unnecessary buildup of the
-	 * LRU, skip the former and warn if we've left any lingering
-	 * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
-	 * or unwritten buffers and warn if the page is not dirty. Otherwise
-	 * try to release the buffers.
+	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
+	 * that are entirely invalidated and need to be released.  Hence the
+	 * only time we should get dirty pages here is through
+	 * shrink_active_list() and so we can simply skip those now.
+	 *
+	 * warn if we've left any lingering delalloc/unwritten buffers on clean
+	 * or invalidated pages we are about to release.
 	 */
+	if (PageDirty(page))
+		return 0;
+
 	xfs_count_page_state(page, &delalloc, &unwritten);
 
-	if (delalloc) {
-		WARN_ON_ONCE(!PageDirty(page));
+	if (WARN_ON_ONCE(delalloc))
 		return 0;
-	}
-	if (unwritten) {
-		WARN_ON_ONCE(!PageDirty(page));
+	if (WARN_ON_ONCE(unwritten))
 		return 0;
-	}
 
 	return try_to_free_buffers(page);
 }
-- 
2.14.2


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] xfs: Don't log uninitialised fields in inode structures
  2017-10-08 23:54 ` [PATCH 1/4] xfs: Don't log uninitialised fields in inode structures Dave Chinner
@ 2017-10-09 14:24   ` Brian Foster
  0 siblings, 0 replies; 19+ messages in thread
From: Brian Foster @ 2017-10-09 14:24 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Mon, Oct 09, 2017 at 10:54:11AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Prevent kmemcheck from throwing warnings about reading uninitialised
> memory when formatting inodes into the incore log buffer. There are
> several issues here - we don't always log all the fields in the
> inode log format item, and we never log the inode the
> di_next_unlinked field.
> 
> In the case of the inode log format item, this is exacerbated
> by the old xfs_inode_log_format structure padding issue. Hence make
> the padded, 64 bit aligned version of the structure the one we always
> use for formatting the log and get rid of the 64 bit variant. This
> means we'll always log the 64-bit version and so recovery only needs
> to convert from the unpadded 32 bit version from older 32 bit
> kernels.
> 
> Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/libxfs/xfs_log_format.h | 27 +++++----------
>  fs/xfs/xfs_inode_item.c        | 79 ++++++++++++++++++++++--------------------
>  fs/xfs/xfs_ondisk.h            |  2 +-
>  3 files changed, 50 insertions(+), 58 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> index 8372e9bcd7b6..71de185735e0 100644
> --- a/fs/xfs/libxfs/xfs_log_format.h
> +++ b/fs/xfs/libxfs/xfs_log_format.h
> @@ -270,6 +270,7 @@ typedef struct xfs_inode_log_format {
>  	uint32_t		ilf_fields;	/* flags for fields logged */
>  	uint16_t		ilf_asize;	/* size of attr d/ext/root */
>  	uint16_t		ilf_dsize;	/* size of data/ext/root */
> +	uint32_t		ilf_pad;	/* pad for 64 bit boundary */
>  	uint64_t		ilf_ino;	/* inode number */
>  	union {
>  		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
> @@ -280,29 +281,17 @@ typedef struct xfs_inode_log_format {
>  	int32_t			ilf_boffset;	/* off of inode in buffer */
>  } xfs_inode_log_format_t;
>  
> -typedef struct xfs_inode_log_format_32 {
> -	uint16_t		ilf_type;	/* inode log item type */
> -	uint16_t		ilf_size;	/* size of this item */
> -	uint32_t		ilf_fields;	/* flags for fields logged */
> -	uint16_t		ilf_asize;	/* size of attr d/ext/root */
> -	uint16_t		ilf_dsize;	/* size of data/ext/root */
> -	uint64_t		ilf_ino;	/* inode number */
> -	union {
> -		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
> -		uuid_t		ilfu_uuid;	/* mount point value */
> -	} ilf_u;
> -	int64_t			ilf_blkno;	/* blkno of inode buffer */
> -	int32_t			ilf_len;	/* len of inode buffer */
> -	int32_t			ilf_boffset;	/* off of inode in buffer */
> -} __attribute__((packed)) xfs_inode_log_format_32_t;
> -
> -typedef struct xfs_inode_log_format_64 {
> +/*
> + * Old 32 bit systems will log in this format without the 64 bit
> + * alignment padding. Recovery will detect this and convert it to the
> + * correct format.
> + */
> +struct xfs_inode_log_format_32 {
>  	uint16_t		ilf_type;	/* inode log item type */
>  	uint16_t		ilf_size;	/* size of this item */
>  	uint32_t		ilf_fields;	/* flags for fields logged */
>  	uint16_t		ilf_asize;	/* size of attr d/ext/root */
>  	uint16_t		ilf_dsize;	/* size of data/ext/root */
> -	uint32_t		ilf_pad;	/* pad for 64 bit boundary */
>  	uint64_t		ilf_ino;	/* inode number */
>  	union {
>  		uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
> @@ -311,7 +300,7 @@ typedef struct xfs_inode_log_format_64 {
>  	int64_t			ilf_blkno;	/* blkno of inode buffer */
>  	int32_t			ilf_len;	/* len of inode buffer */
>  	int32_t			ilf_boffset;	/* off of inode in buffer */
> -} xfs_inode_log_format_64_t;
> +} __attribute__((packed));
>  
>  
>  /*
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index a705f34b58fa..9bbc2d7cc8cb 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -364,6 +364,9 @@ xfs_inode_to_log_dinode(
>  	to->di_dmstate = from->di_dmstate;
>  	to->di_flags = from->di_flags;
>  
> +	/* log a dummy value to ensure log structure is fully initialised */
> +	to->di_next_unlinked = NULLAGINO;
> +
>  	if (from->di_version == 3) {
>  		to->di_changecount = inode->i_version;
>  		to->di_crtime.t_sec = from->di_crtime.t_sec;
> @@ -404,6 +407,11 @@ xfs_inode_item_format_core(
>   * the second with the on-disk inode structure, and a possible third and/or
>   * fourth with the inode data/extents/b-tree root and inode attributes
>   * data/extents/b-tree root.
> + *
> + * Note: Always use the 64 bit inode log format structure so we don't
> + * leave an uninitialised hole in the format item on 64 bit systems. Log
> + * recovery on 32 bit systems handles this just fine, so there's no reason
> + * for not using an initialising the properly padded structure all the time.
>   */
>  STATIC void
>  xfs_inode_item_format(
> @@ -412,8 +420,8 @@ xfs_inode_item_format(
>  {
>  	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
>  	struct xfs_inode	*ip = iip->ili_inode;
> -	struct xfs_inode_log_format *ilf;
>  	struct xfs_log_iovec	*vecp = NULL;
> +	struct xfs_inode_log_format *ilf;
>  
>  	ASSERT(ip->i_d.di_version > 1);
>  
> @@ -425,7 +433,17 @@ xfs_inode_item_format(
>  	ilf->ilf_boffset = ip->i_imap.im_boffset;
>  	ilf->ilf_fields = XFS_ILOG_CORE;
>  	ilf->ilf_size = 2; /* format + core */
> -	xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
> +
> +	/*
> +	 * make sure we don't leak uninitialised data into the log in the case
> +	 * when we don't log every field in the inode.
> +	 */
> +	ilf->ilf_dsize = 0;
> +	ilf->ilf_asize = 0;
> +	ilf->ilf_pad = 0;
> +	uuid_copy(&ilf->ilf_u.ilfu_uuid, &uuid_null);
> +
> +	xlog_finish_iovec(lv, vecp, sizeof(*ilf));
>  
>  	xfs_inode_item_format_core(ip, lv, &vecp);
>  	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
> @@ -855,44 +873,29 @@ xfs_istale_done(
>  }
>  
>  /*
> - * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
> - * (which can have different field alignments) to the native version
> + * convert an xfs_inode_log_format struct from the old 32 bit version
> + * (which can have different field alignments) to the native 64 bit version
>   */
>  int
>  xfs_inode_item_format_convert(
> -	xfs_log_iovec_t		*buf,
> -	xfs_inode_log_format_t	*in_f)
> +	struct xfs_log_iovec		*buf,
> +	struct xfs_inode_log_format	*in_f)
>  {
> -	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
> -		xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
> -
> -		in_f->ilf_type = in_f32->ilf_type;
> -		in_f->ilf_size = in_f32->ilf_size;
> -		in_f->ilf_fields = in_f32->ilf_fields;
> -		in_f->ilf_asize = in_f32->ilf_asize;
> -		in_f->ilf_dsize = in_f32->ilf_dsize;
> -		in_f->ilf_ino = in_f32->ilf_ino;
> -		/* copy biggest field of ilf_u */
> -		uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid);
> -		in_f->ilf_blkno = in_f32->ilf_blkno;
> -		in_f->ilf_len = in_f32->ilf_len;
> -		in_f->ilf_boffset = in_f32->ilf_boffset;
> -		return 0;
> -	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
> -		xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
> -
> -		in_f->ilf_type = in_f64->ilf_type;
> -		in_f->ilf_size = in_f64->ilf_size;
> -		in_f->ilf_fields = in_f64->ilf_fields;
> -		in_f->ilf_asize = in_f64->ilf_asize;
> -		in_f->ilf_dsize = in_f64->ilf_dsize;
> -		in_f->ilf_ino = in_f64->ilf_ino;
> -		/* copy biggest field of ilf_u */
> -		uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid);
> -		in_f->ilf_blkno = in_f64->ilf_blkno;
> -		in_f->ilf_len = in_f64->ilf_len;
> -		in_f->ilf_boffset = in_f64->ilf_boffset;
> -		return 0;
> -	}
> -	return -EFSCORRUPTED;
> +	struct xfs_inode_log_format_32	*in_f32 = buf->i_addr;
> +
> +	if (buf->i_len != sizeof(*in_f32))
> +		return -EFSCORRUPTED;
> +
> +	in_f->ilf_type = in_f32->ilf_type;
> +	in_f->ilf_size = in_f32->ilf_size;
> +	in_f->ilf_fields = in_f32->ilf_fields;
> +	in_f->ilf_asize = in_f32->ilf_asize;
> +	in_f->ilf_dsize = in_f32->ilf_dsize;
> +	in_f->ilf_ino = in_f32->ilf_ino;
> +	/* copy biggest field of ilf_u */
> +	uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid);
> +	in_f->ilf_blkno = in_f32->ilf_blkno;
> +	in_f->ilf_len = in_f32->ilf_len;
> +	in_f->ilf_boffset = in_f32->ilf_boffset;
> +	return 0;
>  }
> diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
> index 0c381d71b242..0492436a053f 100644
> --- a/fs/xfs/xfs_ondisk.h
> +++ b/fs/xfs/xfs_ondisk.h
> @@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void)
>  	XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log,		28);
>  	XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp,		8);
>  	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32,	52);
> -	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64,	56);
> +	XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,	56);
>  	XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,	20);
>  	XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,		16);
>  }
> -- 
> 2.14.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] xfs: move more RT specific code under CONFIG_XFS_RT
  2017-10-08 23:54 ` [PATCH 2/4] xfs: move more RT specific code under CONFIG_XFS_RT Dave Chinner
@ 2017-10-09 14:24   ` Brian Foster
  0 siblings, 0 replies; 19+ messages in thread
From: Brian Foster @ 2017-10-09 14:24 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Mon, Oct 09, 2017 at 10:54:12AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Various utility functions and interfaces that iterate internal
> devices try to reference the realtime device even when RT support is
> not compiled into the kernel.
> 
> Make sure this code is excluded from the CONFIG_XFS_RT=n build,
> and where appropriate stub functions to return fatal errors if
> they ever get called when RT support is not present.
> 
> Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_bmap_util.c |  2 ++
>  fs/xfs/xfs_bmap_util.h | 13 +++++++++++++
>  fs/xfs/xfs_fsmap.c     | 12 ++++++++++++
>  3 files changed, 27 insertions(+)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index e9db7fc95b70..6503cfa44262 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -84,6 +84,7 @@ xfs_zero_extent(
>  		GFP_NOFS, 0);
>  }
>  
> +#ifdef CONFIG_XFS_RT
>  int
>  xfs_bmap_rtalloc(
>  	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
> @@ -190,6 +191,7 @@ xfs_bmap_rtalloc(
>  	}
>  	return 0;
>  }
> +#endif /* CONFIG_XFS_RT */
>  
>  /*
>   * Check if the endoff is outside the last extent. If so the caller will grow
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index 0eaa81dc49be..7d330b3c77c3 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -28,7 +28,20 @@ struct xfs_mount;
>  struct xfs_trans;
>  struct xfs_bmalloca;
>  
> +#ifdef CONFIG_XFS_RT
>  int	xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
> +#else /* !CONFIG_XFS_RT */
> +/*
> + * Attempts to allocate RT extents when RT is disable indicates corruption and
> + * should trigger a shutdown.
> + */
> +static inline int
> +xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
> +{
> +	return -EFSCORRUPTED;
> +}
> +#endif /* CONFIG_XFS_RT */
> +
>  int	xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
>  		     int whichfork, int *eof);
>  int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
> diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
> index 814ed729881d..560e0b40ac1b 100644
> --- a/fs/xfs/xfs_fsmap.c
> +++ b/fs/xfs/xfs_fsmap.c
> @@ -521,6 +521,7 @@ __xfs_getfsmap_rtdev(
>  	return query_fn(tp, info);
>  }
>  
> +#ifdef CONFIG_XFS_RT
>  /* Actually query the realtime bitmap. */
>  STATIC int
>  xfs_getfsmap_rtdev_rtbitmap_query(
> @@ -561,6 +562,7 @@ xfs_getfsmap_rtdev_rtbitmap(
>  	return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
>  			info);
>  }
> +#endif /* CONFIG_XFS_RT */
>  
>  /* Execute a getfsmap query against the regular data device. */
>  STATIC int
> @@ -795,7 +797,15 @@ xfs_getfsmap_check_keys(
>  	return false;
>  }
>  
> +/*
> + * There are only two devices if we didn't configure RT devices at build time.
> + */
> +#ifdef CONFIG_XFS_RT
>  #define XFS_GETFSMAP_DEVS	3
> +#else
> +#define XFS_GETFSMAP_DEVS	2
> +#endif /* CONFIG_XFS_RT */
> +
>  /*
>   * Get filesystem's extents as described in head, and format for
>   * output.  Calls formatter to fill the user's buffer until all
> @@ -853,10 +863,12 @@ xfs_getfsmap(
>  		handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
>  		handlers[1].fn = xfs_getfsmap_logdev;
>  	}
> +#ifdef CONFIG_XFS_RT
>  	if (mp->m_rtdev_targp) {
>  		handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
>  		handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
>  	}
> +#endif /* CONFIG_XFS_RT */
>  
>  	xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
>  			xfs_getfsmap_dev_compare);
> -- 
> 2.14.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] xfs: don't change inode mode if ACL update fails
  2017-10-08 23:54 ` [PATCH 3/4] xfs: don't change inode mode if ACL update fails Dave Chinner
@ 2017-10-09 14:24   ` Brian Foster
  0 siblings, 0 replies; 19+ messages in thread
From: Brian Foster @ 2017-10-09 14:24 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Mon, Oct 09, 2017 at 10:54:13AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If we get ENOSPC half way through setting the ACL, the inode mode
> can still be changed even though the ACL does not exist. Reorder the
> operation to only change the mode of the inode if the ACL is set
> correctly.
> 
> Whilst this does not fix the problem with crash consistency (that requires
> attribute addition to be a deferred op) it does prevent ENOSPC and other
> non-fatal errors setting an xattr to be handled sanely.
> 
> This fixes xfstests generic/449.
> 
> Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_acl.c | 22 ++++++++++++++++------
>  1 file changed, 16 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
> index 7034e17535de..3354140de07e 100644
> --- a/fs/xfs/xfs_acl.c
> +++ b/fs/xfs/xfs_acl.c
> @@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode)
>  int
>  xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  {
> +	umode_t mode;
> +	bool set_mode = false;
>  	int error = 0;
>  
>  	if (!acl)
> @@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  		return error;
>  
>  	if (type == ACL_TYPE_ACCESS) {
> -		umode_t mode;
> -
>  		error = posix_acl_update_mode(inode, &mode, &acl);
>  		if (error)
>  			return error;
> -		error = xfs_set_mode(inode, mode);
> -		if (error)
> -			return error;
> +		set_mode = true;
>  	}
>  
>   set_acl:
> -	return __xfs_set_acl(inode, acl, type);
> +	error =  __xfs_set_acl(inode, acl, type);
> +	if (error)
> +		return error;
> +
> +	/*
> +	 * We set the mode after successfully updating the ACL xattr because the
> +	 * xattr update can fail at ENOSPC and we don't want to change the mode
> +	 * if the ACL update hasn't been applied.
> +	 */
> +	if (set_mode)
> +		error = xfs_set_mode(inode, mode);
> +
> +	return error;
>  }
> -- 
> 2.14.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-08 23:54 ` [PATCH 4/4] xfs: cancel dirty pages on invalidation Dave Chinner
@ 2017-10-09 14:24   ` Brian Foster
  2017-10-09 20:48     ` Dave Chinner
  2017-10-16 19:39   ` Darrick J. Wong
  1 sibling, 1 reply; 19+ messages in thread
From: Brian Foster @ 2017-10-09 14:24 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Mon, Oct 09, 2017 at 10:54:14AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Recently we've had warnings arise from the vm handing us pages
> without bufferheads attached to them. This should not ever occur
> in XFS, but we don't defend against it properly if it does. The only
> place where we remove bufferheads from a page is in
> xfs_vm_releasepage(), but we can't tell the difference here between
> "page is dirty so don't release" and "page is dirty but is being
> invalidated so release it".
> 

Ok, so we changed ->releasepage() in 99579ccec4 ("xfs: skip dirty pages
in ->releasepage()") to deal with the fact that the mm can send
legitimately dirty pages to ->releasepage(). That was apparently too
aggressive a change because truncated pages would also end up skipped
because the dirty bit is not cleared by the time the release occurs.
Commit 0a417b8dc1 ("xfs: Timely free truncated dirty pages") modified
->releasepage() again to not skip all dirty pages and only warn for
delalloc/unwritten blocks on clean pages.

That was apparently insufficient because we have some codepaths where
not skipping dirty pages can allow us to strip the buffers from a page
incorrectly...

> In some places that are invalidating pages ask for pages to be
> released and follow up afterward calling ->releasepage by checking
> whether the page was dirty and then aborting the invalidation. This
> is a possible vector for releasing buffers from a page but then
> leaving it in the mapping, so we really do need to avoid dirty pages
> in xfs_vm_releasepage().
> 

... but I'm having a hard time parsing that first sentence to understand
how that is. Can you elaborate on and/or give a specific reference to
this problematic invalidation abort sequence?

Also, it looks like truncate_complete_page() eventually and
unconditionally clears the page dirty bit, it just happens after the
invalidate -> release attempt sequence that occurs down through
do_invalidatepage(). IIUC, this was the problem fixed by Jan's patch
mentioned above. Is there any reason we can't do the dirty cancel a
little earlier there? Would that also address this problem?

Brian

> To differentiate between invalidated pages and normal pages, we need
> to clear the page dirty flag when invalidating the pages. This can
> be done through xfs_vm_invalidatepage(), and will result
> xfs_vm_releasepage() seeing the page as clean which matches the
> bufferhead state on the page after calling block_invalidatepage().
> 
> Hence we can re-add the page dirty check in xfs_vm_releasepage to
> catch the case where we might be releasing a page that is actually
> dirty and so should not have the bufferheads on it removed. This
> will remove one possible vector of "dirty page with no bufferheads"
> and so help narrow down the search for the root cause of that
> problem.
> 
> Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> ---

>  fs/xfs/xfs_aops.c | 34 ++++++++++++++++++++++------------
>  1 file changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index f18e5932aec4..067284d84d9e 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -735,6 +735,14 @@ xfs_vm_invalidatepage(
>  {
>  	trace_xfs_invalidatepage(page->mapping->host, page, offset,
>  				 length);
> +
> +	/*
> +	 * If we are invalidating the entire page, clear the dirty state from it
> +	 * so that we can check for attempts to release dirty cached pages in
> +	 * xfs_vm_releasepage().
> +	 */
> +	if (offset == 0 && length >= PAGE_SIZE)
> +		cancel_dirty_page(page);
>  	block_invalidatepage(page, offset, length);
>  }
>  
> @@ -1190,25 +1198,27 @@ xfs_vm_releasepage(
>  	 * mm accommodates an old ext3 case where clean pages might not have had
>  	 * the dirty bit cleared. Thus, it can send actual dirty pages to
>  	 * ->releasepage() via shrink_active_list(). Conversely,
> -	 * block_invalidatepage() can send pages that are still marked dirty
> -	 * but otherwise have invalidated buffers.
> +	 * block_invalidatepage() can send pages that are still marked dirty but
> +	 * otherwise have invalidated buffers.
>  	 *
>  	 * We want to release the latter to avoid unnecessary buildup of the
> -	 * LRU, skip the former and warn if we've left any lingering
> -	 * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
> -	 * or unwritten buffers and warn if the page is not dirty. Otherwise
> -	 * try to release the buffers.
> +	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
> +	 * that are entirely invalidated and need to be released.  Hence the
> +	 * only time we should get dirty pages here is through
> +	 * shrink_active_list() and so we can simply skip those now.
> +	 *
> +	 * warn if we've left any lingering delalloc/unwritten buffers on clean
> +	 * or invalidated pages we are about to release.
>  	 */
> +	if (PageDirty(page))
> +		return 0;
> +
>  	xfs_count_page_state(page, &delalloc, &unwritten);
>  
> -	if (delalloc) {
> -		WARN_ON_ONCE(!PageDirty(page));
> +	if (WARN_ON_ONCE(delalloc))
>  		return 0;
> -	}
> -	if (unwritten) {
> -		WARN_ON_ONCE(!PageDirty(page));
> +	if (WARN_ON_ONCE(unwritten))
>  		return 0;
> -	}
>  
>  	return try_to_free_buffers(page);
>  }
> -- 
> 2.14.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 0/4] xfs: miscellaneous fixes
  2017-10-08 23:54 [PATCH 0/4] xfs: miscellaneous fixes Dave Chinner
                   ` (3 preceding siblings ...)
  2017-10-08 23:54 ` [PATCH 4/4] xfs: cancel dirty pages on invalidation Dave Chinner
@ 2017-10-09 18:47 ` Darrick J. Wong
  4 siblings, 0 replies; 19+ messages in thread
From: Darrick J. Wong @ 2017-10-09 18:47 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Mon, Oct 09, 2017 at 10:54:10AM +1100, Dave Chinner wrote:
> Darrick,
> 
> These are fixes that have been sitting in my test tree for a while,
> so I thought I better get them cleaned out and up into the main
> tree. They all address recent reported problems on the list - the RT
> is not directly related to a user bug report but is something I
> did while looking at the recent CVE issue.
> 
> Please consider for merge.

For patches 1-3,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

Will run them through a test cycle and see what happens.

(Still trying to wrap my head around #4 having screwed up in that area
once already.)

--D

> 
> -Dave.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-09 14:24   ` Brian Foster
@ 2017-10-09 20:48     ` Dave Chinner
  2017-10-10 12:29       ` Brian Foster
  0 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2017-10-09 20:48 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Mon, Oct 09, 2017 at 10:24:46AM -0400, Brian Foster wrote:
> On Mon, Oct 09, 2017 at 10:54:14AM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Recently we've had warnings arise from the vm handing us pages
> > without bufferheads attached to them. This should not ever occur
> > in XFS, but we don't defend against it properly if it does. The only
> > place where we remove bufferheads from a page is in
> > xfs_vm_releasepage(), but we can't tell the difference here between
> > "page is dirty so don't release" and "page is dirty but is being
> > invalidated so release it".
> > 
> 
> Ok, so we changed ->releasepage() in 99579ccec4 ("xfs: skip dirty pages
> in ->releasepage()") to deal with the fact that the mm can send
> legitimately dirty pages to ->releasepage(). That was apparently too
> aggressive a change because truncated pages would also end up skipped
> because the dirty bit is not cleared by the time the release occurs.
> Commit 0a417b8dc1 ("xfs: Timely free truncated dirty pages") modified
> ->releasepage() again to not skip all dirty pages and only warn for
> delalloc/unwritten blocks on clean pages.
> 
> That was apparently insufficient because we have some codepaths where
> not skipping dirty pages can allow us to strip the buffers from a page
> incorrectly...

Right, Jan's patch fixed the truncate problem but re-opened the
memory reclaim hole. It effectively reverted the original patch.

> > In some places that are invalidating pages ask for pages to be
> > released and follow up afterward calling ->releasepage by checking
> > whether the page was dirty and then aborting the invalidation. This
> > is a possible vector for releasing buffers from a page but then
> > leaving it in the mapping, so we really do need to avoid dirty pages
> > in xfs_vm_releasepage().
> > 
> 
> ... but I'm having a hard time parsing that first sentence to understand
> how that is. Can you elaborate on and/or give a specific reference to
> this problematic invalidation abort sequence?

e.g. invalidate_complete_page2():

/*                                                                               
 * This is for invalidate_mapping_pages().  That function can be called at       
 * any time, and is not supposed to throw away dirty pages.  But pages can       
.....
       if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;

        spin_lock_irqsave(&mapping->tree_lock, flags);
        if (PageDirty(page))
                goto failed;
....

It's not supposed to throw away dirty pages. But
try_to_release_page() on a dirty page in XFS will currently do just
that.

> Also, it looks like truncate_complete_page() eventually and
> unconditionally clears the page dirty bit, it just happens after the
> invalidate -> release attempt sequence that occurs down through
> do_invalidatepage().

Right, we just need to do it earlier, but.....

> IIUC, this was the problem fixed by Jan's patch
> mentioned above.  Is there any reason we can't do the dirty cancel a
> little earlier there? Would that also address this problem?

.... we can't do that in generic code because ext3.

Essentially, there are ext3 specific behaviours encoded into the generic
code. We can't fix the generic code because then we break ext3,
and nobody is going to redesign the ext3 journalling code to fix
the bogosities it has that require the generic invalidation/release
paths to work the way they do.

That leaves us with having to work around the filesystem specific
code in the generic paths in the filesystem specific code...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-09 20:48     ` Dave Chinner
@ 2017-10-10 12:29       ` Brian Foster
  2017-10-11  0:04         ` Dave Chinner
  0 siblings, 1 reply; 19+ messages in thread
From: Brian Foster @ 2017-10-10 12:29 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Tue, Oct 10, 2017 at 07:48:16AM +1100, Dave Chinner wrote:
> On Mon, Oct 09, 2017 at 10:24:46AM -0400, Brian Foster wrote:
> > On Mon, Oct 09, 2017 at 10:54:14AM +1100, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Recently we've had warnings arise from the vm handing us pages
> > > without bufferheads attached to them. This should not ever occur
> > > in XFS, but we don't defend against it properly if it does. The only
> > > place where we remove bufferheads from a page is in
> > > xfs_vm_releasepage(), but we can't tell the difference here between
> > > "page is dirty so don't release" and "page is dirty but is being
> > > invalidated so release it".
> > > 
> > 
> > Ok, so we changed ->releasepage() in 99579ccec4 ("xfs: skip dirty pages
> > in ->releasepage()") to deal with the fact that the mm can send
> > legitimately dirty pages to ->releasepage(). That was apparently too
> > aggressive a change because truncated pages would also end up skipped
> > because the dirty bit is not cleared by the time the release occurs.
> > Commit 0a417b8dc1 ("xfs: Timely free truncated dirty pages") modified
> > ->releasepage() again to not skip all dirty pages and only warn for
> > delalloc/unwritten blocks on clean pages.
> > 
> > That was apparently insufficient because we have some codepaths where
> > not skipping dirty pages can allow us to strip the buffers from a page
> > incorrectly...
> 
> Right, Jan's patch fixed the truncate problem but re-opened the
> memory reclaim hole. It effectively reverted the original patch.
> 

(Bear with me, trying to work through my confusion...). So the original
patch documents XFS release page delalloc warnings and the conditions
that occur that make these warnings spurious. The objective of the patch
(as documented) was therefore to quiet the warnings. Jan's patch came
along to address the lru issue and quieted the warning in a different
way.

FWIW, my impression is that this patch is intending to fix some other
side effect of this ext/mm behavior also fixed by the original patch,
but at least wasn't documented by the commit log. What this additional
side effect is is the part I'm trying to understand...

> > > In some places that are invalidating pages ask for pages to be
> > > released and follow up afterward calling ->releasepage by checking
> > > whether the page was dirty and then aborting the invalidation. This
> > > is a possible vector for releasing buffers from a page but then
> > > leaving it in the mapping, so we really do need to avoid dirty pages
> > > in xfs_vm_releasepage().
> > > 
> > 
> > ... but I'm having a hard time parsing that first sentence to understand
> > how that is. Can you elaborate on and/or give a specific reference to
> > this problematic invalidation abort sequence?
> 
> e.g. invalidate_complete_page2():
> 
> /*                                                                               
>  * This is for invalidate_mapping_pages().  That function can be called at       
>  * any time, and is not supposed to throw away dirty pages.  But pages can       
> .....
>        if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
>                 return 0;
> 
>         spin_lock_irqsave(&mapping->tree_lock, flags);
>         if (PageDirty(page))
>                 goto failed;
> ....
> 
> It's not supposed to throw away dirty pages. But
> try_to_release_page() on a dirty page in XFS will currently do just
> that.
> 

If the page is legitimately dirty, shouldn't the underlying buffer be
dirty as well? ISTM that try_to_release_page() (which eventually gets to
try_to_free_buffers()) accommodates the case of a dirty page w/o dirty
buffers by checking the latter state and updating the former. So if the
page has dirty buffers, try_to_release_page() returns 0 and the
invalidate exits with -EBUSY. If the page does not have dirty buffers,
the buffers are dropped, the page dirty state is cancelled and the
invalidate proceeds.

So for XFS this (dirty page && !dirty bufs) state should not exist. A
dirty page should always have dirty buffers and vice versa. For XFS,
therefore, I don't see how this patch affects
invalidate_complete_page2() at all. What am I missing? Is there some
other problematic sequence?

AIUI, the historical warning problem in XFS was because we issued the
warning before/without any similar assessment of the validity of the
dirty state (i.e., we assumed the page should not be dirty in
->releasepage()).

> > Also, it looks like truncate_complete_page() eventually and
> > unconditionally clears the page dirty bit, it just happens after the
> > invalidate -> release attempt sequence that occurs down through
> > do_invalidatepage().
> 
> Right, we just need to do it earlier, but.....
> 
> > IIUC, this was the problem fixed by Jan's patch
> > mentioned above.  Is there any reason we can't do the dirty cancel a
> > little earlier there? Would that also address this problem?
> 
> .... we can't do that in generic code because ext3.
> 
> Essentially, there are ext3 specific behaviours encoded into the generic
> code. We can't fix the generic code because then we break ext3,
> and nobody is going to redesign the ext3 journalling code to fix
> the bogosities it has that require the generic invalidation/release
> paths to work the way they do.
> 

Hmm, I thought the ext3-specific wart was the fact that dirty pages w/o
dirty buffers could exist in the first place (why ->releasepage() must
handle dirty pages), not necessarily that a page being removed had to be
cleaned/invalidated in any particular order. No matter, I probably need
to understand the issue better first...

Brian

> That leaves us with having to work around the filesystem specific
> code in the generic paths in the filesystem specific code...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-10 12:29       ` Brian Foster
@ 2017-10-11  0:04         ` Dave Chinner
  2017-10-11  9:02           ` Brian Foster
  0 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2017-10-11  0:04 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Tue, Oct 10, 2017 at 08:29:45AM -0400, Brian Foster wrote:
> On Tue, Oct 10, 2017 at 07:48:16AM +1100, Dave Chinner wrote:
> > On Mon, Oct 09, 2017 at 10:24:46AM -0400, Brian Foster wrote:
> > > On Mon, Oct 09, 2017 at 10:54:14AM +1100, Dave Chinner wrote:
> > > > From: Dave Chinner <dchinner@redhat.com>
> > > > 
> > > > Recently we've had warnings arise from the vm handing us pages
> > > > without bufferheads attached to them. This should not ever occur
> > > > in XFS, but we don't defend against it properly if it does. The only
> > > > place where we remove bufferheads from a page is in
> > > > xfs_vm_releasepage(), but we can't tell the difference here between
> > > > "page is dirty so don't release" and "page is dirty but is being
> > > > invalidated so release it".
> > > > 
> > > 
> > > Ok, so we changed ->releasepage() in 99579ccec4 ("xfs: skip dirty pages
> > > in ->releasepage()") to deal with the fact that the mm can send
> > > legitimately dirty pages to ->releasepage(). That was apparently too
> > > aggressive a change because truncated pages would also end up skipped
> > > because the dirty bit is not cleared by the time the release occurs.
> > > Commit 0a417b8dc1 ("xfs: Timely free truncated dirty pages") modified
> > > ->releasepage() again to not skip all dirty pages and only warn for
> > > delalloc/unwritten blocks on clean pages.
> > > 
> > > That was apparently insufficient because we have some codepaths where
> > > not skipping dirty pages can allow us to strip the buffers from a page
> > > incorrectly...
> > 
> > Right, Jan's patch fixed the truncate problem but re-opened the
> > memory reclaim hole. It effectively reverted the original patch.
> > 
> 
> (Bear with me, trying to work through my confusion...). So the original
> patch documents XFS release page delalloc warnings and the conditions
> that occur that make these warnings spurious. The objective of the patch
> (as documented) was therefore to quiet the warnings. Jan's patch came
> along to address the lru issue and quieted the warning in a different
> way.
> 
> FWIW, my impression is that this patch is intending to fix some other
> side effect of this ext/mm behavior also fixed by the original patch,
> but at least wasn't documented by the commit log. What this additional
> side effect is is the part I'm trying to understand...
> 
> > > > In some places that are invalidating pages ask for pages to be
> > > > released and follow up afterward calling ->releasepage by checking
> > > > whether the page was dirty and then aborting the invalidation. This
> > > > is a possible vector for releasing buffers from a page but then
> > > > leaving it in the mapping, so we really do need to avoid dirty pages
> > > > in xfs_vm_releasepage().
> > > > 
> > > 
> > > ... but I'm having a hard time parsing that first sentence to understand
> > > how that is. Can you elaborate on and/or give a specific reference to
> > > this problematic invalidation abort sequence?
> > 
> > e.g. invalidate_complete_page2():
> > 
> > /*                                                                               
> >  * This is for invalidate_mapping_pages().  That function can be called at       
> >  * any time, and is not supposed to throw away dirty pages.  But pages can       
> > .....
> >        if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
> >                 return 0;
> > 
> >         spin_lock_irqsave(&mapping->tree_lock, flags);
> >         if (PageDirty(page))
> >                 goto failed;
> > ....
> > 
> > It's not supposed to throw away dirty pages. But
> > try_to_release_page() on a dirty page in XFS will currently do just
> > that.
> > 
> 
> If the page is legitimately dirty, shouldn't the underlying buffer be
> dirty as well?

The answer is "yes for all filesystems but ext3". In ext3, journal
checkpoints can write back the buffers directly and clean them
without updating the page state on IO completion. So we end up in
the state where the page state and bufferhead state is not coherent.

Hence the generic code can't check for the page being clean before
releasing the page/buffers, because then it would never end up
noticing and correcting ext3 pages in this state.

> ISTM that try_to_release_page() (which eventually gets to
> try_to_free_buffers()) accommodates the case of a dirty page w/o
> dirty buffers by checking the latter state and updating the
> former. So if the page has dirty buffers, try_to_release_page()
> returns 0 and the invalidate exits with -EBUSY.  If the page does
> not have dirty buffers, the buffers are dropped, the page dirty
> state is cancelled and the invalidate proceeds.
>
> So for XFS this (dirty page && !dirty bufs) state should not exist.

But it does!

Take a dirty page, and truncate it:

truncate_complete_page
  do_invalidatepage
    xfs_vm_invalidatepage
      block_invalidatepage
        discard_buffers		<<<<<<<<< clears all buffer state!
  try_to_release_page
    xfs_vm_releasepage
      xfs_count_page_state   <<< finds clean buffers, no warnings

That's why the truncate path doesn't issue warnings about
invalidating and releasing dirty pages.

> A
> dirty page should always have dirty buffers and vice versa. For XFS,
> therefore, I don't see how this patch affects
> invalidate_complete_page2() at all.

That's what I was trying to point out! i.e. that the patch fixes
spurious false positive warnigns but doesn't affect the high level
invalidate_complete_page2() or other invalidation code.

>
> What am I missing? Is there some
> other problematic sequence?

Memory reclaim, dirty page over delalloc/unwritten extent:

shrink_inactive_list
  shrink_list
    buffer_heads_over_limit
      try_to_release_page
        xfs_vm_releasepage()
	  xfs_count_page_state   <<< finds delalloc/unwritten buffer
	  WARN_ON(delalloc)

That warning is still emitted after Jan's patch. That warning is
meaningless - we've been called to release a valid page from code
that is only called on dirty pages to work around ext3's "journal
cleaned the buffers" dirty state incoherency problem.  We've been
told that this is not going to be fixed in the high level code, so
we have to handle it ourselves.

That is, trying to release a dirty page with valid state from this
path on XFS is simply wrong. We shouldn't pollute logs with warnings
about dirty pages with delalloc/unwritten state in this case because
/the page and buffers have a valid state/. Hence from this path we
need to avoid issuing a warning - it's a false positive.

Similarly for invalidate_complete_page2() calling
try_to_release_page with a dirty page - we're supposed to skip them
and not release them at all as documented by the
invalidate_complete_page2 API. However, we'll issue warnings:

invalidate_complete_page2
  try_to_release_page
    xfs_vm_releasepage
      xfs_count_page_state	<<< finds delalloc/unwritten buffer
      WARN_ON(unwritten)
  <skips page correctly>

Because, again, we've been handed a dirty page in a valid state that
we're not supposed to free. The warning is, again, a false positive.

If xfs_vm_releasepage gets handed a clean page with dirty buffer
state on it, then we most definitely need to warn. But we should not
warn when we are handed dirty pages with valid state from paths that
actually want us to leave dirty pages completely alone.

> AIUI, the historical warning problem in XFS was because we issued the
> warning before/without any similar assessment of the validity of the
> dirty state (i.e., we assumed the page should not be dirty in
> ->releasepage()).

And that's precisely what Jan's patch made the code do again. We
*know* we are going to get handed dirty pages with valid
delalloc/unwritten state by ->releasepage now, and issuing warnigns
in those cases is wrong.

> Hmm, I thought the ext3-specific wart was the fact that dirty pages w/o
> dirty buffers could exist in the first place (why ->releasepage() must
> handle dirty pages), not necessarily that a page being removed had to be
> cleaned/invalidated in any particular order. No matter, I probably need
> to understand the issue better first...

Yup, and that means ->releasepage needs to be called before doing
anything that assumes the page is clean and can be freed. i.e. we
have to get the ext3 buffer state propagated back to the the page
state before we take action on the page....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-11  0:04         ` Dave Chinner
@ 2017-10-11  9:02           ` Brian Foster
  2017-10-11 11:58             ` Dave Chinner
  0 siblings, 1 reply; 19+ messages in thread
From: Brian Foster @ 2017-10-11  9:02 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Wed, Oct 11, 2017 at 11:04:00AM +1100, Dave Chinner wrote:
> On Tue, Oct 10, 2017 at 08:29:45AM -0400, Brian Foster wrote:
> > On Tue, Oct 10, 2017 at 07:48:16AM +1100, Dave Chinner wrote:
> > > On Mon, Oct 09, 2017 at 10:24:46AM -0400, Brian Foster wrote:
> > > > On Mon, Oct 09, 2017 at 10:54:14AM +1100, Dave Chinner wrote:
> > > > > From: Dave Chinner <dchinner@redhat.com>
> > > > > 
> > > > > Recently we've had warnings arise from the vm handing us pages
> > > > > without bufferheads attached to them. This should not ever occur
> > > > > in XFS, but we don't defend against it properly if it does. The only
> > > > > place where we remove bufferheads from a page is in
> > > > > xfs_vm_releasepage(), but we can't tell the difference here between
> > > > > "page is dirty so don't release" and "page is dirty but is being
> > > > > invalidated so release it".
> > > > > 
> > > > 
> > > > Ok, so we changed ->releasepage() in 99579ccec4 ("xfs: skip dirty pages
> > > > in ->releasepage()") to deal with the fact that the mm can send
> > > > legitimately dirty pages to ->releasepage(). That was apparently too
> > > > aggressive a change because truncated pages would also end up skipped
> > > > because the dirty bit is not cleared by the time the release occurs.
> > > > Commit 0a417b8dc1 ("xfs: Timely free truncated dirty pages") modified
> > > > ->releasepage() again to not skip all dirty pages and only warn for
> > > > delalloc/unwritten blocks on clean pages.
> > > > 
> > > > That was apparently insufficient because we have some codepaths where
> > > > not skipping dirty pages can allow us to strip the buffers from a page
> > > > incorrectly...
> > > 
> > > Right, Jan's patch fixed the truncate problem but re-opened the
> > > memory reclaim hole. It effectively reverted the original patch.
> > > 
> > 
> > (Bear with me, trying to work through my confusion...). So the original
> > patch documents XFS release page delalloc warnings and the conditions
> > that occur that make these warnings spurious. The objective of the patch
> > (as documented) was therefore to quiet the warnings. Jan's patch came
> > along to address the lru issue and quieted the warning in a different
> > way.
> > 
> > FWIW, my impression is that this patch is intending to fix some other
> > side effect of this ext/mm behavior also fixed by the original patch,
> > but at least wasn't documented by the commit log. What this additional
> > side effect is is the part I'm trying to understand...
> > 
> > > > > In some places that are invalidating pages ask for pages to be
> > > > > released and follow up afterward calling ->releasepage by checking
> > > > > whether the page was dirty and then aborting the invalidation. This
> > > > > is a possible vector for releasing buffers from a page but then
> > > > > leaving it in the mapping, so we really do need to avoid dirty pages
> > > > > in xfs_vm_releasepage().
> > > > > 
> > > > 
> > > > ... but I'm having a hard time parsing that first sentence to understand
> > > > how that is. Can you elaborate on and/or give a specific reference to
> > > > this problematic invalidation abort sequence?
> > > 
> > > e.g. invalidate_complete_page2():
> > > 
> > > /*                                                                               
> > >  * This is for invalidate_mapping_pages().  That function can be called at       
> > >  * any time, and is not supposed to throw away dirty pages.  But pages can       
> > > .....
> > >        if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
> > >                 return 0;
> > > 
> > >         spin_lock_irqsave(&mapping->tree_lock, flags);
> > >         if (PageDirty(page))
> > >                 goto failed;
> > > ....
> > > 
> > > It's not supposed to throw away dirty pages. But
> > > try_to_release_page() on a dirty page in XFS will currently do just
> > > that.
> > > 
> > 
> > If the page is legitimately dirty, shouldn't the underlying buffer be
> > dirty as well?
> 
> The answer is "yes for all filesystems but ext3". In ext3, journal
> checkpoints can write back the buffers directly and clean them
> without updating the page state on IO completion. So we end up in
> the state where the page state and bufferhead state is not coherent.
> 
> Hence the generic code can't check for the page being clean before
> releasing the page/buffers, because then it would never end up
> noticing and correcting ext3 pages in this state.
> 

Ok. I may not grok the details, but I understand there is some
quirkiness to ext3 that causes the mm to not rely on the page dirty
state for using ->releasepage().

> > ISTM that try_to_release_page() (which eventually gets to
> > try_to_free_buffers()) accommodates the case of a dirty page w/o
> > dirty buffers by checking the latter state and updating the
> > former. So if the page has dirty buffers, try_to_release_page()
> > returns 0 and the invalidate exits with -EBUSY.  If the page does
> > not have dirty buffers, the buffers are dropped, the page dirty
> > state is cancelled and the invalidate proceeds.
> >
> > So for XFS this (dirty page && !dirty bufs) state should not exist.
> 
> But it does!
> 
> Take a dirty page, and truncate it:
> 
> truncate_complete_page
>   do_invalidatepage
>     xfs_vm_invalidatepage
>       block_invalidatepage
>         discard_buffers		<<<<<<<<< clears all buffer state!
>   try_to_release_page
>     xfs_vm_releasepage
>       xfs_count_page_state   <<< finds clean buffers, no warnings
> 
> That's why the truncate path doesn't issue warnings about
> invalidating and releasing dirty pages.
> 

So the above clears the buffer dirty state, then attempts to release the
page, then clears the page dirty state. I can see that we're (XFS)
exposed to the page in a transiently inconsistent state, but the state
itself does appear to be transient in this codepath. IOW, by the time
truncate_complete_page() completes and the page is unlocked, the page
and buffer state are in-sync (neither are dirty).

Note that I can also see how this would have prevented freeing buffers
with the original PageDirty() check (causing Jan's problem), and how
canceling the page dirty state in ->invalidatepage() allows us to
reinstate that PageDirty() check. Alas, that is not my question...

> > A
> > dirty page should always have dirty buffers and vice versa. For XFS,
> > therefore, I don't see how this patch affects
> > invalidate_complete_page2() at all.
> 
> That's what I was trying to point out! i.e. that the patch fixes
> spurious false positive warnigns but doesn't affect the high level
> invalidate_complete_page2() or other invalidation code.
> 

Um, Ok? Then what what is the purpose of the patch? That is what I'm
trying to understand. What problem with the current code does this fix?

> >
> > What am I missing? Is there some
> > other problematic sequence?
> 
> Memory reclaim, dirty page over delalloc/unwritten extent:
> 

Ok..

> shrink_inactive_list
>   shrink_list
>     buffer_heads_over_limit
>       try_to_release_page
>         xfs_vm_releasepage()
> 	  xfs_count_page_state   <<< finds delalloc/unwritten buffer
> 	  WARN_ON(delalloc)
> 

The xfs_vm_releasepage() code looks like this:

	...
        xfs_count_page_state(page, &delalloc, &unwritten);

        if (delalloc) {
                WARN_ON_ONCE(!PageDirty(page));
                return 0;
        }
	...

So if we get here with a dirty page over a delalloc extent, for example,
how exactly does that trigger a spurious warning? AFAICT the warning
will not trigger because the page is dirty. We decide not to release the
page precisely because it has delalloc buffers.

> That warning is still emitted after Jan's patch. That warning is
> meaningless - we've been called to release a valid page from code
> that is only called on dirty pages to work around ext3's "journal
> cleaned the buffers" dirty state incoherency problem.  We've been
> told that this is not going to be fixed in the high level code, so
> we have to handle it ourselves.
> 

Understood.

> That is, trying to release a dirty page with valid state from this
> path on XFS is simply wrong. We shouldn't pollute logs with warnings
> about dirty pages with delalloc/unwritten state in this case because
> /the page and buffers have a valid state/. Hence from this path we
> need to avoid issuing a warning - it's a false positive.
> 

See above. Also AFAICT, try_to_free_buffers() only attempts to release
the page if the buffers are not dirty. So ISTM that this also currently
works as expected.

> Similarly for invalidate_complete_page2() calling
> try_to_release_page with a dirty page - we're supposed to skip them
> and not release them at all as documented by the
> invalidate_complete_page2 API. However, we'll issue warnings:
> 
> invalidate_complete_page2
>   try_to_release_page
>     xfs_vm_releasepage
>       xfs_count_page_state	<<< finds delalloc/unwritten buffer
>       WARN_ON(unwritten)
>   <skips page correctly>
> 

This seems like the same situation from the perspective of
xfs_vm_releasepage(). It basically prioritizes checking the buffer state
over the page state when checking whether we can release the page, and
cancels the page dirty state if so. This is all implemented in
try_to_free_buffers(), we just happen to have an earlier
delalloc/unwritten shortcut check in xfs_vm_releasepage() that allows
checking for an inconsistent page state (i.e., delalloc blocks have been
left on a page that has already been cleaned).

> Because, again, we've been handed a dirty page in a valid state that
> we're not supposed to free. The warning is, again, a false positive.
> 
> If xfs_vm_releasepage gets handed a clean page with dirty buffer
> state on it, then we most definitely need to warn. But we should not
> warn when we are handed dirty pages with valid state from paths that
> actually want us to leave dirty pages completely alone.
> 

Agree, but I don't see how this occurs with the current code. We only
ever warn if the page is clean. As it is, I'm slightly confused as to
whether you are attempting to explain how/why this patch works or
whether there is an issue with the current code. I'm really after the
latter, so I can try and understand the problem before reviewing how
this patch potentially fixes it.

So if I'm still missing something here, can you please reduce the
explanation to cover specifically what is wrong with the current code,
using an example/reproducer description if possible? I think if you can
point me at the sequence of events that cause whatever problem we're
trying to fix, I can probably work out an understanding from the code...

Brian

> > AIUI, the historical warning problem in XFS was because we issued the
> > warning before/without any similar assessment of the validity of the
> > dirty state (i.e., we assumed the page should not be dirty in
> > ->releasepage()).
> 
> And that's precisely what Jan's patch made the code do again. We
> *know* we are going to get handed dirty pages with valid
> delalloc/unwritten state by ->releasepage now, and issuing warnigns
> in those cases is wrong.
> 
> > Hmm, I thought the ext3-specific wart was the fact that dirty pages w/o
> > dirty buffers could exist in the first place (why ->releasepage() must
> > handle dirty pages), not necessarily that a page being removed had to be
> > cleaned/invalidated in any particular order. No matter, I probably need
> > to understand the issue better first...
> 
> Yup, and that means ->releasepage needs to be called before doing
> anything that assumes the page is clean and can be freed. i.e. we
> have to get the ext3 buffer state propagated back to the the page
> state before we take action on the page....
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-11  9:02           ` Brian Foster
@ 2017-10-11 11:58             ` Dave Chinner
  2017-10-11 13:02               ` Brian Foster
  0 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2017-10-11 11:58 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Wed, Oct 11, 2017 at 05:02:20AM -0400, Brian Foster wrote:
> On Wed, Oct 11, 2017 at 11:04:00AM +1100, Dave Chinner wrote:
> > On Tue, Oct 10, 2017 at 08:29:45AM -0400, Brian Foster wrote:
> > > On Tue, Oct 10, 2017 at 07:48:16AM +1100, Dave Chinner wrote:
> > > > On Mon, Oct 09, 2017 at 10:24:46AM -0400, Brian Foster wrote:
> > > > > On Mon, Oct 09, 2017 at 10:54:14AM +1100, Dave Chinner wrote:
> > shrink_inactive_list
> >   shrink_list
> >     buffer_heads_over_limit
> >       try_to_release_page
> >         xfs_vm_releasepage()
> > 	  xfs_count_page_state   <<< finds delalloc/unwritten buffer
> > 	  WARN_ON(delalloc)
> > 
> 
> The xfs_vm_releasepage() code looks like this:
> 
> 	...
>         xfs_count_page_state(page, &delalloc, &unwritten);
> 
>         if (delalloc) {
>                 WARN_ON_ONCE(!PageDirty(page));
>                 return 0;
>         }
> 	...
> 
> So if we get here with a dirty page over a delalloc extent, for example,
> how exactly does that trigger a spurious warning? AFAICT the warning
> will not trigger because the page is dirty. We decide not to release the
> page precisely because it has delalloc buffers.

And, you know, it's not until you pasted it here that I saw the
"!" in that WARN_ON_ONCE.

I've looked at repeatedly over many weeks, and not *once* have I
registered that it's a "NOT page dirty" warning.

So we can ignore the spurious warning issue. You're right, that
clearly doesn't happen.

But, realistically, we shouldn't be relying on bufferhead state to
determine what the correct action to take is. We can still have
dirty pages that do not have delalloc or unwritten extents on them
that we should not be attempting to free. The current code happily
hands them to try_to_free_buffers() rather than says "no, we don't
free dirty pages".

i.e. if xfs_invalidatepage() is trashing the dirty state on the
buffers, then we should also be trashing the dirty state on the page
so they are clean and coherent when passed to xfs_vm_releasepage.
That leaves us with the simple rule in xfs_vm_releasepage():

	Never release a dirty page because they always contain valid
	data that needs to be written back first.

That's what I'm trying to do - move the control decisions to page
level, rather than having them split and, at times, be incoherent
at bufferhead level. We're wanting to get rid of bufferheads so we
should be making decisions in this code based on page state, not
the bufferhead state...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-11 11:58             ` Dave Chinner
@ 2017-10-11 13:02               ` Brian Foster
  2017-10-12  0:56                 ` Dave Chinner
  0 siblings, 1 reply; 19+ messages in thread
From: Brian Foster @ 2017-10-11 13:02 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Wed, Oct 11, 2017 at 10:58:58PM +1100, Dave Chinner wrote:
> On Wed, Oct 11, 2017 at 05:02:20AM -0400, Brian Foster wrote:
> > On Wed, Oct 11, 2017 at 11:04:00AM +1100, Dave Chinner wrote:
> > > On Tue, Oct 10, 2017 at 08:29:45AM -0400, Brian Foster wrote:
> > > > On Tue, Oct 10, 2017 at 07:48:16AM +1100, Dave Chinner wrote:
> > > > > On Mon, Oct 09, 2017 at 10:24:46AM -0400, Brian Foster wrote:
> > > > > > On Mon, Oct 09, 2017 at 10:54:14AM +1100, Dave Chinner wrote:
> > > shrink_inactive_list
> > >   shrink_list
> > >     buffer_heads_over_limit
> > >       try_to_release_page
> > >         xfs_vm_releasepage()
> > > 	  xfs_count_page_state   <<< finds delalloc/unwritten buffer
> > > 	  WARN_ON(delalloc)
> > > 
> > 
> > The xfs_vm_releasepage() code looks like this:
> > 
> > 	...
> >         xfs_count_page_state(page, &delalloc, &unwritten);
> > 
> >         if (delalloc) {
> >                 WARN_ON_ONCE(!PageDirty(page));
> >                 return 0;
> >         }
> > 	...
> > 
> > So if we get here with a dirty page over a delalloc extent, for example,
> > how exactly does that trigger a spurious warning? AFAICT the warning
> > will not trigger because the page is dirty. We decide not to release the
> > page precisely because it has delalloc buffers.
> 
> And, you know, it's not until you pasted it here that I saw the
> "!" in that WARN_ON_ONCE.
> 
> I've looked at repeatedly over many weeks, and not *once* have I
> registered that it's a "NOT page dirty" warning.
> 

Heh, been there done that. ;)

> So we can ignore the spurious warning issue. You're right, that
> clearly doesn't happen.
> 

Ok.

> But, realistically, we shouldn't be relying on bufferhead state to
> determine what the correct action to take is. We can still have
> dirty pages that do not have delalloc or unwritten extents on them
> that we should not be attempting to free. The current code happily
> hands them to try_to_free_buffers() rather than says "no, we don't
> free dirty pages".
> 

Which is still fine because try_to_free_buffers() checks for dirty
buffers before attempting to clean the page.

> i.e. if xfs_invalidatepage() is trashing the dirty state on the
> buffers, then we should also be trashing the dirty state on the page
> so they are clean and coherent when passed to xfs_vm_releasepage.
> That leaves us with the simple rule in xfs_vm_releasepage():
> 
> 	Never release a dirty page because they always contain valid
> 	data that needs to be written back first.
> 
> That's what I'm trying to do - move the control decisions to page
> level, rather than having them split and, at times, be incoherent
> at bufferhead level. We're wanting to get rid of bufferheads so we
> should be making decisions in this code based on page state, not
> the bufferhead state...
> 

That seems reasonable to me. I'm not against this patch if it simplifies
our internal logic for dealing with pages, though I'm still kind of
wondering why to not do this by simply clearing the page earlier in
truncate_complete_page().

That said, I suppose there's an argument to be made to do that locally
and perhaps try to push it up the chain once the approach has some soak
time. Could you at least rewrite the commit log to reflect that this is
not a regression and is more of a refactoring/cleanup to effectively
elevate page state over bh state (a code comment to that effect probably
couldn't hurt either)? Thanks.

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-11 13:02               ` Brian Foster
@ 2017-10-12  0:56                 ` Dave Chinner
  2017-10-12 10:39                   ` Brian Foster
  0 siblings, 1 reply; 19+ messages in thread
From: Dave Chinner @ 2017-10-12  0:56 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Wed, Oct 11, 2017 at 09:02:38AM -0400, Brian Foster wrote:
> On Wed, Oct 11, 2017 at 10:58:58PM +1100, Dave Chinner wrote:
> > On Wed, Oct 11, 2017 at 05:02:20AM -0400, Brian Foster wrote:
> 
> > i.e. if xfs_invalidatepage() is trashing the dirty state on the
> > buffers, then we should also be trashing the dirty state on the page
> > so they are clean and coherent when passed to xfs_vm_releasepage.
> > That leaves us with the simple rule in xfs_vm_releasepage():
> > 
> > 	Never release a dirty page because they always contain valid
> > 	data that needs to be written back first.
> > 
> > That's what I'm trying to do - move the control decisions to page
> > level, rather than having them split and, at times, be incoherent
> > at bufferhead level. We're wanting to get rid of bufferheads so we
> > should be making decisions in this code based on page state, not
> > the bufferhead state...
> > 
> 
> That seems reasonable to me. I'm not against this patch if it simplifies
> our internal logic for dealing with pages, though I'm still kind of
> wondering why to not do this by simply clearing the page earlier in
> truncate_complete_page().

/broken record

We can't change the generic code because of ext3.

Yes, it makes no sense that we can't just change the order to skip
over dirty pages first when you look at the generic code. But the
reality is we will break ext3 if we do that.

> That said, I suppose there's an argument to be made to do that locally
> and perhaps try to push it up the chain once the approach has some soak
> time.

That was my argument for a long while, but it's just not feasible
until someone fixes ext3. We've been advised, instead, to just fix
the filesystem specific callouts to behave correctly.

> Could you at least rewrite the commit log to reflect that this is
> not a regression and is more of a refactoring/cleanup to effectively
> elevate page state over bh state (a code comment to that effect probably
> couldn't hurt either)? Thanks.

I just went back and re-read the commit message I originally wrote.
It talks about being "defensive" and "being able to catch the case
where the page is dirty and should not have bufferheads removed from
it". There's othing about spurious warnings or regressions in it at all.

IOWs, it's already saying "use the page state to determine actions
rather than bufferhead state." Not in those exact words, but it's
not far off it. I'll rewrite it, but on re-reading the commit
message I've now got no idea what the problem was in the first
place. :/

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-12  0:56                 ` Dave Chinner
@ 2017-10-12 10:39                   ` Brian Foster
  0 siblings, 0 replies; 19+ messages in thread
From: Brian Foster @ 2017-10-12 10:39 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Thu, Oct 12, 2017 at 11:56:08AM +1100, Dave Chinner wrote:
> On Wed, Oct 11, 2017 at 09:02:38AM -0400, Brian Foster wrote:
> > On Wed, Oct 11, 2017 at 10:58:58PM +1100, Dave Chinner wrote:
> > > On Wed, Oct 11, 2017 at 05:02:20AM -0400, Brian Foster wrote:
> > 
> > > i.e. if xfs_invalidatepage() is trashing the dirty state on the
> > > buffers, then we should also be trashing the dirty state on the page
> > > so they are clean and coherent when passed to xfs_vm_releasepage.
> > > That leaves us with the simple rule in xfs_vm_releasepage():
> > > 
> > > 	Never release a dirty page because they always contain valid
> > > 	data that needs to be written back first.
> > > 
> > > That's what I'm trying to do - move the control decisions to page
> > > level, rather than having them split and, at times, be incoherent
> > > at bufferhead level. We're wanting to get rid of bufferheads so we
> > > should be making decisions in this code based on page state, not
> > > the bufferhead state...
> > > 
> > 
> > That seems reasonable to me. I'm not against this patch if it simplifies
> > our internal logic for dealing with pages, though I'm still kind of
> > wondering why to not do this by simply clearing the page earlier in
> > truncate_complete_page().
> 
> /broken record
> 
> We can't change the generic code because of ext3.
> 
> Yes, it makes no sense that we can't just change the order to skip
> over dirty pages first when you look at the generic code. But the
> reality is we will break ext3 if we do that.
> 

As I mentioned earlier, my understanding of the ext problem is that it
leaves around essentially clean pages with the dirty bit set. I.e., this
is the reason the mm needs to attempt to release such pages on reclaim
and why we need to accommodate that case in ->releasepage(). I don't see
how this is related to this question, which is why the mm can't clear
the page dirty state before it asks a filesystem to invalidate buffers
and release the page.

I'm not sure where the idea that truncating page has to clear buffers
and page state in a particular order for ext comes from. Maybe I just
have too limited an understanding of the issue, but it looks to me that
ext3 basically calls block_invalidatepage() on ->invalidatepage() and
try_to_free_buffers() on ->releasepage() (or jbd2 variants of each that
also mostly seem to care about buffers and don't seem to even consider
page dirty state). IOW, it does pretty much the same thing that XFS does
in this codepath. So "because ext3" doesn't really answer the question.
But again, I don't know ext, so perhaps there's something else going on
there that I'm missing or there is some other general reason not to do
that.

> > That said, I suppose there's an argument to be made to do that locally
> > and perhaps try to push it up the chain once the approach has some soak
> > time.
> 
> That was my argument for a long while, but it's just not feasible
> until someone fixes ext3. We've been advised, instead, to just fix
> the filesystem specific callouts to behave correctly.
> 

If we can't change the mm, for whatever reason (i.e., too risky or just
don't want to deal with it), then fair enough.

Brian

> > Could you at least rewrite the commit log to reflect that this is
> > not a regression and is more of a refactoring/cleanup to effectively
> > elevate page state over bh state (a code comment to that effect probably
> > couldn't hurt either)? Thanks.
> 
> I just went back and re-read the commit message I originally wrote.
> It talks about being "defensive" and "being able to catch the case
> where the page is dirty and should not have bufferheads removed from
> it". There's othing about spurious warnings or regressions in it at all.
> 
> IOWs, it's already saying "use the page state to determine actions
> rather than bufferhead state." Not in those exact words, but it's
> not far off it. I'll rewrite it, but on re-reading the commit
> message I've now got no idea what the problem was in the first
> place. :/
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] xfs: cancel dirty pages on invalidation
  2017-10-08 23:54 ` [PATCH 4/4] xfs: cancel dirty pages on invalidation Dave Chinner
  2017-10-09 14:24   ` Brian Foster
@ 2017-10-16 19:39   ` Darrick J. Wong
  1 sibling, 0 replies; 19+ messages in thread
From: Darrick J. Wong @ 2017-10-16 19:39 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Mon, Oct 09, 2017 at 10:54:14AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Recently we've had warnings arise from the vm handing us pages
> without bufferheads attached to them. This should not ever occur
> in XFS, but we don't defend against it properly if it does. The only
> place where we remove bufferheads from a page is in
> xfs_vm_releasepage(), but we can't tell the difference here between
> "page is dirty so don't release" and "page is dirty but is being
> invalidated so release it".
> 
> In some places that are invalidating pages ask for pages to be
> released and follow up afterward calling ->releasepage by checking
> whether the page was dirty and then aborting the invalidation. This
> is a possible vector for releasing buffers from a page but then
> leaving it in the mapping, so we really do need to avoid dirty pages
> in xfs_vm_releasepage().
> 
> To differentiate between invalidated pages and normal pages, we need
> to clear the page dirty flag when invalidating the pages. This can
> be done through xfs_vm_invalidatepage(), and will result
> xfs_vm_releasepage() seeing the page as clean which matches the
> bufferhead state on the page after calling block_invalidatepage().
> 
> Hence we can re-add the page dirty check in xfs_vm_releasepage to
> catch the case where we might be releasing a page that is actually
> dirty and so should not have the bufferheads on it removed. This
> will remove one possible vector of "dirty page with no bufferheads"
> and so help narrow down the search for the root cause of that
> problem.
> 
> Signed-Off-By: Dave Chinner <dchinner@redhat.com>

Took forever to wrap my head around the bufferhead vs. page state mess,
but I think it looks ok.

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

> ---
>  fs/xfs/xfs_aops.c | 34 ++++++++++++++++++++++------------
>  1 file changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index f18e5932aec4..067284d84d9e 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -735,6 +735,14 @@ xfs_vm_invalidatepage(
>  {
>  	trace_xfs_invalidatepage(page->mapping->host, page, offset,
>  				 length);
> +
> +	/*
> +	 * If we are invalidating the entire page, clear the dirty state from it
> +	 * so that we can check for attempts to release dirty cached pages in
> +	 * xfs_vm_releasepage().
> +	 */
> +	if (offset == 0 && length >= PAGE_SIZE)
> +		cancel_dirty_page(page);
>  	block_invalidatepage(page, offset, length);
>  }
>  
> @@ -1190,25 +1198,27 @@ xfs_vm_releasepage(
>  	 * mm accommodates an old ext3 case where clean pages might not have had
>  	 * the dirty bit cleared. Thus, it can send actual dirty pages to
>  	 * ->releasepage() via shrink_active_list(). Conversely,
> -	 * block_invalidatepage() can send pages that are still marked dirty
> -	 * but otherwise have invalidated buffers.
> +	 * block_invalidatepage() can send pages that are still marked dirty but
> +	 * otherwise have invalidated buffers.
>  	 *
>  	 * We want to release the latter to avoid unnecessary buildup of the
> -	 * LRU, skip the former and warn if we've left any lingering
> -	 * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
> -	 * or unwritten buffers and warn if the page is not dirty. Otherwise
> -	 * try to release the buffers.
> +	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
> +	 * that are entirely invalidated and need to be released.  Hence the
> +	 * only time we should get dirty pages here is through
> +	 * shrink_active_list() and so we can simply skip those now.
> +	 *
> +	 * warn if we've left any lingering delalloc/unwritten buffers on clean
> +	 * or invalidated pages we are about to release.
>  	 */
> +	if (PageDirty(page))
> +		return 0;
> +
>  	xfs_count_page_state(page, &delalloc, &unwritten);
>  
> -	if (delalloc) {
> -		WARN_ON_ONCE(!PageDirty(page));
> +	if (WARN_ON_ONCE(delalloc))
>  		return 0;
> -	}
> -	if (unwritten) {
> -		WARN_ON_ONCE(!PageDirty(page));
> +	if (WARN_ON_ONCE(unwritten))
>  		return 0;
> -	}
>  
>  	return try_to_free_buffers(page);
>  }
> -- 
> 2.14.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2017-10-16 19:39 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-08 23:54 [PATCH 0/4] xfs: miscellaneous fixes Dave Chinner
2017-10-08 23:54 ` [PATCH 1/4] xfs: Don't log uninitialised fields in inode structures Dave Chinner
2017-10-09 14:24   ` Brian Foster
2017-10-08 23:54 ` [PATCH 2/4] xfs: move more RT specific code under CONFIG_XFS_RT Dave Chinner
2017-10-09 14:24   ` Brian Foster
2017-10-08 23:54 ` [PATCH 3/4] xfs: don't change inode mode if ACL update fails Dave Chinner
2017-10-09 14:24   ` Brian Foster
2017-10-08 23:54 ` [PATCH 4/4] xfs: cancel dirty pages on invalidation Dave Chinner
2017-10-09 14:24   ` Brian Foster
2017-10-09 20:48     ` Dave Chinner
2017-10-10 12:29       ` Brian Foster
2017-10-11  0:04         ` Dave Chinner
2017-10-11  9:02           ` Brian Foster
2017-10-11 11:58             ` Dave Chinner
2017-10-11 13:02               ` Brian Foster
2017-10-12  0:56                 ` Dave Chinner
2017-10-12 10:39                   ` Brian Foster
2017-10-16 19:39   ` Darrick J. Wong
2017-10-09 18:47 ` [PATCH 0/4] xfs: miscellaneous fixes Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.