All of lore.kernel.org
 help / color / mirror / Atom feed
* COW improvements and always_cow support V5
@ 2019-02-18  9:18 Christoph Hellwig
  2019-02-18  9:18 ` [PATCH 1/8] xfs: make xfs_bmbt_to_iomap more useful Christoph Hellwig
                   ` (8 more replies)
  0 siblings, 9 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs

Hi all,

this series adds the always_cow mode support after fixing various
little bugs and issues in the COW support.
write support a little bit first.


    git://git.infradead.org/users/hch/xfs.git always_cow.7

Gitweb:

    http://git.infradead.org/users/hch/xfs.git/shortlog/refs/heads/always_cow.7


Changes since v4:
 - rebased on to of the mapping validation fixes from Brian and me
 - fix SEEK_DATA / SEEK_HOLE vs COW fork preallocations
 - sample the always_cow value in a per-mount variable
 - disable all preallocation, including in zero range and unshare

Changes since v3:
 - spelling fixes
 - improve argument naming and add a comment in xfs_reflink_allocate_cow
 - trace the COW imap if allocating COW blocks
 - collect a few reviewed-by tags from Darrick

Changes since v2:
 - add a patch to remove xfs_trim_extent_eof
 - add a patch to remove the separate io_type and rely on existing state
   in the writeback path
 - rework the truncate race handling in the writeback path a little more

Changes since v1:
 - make delalloc and unwritten extent conversions simpler and more robust
 - add a few additional cleanups
 - support all fallocate modes but actual preallocation
 - rebase on top of a fix from Brian (which is included as first patch
   to make the patch set more usable)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/8] xfs: make xfs_bmbt_to_iomap more useful
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
@ 2019-02-18  9:18 ` Christoph Hellwig
  2019-02-18  9:18 ` [PATCH 2/8] xfs: fix SEEK_DATA for speculative COW fork preallocation Christoph Hellwig
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs; +Cc: Darrick J . Wong

Move checking for invalid zero blocks and setting of various iomap flags
into this helper.  Also make it deal with "raw" delalloc extents to
avoid clutter in the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_iomap.c | 84 +++++++++++++++++++++-------------------------
 fs/xfs/xfs_iomap.h |  4 +--
 fs/xfs/xfs_pnfs.c  |  2 +-
 3 files changed, 41 insertions(+), 49 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 361dfe7af783..284c5e68f695 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -35,18 +35,40 @@
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
 						<< mp->m_writeio_log)
 
-void
+static int
+xfs_alert_fsblock_zero(
+	xfs_inode_t	*ip,
+	xfs_bmbt_irec_t	*imap)
+{
+	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+			"Access to block zero in inode %llu "
+			"start_block: %llx start_off: %llx "
+			"blkcnt: %llx extent-state: %x",
+		(unsigned long long)ip->i_ino,
+		(unsigned long long)imap->br_startblock,
+		(unsigned long long)imap->br_startoff,
+		(unsigned long long)imap->br_blockcount,
+		imap->br_state);
+	return -EFSCORRUPTED;
+}
+
+int
 xfs_bmbt_to_iomap(
 	struct xfs_inode	*ip,
 	struct iomap		*iomap,
-	struct xfs_bmbt_irec	*imap)
+	struct xfs_bmbt_irec	*imap,
+	bool			shared)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
+	if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+		return xfs_alert_fsblock_zero(ip, imap);
+
 	if (imap->br_startblock == HOLESTARTBLOCK) {
 		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->type = IOMAP_HOLE;
-	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
+	} else if (imap->br_startblock == DELAYSTARTBLOCK ||
+		   isnullstartblock(imap->br_startblock)) {
 		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->type = IOMAP_DELALLOC;
 	} else {
@@ -60,6 +82,13 @@ xfs_bmbt_to_iomap(
 	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
 	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
 	iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+	if (xfs_ipincount(ip) &&
+	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+		iomap->flags |= IOMAP_F_DIRTY;
+	if (shared)
+		iomap->flags |= IOMAP_F_SHARED;
+	return 0;
 }
 
 static void
@@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb(
 	return 0;
 }
 
-STATIC int
-xfs_alert_fsblock_zero(
-	xfs_inode_t	*ip,
-	xfs_bmbt_irec_t	*imap)
-{
-	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-			"Access to block zero in inode %llu "
-			"start_block: %llx start_off: %llx "
-			"blkcnt: %llx extent-state: %x",
-		(unsigned long long)ip->i_ino,
-		(unsigned long long)imap->br_startblock,
-		(unsigned long long)imap->br_startoff,
-		(unsigned long long)imap->br_blockcount,
-		imap->br_state);
-	return -EFSCORRUPTED;
-}
-
 int
 xfs_iomap_write_direct(
 	xfs_inode_t	*ip,
@@ -649,17 +661,7 @@ xfs_file_iomap_begin_delay(
 	iomap->flags |= IOMAP_F_NEW;
 	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
 done:
-	if (isnullstartblock(got.br_startblock))
-		got.br_startblock = DELAYSTARTBLOCK;
-
-	if (!got.br_startblock) {
-		error = xfs_alert_fsblock_zero(ip, &got);
-		if (error)
-			goto out_unlock;
-	}
-
-	xfs_bmbt_to_iomap(ip, iomap, &got);
-
+	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
@@ -976,15 +978,7 @@ xfs_file_iomap_begin(
 	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
 
 out_finish:
-	if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
-				& ~XFS_ILOG_TIMESTAMP))
-		iomap->flags |= IOMAP_F_DIRTY;
-
-	xfs_bmbt_to_iomap(ip, iomap, &imap);
-
-	if (shared)
-		iomap->flags |= IOMAP_F_SHARED;
-	return 0;
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
 
 out_found:
 	ASSERT(nimaps);
@@ -1107,12 +1101,10 @@ xfs_xattr_iomap_begin(
 out_unlock:
 	xfs_iunlock(ip, lockmode);
 
-	if (!error) {
-		ASSERT(nimaps);
-		xfs_bmbt_to_iomap(ip, iomap, &imap);
-	}
-
-	return error;
+	if (error)
+		return error;
+	ASSERT(nimaps);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
 }
 
 const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 6b16243db0b7..37b584c3069b 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -15,8 +15,8 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
-		struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+		struct xfs_bmbt_irec *, bool shared);
 xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
 
 static inline xfs_filblks_t
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c3599527d..bde2c9f56a46 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -185,7 +185,7 @@ xfs_fs_map_blocks(
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
-	xfs_bmbt_to_iomap(ip, iomap, &imap);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
 	*device_generation = mp->m_generation;
 	return error;
 out_unlock:
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/8] xfs: fix SEEK_DATA for speculative COW fork preallocation
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
  2019-02-18  9:18 ` [PATCH 1/8] xfs: make xfs_bmbt_to_iomap more useful Christoph Hellwig
@ 2019-02-18  9:18 ` Christoph Hellwig
  2019-02-19  5:13   ` Darrick J. Wong
  2019-02-18  9:18 ` [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints Christoph Hellwig
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs

We speculatively allocate extents in the COW fork to reduce
fragmentation.  But when we write data into such COW fork blocks that
do now shadow an allocation in the data fork SEEK_DATA will not
correctly report it, as it only looks at the data fork extents.
The only reason why that hasn't been an issue so far is because
we even use these speculative COW fork preallocations over holes in
the data fork at all for buffered writes, and blocks in the COW
fork that are written by direct writes are moved into the data
fork immediately at I/O completion time.

Add a new set of iomap_ops for SEEK_HOLE/SEEK_DATA which looks into
both the COW and data fork, and reports all COW extents as unwritten
to the iomap layer.  While this isn't strictly true for COW fork
extents that were already converted to real extents, the practical
semantics that you can't read data from them until they are moved
into the data fork are very similar, and this will force the iomap
layer into probing the extents for actually present data.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c  |  4 +--
 fs/xfs/xfs_iomap.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_iomap.h |  1 +
 3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e47425071e65..1d07dcfbbff3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1068,10 +1068,10 @@ xfs_file_llseek(
 	default:
 		return generic_file_llseek(file, offset, whence);
 	case SEEK_HOLE:
-		offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
 		break;
 	case SEEK_DATA:
-		offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
 		break;
 	}
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 284c5e68f695..a7599b084571 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1068,6 +1068,91 @@ const struct iomap_ops xfs_iomap_ops = {
 	.iomap_end		= xfs_file_iomap_end,
 };
 
+static int
+xfs_seek_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
+	xfs_fileoff_t		cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	imap, cmap;
+	int			error = 0;
+	unsigned		lockmode;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	lockmode = xfs_ilock_data_map_shared(ip);
+	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+		/*
+		 * If we found a data extent we are done.
+		 */
+		if (imap.br_startoff <= offset_fsb)
+			goto done;
+		data_fsb = imap.br_startoff;
+	} else {
+		/*
+		 * Fake a hole until the end of the file.
+		 */
+		data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+			       XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+	}
+
+	/*
+	 * If a COW fork extent covers the hole, report it - capped to the next
+	 * data fork extent:
+	 */
+	if (xfs_inode_has_cow_data(ip) &&
+	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+		cow_fsb = cmap.br_startoff;
+	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+		if (data_fsb < cow_fsb + cmap.br_blockcount)
+			end_fsb = min(end_fsb, data_fsb);
+		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+		/*
+		 * Force page cache probing to avoid reporting COW extents as
+		 * data even if they haven't been written to:
+		 */
+		iomap->type = IOMAP_UNWRITTEN;
+		goto out_unlock;
+	}
+
+	/*
+	 * Else report a hole, capped to the next found data or COW extent.
+	 */
+	if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+		imap.br_blockcount = cow_fsb - offset_fsb;
+	else
+		imap.br_blockcount = data_fsb - offset_fsb;
+	imap.br_startoff = offset_fsb;
+	imap.br_startblock = HOLESTARTBLOCK;
+	imap.br_state = XFS_EXT_NORM;
+done:
+	xfs_trim_extent(&imap, offset_fsb, end_fsb);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+	xfs_iunlock(ip, lockmode);
+	return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+	.iomap_begin		= xfs_seek_iomap_begin,
+};
+
 static int
 xfs_xattr_iomap_begin(
 	struct inode		*inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 37b584c3069b..5c2f6aa6d78f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -40,6 +40,7 @@ xfs_aligned_fsb_count(
 }
 
 extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
 
 #endif /* __XFS_IOMAP_H__*/
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
  2019-02-18  9:18 ` [PATCH 1/8] xfs: make xfs_bmbt_to_iomap more useful Christoph Hellwig
  2019-02-18  9:18 ` [PATCH 2/8] xfs: fix SEEK_DATA for speculative COW fork preallocation Christoph Hellwig
@ 2019-02-18  9:18 ` Christoph Hellwig
  2019-02-19  5:17   ` Darrick J. Wong
  2019-02-21 17:58   ` Brian Foster
  2019-02-18  9:18 ` [PATCH 4/8] xfs: also truncate holes covered by COW blocks Christoph Hellwig
                   ` (5 subsequent siblings)
  8 siblings, 2 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs

While using delalloc for extsize hints is generally a good idea, the
current code that does so only for COW doesn't help us much and creates
a lot of special cases.  Switch it to use real allocations like we
do for direct I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c   | 28 +++++++++++++++++-----------
 fs/xfs/xfs_reflink.c | 10 +++++++++-
 fs/xfs/xfs_reflink.h |  3 ++-
 3 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a7599b084571..19a3331b4a56 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -918,22 +918,28 @@ xfs_file_iomap_begin(
 	 * been done up front, so we don't need to do them here.
 	 */
 	if (xfs_is_reflink_inode(ip)) {
+		struct xfs_bmbt_irec	orig = imap;
+
 		/* if zeroing doesn't need COW allocation, then we are done. */
 		if ((flags & IOMAP_ZERO) &&
 		    !needs_cow_for_zeroing(&imap, nimaps))
 			goto out_found;
 
-		if (flags & IOMAP_DIRECT) {
-			/* may drop and re-acquire the ilock */
-			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
-					&lockmode);
-			if (error)
-				goto out_unlock;
-		} else {
-			error = xfs_reflink_reserve_cow(ip, &imap);
-			if (error)
-				goto out_unlock;
-		}
+		error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode,
+						 flags);
+		if (error)
+			goto out_unlock;
+
+		/*
+		 * For buffered writes we need to report the address of the
+		 * previous block (if there was any) so that the higher level
+		 * write code can perform read-modify-write operations.  For
+		 * direct I/O code, which must be block aligned we need to
+		 * report the newly allocated address.
+		 */
+		if (!(flags & IOMAP_DIRECT) &&
+		    orig.br_startblock != HOLESTARTBLOCK)
+			imap = orig;
 
 		end_fsb = imap.br_startoff + imap.br_blockcount;
 		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 2babc2cbe103..8a5353daf9ab 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*imap,
 	bool			*shared,
-	uint			*lockmode)
+	uint			*lockmode,
+	unsigned		iomap_flags)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb = imap->br_startoff;
@@ -471,6 +472,13 @@ xfs_reflink_allocate_cow(
 	if (nimaps == 0)
 		return -ENOSPC;
 convert:
+	/*
+	 * COW fork extents are supposed to remain unwritten until we're ready
+	 * to initiate a disk write.  For direct I/O we are going to write the
+	 * data and need the conversion, but for buffered writes we're done.
+	 */
+	if (!(iomap_flags & IOMAP_DIRECT))
+		return 0;
 	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
 
 out_unreserve:
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 6d73daef1f13..70d68a1a9b49 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -15,7 +15,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap);
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
-		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+		unsigned iomap_flags);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
 
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 4/8] xfs: also truncate holes covered by COW blocks
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2019-02-18  9:18 ` [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints Christoph Hellwig
@ 2019-02-18  9:18 ` Christoph Hellwig
  2019-02-18  9:18 ` [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay Christoph Hellwig
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs; +Cc: Darrick J . Wong

This only matters if we want to write data through the COW fork that is
not actually an overwrite of existing data.  Reasons for that are
speculative COW fork allocations using the cowextsize, or a mode where
we always write through the COW fork.  Currently both can't actually
happen, but I plan to enable them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_aops.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2ed8733eca49..983d11c27d32 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -447,28 +447,29 @@ xfs_map_blocks(
 
 	wpc->fork = XFS_DATA_FORK;
 
+	/* landed in a hole or beyond EOF? */
 	if (imap.br_startoff > offset_fsb) {
-		/* landed in a hole or beyond EOF */
 		imap.br_blockcount = imap.br_startoff - offset_fsb;
 		imap.br_startoff = offset_fsb;
 		imap.br_startblock = HOLESTARTBLOCK;
 		imap.br_state = XFS_EXT_NORM;
-	} else {
-		/*
-		 * Truncate to the next COW extent if there is one.  This is the
-		 * only opportunity to do this because we can skip COW fork
-		 * lookups for the subsequent blocks in the mapping; however,
-		 * the requirement to treat the COW range separately remains.
-		 */
-		if (cow_fsb != NULLFILEOFF &&
-		    cow_fsb < imap.br_startoff + imap.br_blockcount)
-			imap.br_blockcount = cow_fsb - imap.br_startoff;
-
-		/* got a delalloc extent? */
-		if (isnullstartblock(imap.br_startblock))
-			goto allocate_blocks;
 	}
 
+	/*
+	 * Truncate to the next COW extent if there is one.  This is the only
+	 * opportunity to do this because we can skip COW fork lookups for the
+	 * subsequent blocks in the mapping; however, the requirement to treat
+	 * the COW range separately remains.
+	 */
+	if (cow_fsb != NULLFILEOFF &&
+	    cow_fsb < imap.br_startoff + imap.br_blockcount)
+		imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+	/* got a delalloc extent? */
+	if (imap.br_startblock != HOLESTARTBLOCK &&
+	    isnullstartblock(imap.br_startblock))
+		goto allocate_blocks;
+
 	wpc->imap = imap;
 	trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
 	return 0;
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2019-02-18  9:18 ` [PATCH 4/8] xfs: also truncate holes covered by COW blocks Christoph Hellwig
@ 2019-02-18  9:18 ` Christoph Hellwig
  2019-02-19 18:12   ` Darrick J. Wong
  2019-02-21 17:59   ` Brian Foster
  2019-02-18  9:18 ` [PATCH 6/8] xfs: make COW fork unwritten extent conversions more robust Christoph Hellwig
                   ` (3 subsequent siblings)
  8 siblings, 2 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs

Besides simplifying the code a bit this allows to actually implement
the behavior of using COW preallocation for non-COW data mentioned
in the current comments.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
 fs/xfs/xfs_reflink.c |  67 ----------------------
 fs/xfs/xfs_reflink.h |   2 -
 fs/xfs/xfs_trace.h   |   3 -
 4 files changed, 94 insertions(+), 111 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 19a3331b4a56..c9fd1e4a1f99 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -534,15 +534,16 @@ xfs_file_iomap_begin_delay(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		maxbytes_fsb =
 		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 	xfs_fileoff_t		end_fsb;
-	int			error = 0, eof = 0;
-	struct xfs_bmbt_irec	got;
-	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	imap, cmap;
+	struct xfs_iext_cursor	icur, ccur;
 	xfs_fsblock_t		prealloc_blocks = 0;
+	bool			eof = false, cow_eof = false, shared;
+	int			whichfork = XFS_DATA_FORK;
+	int			error = 0;
 
 	ASSERT(!XFS_IS_REALTIME_INODE(ip));
 	ASSERT(!xfs_get_extsz_hint(ip));
@@ -560,7 +561,7 @@ xfs_file_iomap_begin_delay(
 
 	XFS_STATS_INC(mp, xs_blk_mapw);
 
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
 		if (error)
 			goto out_unlock;
@@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
 
 	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 
-	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+	/*
+	 * Search the data fork fork first to look up our source mapping.  We
+	 * always need the data fork map, as we have to return it to the
+	 * iomap code so that the higher level write code can read data in to
+	 * perform read-modify-write cycles for unaligned writes.
+	 */
+	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
 	if (eof)
-		got.br_startoff = end_fsb; /* fake hole until the end */
+		imap.br_startoff = end_fsb; /* fake hole until the end */
+
+	/* We never need to allocate blocks for zeroing a hole. */
+	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+		goto out_unlock;
+	}
+
+	/*
+	 * Search the COW fork extent list even if we did not find a data fork
+	 * extent.  This serves two purposes: first this implements the
+	 * speculative preallocation using cowextsize, so that we also unshare
+	 * block adjacent to shared blocks instead of just the shared blocks
+	 * themselves.  Second the lookup in the extent list is generally faster
+	 * than going out to the shared extent tree.
+	 */
+	if (xfs_is_reflink_inode(ip)) {
+		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+				&ccur, &cmap);
+		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+			trace_xfs_reflink_cow_found(ip, &cmap);
+			whichfork = XFS_COW_FORK;
+			goto done;
+		}
+	}
 
-	if (got.br_startoff <= offset_fsb) {
+	if (imap.br_startoff <= offset_fsb) {
 		/*
 		 * For reflink files we may need a delalloc reservation when
 		 * overwriting shared extents.   This includes zeroing of
 		 * existing extents that contain data.
 		 */
-		if (xfs_is_reflink_inode(ip) &&
-		    ((flags & IOMAP_WRITE) ||
-		     got.br_state != XFS_EXT_UNWRITTEN)) {
-			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
-			error = xfs_reflink_reserve_cow(ip, &got);
-			if (error)
-				goto out_unlock;
+		if (!xfs_is_reflink_inode(ip) ||
+		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+					&imap);
+			goto done;
 		}
 
-		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
-		goto done;
-	}
+		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
 
-	if (flags & IOMAP_ZERO) {
-		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
-		goto out_unlock;
+		/* Trim the mapping to the nearest shared extent boundary. */
+		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+		if (error)
+			goto out_unlock;
+
+		/* Not shared?  Just report the (potentially capped) extent. */
+		if (!shared) {
+			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+					&imap);
+			goto done;
+		}
+
+		/*
+		 * Fork all the shared blocks from our write offset until the
+		 * end of the extent.
+		 */
+		whichfork = XFS_COW_FORK;
+		end_fsb = imap.br_startoff + imap.br_blockcount;
+	} else {
+		/*
+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+		 * pages to keep the chunks of work done where somewhat
+		 * symmetric with the work writeback does.  This is a completely
+		 * arbitrary number pulled out of thin air.
+		 *
+		 * Note that the values needs to be less than 32-bits wide until
+		 * the lower level functions are updated.
+		 */
+		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 	}
 
 	error = xfs_qm_dqattach_locked(ip, false);
 	if (error)
 		goto out_unlock;
 
-	/*
-	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
-	 * to keep the chunks of work done where somewhat symmetric with the
-	 * work writeback does. This is a completely arbitrary number pulled
-	 * out of thin air as a best guess for initial testing.
-	 *
-	 * Note that the values needs to be less than 32-bits wide until
-	 * the lower level functions are updated.
-	 */
-	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
-	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
-	if (eof) {
+	if (eof && whichfork == XFS_DATA_FORK) {
 		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
 				&icur);
 		if (prealloc_blocks) {
@@ -635,9 +677,11 @@ xfs_file_iomap_begin_delay(
 	}
 
 retry:
-	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
-			end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
-			eof);
+	error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+			end_fsb - offset_fsb, prealloc_blocks,
+			whichfork == XFS_DATA_FORK ? &imap : &cmap,
+			whichfork == XFS_DATA_FORK ? &icur : &ccur,
+			whichfork == XFS_DATA_FORK ? eof : cow_eof);
 	switch (error) {
 	case 0:
 		break;
@@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
 	 * them out if the write happens to fail.
 	 */
 	iomap->flags |= IOMAP_F_NEW;
-	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
+	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+			whichfork == XFS_DATA_FORK ? &imap : &cmap);
 done:
-	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
+	if (whichfork == XFS_COW_FORK) {
+		if (imap.br_startoff > offset_fsb) {
+			xfs_trim_extent(&cmap, offset_fsb,
+					imap.br_startoff - offset_fsb);
+			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
+			goto out_unlock;
+		}
+		/* ensure we only report blocks we have a reservation for */
+		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+	}
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 8a5353daf9ab..9ef1f79cb3ae 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
 	}
 }
 
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
-	struct xfs_inode	*ip,
-	struct xfs_bmbt_irec	*imap)
-{
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec	got;
-	int			error = 0;
-	bool			eof = false;
-	struct xfs_iext_cursor	icur;
-	bool			shared;
-
-	/*
-	 * Search the COW fork extent list first.  This serves two purposes:
-	 * first this implement the speculative preallocation using cowextisze,
-	 * so that we also unshared block adjacent to shared blocks instead
-	 * of just the shared blocks themselves.  Second the lookup in the
-	 * extent list is generally faster than going out to the shared extent
-	 * tree.
-	 */
-
-	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
-		eof = true;
-	if (!eof && got.br_startoff <= imap->br_startoff) {
-		trace_xfs_reflink_cow_found(ip, imap);
-		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-		return 0;
-	}
-
-	/* Trim the mapping to the nearest shared extent boundary. */
-	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
-	if (error)
-		return error;
-
-	/* Not shared?  Just report the (potentially capped) extent. */
-	if (!shared)
-		return 0;
-
-	/*
-	 * Fork all the shared blocks from our write offset until the end of
-	 * the extent.
-	 */
-	error = xfs_qm_dqattach_locked(ip, false);
-	if (error)
-		return error;
-
-	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
-			imap->br_blockcount, 0, &got, &icur, eof);
-	if (error == -ENOSPC || error == -EDQUOT)
-		trace_xfs_reflink_cow_enospc(ip, imap);
-	if (error)
-		return error;
-
-	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-	trace_xfs_reflink_cow_alloc(ip, &got);
-	return 0;
-}
-
 /* Convert part of an unwritten CoW extent to a real one. */
 STATIC int
 xfs_reflink_convert_cow_extent(
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 70d68a1a9b49..4a9e3cd4768a 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *irec, bool *shared);
 
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
-		struct xfs_bmbt_irec *imap);
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
 		unsigned iomap_flags);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f1e18ae8a209..47fb07d86efd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
 
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
 
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 6/8] xfs: make COW fork unwritten extent conversions more robust
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2019-02-18  9:18 ` [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay Christoph Hellwig
@ 2019-02-18  9:18 ` Christoph Hellwig
  2019-02-19 18:16   ` Darrick J. Wong
  2019-02-18  9:18 ` [PATCH 7/8] xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay Christoph Hellwig
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs

If we have racing buffered and direct I/O COW fork extents under
writeback can have been moved to the data fork by the time we call
xfs_reflink_convert_cow from xfs_submit_ioend.  This would be mostly
harmless as the block numbers don't change by this move, except for
the fact that xfs_bmapi_write will crash or trigger asserts when
not finding existing extents, even despite trying to paper over this
with the XFS_BMAPI_CONVERT_ONLY flag.

Instead of special casing non-transaction conversions in the already
way too complicated xfs_bmapi_write just add a new helper for the much
simpler non-transactional COW fork case, which simplify ignores not
found extents.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c |  9 ++----
 fs/xfs/libxfs/xfs_bmap.h |  8 +++---
 fs/xfs/xfs_reflink.c     | 61 +++++++++++++++++++++++++---------------
 3 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 4cf83475f0d0..114b4da9add6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -2031,7 +2031,7 @@ xfs_bmap_add_extent_delay_real(
 /*
  * Convert an unwritten allocation to a real allocation or vice versa.
  */
-STATIC int				/* error */
+int					/* error */
 xfs_bmap_add_extent_unwritten_real(
 	struct xfs_trans	*tp,
 	xfs_inode_t		*ip,	/* incore inode pointer */
@@ -4276,9 +4276,7 @@ xfs_bmapi_write(
 
 	ASSERT(*nmap >= 1);
 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-	ASSERT(tp != NULL ||
-	       (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
-			(XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+	ASSERT(tp != NULL);
 	ASSERT(len > 0);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4352,8 +4350,7 @@ xfs_bmapi_write(
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
-		if ((need_alloc || wasdelay) &&
-		    !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+		if (need_alloc || wasdelay) {
 			bma.eof = eof;
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 78b190b6e908..8f597f9abdbe 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -95,9 +95,6 @@ struct xfs_extent_free_item
 /* Map something in the CoW fork. */
 #define XFS_BMAPI_COWFORK	0x200
 
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY	0x800
-
 /* Skip online discard of freed extents */
 #define XFS_BMAPI_NODISCARD	0x1000
 
@@ -114,7 +111,6 @@ struct xfs_extent_free_item
 	{ XFS_BMAPI_ZERO,	"ZERO" }, \
 	{ XFS_BMAPI_REMAP,	"REMAP" }, \
 	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
-	{ XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
 	{ XFS_BMAPI_NODISCARD,	"NODISCARD" }, \
 	{ XFS_BMAPI_NORMAP,	"NORMAP" }
 
@@ -226,6 +222,10 @@ int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
 int	xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
 		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
 		unsigned int *seq);
+int	xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+		struct xfs_inode *ip, int whichfork,
+		struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+		struct xfs_bmbt_irec *new, int *logflagsp);
 
 static inline void
 xfs_bmap_add_free(
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 9ef1f79cb3ae..f84b37fa4f17 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -234,26 +234,42 @@ xfs_reflink_trim_around_shared(
 	}
 }
 
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
-	struct xfs_inode		*ip,
-	struct xfs_bmbt_irec		*imap,
-	xfs_fileoff_t			offset_fsb,
-	xfs_filblks_t			count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb,
+	xfs_filblks_t		count_fsb)
 {
-	int				nimaps = 1;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	struct xfs_btree_cur	*dummy_cur = NULL;
+	int			dummy_logflags;
+	int			error;
 
-	if (imap->br_state == XFS_EXT_NORM)
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
 		return 0;
 
-	xfs_trim_extent(imap, offset_fsb, count_fsb);
-	trace_xfs_reflink_convert_cow(ip, imap);
-	if (imap->br_blockcount == 0)
-		return 0;
-	return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
-			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
-			&nimaps);
+	do {
+		if (got.br_startoff >= offset_fsb + count_fsb)
+			break;
+		if (got.br_state == XFS_EXT_NORM)
+			continue;
+		if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+			return -EIO;
+
+		xfs_trim_extent(&got, offset_fsb, count_fsb);
+		if (!got.br_blockcount)
+			continue;
+
+		got.br_state = XFS_EXT_NORM;
+		error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+				XFS_COW_FORK, &icur, &dummy_cur, &got,
+				&dummy_logflags);
+		if (error)
+			return error;
+	} while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+	return error;
 }
 
 /* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -267,15 +283,12 @@ xfs_reflink_convert_cow(
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
 	xfs_filblks_t		count_fsb = end_fsb - offset_fsb;
-	struct xfs_bmbt_irec	imap;
-	int			nimaps = 1, error = 0;
+	int			error;
 
 	ASSERT(count != 0);
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
-			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
-			XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
@@ -405,14 +418,16 @@ xfs_reflink_allocate_cow(
 	if (nimaps == 0)
 		return -ENOSPC;
 convert:
+	xfs_trim_extent(imap, offset_fsb, count_fsb);
 	/*
 	 * COW fork extents are supposed to remain unwritten until we're ready
 	 * to initiate a disk write.  For direct I/O we are going to write the
 	 * data and need the conversion, but for buffered writes we're done.
 	 */
-	if (!(iomap_flags & IOMAP_DIRECT))
+	if (!(iomap_flags & IOMAP_DIRECT) || imap->br_state == XFS_EXT_NORM)
 		return 0;
-	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+	trace_xfs_reflink_convert_cow(ip, imap);
+	return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
 
 out_unreserve:
 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 7/8] xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2019-02-18  9:18 ` [PATCH 6/8] xfs: make COW fork unwritten extent conversions more robust Christoph Hellwig
@ 2019-02-18  9:18 ` Christoph Hellwig
  2019-02-19  5:20   ` Darrick J. Wong
  2019-02-18  9:18 ` [PATCH 8/8] xfs: introduce an always_cow mode Christoph Hellwig
  2019-02-18  9:19 ` xfs/420 and xfs/421: don't disturb unwritten status with md5sum Christoph Hellwig
  8 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs

No user of it in the iomap code at the moment, but we should not
actively report wrong information if we can trivially get it right.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index c9fd1e4a1f99..7b71dcc97ef0 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -541,7 +541,7 @@ xfs_file_iomap_begin_delay(
 	struct xfs_bmbt_irec	imap, cmap;
 	struct xfs_iext_cursor	icur, ccur;
 	xfs_fsblock_t		prealloc_blocks = 0;
-	bool			eof = false, cow_eof = false, shared;
+	bool			eof = false, cow_eof = false, shared = false;
 	int			whichfork = XFS_DATA_FORK;
 	int			error = 0;
 
@@ -710,13 +710,14 @@ xfs_file_iomap_begin_delay(
 		if (imap.br_startoff > offset_fsb) {
 			xfs_trim_extent(&cmap, offset_fsb,
 					imap.br_startoff - offset_fsb);
-			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
+			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
 			goto out_unlock;
 		}
 		/* ensure we only report blocks we have a reservation for */
 		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+		shared = true;
 	}
-	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 8/8] xfs: introduce an always_cow mode
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2019-02-18  9:18 ` [PATCH 7/8] xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay Christoph Hellwig
@ 2019-02-18  9:18 ` Christoph Hellwig
  2019-02-19  5:25   ` Darrick J. Wong
  2019-02-19 18:31   ` Darrick J. Wong
  2019-02-18  9:19 ` xfs/420 and xfs/421: don't disturb unwritten status with md5sum Christoph Hellwig
  8 siblings, 2 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:18 UTC (permalink / raw)
  To: linux-xfs

Add a mode where XFS never overwrites existing blocks in place.  This
is to aid debugging our COW code, and also put infatructure in place
for things like possible future support for zoned block devices, which
can't support overwrites.

This mode is enabled globally by doing a:

    echo 1 > /sys/fs/xfs/debug/always_cow

Note that the parameter is global to allow running all tests in xfstests
easily in this mode, which would not easily be possible with a per-fs
sysfs file.

In always_cow mode persistent preallocations are disabled, and fallocate
will fail when called with a 0 mode (with our without
FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space
when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE.

There are a few interesting xfstests failures when run in always_cow
mode:

 - generic/392 fails because the bytes used in the file used to test
   hole punch recovery are less after the log replay.  This is
   because the blocks written and then punched out are only freed
   with a delay due to the logging mechanism.
 - xfs/170 will fail as the already fragile file streams mechanism
   doesn't seem to interact well with the COW allocator
 - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim
   the file system is badly fragmented, but there is not much we
   can do to avoid that when always writing out of place
 - xfs/205 fails because overwriting a file in always_cow mode
   will require new space allocation and the assumption in the
   test thus don't work anymore.
 - xfs/326 fails to modify the file at all in always_cow mode after
   injecting the refcount error, leading to an unexpected md5sum
   after the remount, but that again is expected

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c      |  2 +-
 fs/xfs/xfs_bmap_util.c |  9 +++------
 fs/xfs/xfs_file.c      | 27 ++++++++++++++++++++-------
 fs/xfs/xfs_iomap.c     | 28 ++++++++++++++++++----------
 fs/xfs/xfs_mount.h     |  1 +
 fs/xfs/xfs_reflink.c   | 28 ++++++++++++++++++++++++----
 fs/xfs/xfs_reflink.h   | 13 +++++++++++++
 fs/xfs/xfs_super.c     | 15 +++++++++++----
 fs/xfs/xfs_sysctl.h    |  1 +
 fs/xfs/xfs_sysfs.c     | 24 ++++++++++++++++++++++++
 10 files changed, 116 insertions(+), 32 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 983d11c27d32..7b8bb6bde981 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1023,7 +1023,7 @@ xfs_vm_bmap(
 	 * Since we don't pass back blockdev info, we can't return bmap
 	 * information for rt files either.
 	 */
-	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 		return 0;
 	return iomap_bmap(mapping, block, &xfs_iomap_ops);
 }
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1ee8c5539fa4..2db43ff4f8b5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1162,16 +1162,13 @@ xfs_zero_file_space(
 	 * by virtue of the hole punch.
 	 */
 	error = xfs_free_file_space(ip, offset, len);
-	if (error)
-		goto out;
+	if (error || xfs_is_always_cow_inode(ip))
+		return error;
 
-	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+	return xfs_alloc_file_space(ip, round_down(offset, blksize),
 				     round_up(offset + len, blksize) -
 				     round_down(offset, blksize),
 				     XFS_BMAPI_PREALLOC);
-out:
-	return error;
-
 }
 
 static int
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1d07dcfbbff3..770cc2edf777 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
 		 * We can't properly handle unaligned direct I/O to reflink
 		 * files yet, as we can't unshare a partial block.
 		 */
-		if (xfs_is_reflink_inode(ip)) {
+		if (xfs_is_cow_inode(ip)) {
 			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
 			return -EREMCHG;
 		}
@@ -872,14 +872,27 @@ xfs_file_fallocate(
 				goto out_unlock;
 		}
 
-		if (mode & FALLOC_FL_ZERO_RANGE)
+		if (mode & FALLOC_FL_ZERO_RANGE) {
 			error = xfs_zero_file_space(ip, offset, len);
-		else {
-			if (mode & FALLOC_FL_UNSHARE_RANGE) {
-				error = xfs_reflink_unshare(ip, offset, len);
-				if (error)
-					goto out_unlock;
+		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+			error = xfs_reflink_unshare(ip, offset, len);
+			if (error)
+				goto out_unlock;
+
+			if (!xfs_is_always_cow_inode(ip)) {
+				error = xfs_alloc_file_space(ip, offset, len,
+						XFS_BMAPI_PREALLOC);
 			}
+		} else {
+			/*
+			 * If always_cow mode we can't use preallocations and
+			 * thus should not create them.
+			 */
+			if (xfs_is_always_cow_inode(ip)) {
+				error = -EOPNOTSUPP;
+				goto out_unlock;
+			}
+
 			error = xfs_alloc_file_space(ip, offset, len,
 						     XFS_BMAPI_PREALLOC);
 		}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7b71dcc97ef0..72533e9dddc6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -395,12 +395,13 @@ xfs_quota_calc_throttle(
 STATIC xfs_fsblock_t
 xfs_iomap_prealloc_size(
 	struct xfs_inode	*ip,
+	int			whichfork,
 	loff_t			offset,
 	loff_t			count,
 	struct xfs_iext_cursor	*icur)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	struct xfs_bmbt_irec	prev;
 	int			shift = 0;
@@ -593,7 +594,11 @@ xfs_file_iomap_begin_delay(
 	 * themselves.  Second the lookup in the extent list is generally faster
 	 * than going out to the shared extent tree.
 	 */
-	if (xfs_is_reflink_inode(ip)) {
+	if (xfs_is_cow_inode(ip)) {
+		if (!ip->i_cowfp) {
+			ASSERT(!xfs_is_reflink_inode(ip));
+			xfs_ifork_init_cow(ip);
+		}
 		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
 				&ccur, &cmap);
 		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
@@ -609,7 +614,7 @@ xfs_file_iomap_begin_delay(
 		 * overwriting shared extents.   This includes zeroing of
 		 * existing extents that contain data.
 		 */
-		if (!xfs_is_reflink_inode(ip) ||
+		if (!xfs_is_cow_inode(ip) ||
 		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
 			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
 					&imap);
@@ -619,7 +624,7 @@ xfs_file_iomap_begin_delay(
 		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
 
 		/* Trim the mapping to the nearest shared extent boundary. */
-		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+		error = xfs_inode_need_cow(ip, &imap, &shared);
 		if (error)
 			goto out_unlock;
 
@@ -648,15 +653,18 @@ xfs_file_iomap_begin_delay(
 		 */
 		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
 		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+		if (xfs_is_always_cow_inode(ip))
+			whichfork = XFS_COW_FORK;
 	}
 
 	error = xfs_qm_dqattach_locked(ip, false);
 	if (error)
 		goto out_unlock;
 
-	if (eof && whichfork == XFS_DATA_FORK) {
-		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
-				&icur);
+	if (eof) {
+		prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+				count, &icur);
 		if (prealloc_blocks) {
 			xfs_extlen_t	align;
 			xfs_off_t	end_offset;
@@ -867,7 +875,7 @@ xfs_ilock_for_iomap(
 	 * COW writes may allocate delalloc space or convert unwritten COW
 	 * extents, so we need to make sure to take the lock exclusively here.
 	 */
-	if (xfs_is_reflink_inode(ip) && is_write) {
+	if (xfs_is_cow_inode(ip) && is_write) {
 		/*
 		 * FIXME: It could still overwrite on unshared extents and not
 		 * need allocation.
@@ -901,7 +909,7 @@ xfs_ilock_for_iomap(
 	 * check, so if we got ILOCK_SHARED for a write and but we're now a
 	 * reflink inode we have to switch to ILOCK_EXCL and relock.
 	 */
-	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
 		xfs_iunlock(ip, mode);
 		mode = XFS_ILOCK_EXCL;
 		goto relock;
@@ -973,7 +981,7 @@ xfs_file_iomap_begin(
 	 * Break shared extents if necessary. Checks for non-blocking IO have
 	 * been done up front, so we don't need to do them here.
 	 */
-	if (xfs_is_reflink_inode(ip)) {
+	if (xfs_is_cow_inode(ip)) {
 		struct xfs_bmbt_irec	orig = imap;
 
 		/* if zeroing doesn't need COW allocation, then we are done. */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a33f45077867..fa8b37d61838 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -194,6 +194,7 @@ typedef struct xfs_mount {
 	 */
 	uint32_t		m_generation;
 
+	bool			m_always_cow;
 	bool			m_fail_unmount;
 #ifdef DEBUG
 	/*
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f84b37fa4f17..e2d9179bd50d 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
 	int			error = 0;
 
 	/* Holes, unwritten, and delalloc extents cannot be shared */
-	if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+	if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
 		*shared = false;
 		return 0;
 	}
@@ -234,6 +234,23 @@ xfs_reflink_trim_around_shared(
 	}
 }
 
+bool
+xfs_inode_need_cow(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	bool			*shared)
+{
+	/* We can't update any real extents in always COW mode. */
+	if (xfs_is_always_cow_inode(ip) &&
+	    !isnullstartblock(imap->br_startblock)) {
+		*shared = true;
+		return 0;
+	}
+
+	/* Trim the mapping to the nearest shared extent boundary. */
+	return xfs_reflink_trim_around_shared(ip, imap, shared);
+}
+
 static int
 xfs_reflink_convert_cow_locked(
 	struct xfs_inode	*ip,
@@ -321,7 +338,7 @@ xfs_find_trim_cow_extent(
 	if (got.br_startoff > offset_fsb) {
 		xfs_trim_extent(imap, imap->br_startoff,
 				got.br_startoff - imap->br_startoff);
-		return xfs_reflink_trim_around_shared(ip, imap, shared);
+		return xfs_inode_need_cow(ip, imap, shared);
 	}
 
 	*shared = true;
@@ -356,7 +373,10 @@ xfs_reflink_allocate_cow(
 	xfs_extlen_t		resblks = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(xfs_is_reflink_inode(ip));
+	if (!ip->i_cowfp) {
+		ASSERT(!xfs_is_reflink_inode(ip));
+		xfs_ifork_init_cow(ip);
+	}
 
 	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
 	if (error || !*shared)
@@ -542,7 +562,7 @@ xfs_reflink_cancel_cow_range(
 	int			error;
 
 	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
-	ASSERT(xfs_is_reflink_inode(ip));
+	ASSERT(ip->i_cowfp);
 
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 	if (count == NULLFILEOFF)
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 4a9e3cd4768a..2a3052fbe23e 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,11 +6,24 @@
 #ifndef __XFS_REFLINK_H
 #define __XFS_REFLINK_H 1
 
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+	return ip->i_mount->m_always_cow &&
+		xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
 extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
 		xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
 		xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+		bool *shared);
 
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c9097cb0b955..6be183a391b6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1729,11 +1729,18 @@ xfs_fs_fill_super(
 		}
 	}
 
-	if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
-		xfs_alert(mp,
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		if (mp->m_sb.sb_rblocks) {
+			xfs_alert(mp,
 	"reflink not compatible with realtime device!");
-		error = -EINVAL;
-		goto out_filestream_unmount;
+			error = -EINVAL;
+			goto out_filestream_unmount;
+		}
+
+		if (xfs_globals.always_cow) {
+			xfs_info(mp, "using DEBUG-only always_cow mode.");
+			mp->m_always_cow = true;
+		}
 	}
 
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 168488130a19..ad7f9be13087 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,7 @@ struct xfs_globals {
 	int	log_recovery_delay;	/* log recovery delay (secs) */
 	int	mount_delay;		/* mount setup delay (secs) */
 	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
+	bool	always_cow;		/* use COW fork for all overwrites */
 };
 extern struct xfs_globals	xfs_globals;
 
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994a7250..cabda13f3c64 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -183,10 +183,34 @@ mount_delay_show(
 }
 XFS_SYSFS_ATTR_RW(mount_delay);
 
+static ssize_t
+always_cow_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	ssize_t		ret;
+
+	ret = kstrtobool(buf, &xfs_globals.always_cow);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+static ssize_t
+always_cow_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
 static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(bug_on_assert),
 	ATTR_LIST(log_recovery_delay),
 	ATTR_LIST(mount_delay),
+	ATTR_LIST(always_cow),
 	NULL,
 };
 
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* xfs/420 and xfs/421: don't disturb unwritten status with md5sum
  2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2019-02-18  9:18 ` [PATCH 8/8] xfs: introduce an always_cow mode Christoph Hellwig
@ 2019-02-18  9:19 ` Christoph Hellwig
  2019-03-09 10:32   ` Eryu Guan
  8 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-18  9:19 UTC (permalink / raw)
  To: linux-xfs, fstests

The way we decided if an unwritten extent is considered a hole or data
is by checking if the page and/or blocks are marked uptodate, that is
contain valid data in the page cache.

xfs/420 and xfs/421 try to exercise SEEK_HOLE / SEEK_DATA in the
presence of cowextsize preallocations over holes in the data fork.  The
current XFS code never actually uses those for buffer writes, but a
pending patch changes that.  For SEEK_HOLE / SEEK_DATA to work properly
in that case we also need to look at the COW fork in their
implementations and thus have to rely on the unwritten extent page cache
probing.  But the tests for it ensure we do have valid data in the
pagecache by calling md5sum on the test files, and thus reading their
contents (including the zero-filled holes) in, and thus making them
all valid data.

Fix that by dropping the page cache content after the md5sum calls.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 tests/xfs/420 | 6 ++++++
 tests/xfs/421 | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/tests/xfs/420 b/tests/xfs/420
index a083a12b..34199637 100755
--- a/tests/xfs/420
+++ b/tests/xfs/420
@@ -83,6 +83,8 @@ echo "Compare files"
 md5sum $testdir/file1 | _filter_scratch
 md5sum $testdir/file2 | _filter_scratch
 md5sum $testdir/file3 | _filter_scratch
+# drop caches to make sure the page cache for the unwritten extents is clean
+echo 1 > /proc/sys/vm/drop_caches
 
 echo "CoW the shared part then write into the empty part" | tee -a $seqres.full
 $XFS_IO_PROG -c "cowextsize" $testdir/file1 >> $seqres.full
@@ -106,6 +108,8 @@ echo "Compare files"
 md5sum $testdir/file1 | _filter_scratch
 md5sum $testdir/file2 | _filter_scratch
 md5sum $testdir/file3 | _filter_scratch
+# drop caches to make sure the page cache for the unwritten extents is clean
+echo 1 > /proc/sys/vm/drop_caches
 
 echo "sync filesystem" | tee -a $seqres.full
 sync
@@ -123,6 +127,8 @@ echo "Compare files"
 md5sum $testdir/file1 | _filter_scratch
 md5sum $testdir/file2 | _filter_scratch
 md5sum $testdir/file3 | _filter_scratch
+# drop caches to make sure the page cache for the unwritten extents is clean
+echo 1 > /proc/sys/vm/drop_caches
 
 echo "Remount" | tee -a $seqres.full
 _scratch_cycle_mount
diff --git a/tests/xfs/421 b/tests/xfs/421
index a2734aba..374389bd 100755
--- a/tests/xfs/421
+++ b/tests/xfs/421
@@ -83,6 +83,8 @@ echo "Compare files"
 md5sum $testdir/file1 | _filter_scratch
 md5sum $testdir/file2 | _filter_scratch
 md5sum $testdir/file3 | _filter_scratch
+# drop caches to make sure the page cache for the unwritten extents is clean
+echo 1 > /proc/sys/vm/drop_caches
 
 echo "CoW the shared part then write into the empty part" | tee -a $seqres.full
 $XFS_IO_PROG -c "cowextsize" $testdir/file1 >> $seqres.full
@@ -106,6 +108,8 @@ echo "Compare files"
 md5sum $testdir/file1 | _filter_scratch
 md5sum $testdir/file2 | _filter_scratch
 md5sum $testdir/file3 | _filter_scratch
+# drop caches to make sure the page cache for the unwritten extents is clean
+echo 1 > /proc/sys/vm/drop_caches
 
 echo "sync filesystem" | tee -a $seqres.full
 sync
@@ -123,6 +127,8 @@ echo "Compare files"
 md5sum $testdir/file1 | _filter_scratch
 md5sum $testdir/file2 | _filter_scratch
 md5sum $testdir/file3 | _filter_scratch
+# drop caches to make sure the page cache for the unwritten extents is clean
+echo 1 > /proc/sys/vm/drop_caches
 
 echo "Remount" | tee -a $seqres.full
 _scratch_cycle_mount
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/8] xfs: fix SEEK_DATA for speculative COW fork preallocation
  2019-02-18  9:18 ` [PATCH 2/8] xfs: fix SEEK_DATA for speculative COW fork preallocation Christoph Hellwig
@ 2019-02-19  5:13   ` Darrick J. Wong
  0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-19  5:13 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 10:18:21AM +0100, Christoph Hellwig wrote:
> We speculatively allocate extents in the COW fork to reduce
> fragmentation.  But when we write data into such COW fork blocks that
> do now shadow an allocation in the data fork SEEK_DATA will not
> correctly report it, as it only looks at the data fork extents.
> The only reason why that hasn't been an issue so far is because
> we even use these speculative COW fork preallocations over holes in
> the data fork at all for buffered writes, and blocks in the COW
> fork that are written by direct writes are moved into the data
> fork immediately at I/O completion time.
> 
> Add a new set of iomap_ops for SEEK_HOLE/SEEK_DATA which looks into
> both the COW and data fork, and reports all COW extents as unwritten
> to the iomap layer.  While this isn't strictly true for COW fork
> extents that were already converted to real extents, the practical
> semantics that you can't read data from them until they are moved
> into the data fork are very similar, and this will force the iomap
> layer into probing the extents for actually present data.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c  |  4 +--
>  fs/xfs/xfs_iomap.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iomap.h |  1 +
>  3 files changed, 88 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index e47425071e65..1d07dcfbbff3 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1068,10 +1068,10 @@ xfs_file_llseek(
>  	default:
>  		return generic_file_llseek(file, offset, whence);
>  	case SEEK_HOLE:
> -		offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
> +		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
>  		break;
>  	case SEEK_DATA:
> -		offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
> +		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
>  		break;
>  	}
>  
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 284c5e68f695..a7599b084571 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1068,6 +1068,91 @@ const struct iomap_ops xfs_iomap_ops = {
>  	.iomap_end		= xfs_file_iomap_end,
>  };
>  
> +static int
> +xfs_seek_iomap_begin(
> +	struct inode		*inode,
> +	loff_t			offset,
> +	loff_t			length,
> +	unsigned		flags,
> +	struct iomap		*iomap)
> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
> +	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
> +	xfs_fileoff_t		cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
> +	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	imap, cmap;
> +	int			error = 0;
> +	unsigned		lockmode;
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return -EIO;
> +
> +	lockmode = xfs_ilock_data_map_shared(ip);
> +	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
> +		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
> +		if (error)
> +			goto out_unlock;
> +	}
> +
> +	if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
> +		/*
> +		 * If we found a data extent we are done.
> +		 */
> +		if (imap.br_startoff <= offset_fsb)
> +			goto done;
> +		data_fsb = imap.br_startoff;
> +	} else {
> +		/*
> +		 * Fake a hole until the end of the file.
> +		 */
> +		data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
> +			       XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
> +	}
> +
> +	/*
> +	 * If a COW fork extent covers the hole, report it - capped to the next
> +	 * data fork extent:
> +	 */
> +	if (xfs_inode_has_cow_data(ip) &&
> +	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
> +		cow_fsb = cmap.br_startoff;
> +	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
> +		if (data_fsb < cow_fsb + cmap.br_blockcount)
> +			end_fsb = min(end_fsb, data_fsb);
> +		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
> +		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
> +		/*
> +		 * Force page cache probing to avoid reporting COW extents as
> +		 * data even if they haven't been written to:

This comment is a little awkward, I'll change it to:

/*
 * This is a COW extent, so we must probe the page cache because there
 * could be dirty page cache being backed by this extent.
 */

Otherwise looks fine to me,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> +		 */
> +		iomap->type = IOMAP_UNWRITTEN;
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * Else report a hole, capped to the next found data or COW extent.
> +	 */
> +	if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
> +		imap.br_blockcount = cow_fsb - offset_fsb;
> +	else
> +		imap.br_blockcount = data_fsb - offset_fsb;
> +	imap.br_startoff = offset_fsb;
> +	imap.br_startblock = HOLESTARTBLOCK;
> +	imap.br_state = XFS_EXT_NORM;
> +done:
> +	xfs_trim_extent(&imap, offset_fsb, end_fsb);
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
> +out_unlock:
> +	xfs_iunlock(ip, lockmode);
> +	return error;
> +}
> +
> +const struct iomap_ops xfs_seek_iomap_ops = {
> +	.iomap_begin		= xfs_seek_iomap_begin,
> +};
> +
>  static int
>  xfs_xattr_iomap_begin(
>  	struct inode		*inode,
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 37b584c3069b..5c2f6aa6d78f 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -40,6 +40,7 @@ xfs_aligned_fsb_count(
>  }
>  
>  extern const struct iomap_ops xfs_iomap_ops;
> +extern const struct iomap_ops xfs_seek_iomap_ops;
>  extern const struct iomap_ops xfs_xattr_iomap_ops;
>  
>  #endif /* __XFS_IOMAP_H__*/
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints
  2019-02-18  9:18 ` [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints Christoph Hellwig
@ 2019-02-19  5:17   ` Darrick J. Wong
  2019-02-21 17:58   ` Brian Foster
  1 sibling, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-19  5:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 10:18:22AM +0100, Christoph Hellwig wrote:
> While using delalloc for extsize hints is generally a good idea, the
> current code that does so only for COW doesn't help us much and creates
> a lot of special cases.  Switch it to use real allocations like we
> do for direct I/O.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_iomap.c   | 28 +++++++++++++++++-----------
>  fs/xfs/xfs_reflink.c | 10 +++++++++-
>  fs/xfs/xfs_reflink.h |  3 ++-
>  3 files changed, 28 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index a7599b084571..19a3331b4a56 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -918,22 +918,28 @@ xfs_file_iomap_begin(
>  	 * been done up front, so we don't need to do them here.
>  	 */
>  	if (xfs_is_reflink_inode(ip)) {
> +		struct xfs_bmbt_irec	orig = imap;
> +
>  		/* if zeroing doesn't need COW allocation, then we are done. */
>  		if ((flags & IOMAP_ZERO) &&
>  		    !needs_cow_for_zeroing(&imap, nimaps))
>  			goto out_found;
>  
> -		if (flags & IOMAP_DIRECT) {
> -			/* may drop and re-acquire the ilock */

AFAICT we can still cycle the ilock in _reflink_allocate_cow, so we
should retain the comment.  I'll fix that in my tree if I end up pulling
in this series...

Otherwise looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D


> -			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
> -					&lockmode);
> -			if (error)
> -				goto out_unlock;
> -		} else {
> -			error = xfs_reflink_reserve_cow(ip, &imap);
> -			if (error)
> -				goto out_unlock;
> -		}
> +		error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode,
> +						 flags);
> +		if (error)
> +			goto out_unlock;
> +
> +		/*
> +		 * For buffered writes we need to report the address of the
> +		 * previous block (if there was any) so that the higher level
> +		 * write code can perform read-modify-write operations.  For
> +		 * direct I/O code, which must be block aligned we need to
> +		 * report the newly allocated address.
> +		 */
> +		if (!(flags & IOMAP_DIRECT) &&
> +		    orig.br_startblock != HOLESTARTBLOCK)
> +			imap = orig;
>  
>  		end_fsb = imap.br_startoff + imap.br_blockcount;
>  		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 2babc2cbe103..8a5353daf9ab 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
>  	struct xfs_inode	*ip,
>  	struct xfs_bmbt_irec	*imap,
>  	bool			*shared,
> -	uint			*lockmode)
> +	uint			*lockmode,
> +	unsigned		iomap_flags)
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> @@ -471,6 +472,13 @@ xfs_reflink_allocate_cow(
>  	if (nimaps == 0)
>  		return -ENOSPC;
>  convert:
> +	/*
> +	 * COW fork extents are supposed to remain unwritten until we're ready
> +	 * to initiate a disk write.  For direct I/O we are going to write the
> +	 * data and need the conversion, but for buffered writes we're done.
> +	 */
> +	if (!(iomap_flags & IOMAP_DIRECT))
> +		return 0;
>  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
>  
>  out_unreserve:
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 6d73daef1f13..70d68a1a9b49 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -15,7 +15,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap);
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> -		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
> +		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> +		unsigned iomap_flags);
>  extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t count);
>  
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 7/8] xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay
  2019-02-18  9:18 ` [PATCH 7/8] xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay Christoph Hellwig
@ 2019-02-19  5:20   ` Darrick J. Wong
  0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-19  5:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 10:18:26AM +0100, Christoph Hellwig wrote:
> No user of it in the iomap code at the moment, but we should not
> actively report wrong information if we can trivially get it right.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> ---
>  fs/xfs/xfs_iomap.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index c9fd1e4a1f99..7b71dcc97ef0 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -541,7 +541,7 @@ xfs_file_iomap_begin_delay(
>  	struct xfs_bmbt_irec	imap, cmap;
>  	struct xfs_iext_cursor	icur, ccur;
>  	xfs_fsblock_t		prealloc_blocks = 0;
> -	bool			eof = false, cow_eof = false, shared;
> +	bool			eof = false, cow_eof = false, shared = false;
>  	int			whichfork = XFS_DATA_FORK;
>  	int			error = 0;
>  
> @@ -710,13 +710,14 @@ xfs_file_iomap_begin_delay(
>  		if (imap.br_startoff > offset_fsb) {
>  			xfs_trim_extent(&cmap, offset_fsb,
>  					imap.br_startoff - offset_fsb);
> -			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
>  			goto out_unlock;
>  		}
>  		/* ensure we only report blocks we have a reservation for */
>  		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> +		shared = true;
>  	}
> -	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
>  out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	return error;
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/8] xfs: introduce an always_cow mode
  2019-02-18  9:18 ` [PATCH 8/8] xfs: introduce an always_cow mode Christoph Hellwig
@ 2019-02-19  5:25   ` Darrick J. Wong
  2019-02-19 17:53     ` Darrick J. Wong
  2019-02-20 15:00     ` Christoph Hellwig
  2019-02-19 18:31   ` Darrick J. Wong
  1 sibling, 2 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-19  5:25 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 10:18:27AM +0100, Christoph Hellwig wrote:
> Add a mode where XFS never overwrites existing blocks in place.  This
> is to aid debugging our COW code, and also put infatructure in place
> for things like possible future support for zoned block devices, which
> can't support overwrites.
> 
> This mode is enabled globally by doing a:
> 
>     echo 1 > /sys/fs/xfs/debug/always_cow
> 
> Note that the parameter is global to allow running all tests in xfstests
> easily in this mode, which would not easily be possible with a per-fs
> sysfs file.
> 
> In always_cow mode persistent preallocations are disabled, and fallocate
> will fail when called with a 0 mode (with our without
> FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space
> when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE.
> 
> There are a few interesting xfstests failures when run in always_cow
> mode:
> 
>  - generic/392 fails because the bytes used in the file used to test
>    hole punch recovery are less after the log replay.  This is
>    because the blocks written and then punched out are only freed
>    with a delay due to the logging mechanism.
>  - xfs/170 will fail as the already fragile file streams mechanism
>    doesn't seem to interact well with the COW allocator
>  - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim
>    the file system is badly fragmented, but there is not much we
>    can do to avoid that when always writing out of place
>  - xfs/205 fails because overwriting a file in always_cow mode
>    will require new space allocation and the assumption in the
>    test thus don't work anymore.
>  - xfs/326 fails to modify the file at all in always_cow mode after
>    injecting the refcount error, leading to an unexpected md5sum
>    after the remount, but that again is expected

Yeah, I saw failures in a bunch of tests when running always_cow against
a 4k blocksize rmap/reflink filesystem:

generic/476 xfs/066 xfs/1501 xfs/064 xfs/296 xfs/205 xfs/056 xfs/198
generic/392 xfs/170 xfs/450 xfs/060 xfs/068 xfs/192 generic/475 xfs/283

Will have a look at those tomorrow...

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_aops.c      |  2 +-
>  fs/xfs/xfs_bmap_util.c |  9 +++------
>  fs/xfs/xfs_file.c      | 27 ++++++++++++++++++++-------
>  fs/xfs/xfs_iomap.c     | 28 ++++++++++++++++++----------
>  fs/xfs/xfs_mount.h     |  1 +
>  fs/xfs/xfs_reflink.c   | 28 ++++++++++++++++++++++++----
>  fs/xfs/xfs_reflink.h   | 13 +++++++++++++
>  fs/xfs/xfs_super.c     | 15 +++++++++++----
>  fs/xfs/xfs_sysctl.h    |  1 +
>  fs/xfs/xfs_sysfs.c     | 24 ++++++++++++++++++++++++
>  10 files changed, 116 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 983d11c27d32..7b8bb6bde981 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1023,7 +1023,7 @@ xfs_vm_bmap(
>  	 * Since we don't pass back blockdev info, we can't return bmap
>  	 * information for rt files either.
>  	 */
> -	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
> +	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
>  		return 0;
>  	return iomap_bmap(mapping, block, &xfs_iomap_ops);
>  }
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 1ee8c5539fa4..2db43ff4f8b5 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1162,16 +1162,13 @@ xfs_zero_file_space(
>  	 * by virtue of the hole punch.
>  	 */
>  	error = xfs_free_file_space(ip, offset, len);
> -	if (error)
> -		goto out;
> +	if (error || xfs_is_always_cow_inode(ip))
> +		return error;
>  
> -	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
> +	return xfs_alloc_file_space(ip, round_down(offset, blksize),
>  				     round_up(offset + len, blksize) -
>  				     round_down(offset, blksize),
>  				     XFS_BMAPI_PREALLOC);
> -out:
> -	return error;
> -
>  }
>  
>  static int
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 1d07dcfbbff3..770cc2edf777 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
>  		 * We can't properly handle unaligned direct I/O to reflink
>  		 * files yet, as we can't unshare a partial block.
>  		 */
> -		if (xfs_is_reflink_inode(ip)) {
> +		if (xfs_is_cow_inode(ip)) {
>  			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
>  			return -EREMCHG;
>  		}
> @@ -872,14 +872,27 @@ xfs_file_fallocate(
>  				goto out_unlock;
>  		}
>  
> -		if (mode & FALLOC_FL_ZERO_RANGE)
> +		if (mode & FALLOC_FL_ZERO_RANGE) {
>  			error = xfs_zero_file_space(ip, offset, len);
> -		else {
> -			if (mode & FALLOC_FL_UNSHARE_RANGE) {
> -				error = xfs_reflink_unshare(ip, offset, len);
> -				if (error)
> -					goto out_unlock;
> +		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
> +			error = xfs_reflink_unshare(ip, offset, len);
> +			if (error)
> +				goto out_unlock;
> +
> +			if (!xfs_is_always_cow_inode(ip)) {
> +				error = xfs_alloc_file_space(ip, offset, len,
> +						XFS_BMAPI_PREALLOC);
>  			}
> +		} else {
> +			/*
> +			 * If always_cow mode we can't use preallocations and
> +			 * thus should not create them.
> +			 */
> +			if (xfs_is_always_cow_inode(ip)) {
> +				error = -EOPNOTSUPP;
> +				goto out_unlock;
> +			}
> +
>  			error = xfs_alloc_file_space(ip, offset, len,
>  						     XFS_BMAPI_PREALLOC);
>  		}
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 7b71dcc97ef0..72533e9dddc6 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -395,12 +395,13 @@ xfs_quota_calc_throttle(
>  STATIC xfs_fsblock_t
>  xfs_iomap_prealloc_size(
>  	struct xfs_inode	*ip,
> +	int			whichfork,
>  	loff_t			offset,
>  	loff_t			count,
>  	struct xfs_iext_cursor	*icur)
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	struct xfs_bmbt_irec	prev;
>  	int			shift = 0;
> @@ -593,7 +594,11 @@ xfs_file_iomap_begin_delay(
>  	 * themselves.  Second the lookup in the extent list is generally faster
>  	 * than going out to the shared extent tree.
>  	 */
> -	if (xfs_is_reflink_inode(ip)) {
> +	if (xfs_is_cow_inode(ip)) {
> +		if (!ip->i_cowfp) {
> +			ASSERT(!xfs_is_reflink_inode(ip));
> +			xfs_ifork_init_cow(ip);
> +		}
>  		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
>  				&ccur, &cmap);
>  		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> @@ -609,7 +614,7 @@ xfs_file_iomap_begin_delay(
>  		 * overwriting shared extents.   This includes zeroing of
>  		 * existing extents that contain data.
>  		 */
> -		if (!xfs_is_reflink_inode(ip) ||
> +		if (!xfs_is_cow_inode(ip) ||
>  		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
>  			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
>  					&imap);
> @@ -619,7 +624,7 @@ xfs_file_iomap_begin_delay(
>  		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
>  
>  		/* Trim the mapping to the nearest shared extent boundary. */
> -		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +		error = xfs_inode_need_cow(ip, &imap, &shared);
>  		if (error)
>  			goto out_unlock;
>  
> @@ -648,15 +653,18 @@ xfs_file_iomap_begin_delay(
>  		 */
>  		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
>  		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> +
> +		if (xfs_is_always_cow_inode(ip))
> +			whichfork = XFS_COW_FORK;
>  	}
>  
>  	error = xfs_qm_dqattach_locked(ip, false);
>  	if (error)
>  		goto out_unlock;
>  
> -	if (eof && whichfork == XFS_DATA_FORK) {
> -		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
> -				&icur);
> +	if (eof) {
> +		prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
> +				count, &icur);
>  		if (prealloc_blocks) {
>  			xfs_extlen_t	align;
>  			xfs_off_t	end_offset;
> @@ -867,7 +875,7 @@ xfs_ilock_for_iomap(
>  	 * COW writes may allocate delalloc space or convert unwritten COW
>  	 * extents, so we need to make sure to take the lock exclusively here.
>  	 */
> -	if (xfs_is_reflink_inode(ip) && is_write) {
> +	if (xfs_is_cow_inode(ip) && is_write) {
>  		/*
>  		 * FIXME: It could still overwrite on unshared extents and not
>  		 * need allocation.
> @@ -901,7 +909,7 @@ xfs_ilock_for_iomap(
>  	 * check, so if we got ILOCK_SHARED for a write and but we're now a
>  	 * reflink inode we have to switch to ILOCK_EXCL and relock.
>  	 */
> -	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
> +	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
>  		xfs_iunlock(ip, mode);
>  		mode = XFS_ILOCK_EXCL;
>  		goto relock;
> @@ -973,7 +981,7 @@ xfs_file_iomap_begin(
>  	 * Break shared extents if necessary. Checks for non-blocking IO have
>  	 * been done up front, so we don't need to do them here.
>  	 */
> -	if (xfs_is_reflink_inode(ip)) {
> +	if (xfs_is_cow_inode(ip)) {
>  		struct xfs_bmbt_irec	orig = imap;
>  
>  		/* if zeroing doesn't need COW allocation, then we are done. */
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index a33f45077867..fa8b37d61838 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -194,6 +194,7 @@ typedef struct xfs_mount {
>  	 */
>  	uint32_t		m_generation;
>  
> +	bool			m_always_cow;
>  	bool			m_fail_unmount;
>  #ifdef DEBUG
>  	/*
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index f84b37fa4f17..e2d9179bd50d 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
>  	int			error = 0;
>  
>  	/* Holes, unwritten, and delalloc extents cannot be shared */
> -	if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
> +	if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
>  		*shared = false;
>  		return 0;
>  	}
> @@ -234,6 +234,23 @@ xfs_reflink_trim_around_shared(
>  	}
>  }
>  
> +bool
> +xfs_inode_need_cow(

xfs_inode_trim_to_cow() ?

Otherwise generally looks ok to me, but it's late so I'll send this now
and get back to review in the morning.

--D

> +	struct xfs_inode	*ip,
> +	struct xfs_bmbt_irec	*imap,
> +	bool			*shared)
> +{
> +	/* We can't update any real extents in always COW mode. */
> +	if (xfs_is_always_cow_inode(ip) &&
> +	    !isnullstartblock(imap->br_startblock)) {
> +		*shared = true;
> +		return 0;
> +	}
> +
> +	/* Trim the mapping to the nearest shared extent boundary. */
> +	return xfs_reflink_trim_around_shared(ip, imap, shared);
> +}
> +
>  static int
>  xfs_reflink_convert_cow_locked(
>  	struct xfs_inode	*ip,
> @@ -321,7 +338,7 @@ xfs_find_trim_cow_extent(
>  	if (got.br_startoff > offset_fsb) {
>  		xfs_trim_extent(imap, imap->br_startoff,
>  				got.br_startoff - imap->br_startoff);
> -		return xfs_reflink_trim_around_shared(ip, imap, shared);
> +		return xfs_inode_need_cow(ip, imap, shared);
>  	}
>  
>  	*shared = true;
> @@ -356,7 +373,10 @@ xfs_reflink_allocate_cow(
>  	xfs_extlen_t		resblks = 0;
>  
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> -	ASSERT(xfs_is_reflink_inode(ip));
> +	if (!ip->i_cowfp) {
> +		ASSERT(!xfs_is_reflink_inode(ip));
> +		xfs_ifork_init_cow(ip);
> +	}
>  
>  	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
>  	if (error || !*shared)
> @@ -542,7 +562,7 @@ xfs_reflink_cancel_cow_range(
>  	int			error;
>  
>  	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
> -	ASSERT(xfs_is_reflink_inode(ip));
> +	ASSERT(ip->i_cowfp);
>  
>  	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
>  	if (count == NULLFILEOFF)
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 4a9e3cd4768a..2a3052fbe23e 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -6,11 +6,24 @@
>  #ifndef __XFS_REFLINK_H
>  #define __XFS_REFLINK_H 1
>  
> +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
> +{
> +	return ip->i_mount->m_always_cow &&
> +		xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
> +}
> +
> +static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
> +{
> +	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
> +}
> +
>  extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
>  		xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
>  		xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
>  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *irec, bool *shared);
> +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
> +		bool *shared);
>  
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index c9097cb0b955..6be183a391b6 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1729,11 +1729,18 @@ xfs_fs_fill_super(
>  		}
>  	}
>  
> -	if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
> -		xfs_alert(mp,
> +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> +		if (mp->m_sb.sb_rblocks) {
> +			xfs_alert(mp,
>  	"reflink not compatible with realtime device!");
> -		error = -EINVAL;
> -		goto out_filestream_unmount;
> +			error = -EINVAL;
> +			goto out_filestream_unmount;
> +		}
> +
> +		if (xfs_globals.always_cow) {
> +			xfs_info(mp, "using DEBUG-only always_cow mode.");
> +			mp->m_always_cow = true;
> +		}
>  	}
>  
>  	if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
> diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> index 168488130a19..ad7f9be13087 100644
> --- a/fs/xfs/xfs_sysctl.h
> +++ b/fs/xfs/xfs_sysctl.h
> @@ -85,6 +85,7 @@ struct xfs_globals {
>  	int	log_recovery_delay;	/* log recovery delay (secs) */
>  	int	mount_delay;		/* mount setup delay (secs) */
>  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> +	bool	always_cow;		/* use COW fork for all overwrites */
>  };
>  extern struct xfs_globals	xfs_globals;
>  
> diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> index cd6a994a7250..cabda13f3c64 100644
> --- a/fs/xfs/xfs_sysfs.c
> +++ b/fs/xfs/xfs_sysfs.c
> @@ -183,10 +183,34 @@ mount_delay_show(
>  }
>  XFS_SYSFS_ATTR_RW(mount_delay);
>  
> +static ssize_t
> +always_cow_store(
> +	struct kobject	*kobject,
> +	const char	*buf,
> +	size_t		count)
> +{
> +	ssize_t		ret;
> +
> +	ret = kstrtobool(buf, &xfs_globals.always_cow);
> +	if (ret < 0)
> +		return ret;
> +	return count;
> +}
> +
> +static ssize_t
> +always_cow_show(
> +	struct kobject	*kobject,
> +	char		*buf)
> +{
> +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
> +}
> +XFS_SYSFS_ATTR_RW(always_cow);
> +
>  static struct attribute *xfs_dbg_attrs[] = {
>  	ATTR_LIST(bug_on_assert),
>  	ATTR_LIST(log_recovery_delay),
>  	ATTR_LIST(mount_delay),
> +	ATTR_LIST(always_cow),
>  	NULL,
>  };
>  
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/8] xfs: introduce an always_cow mode
  2019-02-19  5:25   ` Darrick J. Wong
@ 2019-02-19 17:53     ` Darrick J. Wong
  2019-02-20 14:58       ` Christoph Hellwig
  2019-02-20 15:00     ` Christoph Hellwig
  1 sibling, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-19 17:53 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 09:25:45PM -0800, Darrick J. Wong wrote:
> On Mon, Feb 18, 2019 at 10:18:27AM +0100, Christoph Hellwig wrote:
> > Add a mode where XFS never overwrites existing blocks in place.  This
> > is to aid debugging our COW code, and also put infatructure in place
> > for things like possible future support for zoned block devices, which
> > can't support overwrites.
> > 
> > This mode is enabled globally by doing a:
> > 
> >     echo 1 > /sys/fs/xfs/debug/always_cow
> > 
> > Note that the parameter is global to allow running all tests in xfstests
> > easily in this mode, which would not easily be possible with a per-fs
> > sysfs file.
> > 
> > In always_cow mode persistent preallocations are disabled, and fallocate
> > will fail when called with a 0 mode (with our without
> > FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space
> > when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE.
> > 
> > There are a few interesting xfstests failures when run in always_cow
> > mode:
> > 
> >  - generic/392 fails because the bytes used in the file used to test
> >    hole punch recovery are less after the log replay.  This is
> >    because the blocks written and then punched out are only freed
> >    with a delay due to the logging mechanism.
> >  - xfs/170 will fail as the already fragile file streams mechanism
> >    doesn't seem to interact well with the COW allocator
> >  - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim
> >    the file system is badly fragmented, but there is not much we
> >    can do to avoid that when always writing out of place
> >  - xfs/205 fails because overwriting a file in always_cow mode
> >    will require new space allocation and the assumption in the
> >    test thus don't work anymore.
> >  - xfs/326 fails to modify the file at all in always_cow mode after
> >    injecting the refcount error, leading to an unexpected md5sum
> >    after the remount, but that again is expected
> 
> Yeah, I saw failures in a bunch of tests when running always_cow against
> a 4k blocksize rmap/reflink filesystem:

/me wakes up and looks...

> xfs/056
> xfs/064
> xfs/060
> xfs/066
> xfs/068
> xfs/283
> xfs/296

xfsrestore tries to RESVSP space for its index file and fails,
presumably because the vfs intercepts the RESVSP ioctl and calls
vfs_fallocate.  But then xfsrestore seems to run ALLOCSP, which afaict
succeeds... ?

Two questions:

- Are you going to mask off xfs_ioc_space from alwayscow inodes like you
  did for xfs_file_fallocate?

- What are we going to do for xfsrestore if it encounters a filesystem
  where everything is alwayscow?

> xfs/205

"disk full, expected" ?

/me wonders, since this is an ENOSPC test, why would it fail if we run
out of space.

> xfs/192
> xfs/198

whines about fragmentation as noted; when we start working on a more
user-visible alwayscow mode we're going to have to figure out a way to
notrun these or something...

> generic/392

There's a little more going on here than what you said above -- the
failing test is a 1k punch, which just writes zeroes to the page cache,
thereby invoking the COW mechanism.  We create a 256k speculative
preallocation in the COW fork, which is reported through i_delayed_blks.
After shutting down and remounting that speculative preallocation is no
longer attached to the inode, so the block count is lower.

Anyway, the test needs some kind of adjustment to work around this,
though for some reason setting cowextsize=1fsb to disable the
speculative cow preallocations still results in a discrepancy of 1 fsb.

> generic/476

lockdep warning about reclaim during fs freeze, not related...

> xfs/1501

unreleased test, ignore this too...

> xfs/170

filestreams, who knows...

> xfs/450

Missing _require_xfs_io_command "falloc", I'll send a patch.

> generic/475

unrelated quota leak problems, yay...

> 
> Will have a look at those tomorrow...

And now back to reviewing patches.

--D

> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  fs/xfs/xfs_aops.c      |  2 +-
> >  fs/xfs/xfs_bmap_util.c |  9 +++------
> >  fs/xfs/xfs_file.c      | 27 ++++++++++++++++++++-------
> >  fs/xfs/xfs_iomap.c     | 28 ++++++++++++++++++----------
> >  fs/xfs/xfs_mount.h     |  1 +
> >  fs/xfs/xfs_reflink.c   | 28 ++++++++++++++++++++++++----
> >  fs/xfs/xfs_reflink.h   | 13 +++++++++++++
> >  fs/xfs/xfs_super.c     | 15 +++++++++++----
> >  fs/xfs/xfs_sysctl.h    |  1 +
> >  fs/xfs/xfs_sysfs.c     | 24 ++++++++++++++++++++++++
> >  10 files changed, 116 insertions(+), 32 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > index 983d11c27d32..7b8bb6bde981 100644
> > --- a/fs/xfs/xfs_aops.c
> > +++ b/fs/xfs/xfs_aops.c
> > @@ -1023,7 +1023,7 @@ xfs_vm_bmap(
> >  	 * Since we don't pass back blockdev info, we can't return bmap
> >  	 * information for rt files either.
> >  	 */
> > -	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
> > +	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
> >  		return 0;
> >  	return iomap_bmap(mapping, block, &xfs_iomap_ops);
> >  }
> > diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> > index 1ee8c5539fa4..2db43ff4f8b5 100644
> > --- a/fs/xfs/xfs_bmap_util.c
> > +++ b/fs/xfs/xfs_bmap_util.c
> > @@ -1162,16 +1162,13 @@ xfs_zero_file_space(
> >  	 * by virtue of the hole punch.
> >  	 */
> >  	error = xfs_free_file_space(ip, offset, len);
> > -	if (error)
> > -		goto out;
> > +	if (error || xfs_is_always_cow_inode(ip))
> > +		return error;
> >  
> > -	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
> > +	return xfs_alloc_file_space(ip, round_down(offset, blksize),
> >  				     round_up(offset + len, blksize) -
> >  				     round_down(offset, blksize),
> >  				     XFS_BMAPI_PREALLOC);
> > -out:
> > -	return error;
> > -
> >  }
> >  
> >  static int
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index 1d07dcfbbff3..770cc2edf777 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
> >  		 * We can't properly handle unaligned direct I/O to reflink
> >  		 * files yet, as we can't unshare a partial block.
> >  		 */
> > -		if (xfs_is_reflink_inode(ip)) {
> > +		if (xfs_is_cow_inode(ip)) {
> >  			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
> >  			return -EREMCHG;
> >  		}
> > @@ -872,14 +872,27 @@ xfs_file_fallocate(
> >  				goto out_unlock;
> >  		}
> >  
> > -		if (mode & FALLOC_FL_ZERO_RANGE)
> > +		if (mode & FALLOC_FL_ZERO_RANGE) {
> >  			error = xfs_zero_file_space(ip, offset, len);
> > -		else {
> > -			if (mode & FALLOC_FL_UNSHARE_RANGE) {
> > -				error = xfs_reflink_unshare(ip, offset, len);
> > -				if (error)
> > -					goto out_unlock;
> > +		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
> > +			error = xfs_reflink_unshare(ip, offset, len);
> > +			if (error)
> > +				goto out_unlock;
> > +
> > +			if (!xfs_is_always_cow_inode(ip)) {
> > +				error = xfs_alloc_file_space(ip, offset, len,
> > +						XFS_BMAPI_PREALLOC);
> >  			}
> > +		} else {
> > +			/*
> > +			 * If always_cow mode we can't use preallocations and
> > +			 * thus should not create them.
> > +			 */
> > +			if (xfs_is_always_cow_inode(ip)) {
> > +				error = -EOPNOTSUPP;
> > +				goto out_unlock;
> > +			}
> > +
> >  			error = xfs_alloc_file_space(ip, offset, len,
> >  						     XFS_BMAPI_PREALLOC);
> >  		}
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index 7b71dcc97ef0..72533e9dddc6 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> > @@ -395,12 +395,13 @@ xfs_quota_calc_throttle(
> >  STATIC xfs_fsblock_t
> >  xfs_iomap_prealloc_size(
> >  	struct xfs_inode	*ip,
> > +	int			whichfork,
> >  	loff_t			offset,
> >  	loff_t			count,
> >  	struct xfs_iext_cursor	*icur)
> >  {
> >  	struct xfs_mount	*mp = ip->i_mount;
> > -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> > +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
> >  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
> >  	struct xfs_bmbt_irec	prev;
> >  	int			shift = 0;
> > @@ -593,7 +594,11 @@ xfs_file_iomap_begin_delay(
> >  	 * themselves.  Second the lookup in the extent list is generally faster
> >  	 * than going out to the shared extent tree.
> >  	 */
> > -	if (xfs_is_reflink_inode(ip)) {
> > +	if (xfs_is_cow_inode(ip)) {
> > +		if (!ip->i_cowfp) {
> > +			ASSERT(!xfs_is_reflink_inode(ip));
> > +			xfs_ifork_init_cow(ip);
> > +		}
> >  		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> >  				&ccur, &cmap);
> >  		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> > @@ -609,7 +614,7 @@ xfs_file_iomap_begin_delay(
> >  		 * overwriting shared extents.   This includes zeroing of
> >  		 * existing extents that contain data.
> >  		 */
> > -		if (!xfs_is_reflink_inode(ip) ||
> > +		if (!xfs_is_cow_inode(ip) ||
> >  		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> >  			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> >  					&imap);
> > @@ -619,7 +624,7 @@ xfs_file_iomap_begin_delay(
> >  		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
> >  
> >  		/* Trim the mapping to the nearest shared extent boundary. */
> > -		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> > +		error = xfs_inode_need_cow(ip, &imap, &shared);
> >  		if (error)
> >  			goto out_unlock;
> >  
> > @@ -648,15 +653,18 @@ xfs_file_iomap_begin_delay(
> >  		 */
> >  		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> >  		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > +
> > +		if (xfs_is_always_cow_inode(ip))
> > +			whichfork = XFS_COW_FORK;
> >  	}
> >  
> >  	error = xfs_qm_dqattach_locked(ip, false);
> >  	if (error)
> >  		goto out_unlock;
> >  
> > -	if (eof && whichfork == XFS_DATA_FORK) {
> > -		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
> > -				&icur);
> > +	if (eof) {
> > +		prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
> > +				count, &icur);
> >  		if (prealloc_blocks) {
> >  			xfs_extlen_t	align;
> >  			xfs_off_t	end_offset;
> > @@ -867,7 +875,7 @@ xfs_ilock_for_iomap(
> >  	 * COW writes may allocate delalloc space or convert unwritten COW
> >  	 * extents, so we need to make sure to take the lock exclusively here.
> >  	 */
> > -	if (xfs_is_reflink_inode(ip) && is_write) {
> > +	if (xfs_is_cow_inode(ip) && is_write) {
> >  		/*
> >  		 * FIXME: It could still overwrite on unshared extents and not
> >  		 * need allocation.
> > @@ -901,7 +909,7 @@ xfs_ilock_for_iomap(
> >  	 * check, so if we got ILOCK_SHARED for a write and but we're now a
> >  	 * reflink inode we have to switch to ILOCK_EXCL and relock.
> >  	 */
> > -	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
> > +	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
> >  		xfs_iunlock(ip, mode);
> >  		mode = XFS_ILOCK_EXCL;
> >  		goto relock;
> > @@ -973,7 +981,7 @@ xfs_file_iomap_begin(
> >  	 * Break shared extents if necessary. Checks for non-blocking IO have
> >  	 * been done up front, so we don't need to do them here.
> >  	 */
> > -	if (xfs_is_reflink_inode(ip)) {
> > +	if (xfs_is_cow_inode(ip)) {
> >  		struct xfs_bmbt_irec	orig = imap;
> >  
> >  		/* if zeroing doesn't need COW allocation, then we are done. */
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index a33f45077867..fa8b37d61838 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -194,6 +194,7 @@ typedef struct xfs_mount {
> >  	 */
> >  	uint32_t		m_generation;
> >  
> > +	bool			m_always_cow;
> >  	bool			m_fail_unmount;
> >  #ifdef DEBUG
> >  	/*
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index f84b37fa4f17..e2d9179bd50d 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
> >  	int			error = 0;
> >  
> >  	/* Holes, unwritten, and delalloc extents cannot be shared */
> > -	if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
> > +	if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
> >  		*shared = false;
> >  		return 0;
> >  	}
> > @@ -234,6 +234,23 @@ xfs_reflink_trim_around_shared(
> >  	}
> >  }
> >  
> > +bool
> > +xfs_inode_need_cow(
> 
> xfs_inode_trim_to_cow() ?
> 
> Otherwise generally looks ok to me, but it's late so I'll send this now
> and get back to review in the morning.
> 
> --D
> 
> > +	struct xfs_inode	*ip,
> > +	struct xfs_bmbt_irec	*imap,
> > +	bool			*shared)
> > +{
> > +	/* We can't update any real extents in always COW mode. */
> > +	if (xfs_is_always_cow_inode(ip) &&
> > +	    !isnullstartblock(imap->br_startblock)) {
> > +		*shared = true;
> > +		return 0;
> > +	}
> > +
> > +	/* Trim the mapping to the nearest shared extent boundary. */
> > +	return xfs_reflink_trim_around_shared(ip, imap, shared);
> > +}
> > +
> >  static int
> >  xfs_reflink_convert_cow_locked(
> >  	struct xfs_inode	*ip,
> > @@ -321,7 +338,7 @@ xfs_find_trim_cow_extent(
> >  	if (got.br_startoff > offset_fsb) {
> >  		xfs_trim_extent(imap, imap->br_startoff,
> >  				got.br_startoff - imap->br_startoff);
> > -		return xfs_reflink_trim_around_shared(ip, imap, shared);
> > +		return xfs_inode_need_cow(ip, imap, shared);
> >  	}
> >  
> >  	*shared = true;
> > @@ -356,7 +373,10 @@ xfs_reflink_allocate_cow(
> >  	xfs_extlen_t		resblks = 0;
> >  
> >  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> > -	ASSERT(xfs_is_reflink_inode(ip));
> > +	if (!ip->i_cowfp) {
> > +		ASSERT(!xfs_is_reflink_inode(ip));
> > +		xfs_ifork_init_cow(ip);
> > +	}
> >  
> >  	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
> >  	if (error || !*shared)
> > @@ -542,7 +562,7 @@ xfs_reflink_cancel_cow_range(
> >  	int			error;
> >  
> >  	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
> > -	ASSERT(xfs_is_reflink_inode(ip));
> > +	ASSERT(ip->i_cowfp);
> >  
> >  	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> >  	if (count == NULLFILEOFF)
> > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > index 4a9e3cd4768a..2a3052fbe23e 100644
> > --- a/fs/xfs/xfs_reflink.h
> > +++ b/fs/xfs/xfs_reflink.h
> > @@ -6,11 +6,24 @@
> >  #ifndef __XFS_REFLINK_H
> >  #define __XFS_REFLINK_H 1
> >  
> > +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
> > +{
> > +	return ip->i_mount->m_always_cow &&
> > +		xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
> > +}
> > +
> > +static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
> > +{
> > +	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
> > +}
> > +
> >  extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
> >  		xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
> >  		xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
> >  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
> >  		struct xfs_bmbt_irec *irec, bool *shared);
> > +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
> > +		bool *shared);
> >  
> >  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> >  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index c9097cb0b955..6be183a391b6 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -1729,11 +1729,18 @@ xfs_fs_fill_super(
> >  		}
> >  	}
> >  
> > -	if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
> > -		xfs_alert(mp,
> > +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > +		if (mp->m_sb.sb_rblocks) {
> > +			xfs_alert(mp,
> >  	"reflink not compatible with realtime device!");
> > -		error = -EINVAL;
> > -		goto out_filestream_unmount;
> > +			error = -EINVAL;
> > +			goto out_filestream_unmount;
> > +		}
> > +
> > +		if (xfs_globals.always_cow) {
> > +			xfs_info(mp, "using DEBUG-only always_cow mode.");
> > +			mp->m_always_cow = true;
> > +		}
> >  	}
> >  
> >  	if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
> > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> > index 168488130a19..ad7f9be13087 100644
> > --- a/fs/xfs/xfs_sysctl.h
> > +++ b/fs/xfs/xfs_sysctl.h
> > @@ -85,6 +85,7 @@ struct xfs_globals {
> >  	int	log_recovery_delay;	/* log recovery delay (secs) */
> >  	int	mount_delay;		/* mount setup delay (secs) */
> >  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> > +	bool	always_cow;		/* use COW fork for all overwrites */
> >  };
> >  extern struct xfs_globals	xfs_globals;
> >  
> > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> > index cd6a994a7250..cabda13f3c64 100644
> > --- a/fs/xfs/xfs_sysfs.c
> > +++ b/fs/xfs/xfs_sysfs.c
> > @@ -183,10 +183,34 @@ mount_delay_show(
> >  }
> >  XFS_SYSFS_ATTR_RW(mount_delay);
> >  
> > +static ssize_t
> > +always_cow_store(
> > +	struct kobject	*kobject,
> > +	const char	*buf,
> > +	size_t		count)
> > +{
> > +	ssize_t		ret;
> > +
> > +	ret = kstrtobool(buf, &xfs_globals.always_cow);
> > +	if (ret < 0)
> > +		return ret;
> > +	return count;
> > +}
> > +
> > +static ssize_t
> > +always_cow_show(
> > +	struct kobject	*kobject,
> > +	char		*buf)
> > +{
> > +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
> > +}
> > +XFS_SYSFS_ATTR_RW(always_cow);
> > +
> >  static struct attribute *xfs_dbg_attrs[] = {
> >  	ATTR_LIST(bug_on_assert),
> >  	ATTR_LIST(log_recovery_delay),
> >  	ATTR_LIST(mount_delay),
> > +	ATTR_LIST(always_cow),
> >  	NULL,
> >  };
> >  
> > -- 
> > 2.20.1
> > 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay
  2019-02-18  9:18 ` [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay Christoph Hellwig
@ 2019-02-19 18:12   ` Darrick J. Wong
  2019-02-21 17:59   ` Brian Foster
  1 sibling, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-19 18:12 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> Besides simplifying the code a bit this allows to actually implement
> the behavior of using COW preallocation for non-COW data mentioned
> in the current comments.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> ---
>  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
>  fs/xfs/xfs_reflink.c |  67 ----------------------
>  fs/xfs/xfs_reflink.h |   2 -
>  fs/xfs/xfs_trace.h   |   3 -
>  4 files changed, 94 insertions(+), 111 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 19a3331b4a56..c9fd1e4a1f99 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -534,15 +534,16 @@ xfs_file_iomap_begin_delay(
>  {
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	xfs_fileoff_t		maxbytes_fsb =
>  		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
>  	xfs_fileoff_t		end_fsb;
> -	int			error = 0, eof = 0;
> -	struct xfs_bmbt_irec	got;
> -	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	imap, cmap;
> +	struct xfs_iext_cursor	icur, ccur;
>  	xfs_fsblock_t		prealloc_blocks = 0;
> +	bool			eof = false, cow_eof = false, shared;
> +	int			whichfork = XFS_DATA_FORK;
> +	int			error = 0;
>  
>  	ASSERT(!XFS_IS_REALTIME_INODE(ip));
>  	ASSERT(!xfs_get_extsz_hint(ip));
> @@ -560,7 +561,7 @@ xfs_file_iomap_begin_delay(
>  
>  	XFS_STATS_INC(mp, xs_blk_mapw);
>  
> -	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> +	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
>  		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
>  		if (error)
>  			goto out_unlock;
> @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
>  
>  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
>  
> -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> +	/*
> +	 * Search the data fork fork first to look up our source mapping.  We
> +	 * always need the data fork map, as we have to return it to the
> +	 * iomap code so that the higher level write code can read data in to
> +	 * perform read-modify-write cycles for unaligned writes.
> +	 */
> +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
>  	if (eof)
> -		got.br_startoff = end_fsb; /* fake hole until the end */
> +		imap.br_startoff = end_fsb; /* fake hole until the end */
> +
> +	/* We never need to allocate blocks for zeroing a hole. */
> +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * Search the COW fork extent list even if we did not find a data fork
> +	 * extent.  This serves two purposes: first this implements the
> +	 * speculative preallocation using cowextsize, so that we also unshare
> +	 * block adjacent to shared blocks instead of just the shared blocks
> +	 * themselves.  Second the lookup in the extent list is generally faster
> +	 * than going out to the shared extent tree.
> +	 */
> +	if (xfs_is_reflink_inode(ip)) {
> +		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> +				&ccur, &cmap);
> +		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> +			trace_xfs_reflink_cow_found(ip, &cmap);
> +			whichfork = XFS_COW_FORK;
> +			goto done;
> +		}
> +	}
>  
> -	if (got.br_startoff <= offset_fsb) {
> +	if (imap.br_startoff <= offset_fsb) {
>  		/*
>  		 * For reflink files we may need a delalloc reservation when
>  		 * overwriting shared extents.   This includes zeroing of
>  		 * existing extents that contain data.
>  		 */
> -		if (xfs_is_reflink_inode(ip) &&
> -		    ((flags & IOMAP_WRITE) ||
> -		     got.br_state != XFS_EXT_UNWRITTEN)) {
> -			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
> -			error = xfs_reflink_reserve_cow(ip, &got);
> -			if (error)
> -				goto out_unlock;
> +		if (!xfs_is_reflink_inode(ip) ||
> +		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> +					&imap);
> +			goto done;
>  		}
>  
> -		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
> -		goto done;
> -	}
> +		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
>  
> -	if (flags & IOMAP_ZERO) {
> -		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
> -		goto out_unlock;
> +		/* Trim the mapping to the nearest shared extent boundary. */
> +		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +		if (error)
> +			goto out_unlock;
> +
> +		/* Not shared?  Just report the (potentially capped) extent. */
> +		if (!shared) {
> +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> +					&imap);
> +			goto done;
> +		}
> +
> +		/*
> +		 * Fork all the shared blocks from our write offset until the
> +		 * end of the extent.
> +		 */
> +		whichfork = XFS_COW_FORK;
> +		end_fsb = imap.br_startoff + imap.br_blockcount;
> +	} else {
> +		/*
> +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> +		 * pages to keep the chunks of work done where somewhat
> +		 * symmetric with the work writeback does.  This is a completely
> +		 * arbitrary number pulled out of thin air.
> +		 *
> +		 * Note that the values needs to be less than 32-bits wide until
> +		 * the lower level functions are updated.
> +		 */
> +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
>  	}
>  
>  	error = xfs_qm_dqattach_locked(ip, false);
>  	if (error)
>  		goto out_unlock;
>  
> -	/*
> -	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
> -	 * to keep the chunks of work done where somewhat symmetric with the
> -	 * work writeback does. This is a completely arbitrary number pulled
> -	 * out of thin air as a best guess for initial testing.
> -	 *
> -	 * Note that the values needs to be less than 32-bits wide until
> -	 * the lower level functions are updated.
> -	 */
> -	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> -	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> -
> -	if (eof) {
> +	if (eof && whichfork == XFS_DATA_FORK) {
>  		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
>  				&icur);
>  		if (prealloc_blocks) {
> @@ -635,9 +677,11 @@ xfs_file_iomap_begin_delay(
>  	}
>  
>  retry:
> -	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
> -			end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
> -			eof);
> +	error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
> +			end_fsb - offset_fsb, prealloc_blocks,
> +			whichfork == XFS_DATA_FORK ? &imap : &cmap,
> +			whichfork == XFS_DATA_FORK ? &icur : &ccur,
> +			whichfork == XFS_DATA_FORK ? eof : cow_eof);
>  	switch (error) {
>  	case 0:
>  		break;
> @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
>  	 * them out if the write happens to fail.
>  	 */
>  	iomap->flags |= IOMAP_F_NEW;
> -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
>  done:
> -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> +	if (whichfork == XFS_COW_FORK) {
> +		if (imap.br_startoff > offset_fsb) {
> +			xfs_trim_extent(&cmap, offset_fsb,
> +					imap.br_startoff - offset_fsb);
> +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> +			goto out_unlock;
> +		}
> +		/* ensure we only report blocks we have a reservation for */
> +		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> +	}
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
>  out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	return error;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 8a5353daf9ab..9ef1f79cb3ae 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
>  	}
>  }
>  
> -/*
> - * Trim the passed in imap to the next shared/unshared extent boundary, and
> - * if imap->br_startoff points to a shared extent reserve space for it in the
> - * COW fork.
> - *
> - * Note that imap will always contain the block numbers for the existing blocks
> - * in the data fork, as the upper layers need them for read-modify-write
> - * operations.
> - */
> -int
> -xfs_reflink_reserve_cow(
> -	struct xfs_inode	*ip,
> -	struct xfs_bmbt_irec	*imap)
> -{
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> -	struct xfs_bmbt_irec	got;
> -	int			error = 0;
> -	bool			eof = false;
> -	struct xfs_iext_cursor	icur;
> -	bool			shared;
> -
> -	/*
> -	 * Search the COW fork extent list first.  This serves two purposes:
> -	 * first this implement the speculative preallocation using cowextisze,
> -	 * so that we also unshared block adjacent to shared blocks instead
> -	 * of just the shared blocks themselves.  Second the lookup in the
> -	 * extent list is generally faster than going out to the shared extent
> -	 * tree.
> -	 */
> -
> -	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
> -		eof = true;
> -	if (!eof && got.br_startoff <= imap->br_startoff) {
> -		trace_xfs_reflink_cow_found(ip, imap);
> -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -		return 0;
> -	}
> -
> -	/* Trim the mapping to the nearest shared extent boundary. */
> -	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
> -	if (error)
> -		return error;
> -
> -	/* Not shared?  Just report the (potentially capped) extent. */
> -	if (!shared)
> -		return 0;
> -
> -	/*
> -	 * Fork all the shared blocks from our write offset until the end of
> -	 * the extent.
> -	 */
> -	error = xfs_qm_dqattach_locked(ip, false);
> -	if (error)
> -		return error;
> -
> -	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
> -			imap->br_blockcount, 0, &got, &icur, eof);
> -	if (error == -ENOSPC || error == -EDQUOT)
> -		trace_xfs_reflink_cow_enospc(ip, imap);
> -	if (error)
> -		return error;
> -
> -	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -	trace_xfs_reflink_cow_alloc(ip, &got);
> -	return 0;
> -}
> -
>  /* Convert part of an unwritten CoW extent to a real one. */
>  STATIC int
>  xfs_reflink_convert_cow_extent(
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 70d68a1a9b49..4a9e3cd4768a 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
>  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *irec, bool *shared);
>  
> -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> -		struct xfs_bmbt_irec *imap);
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
>  		unsigned iomap_flags);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index f1e18ae8a209..47fb07d86efd 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
>  
>  /* copy on write */
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
> -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
>  
> -DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
> -
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
>  
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 6/8] xfs: make COW fork unwritten extent conversions more robust
  2019-02-18  9:18 ` [PATCH 6/8] xfs: make COW fork unwritten extent conversions more robust Christoph Hellwig
@ 2019-02-19 18:16   ` Darrick J. Wong
  0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-19 18:16 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 10:18:25AM +0100, Christoph Hellwig wrote:
> If we have racing buffered and direct I/O COW fork extents under
> writeback can have been moved to the data fork by the time we call
> xfs_reflink_convert_cow from xfs_submit_ioend.  This would be mostly
> harmless as the block numbers don't change by this move, except for
> the fact that xfs_bmapi_write will crash or trigger asserts when
> not finding existing extents, even despite trying to paper over this
> with the XFS_BMAPI_CONVERT_ONLY flag.
> 
> Instead of special casing non-transaction conversions in the already
> way too complicated xfs_bmapi_write just add a new helper for the much
> simpler non-transactional COW fork case, which simplify ignores not
> found extents.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> ---
>  fs/xfs/libxfs/xfs_bmap.c |  9 ++----
>  fs/xfs/libxfs/xfs_bmap.h |  8 +++---
>  fs/xfs/xfs_reflink.c     | 61 +++++++++++++++++++++++++---------------
>  3 files changed, 45 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 4cf83475f0d0..114b4da9add6 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -2031,7 +2031,7 @@ xfs_bmap_add_extent_delay_real(
>  /*
>   * Convert an unwritten allocation to a real allocation or vice versa.
>   */
> -STATIC int				/* error */
> +int					/* error */
>  xfs_bmap_add_extent_unwritten_real(
>  	struct xfs_trans	*tp,
>  	xfs_inode_t		*ip,	/* incore inode pointer */
> @@ -4276,9 +4276,7 @@ xfs_bmapi_write(
>  
>  	ASSERT(*nmap >= 1);
>  	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
> -	ASSERT(tp != NULL ||
> -	       (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
> -			(XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
> +	ASSERT(tp != NULL);
>  	ASSERT(len > 0);
>  	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> @@ -4352,8 +4350,7 @@ xfs_bmapi_write(
>  		 * First, deal with the hole before the allocated space
>  		 * that we found, if any.
>  		 */
> -		if ((need_alloc || wasdelay) &&
> -		    !(flags & XFS_BMAPI_CONVERT_ONLY)) {
> +		if (need_alloc || wasdelay) {
>  			bma.eof = eof;
>  			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
>  			bma.wasdel = wasdelay;
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index 78b190b6e908..8f597f9abdbe 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -95,9 +95,6 @@ struct xfs_extent_free_item
>  /* Map something in the CoW fork. */
>  #define XFS_BMAPI_COWFORK	0x200
>  
> -/* Only convert unwritten extents, don't allocate new blocks */
> -#define XFS_BMAPI_CONVERT_ONLY	0x800
> -
>  /* Skip online discard of freed extents */
>  #define XFS_BMAPI_NODISCARD	0x1000
>  
> @@ -114,7 +111,6 @@ struct xfs_extent_free_item
>  	{ XFS_BMAPI_ZERO,	"ZERO" }, \
>  	{ XFS_BMAPI_REMAP,	"REMAP" }, \
>  	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
> -	{ XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
>  	{ XFS_BMAPI_NODISCARD,	"NODISCARD" }, \
>  	{ XFS_BMAPI_NORMAP,	"NORMAP" }
>  
> @@ -226,6 +222,10 @@ int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
>  int	xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
>  		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
>  		unsigned int *seq);
> +int	xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
> +		struct xfs_inode *ip, int whichfork,
> +		struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
> +		struct xfs_bmbt_irec *new, int *logflagsp);
>  
>  static inline void
>  xfs_bmap_add_free(
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 9ef1f79cb3ae..f84b37fa4f17 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -234,26 +234,42 @@ xfs_reflink_trim_around_shared(
>  	}
>  }
>  
> -/* Convert part of an unwritten CoW extent to a real one. */
> -STATIC int
> -xfs_reflink_convert_cow_extent(
> -	struct xfs_inode		*ip,
> -	struct xfs_bmbt_irec		*imap,
> -	xfs_fileoff_t			offset_fsb,
> -	xfs_filblks_t			count_fsb)
> +static int
> +xfs_reflink_convert_cow_locked(
> +	struct xfs_inode	*ip,
> +	xfs_fileoff_t		offset_fsb,
> +	xfs_filblks_t		count_fsb)
>  {
> -	int				nimaps = 1;
> +	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	got;
> +	struct xfs_btree_cur	*dummy_cur = NULL;
> +	int			dummy_logflags;
> +	int			error;
>  
> -	if (imap->br_state == XFS_EXT_NORM)
> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
>  		return 0;
>  
> -	xfs_trim_extent(imap, offset_fsb, count_fsb);
> -	trace_xfs_reflink_convert_cow(ip, imap);
> -	if (imap->br_blockcount == 0)
> -		return 0;
> -	return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
> -			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
> -			&nimaps);
> +	do {
> +		if (got.br_startoff >= offset_fsb + count_fsb)
> +			break;
> +		if (got.br_state == XFS_EXT_NORM)
> +			continue;
> +		if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
> +			return -EIO;
> +
> +		xfs_trim_extent(&got, offset_fsb, count_fsb);
> +		if (!got.br_blockcount)
> +			continue;
> +
> +		got.br_state = XFS_EXT_NORM;
> +		error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
> +				XFS_COW_FORK, &icur, &dummy_cur, &got,
> +				&dummy_logflags);
> +		if (error)
> +			return error;
> +	} while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
> +
> +	return error;
>  }
>  
>  /* Convert all of the unwritten CoW extents in a file's range to real ones. */
> @@ -267,15 +283,12 @@ xfs_reflink_convert_cow(
>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
>  	xfs_filblks_t		count_fsb = end_fsb - offset_fsb;
> -	struct xfs_bmbt_irec	imap;
> -	int			nimaps = 1, error = 0;
> +	int			error;
>  
>  	ASSERT(count != 0);
>  
>  	xfs_ilock(ip, XFS_ILOCK_EXCL);
> -	error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
> -			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
> -			XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
> +	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	return error;
>  }
> @@ -405,14 +418,16 @@ xfs_reflink_allocate_cow(
>  	if (nimaps == 0)
>  		return -ENOSPC;
>  convert:
> +	xfs_trim_extent(imap, offset_fsb, count_fsb);
>  	/*
>  	 * COW fork extents are supposed to remain unwritten until we're ready
>  	 * to initiate a disk write.  For direct I/O we are going to write the
>  	 * data and need the conversion, but for buffered writes we're done.
>  	 */
> -	if (!(iomap_flags & IOMAP_DIRECT))
> +	if (!(iomap_flags & IOMAP_DIRECT) || imap->br_state == XFS_EXT_NORM)
>  		return 0;
> -	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
> +	trace_xfs_reflink_convert_cow(ip, imap);
> +	return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
>  
>  out_unreserve:
>  	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/8] xfs: introduce an always_cow mode
  2019-02-18  9:18 ` [PATCH 8/8] xfs: introduce an always_cow mode Christoph Hellwig
  2019-02-19  5:25   ` Darrick J. Wong
@ 2019-02-19 18:31   ` Darrick J. Wong
  2019-02-20 15:08     ` Christoph Hellwig
  1 sibling, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-19 18:31 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

Ok, formal review of patch 8 follows.

On Mon, Feb 18, 2019 at 10:18:27AM +0100, Christoph Hellwig wrote:
> Add a mode where XFS never overwrites existing blocks in place.  This
> is to aid debugging our COW code, and also put infatructure in place
> for things like possible future support for zoned block devices, which
> can't support overwrites.
> 
> This mode is enabled globally by doing a:
> 
>     echo 1 > /sys/fs/xfs/debug/always_cow
> 
> Note that the parameter is global to allow running all tests in xfstests
> easily in this mode, which would not easily be possible with a per-fs
> sysfs file.
> 
> In always_cow mode persistent preallocations are disabled, and fallocate
> will fail when called with a 0 mode (with our without
> FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space
> when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE.
> 
> There are a few interesting xfstests failures when run in always_cow
> mode:
> 
>  - generic/392 fails because the bytes used in the file used to test
>    hole punch recovery are less after the log replay.  This is
>    because the blocks written and then punched out are only freed
>    with a delay due to the logging mechanism.
>  - xfs/170 will fail as the already fragile file streams mechanism
>    doesn't seem to interact well with the COW allocator
>  - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim
>    the file system is badly fragmented, but there is not much we
>    can do to avoid that when always writing out of place
>  - xfs/205 fails because overwriting a file in always_cow mode
>    will require new space allocation and the assumption in the
>    test thus don't work anymore.
>  - xfs/326 fails to modify the file at all in always_cow mode after
>    injecting the refcount error, leading to an unexpected md5sum
>    after the remount, but that again is expected

I'm assuming the purpose of the debug knob is so that we can get all the
fstests regressions fixed before enabling always cow modes for users...

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_aops.c      |  2 +-
>  fs/xfs/xfs_bmap_util.c |  9 +++------
>  fs/xfs/xfs_file.c      | 27 ++++++++++++++++++++-------
>  fs/xfs/xfs_iomap.c     | 28 ++++++++++++++++++----------
>  fs/xfs/xfs_mount.h     |  1 +
>  fs/xfs/xfs_reflink.c   | 28 ++++++++++++++++++++++++----
>  fs/xfs/xfs_reflink.h   | 13 +++++++++++++
>  fs/xfs/xfs_super.c     | 15 +++++++++++----
>  fs/xfs/xfs_sysctl.h    |  1 +
>  fs/xfs/xfs_sysfs.c     | 24 ++++++++++++++++++++++++
>  10 files changed, 116 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 983d11c27d32..7b8bb6bde981 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1023,7 +1023,7 @@ xfs_vm_bmap(
>  	 * Since we don't pass back blockdev info, we can't return bmap
>  	 * information for rt files either.
>  	 */
> -	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
> +	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
>  		return 0;
>  	return iomap_bmap(mapping, block, &xfs_iomap_ops);
>  }
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 1ee8c5539fa4..2db43ff4f8b5 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1162,16 +1162,13 @@ xfs_zero_file_space(
>  	 * by virtue of the hole punch.
>  	 */
>  	error = xfs_free_file_space(ip, offset, len);
> -	if (error)
> -		goto out;
> +	if (error || xfs_is_always_cow_inode(ip))
> +		return error;
>  
> -	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
> +	return xfs_alloc_file_space(ip, round_down(offset, blksize),
>  				     round_up(offset + len, blksize) -
>  				     round_down(offset, blksize),
>  				     XFS_BMAPI_PREALLOC);
> -out:
> -	return error;
> -
>  }
>  
>  static int
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 1d07dcfbbff3..770cc2edf777 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
>  		 * We can't properly handle unaligned direct I/O to reflink
>  		 * files yet, as we can't unshare a partial block.
>  		 */
> -		if (xfs_is_reflink_inode(ip)) {
> +		if (xfs_is_cow_inode(ip)) {
>  			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
>  			return -EREMCHG;
>  		}
> @@ -872,14 +872,27 @@ xfs_file_fallocate(
>  				goto out_unlock;
>  		}
>  
> -		if (mode & FALLOC_FL_ZERO_RANGE)
> +		if (mode & FALLOC_FL_ZERO_RANGE) {
>  			error = xfs_zero_file_space(ip, offset, len);
> -		else {
> -			if (mode & FALLOC_FL_UNSHARE_RANGE) {
> -				error = xfs_reflink_unshare(ip, offset, len);
> -				if (error)
> -					goto out_unlock;
> +		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
> +			error = xfs_reflink_unshare(ip, offset, len);
> +			if (error)
> +				goto out_unlock;
> +
> +			if (!xfs_is_always_cow_inode(ip)) {
> +				error = xfs_alloc_file_space(ip, offset, len,
> +						XFS_BMAPI_PREALLOC);
>  			}
> +		} else {
> +			/*
> +			 * If always_cow mode we can't use preallocations and
> +			 * thus should not create them.
> +			 */
> +			if (xfs_is_always_cow_inode(ip)) {
> +				error = -EOPNOTSUPP;
> +				goto out_unlock;
> +			}
> +

Ok, so the basic premise here is that we never call xfs_alloc_file_space
for alwayscow inodes.  As I pointed out in my reply that focused on
fstests errors, xfs_file_fallocate is not the only caller of
xfs_alloc_file_space, so I think xfs_ioc_space and friends are going to
need a similar treatment.

>  			error = xfs_alloc_file_space(ip, offset, len,
>  						     XFS_BMAPI_PREALLOC);
>  		}
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 7b71dcc97ef0..72533e9dddc6 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -395,12 +395,13 @@ xfs_quota_calc_throttle(
>  STATIC xfs_fsblock_t
>  xfs_iomap_prealloc_size(
>  	struct xfs_inode	*ip,
> +	int			whichfork,
>  	loff_t			offset,
>  	loff_t			count,
>  	struct xfs_iext_cursor	*icur)
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);

Hmm, so are we able to do preallocations in the cow fork now?

I think that's a separate change from adding the alwayscow knob.

>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	struct xfs_bmbt_irec	prev;
>  	int			shift = 0;
> @@ -593,7 +594,11 @@ xfs_file_iomap_begin_delay(
>  	 * themselves.  Second the lookup in the extent list is generally faster
>  	 * than going out to the shared extent tree.
>  	 */
> -	if (xfs_is_reflink_inode(ip)) {
> +	if (xfs_is_cow_inode(ip)) {
> +		if (!ip->i_cowfp) {
> +			ASSERT(!xfs_is_reflink_inode(ip));
> +			xfs_ifork_init_cow(ip);
> +		}
>  		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
>  				&ccur, &cmap);
>  		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> @@ -609,7 +614,7 @@ xfs_file_iomap_begin_delay(
>  		 * overwriting shared extents.   This includes zeroing of
>  		 * existing extents that contain data.
>  		 */
> -		if (!xfs_is_reflink_inode(ip) ||
> +		if (!xfs_is_cow_inode(ip) ||
>  		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
>  			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
>  					&imap);
> @@ -619,7 +624,7 @@ xfs_file_iomap_begin_delay(
>  		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
>  
>  		/* Trim the mapping to the nearest shared extent boundary. */
> -		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +		error = xfs_inode_need_cow(ip, &imap, &shared);
>  		if (error)
>  			goto out_unlock;
>  
> @@ -648,15 +653,18 @@ xfs_file_iomap_begin_delay(
>  		 */
>  		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
>  		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> +
> +		if (xfs_is_always_cow_inode(ip))
> +			whichfork = XFS_COW_FORK;
>  	}
>  
>  	error = xfs_qm_dqattach_locked(ip, false);
>  	if (error)
>  		goto out_unlock;
>  
> -	if (eof && whichfork == XFS_DATA_FORK) {
> -		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
> -				&icur);
> +	if (eof) {
> +		prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
> +				count, &icur);
>  		if (prealloc_blocks) {
>  			xfs_extlen_t	align;
>  			xfs_off_t	end_offset;
> @@ -867,7 +875,7 @@ xfs_ilock_for_iomap(
>  	 * COW writes may allocate delalloc space or convert unwritten COW
>  	 * extents, so we need to make sure to take the lock exclusively here.
>  	 */
> -	if (xfs_is_reflink_inode(ip) && is_write) {
> +	if (xfs_is_cow_inode(ip) && is_write) {
>  		/*
>  		 * FIXME: It could still overwrite on unshared extents and not
>  		 * need allocation.
> @@ -901,7 +909,7 @@ xfs_ilock_for_iomap(
>  	 * check, so if we got ILOCK_SHARED for a write and but we're now a
>  	 * reflink inode we have to switch to ILOCK_EXCL and relock.
>  	 */
> -	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
> +	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
>  		xfs_iunlock(ip, mode);
>  		mode = XFS_ILOCK_EXCL;
>  		goto relock;
> @@ -973,7 +981,7 @@ xfs_file_iomap_begin(
>  	 * Break shared extents if necessary. Checks for non-blocking IO have
>  	 * been done up front, so we don't need to do them here.
>  	 */
> -	if (xfs_is_reflink_inode(ip)) {
> +	if (xfs_is_cow_inode(ip)) {
>  		struct xfs_bmbt_irec	orig = imap;
>  
>  		/* if zeroing doesn't need COW allocation, then we are done. */
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index a33f45077867..fa8b37d61838 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -194,6 +194,7 @@ typedef struct xfs_mount {
>  	 */
>  	uint32_t		m_generation;
>  
> +	bool			m_always_cow;
>  	bool			m_fail_unmount;
>  #ifdef DEBUG
>  	/*
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index f84b37fa4f17..e2d9179bd50d 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
>  	int			error = 0;
>  
>  	/* Holes, unwritten, and delalloc extents cannot be shared */
> -	if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
> +	if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
>  		*shared = false;
>  		return 0;
>  	}
> @@ -234,6 +234,23 @@ xfs_reflink_trim_around_shared(
>  	}
>  }
>  
> +bool
> +xfs_inode_need_cow(

I think I mentioned this last night, but this name doesn't describe all
of what it does.  Yes, it figures out if we need to COW to satisfy a
file write, but it also trim @imap so it doesn't cross a cow/nocow
boundary.

xfs_inode_trim_to_cow() ?

> +	struct xfs_inode	*ip,
> +	struct xfs_bmbt_irec	*imap,
> +	bool			*shared)
> +{
> +	/* We can't update any real extents in always COW mode. */
> +	if (xfs_is_always_cow_inode(ip) &&
> +	    !isnullstartblock(imap->br_startblock)) {
> +		*shared = true;
> +		return 0;
> +	}
> +
> +	/* Trim the mapping to the nearest shared extent boundary. */
> +	return xfs_reflink_trim_around_shared(ip, imap, shared);
> +}
> +
>  static int
>  xfs_reflink_convert_cow_locked(
>  	struct xfs_inode	*ip,
> @@ -321,7 +338,7 @@ xfs_find_trim_cow_extent(
>  	if (got.br_startoff > offset_fsb) {
>  		xfs_trim_extent(imap, imap->br_startoff,
>  				got.br_startoff - imap->br_startoff);
> -		return xfs_reflink_trim_around_shared(ip, imap, shared);
> +		return xfs_inode_need_cow(ip, imap, shared);
>  	}
>  
>  	*shared = true;
> @@ -356,7 +373,10 @@ xfs_reflink_allocate_cow(
>  	xfs_extlen_t		resblks = 0;
>  
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> -	ASSERT(xfs_is_reflink_inode(ip));
> +	if (!ip->i_cowfp) {
> +		ASSERT(!xfs_is_reflink_inode(ip));
> +		xfs_ifork_init_cow(ip);
> +	}
>  
>  	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
>  	if (error || !*shared)
> @@ -542,7 +562,7 @@ xfs_reflink_cancel_cow_range(
>  	int			error;
>  
>  	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
> -	ASSERT(xfs_is_reflink_inode(ip));
> +	ASSERT(ip->i_cowfp);
>  
>  	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
>  	if (count == NULLFILEOFF)
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 4a9e3cd4768a..2a3052fbe23e 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -6,11 +6,24 @@
>  #ifndef __XFS_REFLINK_H
>  #define __XFS_REFLINK_H 1
>  
> +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
> +{
> +	return ip->i_mount->m_always_cow &&
> +		xfs_sb_version_hasreflink(&ip->i_mount->m_sb);

Assuming the next step is to rewrite this function to autodetect a
device that cannot easily handle rewrites (aka SMR)...

> +}
> +
> +static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
> +{
> +	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
> +}
> +
>  extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
>  		xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
>  		xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
>  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *irec, bool *shared);
> +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
> +		bool *shared);
>  
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index c9097cb0b955..6be183a391b6 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1729,11 +1729,18 @@ xfs_fs_fill_super(
>  		}
>  	}
>  
> -	if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
> -		xfs_alert(mp,
> +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> +		if (mp->m_sb.sb_rblocks) {
> +			xfs_alert(mp,
>  	"reflink not compatible with realtime device!");
> -		error = -EINVAL;
> -		goto out_filestream_unmount;
> +			error = -EINVAL;
> +			goto out_filestream_unmount;
> +		}
> +
> +		if (xfs_globals.always_cow) {
> +			xfs_info(mp, "using DEBUG-only always_cow mode.");
> +			mp->m_always_cow = true;
> +		}
>  	}
>  
>  	if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
> diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> index 168488130a19..ad7f9be13087 100644
> --- a/fs/xfs/xfs_sysctl.h
> +++ b/fs/xfs/xfs_sysctl.h
> @@ -85,6 +85,7 @@ struct xfs_globals {
>  	int	log_recovery_delay;	/* log recovery delay (secs) */
>  	int	mount_delay;		/* mount setup delay (secs) */
>  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> +	bool	always_cow;		/* use COW fork for all overwrites */

...I'm also wondering if it makes sense to separate the creation of the
sysctl knob into a separate patch so that we can cleanly revert just
this part whenever we land a more permanent interface for enabling
alwayscow paths?

--D

>  };
>  extern struct xfs_globals	xfs_globals;
>  
> diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> index cd6a994a7250..cabda13f3c64 100644
> --- a/fs/xfs/xfs_sysfs.c
> +++ b/fs/xfs/xfs_sysfs.c
> @@ -183,10 +183,34 @@ mount_delay_show(
>  }
>  XFS_SYSFS_ATTR_RW(mount_delay);
>  
> +static ssize_t
> +always_cow_store(
> +	struct kobject	*kobject,
> +	const char	*buf,
> +	size_t		count)
> +{
> +	ssize_t		ret;
> +
> +	ret = kstrtobool(buf, &xfs_globals.always_cow);
> +	if (ret < 0)
> +		return ret;
> +	return count;
> +}
> +
> +static ssize_t
> +always_cow_show(
> +	struct kobject	*kobject,
> +	char		*buf)
> +{
> +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
> +}
> +XFS_SYSFS_ATTR_RW(always_cow);
> +
>  static struct attribute *xfs_dbg_attrs[] = {
>  	ATTR_LIST(bug_on_assert),
>  	ATTR_LIST(log_recovery_delay),
>  	ATTR_LIST(mount_delay),
> +	ATTR_LIST(always_cow),
>  	NULL,
>  };
>  
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/8] xfs: introduce an always_cow mode
  2019-02-19 17:53     ` Darrick J. Wong
@ 2019-02-20 14:58       ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-20 14:58 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, linux-xfs

On Tue, Feb 19, 2019 at 09:53:58AM -0800, Darrick J. Wong wrote:
> xfsrestore tries to RESVSP space for its index file and fails,
> presumably because the vfs intercepts the RESVSP ioctl and calls
> vfs_fallocate.  But then xfsrestore seems to run ALLOCSP, which afaict
> succeeds... ?

s/ALLOCSP/fallocate/ I think.

But not, fallocate on an always_cow inode does not work, because
it can't (modulo the non-persistent delalloc only idea from Dave).

> 
> Two questions:
> 
> - Are you going to mask off xfs_ioc_space from alwayscow inodes like you
>   did for xfs_file_fallocate?

xfs_ioc_space does a few different things:

 - it contains a RESVP implementation that actually is dead code,
   as the VFS takes care of it.
 - it contains a UNRESVSP implementation, this is supported by
   always_cow
 - it contains a ZERO_RANGE implementation, also supported by
   always_cow

So nothing to do there.

> - What are we going to do for xfsrestore if it encounters a filesystem
>   where everything is alwayscow?
> 
> > xfs/205
> 
> "disk full, expected" ?
> 
> /me wonders, since this is an ENOSPC test, why would it fail if we run
> out of space.

As far as I can tell because we run out of space too early due to the
transient double block usage due to the COW writes.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/8] xfs: introduce an always_cow mode
  2019-02-19  5:25   ` Darrick J. Wong
  2019-02-19 17:53     ` Darrick J. Wong
@ 2019-02-20 15:00     ` Christoph Hellwig
  1 sibling, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-20 15:00 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, linux-xfs

[fullquote deleted, please follow email ettiquette]

On Mon, Feb 18, 2019 at 09:25:45PM -0800, Darrick J. Wong wrote:
> >  
> > +bool
> > +xfs_inode_need_cow(
> 
> xfs_inode_trim_to_cow() ?
> 
> Otherwise generally looks ok to me, but it's late so I'll send this now
> and get back to review in the morning.

Hmm, that name seems even more confusing than the original one.  But
I don't have any better suggestion.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/8] xfs: introduce an always_cow mode
  2019-02-19 18:31   ` Darrick J. Wong
@ 2019-02-20 15:08     ` Christoph Hellwig
  2019-02-21 17:31       ` Darrick J. Wong
  0 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-20 15:08 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, linux-xfs

On Tue, Feb 19, 2019 at 10:31:14AM -0800, Darrick J. Wong wrote:
> >  - xfs/326 fails to modify the file at all in always_cow mode after
> >    injecting the refcount error, leading to an unexpected md5sum
> >    after the remount, but that again is expected
> 
> I'm assuming the purpose of the debug knob is so that we can get all the
> fstests regressions fixed before enabling always cow modes for users...

Theoriginal  purpose is to allow exercising the always COW functionality,
so that  it can be tested without SMR support or atomic write which are
still in flux.

While developing it I also noticed that it also helps to uncover general
COW functionality bugs, so it also is a pretty good debug aid for that.

> > +		} else {
> > +			/*
> > +			 * If always_cow mode we can't use preallocations and
> > +			 * thus should not create them.
> > +			 */
> > +			if (xfs_is_always_cow_inode(ip)) {
> > +				error = -EOPNOTSUPP;
> > +				goto out_unlock;
> > +			}
> > +
> 
> Ok, so the basic premise here is that we never call xfs_alloc_file_space
> for alwayscow inodes.  As I pointed out in my reply that focused on
> fstests errors, xfs_file_fallocate is not the only caller of
> xfs_alloc_file_space, so I think xfs_ioc_space and friends are going to
> need a similar treatment.

As mentioned the XFS_IOC_RESVSP / XFS_IOC_RESVSP64 cases in
xfs_ioc_space are dead code.  the allocspc/freespc ones aren't, and
while those aren't super useful on always_cow inodes they do work
just fine.

> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index 7b71dcc97ef0..72533e9dddc6 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> > @@ -395,12 +395,13 @@ xfs_quota_calc_throttle(
> >  STATIC xfs_fsblock_t
> >  xfs_iomap_prealloc_size(
> >  	struct xfs_inode	*ip,
> > +	int			whichfork,
> >  	loff_t			offset,
> >  	loff_t			count,
> >  	struct xfs_iext_cursor	*icur)
> >  {
> >  	struct xfs_mount	*mp = ip->i_mount;
> > -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> > +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
> 
> Hmm, so are we able to do preallocations in the cow fork now?
> 
> I think that's a separate change from adding the alwayscow knob.

I can split a prep patch out, but by always writing through the COW fork
in always_cow mode we also want the speculative EOF preallocations there.
Without always_cow we don't COW at EOF per defintion so it is rather
pointless.

> > +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
> > +{
> > +	return ip->i_mount->m_always_cow &&
> > +		xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
> 
> Assuming the next step is to rewrite this function to autodetect a
> device that cannot easily handle rewrites (aka SMR)...

For SMR we'll have a per-sb flag, for atomic writes a per-inode one.
But right now I intended these to exist in addition, not instead of the
debug know.

> > --- a/fs/xfs/xfs_sysctl.h
> > +++ b/fs/xfs/xfs_sysctl.h
> > @@ -85,6 +85,7 @@ struct xfs_globals {
> >  	int	log_recovery_delay;	/* log recovery delay (secs) */
> >  	int	mount_delay;		/* mount setup delay (secs) */
> >  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> > +	bool	always_cow;		/* use COW fork for all overwrites */
> 
> ...I'm also wondering if it makes sense to separate the creation of the
> sysctl knob into a separate patch so that we can cleanly revert just
> this part whenever we land a more permanent interface for enabling
> alwayscow paths?

I could split it - but my intention was to keep the debug know around
in the long term, as it is really helpful for exercising the COW code.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/8] xfs: introduce an always_cow mode
  2019-02-20 15:08     ` Christoph Hellwig
@ 2019-02-21 17:31       ` Darrick J. Wong
  0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-21 17:31 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Wed, Feb 20, 2019 at 04:08:18PM +0100, Christoph Hellwig wrote:
> On Tue, Feb 19, 2019 at 10:31:14AM -0800, Darrick J. Wong wrote:
> > >  - xfs/326 fails to modify the file at all in always_cow mode after
> > >    injecting the refcount error, leading to an unexpected md5sum
> > >    after the remount, but that again is expected
> > 
> > I'm assuming the purpose of the debug knob is so that we can get all the
> > fstests regressions fixed before enabling always cow modes for users...
> 
> Theoriginal  purpose is to allow exercising the always COW functionality,
> so that  it can be tested without SMR support or atomic write which are
> still in flux.
> 
> While developing it I also noticed that it also helps to uncover general
> COW functionality bugs, so it also is a pretty good debug aid for that.
> 
> > > +		} else {
> > > +			/*
> > > +			 * If always_cow mode we can't use preallocations and
> > > +			 * thus should not create them.
> > > +			 */
> > > +			if (xfs_is_always_cow_inode(ip)) {
> > > +				error = -EOPNOTSUPP;
> > > +				goto out_unlock;
> > > +			}
> > > +
> > 
> > Ok, so the basic premise here is that we never call xfs_alloc_file_space
> > for alwayscow inodes.  As I pointed out in my reply that focused on
> > fstests errors, xfs_file_fallocate is not the only caller of
> > xfs_alloc_file_space, so I think xfs_ioc_space and friends are going to
> > need a similar treatment.
> 
> As mentioned the XFS_IOC_RESVSP / XFS_IOC_RESVSP64 cases in
> xfs_ioc_space are dead code.  the allocspc/freespc ones aren't, and
> while those aren't super useful on always_cow inodes they do work
> just fine.
> 
> > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > > index 7b71dcc97ef0..72533e9dddc6 100644
> > > --- a/fs/xfs/xfs_iomap.c
> > > +++ b/fs/xfs/xfs_iomap.c
> > > @@ -395,12 +395,13 @@ xfs_quota_calc_throttle(
> > >  STATIC xfs_fsblock_t
> > >  xfs_iomap_prealloc_size(
> > >  	struct xfs_inode	*ip,
> > > +	int			whichfork,
> > >  	loff_t			offset,
> > >  	loff_t			count,
> > >  	struct xfs_iext_cursor	*icur)
> > >  {
> > >  	struct xfs_mount	*mp = ip->i_mount;
> > > -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> > > +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
> > 
> > Hmm, so are we able to do preallocations in the cow fork now?
> > 
> > I think that's a separate change from adding the alwayscow knob.
> 
> I can split a prep patch out, but by always writing through the COW fork
> in always_cow mode we also want the speculative EOF preallocations there.
> Without always_cow we don't COW at EOF per defintion so it is rather
> pointless.
> 
> > > +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
> > > +{
> > > +	return ip->i_mount->m_always_cow &&
> > > +		xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
> > 
> > Assuming the next step is to rewrite this function to autodetect a
> > device that cannot easily handle rewrites (aka SMR)...
> 
> For SMR we'll have a per-sb flag, for atomic writes a per-inode one.
> But right now I intended these to exist in addition, not instead of the
> debug know.
> 
> > > --- a/fs/xfs/xfs_sysctl.h
> > > +++ b/fs/xfs/xfs_sysctl.h
> > > @@ -85,6 +85,7 @@ struct xfs_globals {
> > >  	int	log_recovery_delay;	/* log recovery delay (secs) */
> > >  	int	mount_delay;		/* mount setup delay (secs) */
> > >  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> > > +	bool	always_cow;		/* use COW fork for all overwrites */
> > 
> > ...I'm also wondering if it makes sense to separate the creation of the
> > sysctl knob into a separate patch so that we can cleanly revert just
> > this part whenever we land a more permanent interface for enabling
> > alwayscow paths?
> 
> I could split it - but my intention was to keep the debug know around
> in the long term, as it is really helpful for exercising the COW code.

/me decides he'll put this one in since it's a debug knob...

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints
  2019-02-18  9:18 ` [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints Christoph Hellwig
  2019-02-19  5:17   ` Darrick J. Wong
@ 2019-02-21 17:58   ` Brian Foster
  2019-02-21 22:56     ` Darrick J. Wong
  2019-02-22 14:16     ` Christoph Hellwig
  1 sibling, 2 replies; 33+ messages in thread
From: Brian Foster @ 2019-02-21 17:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 10:18:22AM +0100, Christoph Hellwig wrote:
> While using delalloc for extsize hints is generally a good idea, the
> current code that does so only for COW doesn't help us much and creates
> a lot of special cases.  Switch it to use real allocations like we
> do for direct I/O.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_iomap.c   | 28 +++++++++++++++++-----------
>  fs/xfs/xfs_reflink.c | 10 +++++++++-
>  fs/xfs/xfs_reflink.h |  3 ++-
>  3 files changed, 28 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index a7599b084571..19a3331b4a56 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -918,22 +918,28 @@ xfs_file_iomap_begin(
>  	 * been done up front, so we don't need to do them here.
>  	 */
>  	if (xfs_is_reflink_inode(ip)) {
> +		struct xfs_bmbt_irec	orig = imap;
> +
...
> +		error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode,
> +						 flags);
> +		if (error)
> +			goto out_unlock;
> +
> +		/*
> +		 * For buffered writes we need to report the address of the
> +		 * previous block (if there was any) so that the higher level
> +		 * write code can perform read-modify-write operations.  For
> +		 * direct I/O code, which must be block aligned we need to
> +		 * report the newly allocated address.
> +		 */
> +		if (!(flags & IOMAP_DIRECT) &&
> +		    orig.br_startblock != HOLESTARTBLOCK)
> +			imap = orig;

I find the logic here kind of confusing. The buffered write (reflink)
path basically expects to allocated COW blocks over an existing shared
extent. It thus makes no modification to the caller's imap because it
(read-modify-)writes into cache and writeback determines where to send
the I/O. Why not follow the same flow here? For example:

	cmap = orig;
	error = xfs_reflink_allocate_cow(ip, &cmap, ...);
	/* ... */
	if ((flags & IOMAP_DIRECT) || (imap.br_startblock == HOLESTARTBLOCK))
		imap = cmap;

And also, what's the point of the hole check.. to avoid an unnecessary
data fork allocation below? That should be explained in the comment.

>  
>  		end_fsb = imap.br_startoff + imap.br_blockcount;
>  		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 2babc2cbe103..8a5353daf9ab 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
>  	struct xfs_inode	*ip,
>  	struct xfs_bmbt_irec	*imap,
>  	bool			*shared,
> -	uint			*lockmode)
> +	uint			*lockmode,
> +	unsigned		iomap_flags)

I don't see why a lower level reflink mechanism needs to care about
direct I/O or not. IMO this should just be a 'bool convert' param.

Brian

>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> @@ -471,6 +472,13 @@ xfs_reflink_allocate_cow(
>  	if (nimaps == 0)
>  		return -ENOSPC;
>  convert:
> +	/*
> +	 * COW fork extents are supposed to remain unwritten until we're ready
> +	 * to initiate a disk write.  For direct I/O we are going to write the
> +	 * data and need the conversion, but for buffered writes we're done.
> +	 */
> +	if (!(iomap_flags & IOMAP_DIRECT))
> +		return 0;
>  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
>  
>  out_unreserve:
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 6d73daef1f13..70d68a1a9b49 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -15,7 +15,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap);
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> -		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
> +		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> +		unsigned iomap_flags);
>  extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t count);
>  
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay
  2019-02-18  9:18 ` [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay Christoph Hellwig
  2019-02-19 18:12   ` Darrick J. Wong
@ 2019-02-21 17:59   ` Brian Foster
  2019-02-21 21:30     ` Darrick J. Wong
  2019-02-22 14:20     ` Christoph Hellwig
  1 sibling, 2 replies; 33+ messages in thread
From: Brian Foster @ 2019-02-21 17:59 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> Besides simplifying the code a bit this allows to actually implement
> the behavior of using COW preallocation for non-COW data mentioned
> in the current comments.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Heh, I'm somewhat amused by the fact that I sent a variant of this patch
two years ago (for a different purpose) and you explicitly complained
about the factoring. I'm glad you've finally come around. ;)

https://marc.info/?l=linux-xfs&m=149498124730442&w=2

>  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
>  fs/xfs/xfs_reflink.c |  67 ----------------------
>  fs/xfs/xfs_reflink.h |   2 -
>  fs/xfs/xfs_trace.h   |   3 -
>  4 files changed, 94 insertions(+), 111 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 19a3331b4a56..c9fd1e4a1f99 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
...
> @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
>  
>  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
>  
> -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> +	/*
> +	 * Search the data fork fork first to look up our source mapping.  We
> +	 * always need the data fork map, as we have to return it to the
> +	 * iomap code so that the higher level write code can read data in to
> +	 * perform read-modify-write cycles for unaligned writes.
> +	 */
> +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
>  	if (eof)
> -		got.br_startoff = end_fsb; /* fake hole until the end */
> +		imap.br_startoff = end_fsb; /* fake hole until the end */
> +
> +	/* We never need to allocate blocks for zeroing a hole. */
> +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> +		goto out_unlock;
> +	}
> +

So does this need to account for the case of an overlapping cow block
over a hole in the data fork (with cached data, if that is possible)?
IIUC we introduce that possibility just below.

> +	/*
> +	 * Search the COW fork extent list even if we did not find a data fork
> +	 * extent.  This serves two purposes: first this implements the
> +	 * speculative preallocation using cowextsize, so that we also unshare
> +	 * block adjacent to shared blocks instead of just the shared blocks
> +	 * themselves.  Second the lookup in the extent list is generally faster
> +	 * than going out to the shared extent tree.
> +	 */
> +	if (xfs_is_reflink_inode(ip)) {
> +		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> +				&ccur, &cmap);
> +		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> +			trace_xfs_reflink_cow_found(ip, &cmap);
> +			whichfork = XFS_COW_FORK;
> +			goto done;
> +		}
> +	}
>  
> -	if (got.br_startoff <= offset_fsb) {
> +	if (imap.br_startoff <= offset_fsb) {
>  		/*
>  		 * For reflink files we may need a delalloc reservation when
>  		 * overwriting shared extents.   This includes zeroing of
>  		 * existing extents that contain data.
>  		 */
> -		if (xfs_is_reflink_inode(ip) &&
> -		    ((flags & IOMAP_WRITE) ||
> -		     got.br_state != XFS_EXT_UNWRITTEN)) {
> -			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
> -			error = xfs_reflink_reserve_cow(ip, &got);
> -			if (error)
> -				goto out_unlock;
> +		if (!xfs_is_reflink_inode(ip) ||
> +		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> +					&imap);
> +			goto done;
>  		}
>  
> -		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
> -		goto done;
> -	}
> +		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
>  
> -	if (flags & IOMAP_ZERO) {
> -		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
> -		goto out_unlock;
> +		/* Trim the mapping to the nearest shared extent boundary. */
> +		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +		if (error)
> +			goto out_unlock;
> +
> +		/* Not shared?  Just report the (potentially capped) extent. */
> +		if (!shared) {
> +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> +					&imap);
> +			goto done;
> +		}
> +
> +		/*
> +		 * Fork all the shared blocks from our write offset until the
> +		 * end of the extent.
> +		 */
> +		whichfork = XFS_COW_FORK;
> +		end_fsb = imap.br_startoff + imap.br_blockcount;
> +	} else {
> +		/*
> +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> +		 * pages to keep the chunks of work done where somewhat
> +		 * symmetric with the work writeback does.  This is a completely
> +		 * arbitrary number pulled out of thin air.
> +		 *
> +		 * Note that the values needs to be less than 32-bits wide until
> +		 * the lower level functions are updated.
> +		 */
> +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);

The existing code doesn't currently do this but ISTM this should apply
to either allocation case, not just data fork delalloc. That could be
something for a separate patch though.

>  	}
>  
>  	error = xfs_qm_dqattach_locked(ip, false);
>  	if (error)
>  		goto out_unlock;
>  
> -	/*
> -	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
> -	 * to keep the chunks of work done where somewhat symmetric with the
> -	 * work writeback does. This is a completely arbitrary number pulled
> -	 * out of thin air as a best guess for initial testing.
> -	 *
> -	 * Note that the values needs to be less than 32-bits wide until
> -	 * the lower level functions are updated.
> -	 */
> -	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> -	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> -
> -	if (eof) {
> +	if (eof && whichfork == XFS_DATA_FORK) {
>  		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
>  				&icur);
>  		if (prealloc_blocks) {
...
> @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
>  	 * them out if the write happens to fail.
>  	 */
>  	iomap->flags |= IOMAP_F_NEW;

This looks like it flags the mapping new if we reserve cow blocks, which
I don't think is quite right.

> -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
>  done:
> -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> +	if (whichfork == XFS_COW_FORK) {
> +		if (imap.br_startoff > offset_fsb) {
> +			xfs_trim_extent(&cmap, offset_fsb,
> +					imap.br_startoff - offset_fsb);
> +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> +			goto out_unlock;
> +		}

Hmm, so this looks like it is in fact handling a COW blocks over a hole
case, pushing the COW mapping into the iomap. We never accounted for
that case before where we'd always just allocate to the data fork. The
change in behavior probably makes sense, but this really should be
separate from the refactoring bits to reuse the data fork delalloc code.
Beyond making this a bit easier to follow, it warrants its own commit
log description and this one makes no mention of it at all.

Brian

> +		/* ensure we only report blocks we have a reservation for */
> +		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> +	}
> +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
>  out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  	return error;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 8a5353daf9ab..9ef1f79cb3ae 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
>  	}
>  }
>  
> -/*
> - * Trim the passed in imap to the next shared/unshared extent boundary, and
> - * if imap->br_startoff points to a shared extent reserve space for it in the
> - * COW fork.
> - *
> - * Note that imap will always contain the block numbers for the existing blocks
> - * in the data fork, as the upper layers need them for read-modify-write
> - * operations.
> - */
> -int
> -xfs_reflink_reserve_cow(
> -	struct xfs_inode	*ip,
> -	struct xfs_bmbt_irec	*imap)
> -{
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> -	struct xfs_bmbt_irec	got;
> -	int			error = 0;
> -	bool			eof = false;
> -	struct xfs_iext_cursor	icur;
> -	bool			shared;
> -
> -	/*
> -	 * Search the COW fork extent list first.  This serves two purposes:
> -	 * first this implement the speculative preallocation using cowextisze,
> -	 * so that we also unshared block adjacent to shared blocks instead
> -	 * of just the shared blocks themselves.  Second the lookup in the
> -	 * extent list is generally faster than going out to the shared extent
> -	 * tree.
> -	 */
> -
> -	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
> -		eof = true;
> -	if (!eof && got.br_startoff <= imap->br_startoff) {
> -		trace_xfs_reflink_cow_found(ip, imap);
> -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -		return 0;
> -	}
> -
> -	/* Trim the mapping to the nearest shared extent boundary. */
> -	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
> -	if (error)
> -		return error;
> -
> -	/* Not shared?  Just report the (potentially capped) extent. */
> -	if (!shared)
> -		return 0;
> -
> -	/*
> -	 * Fork all the shared blocks from our write offset until the end of
> -	 * the extent.
> -	 */
> -	error = xfs_qm_dqattach_locked(ip, false);
> -	if (error)
> -		return error;
> -
> -	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
> -			imap->br_blockcount, 0, &got, &icur, eof);
> -	if (error == -ENOSPC || error == -EDQUOT)
> -		trace_xfs_reflink_cow_enospc(ip, imap);
> -	if (error)
> -		return error;
> -
> -	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> -	trace_xfs_reflink_cow_alloc(ip, &got);
> -	return 0;
> -}
> -
>  /* Convert part of an unwritten CoW extent to a real one. */
>  STATIC int
>  xfs_reflink_convert_cow_extent(
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 70d68a1a9b49..4a9e3cd4768a 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
>  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *irec, bool *shared);
>  
> -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> -		struct xfs_bmbt_irec *imap);
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
>  		unsigned iomap_flags);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index f1e18ae8a209..47fb07d86efd 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
>  
>  /* copy on write */
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
> -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
>  
> -DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
> -
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
>  
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay
  2019-02-21 17:59   ` Brian Foster
@ 2019-02-21 21:30     ` Darrick J. Wong
  2019-02-22 12:31       ` Brian Foster
  2019-02-22 14:22       ` Christoph Hellwig
  2019-02-22 14:20     ` Christoph Hellwig
  1 sibling, 2 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-21 21:30 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Thu, Feb 21, 2019 at 12:59:03PM -0500, Brian Foster wrote:
> On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> > Besides simplifying the code a bit this allows to actually implement
> > the behavior of using COW preallocation for non-COW data mentioned
> > in the current comments.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> 
> Heh, I'm somewhat amused by the fact that I sent a variant of this patch
> two years ago (for a different purpose) and you explicitly complained
> about the factoring. I'm glad you've finally come around. ;)
> 
> https://marc.info/?l=linux-xfs&m=149498124730442&w=2
> 
> >  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
> >  fs/xfs/xfs_reflink.c |  67 ----------------------
> >  fs/xfs/xfs_reflink.h |   2 -
> >  fs/xfs/xfs_trace.h   |   3 -
> >  4 files changed, 94 insertions(+), 111 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index 19a3331b4a56..c9fd1e4a1f99 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> ...
> > @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
> >  
> >  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> >  
> > -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> > +	/*
> > +	 * Search the data fork fork first to look up our source mapping.  We
> > +	 * always need the data fork map, as we have to return it to the
> > +	 * iomap code so that the higher level write code can read data in to
> > +	 * perform read-modify-write cycles for unaligned writes.
> > +	 */
> > +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
> >  	if (eof)
> > -		got.br_startoff = end_fsb; /* fake hole until the end */
> > +		imap.br_startoff = end_fsb; /* fake hole until the end */
> > +
> > +	/* We never need to allocate blocks for zeroing a hole. */
> > +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> > +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> > +		goto out_unlock;
> > +	}
> > +
> 
> So does this need to account for the case of an overlapping cow block
> over a hole in the data fork (with cached data, if that is possible)?
> IIUC we introduce that possibility just below.

I think it makes sense to ignore overlapping cow blocks for zeroing a
hole in the data fork -- the user told us to zero part of a file that
didn't have nonzero contents in it, so we just leave the speculative cow
allocation for that block alone.

> > +	/*
> > +	 * Search the COW fork extent list even if we did not find a data fork
> > +	 * extent.  This serves two purposes: first this implements the
> > +	 * speculative preallocation using cowextsize, so that we also unshare
> > +	 * block adjacent to shared blocks instead of just the shared blocks
> > +	 * themselves.  Second the lookup in the extent list is generally faster
> > +	 * than going out to the shared extent tree.
> > +	 */
> > +	if (xfs_is_reflink_inode(ip)) {
> > +		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> > +				&ccur, &cmap);
> > +		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> > +			trace_xfs_reflink_cow_found(ip, &cmap);
> > +			whichfork = XFS_COW_FORK;
> > +			goto done;
> > +		}
> > +	}
> >  
> > -	if (got.br_startoff <= offset_fsb) {
> > +	if (imap.br_startoff <= offset_fsb) {
> >  		/*
> >  		 * For reflink files we may need a delalloc reservation when
> >  		 * overwriting shared extents.   This includes zeroing of
> >  		 * existing extents that contain data.
> >  		 */
> > -		if (xfs_is_reflink_inode(ip) &&
> > -		    ((flags & IOMAP_WRITE) ||
> > -		     got.br_state != XFS_EXT_UNWRITTEN)) {
> > -			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
> > -			error = xfs_reflink_reserve_cow(ip, &got);
> > -			if (error)
> > -				goto out_unlock;
> > +		if (!xfs_is_reflink_inode(ip) ||
> > +		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> > +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> > +					&imap);
> > +			goto done;
> >  		}
> >  
> > -		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
> > -		goto done;
> > -	}
> > +		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
> >  
> > -	if (flags & IOMAP_ZERO) {
> > -		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
> > -		goto out_unlock;
> > +		/* Trim the mapping to the nearest shared extent boundary. */
> > +		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> > +		if (error)
> > +			goto out_unlock;
> > +
> > +		/* Not shared?  Just report the (potentially capped) extent. */
> > +		if (!shared) {
> > +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> > +					&imap);
> > +			goto done;
> > +		}
> > +
> > +		/*
> > +		 * Fork all the shared blocks from our write offset until the
> > +		 * end of the extent.
> > +		 */
> > +		whichfork = XFS_COW_FORK;
> > +		end_fsb = imap.br_startoff + imap.br_blockcount;
> > +	} else {
> > +		/*
> > +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> > +		 * pages to keep the chunks of work done where somewhat
> > +		 * symmetric with the work writeback does.  This is a completely
> > +		 * arbitrary number pulled out of thin air.
> > +		 *
> > +		 * Note that the values needs to be less than 32-bits wide until
> > +		 * the lower level functions are updated.
> > +		 */
> > +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> 
> The existing code doesn't currently do this but ISTM this should apply
> to either allocation case, not just data fork delalloc. That could be
> something for a separate patch though.
> 
> >  	}
> >  
> >  	error = xfs_qm_dqattach_locked(ip, false);
> >  	if (error)
> >  		goto out_unlock;
> >  
> > -	/*
> > -	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
> > -	 * to keep the chunks of work done where somewhat symmetric with the
> > -	 * work writeback does. This is a completely arbitrary number pulled
> > -	 * out of thin air as a best guess for initial testing.
> > -	 *
> > -	 * Note that the values needs to be less than 32-bits wide until
> > -	 * the lower level functions are updated.
> > -	 */
> > -	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > -	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > -
> > -	if (eof) {
> > +	if (eof && whichfork == XFS_DATA_FORK) {
> >  		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
> >  				&icur);
> >  		if (prealloc_blocks) {
> ...
> > @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
> >  	 * them out if the write happens to fail.
> >  	 */
> >  	iomap->flags |= IOMAP_F_NEW;
> 
> This looks like it flags the mapping new if we reserve cow blocks, which
> I don't think is quite right.

Hmmm.  I thought it was correct -- if the write fails, we punch out the
pagecache and trim any delalloc blocks in the data fork.  If they user
tries to reread the area of the failed write we'll just read them back
in from disk...?

--D

> > -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> > +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> > +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
> >  done:
> > -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> > +	if (whichfork == XFS_COW_FORK) {
> > +		if (imap.br_startoff > offset_fsb) {
> > +			xfs_trim_extent(&cmap, offset_fsb,
> > +					imap.br_startoff - offset_fsb);
> > +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> > +			goto out_unlock;
> > +		}
> 
> Hmm, so this looks like it is in fact handling a COW blocks over a hole
> case, pushing the COW mapping into the iomap. We never accounted for
> that case before where we'd always just allocate to the data fork. The
> change in behavior probably makes sense, but this really should be
> separate from the refactoring bits to reuse the data fork delalloc code.
> Beyond making this a bit easier to follow, it warrants its own commit
> log description and this one makes no mention of it at all.
> 
> Brian
> 
> > +		/* ensure we only report blocks we have a reservation for */
> > +		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> > +	}
> > +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
> >  out_unlock:
> >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> >  	return error;
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index 8a5353daf9ab..9ef1f79cb3ae 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
> >  	}
> >  }
> >  
> > -/*
> > - * Trim the passed in imap to the next shared/unshared extent boundary, and
> > - * if imap->br_startoff points to a shared extent reserve space for it in the
> > - * COW fork.
> > - *
> > - * Note that imap will always contain the block numbers for the existing blocks
> > - * in the data fork, as the upper layers need them for read-modify-write
> > - * operations.
> > - */
> > -int
> > -xfs_reflink_reserve_cow(
> > -	struct xfs_inode	*ip,
> > -	struct xfs_bmbt_irec	*imap)
> > -{
> > -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> > -	struct xfs_bmbt_irec	got;
> > -	int			error = 0;
> > -	bool			eof = false;
> > -	struct xfs_iext_cursor	icur;
> > -	bool			shared;
> > -
> > -	/*
> > -	 * Search the COW fork extent list first.  This serves two purposes:
> > -	 * first this implement the speculative preallocation using cowextisze,
> > -	 * so that we also unshared block adjacent to shared blocks instead
> > -	 * of just the shared blocks themselves.  Second the lookup in the
> > -	 * extent list is generally faster than going out to the shared extent
> > -	 * tree.
> > -	 */
> > -
> > -	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
> > -		eof = true;
> > -	if (!eof && got.br_startoff <= imap->br_startoff) {
> > -		trace_xfs_reflink_cow_found(ip, imap);
> > -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > -		return 0;
> > -	}
> > -
> > -	/* Trim the mapping to the nearest shared extent boundary. */
> > -	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
> > -	if (error)
> > -		return error;
> > -
> > -	/* Not shared?  Just report the (potentially capped) extent. */
> > -	if (!shared)
> > -		return 0;
> > -
> > -	/*
> > -	 * Fork all the shared blocks from our write offset until the end of
> > -	 * the extent.
> > -	 */
> > -	error = xfs_qm_dqattach_locked(ip, false);
> > -	if (error)
> > -		return error;
> > -
> > -	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
> > -			imap->br_blockcount, 0, &got, &icur, eof);
> > -	if (error == -ENOSPC || error == -EDQUOT)
> > -		trace_xfs_reflink_cow_enospc(ip, imap);
> > -	if (error)
> > -		return error;
> > -
> > -	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > -	trace_xfs_reflink_cow_alloc(ip, &got);
> > -	return 0;
> > -}
> > -
> >  /* Convert part of an unwritten CoW extent to a real one. */
> >  STATIC int
> >  xfs_reflink_convert_cow_extent(
> > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > index 70d68a1a9b49..4a9e3cd4768a 100644
> > --- a/fs/xfs/xfs_reflink.h
> > +++ b/fs/xfs/xfs_reflink.h
> > @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
> >  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
> >  		struct xfs_bmbt_irec *irec, bool *shared);
> >  
> > -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> > -		struct xfs_bmbt_irec *imap);
> >  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> >  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> >  		unsigned iomap_flags);
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index f1e18ae8a209..47fb07d86efd 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
> >  
> >  /* copy on write */
> >  DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
> > -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
> >  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
> >  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
> >  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
> >  
> > -DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
> > -
> >  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
> >  
> >  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> > -- 
> > 2.20.1
> > 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints
  2019-02-21 17:58   ` Brian Foster
@ 2019-02-21 22:56     ` Darrick J. Wong
  2019-02-22 14:16     ` Christoph Hellwig
  1 sibling, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-02-21 22:56 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Thu, Feb 21, 2019 at 12:58:17PM -0500, Brian Foster wrote:
> On Mon, Feb 18, 2019 at 10:18:22AM +0100, Christoph Hellwig wrote:
> > While using delalloc for extsize hints is generally a good idea, the
> > current code that does so only for COW doesn't help us much and creates
> > a lot of special cases.  Switch it to use real allocations like we
> > do for direct I/O.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  fs/xfs/xfs_iomap.c   | 28 +++++++++++++++++-----------
> >  fs/xfs/xfs_reflink.c | 10 +++++++++-
> >  fs/xfs/xfs_reflink.h |  3 ++-
> >  3 files changed, 28 insertions(+), 13 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index a7599b084571..19a3331b4a56 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> > @@ -918,22 +918,28 @@ xfs_file_iomap_begin(
> >  	 * been done up front, so we don't need to do them here.
> >  	 */
> >  	if (xfs_is_reflink_inode(ip)) {
> > +		struct xfs_bmbt_irec	orig = imap;
> > +
> ...
> > +		error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode,
> > +						 flags);
> > +		if (error)
> > +			goto out_unlock;
> > +
> > +		/*
> > +		 * For buffered writes we need to report the address of the
> > +		 * previous block (if there was any) so that the higher level
> > +		 * write code can perform read-modify-write operations.  For
> > +		 * direct I/O code, which must be block aligned we need to
> > +		 * report the newly allocated address.
> > +		 */
> > +		if (!(flags & IOMAP_DIRECT) &&
> > +		    orig.br_startblock != HOLESTARTBLOCK)
> > +			imap = orig;
> 
> I find the logic here kind of confusing. The buffered write (reflink)
> path basically expects to allocated COW blocks over an existing shared
> extent. It thus makes no modification to the caller's imap because it
> (read-modify-)writes into cache and writeback determines where to send
> the I/O. Why not follow the same flow here? For example:
> 
> 	cmap = orig;
> 	error = xfs_reflink_allocate_cow(ip, &cmap, ...);
> 	/* ... */
> 	if ((flags & IOMAP_DIRECT) || (imap.br_startblock == HOLESTARTBLOCK))
> 		imap = cmap;

Admittedly, I think that flows better.

> And also, what's the point of the hole check.. to avoid an unnecessary
> data fork allocation below? That should be explained in the comment.
>
> >  
> >  		end_fsb = imap.br_startoff + imap.br_blockcount;
> >  		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index 2babc2cbe103..8a5353daf9ab 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
> >  	struct xfs_inode	*ip,
> >  	struct xfs_bmbt_irec	*imap,
> >  	bool			*shared,
> > -	uint			*lockmode)
> > +	uint			*lockmode,
> > +	unsigned		iomap_flags)
> 
> I don't see why a lower level reflink mechanism needs to care about
> direct I/O or not. IMO this should just be a 'bool convert' param.

I forgot I'd complained about this the last time I read this patch.

Well maybe I'll just fix it, seeing as I already pushed the whole lot to
for-next. :/

--D

> Brian
> 
> >  {
> >  	struct xfs_mount	*mp = ip->i_mount;
> >  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> > @@ -471,6 +472,13 @@ xfs_reflink_allocate_cow(
> >  	if (nimaps == 0)
> >  		return -ENOSPC;
> >  convert:
> > +	/*
> > +	 * COW fork extents are supposed to remain unwritten until we're ready
> > +	 * to initiate a disk write.  For direct I/O we are going to write the
> > +	 * data and need the conversion, but for buffered writes we're done.
> > +	 */
> > +	if (!(iomap_flags & IOMAP_DIRECT))
> > +		return 0;
> >  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
> >  
> >  out_unreserve:
> > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > index 6d73daef1f13..70d68a1a9b49 100644
> > --- a/fs/xfs/xfs_reflink.h
> > +++ b/fs/xfs/xfs_reflink.h
> > @@ -15,7 +15,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
> >  extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> >  		struct xfs_bmbt_irec *imap);
> >  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> > -		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
> > +		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> > +		unsigned iomap_flags);
> >  extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
> >  		xfs_off_t count);
> >  
> > -- 
> > 2.20.1
> > 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay
  2019-02-21 21:30     ` Darrick J. Wong
@ 2019-02-22 12:31       ` Brian Foster
  2019-02-22 14:22       ` Christoph Hellwig
  1 sibling, 0 replies; 33+ messages in thread
From: Brian Foster @ 2019-02-22 12:31 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, linux-xfs

On Thu, Feb 21, 2019 at 01:30:52PM -0800, Darrick J. Wong wrote:
> On Thu, Feb 21, 2019 at 12:59:03PM -0500, Brian Foster wrote:
> > On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> > > Besides simplifying the code a bit this allows to actually implement
> > > the behavior of using COW preallocation for non-COW data mentioned
> > > in the current comments.
> > > 
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > ---
> > 
> > Heh, I'm somewhat amused by the fact that I sent a variant of this patch
> > two years ago (for a different purpose) and you explicitly complained
> > about the factoring. I'm glad you've finally come around. ;)
> > 
> > https://marc.info/?l=linux-xfs&m=149498124730442&w=2
> > 
> > >  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
> > >  fs/xfs/xfs_reflink.c |  67 ----------------------
> > >  fs/xfs/xfs_reflink.h |   2 -
> > >  fs/xfs/xfs_trace.h   |   3 -
> > >  4 files changed, 94 insertions(+), 111 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > > index 19a3331b4a56..c9fd1e4a1f99 100644
> > > --- a/fs/xfs/xfs_iomap.c
> > > +++ b/fs/xfs/xfs_iomap.c
> > ...
> > > @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
> > >  
> > >  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > >  
> > > -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> > > +	/*
> > > +	 * Search the data fork fork first to look up our source mapping.  We
> > > +	 * always need the data fork map, as we have to return it to the
> > > +	 * iomap code so that the higher level write code can read data in to
> > > +	 * perform read-modify-write cycles for unaligned writes.
> > > +	 */
> > > +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
> > >  	if (eof)
> > > -		got.br_startoff = end_fsb; /* fake hole until the end */
> > > +		imap.br_startoff = end_fsb; /* fake hole until the end */
> > > +
> > > +	/* We never need to allocate blocks for zeroing a hole. */
> > > +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> > > +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> > > +		goto out_unlock;
> > > +	}
> > > +
> > 
> > So does this need to account for the case of an overlapping cow block
> > over a hole in the data fork (with cached data, if that is possible)?
> > IIUC we introduce that possibility just below.
> 
> I think it makes sense to ignore overlapping cow blocks for zeroing a
> hole in the data fork -- the user told us to zero part of a file that
> didn't have nonzero contents in it, so we just leave the speculative cow
> allocation for that block alone.
> 

That makes sense, but with this patch does a hole in the data fork still
always infer a hole in the file content?

We know the current code allows for COW fork overlap of non-shared data
fork blocks due to the cowextsz hint stuff. I'm assuming that means said
COW blocks could just as well overlap a hole in the data fork, but I
could be mistaken on that. If we can and do overlap a hole and then see
a buffered write at the associated file offset, the
(trace_xfs_reflink_cow_found()) case below looks like it pushes the cow
mapping into the iomap and skips the data fork (del)allocation case.
Thus it seems to me that we're now able to have a dirty pagecache page
over a COW delalloc block over a hole in the data fork. If so and we
receive a zero range over this offset, this hunk sees a hole in the data
fork, reports it as such to iomap and thus skips zeroing the page.

Note that I wasn't necessarily sure this could happen. It seems not
given most direct usages iomap_zero_range() in the current code, at
least. The the iomap_truncate_page() -> iomap_zero_range() case looked
suspicious so I just tried to exploit that, but then ran into the fact
that truncate_setsize() also does sub-page zeroing after we call
iomap_truncate_page(). Out of curiousity, I open-coded the
truncate_setsize() call with an i_size_write() and page size aligned
pagecache truncate to bypass truncate_setsize() subpage zeroing and
tried again:

xfs_io -fc "truncate 128k" -c "pwrite 64k" <file>
cp --reflink <file> <file.2>
xfs_io -c "mmap 0 128k" -c "pwrite 64k 4k" -c "pwrite 76k 4k" \
	-c "truncate 78k" -c "mread -v 76k 4k" /mnt/file

... and with that I do see different behavior with and without this
patchset.

The intent here is to speculatively preallocate in the COW fork,
buffered write to a preallocated COW block over a data fork hole,
truncate partially over that block and then inspect whether
iomap_zero_range() locates the block and zeroes the post-eof portion of
the page. Without this patch series, iomap_zero_range() zeroes that
portion of the page, with this patch series it does not.

> > > +	/*
> > > +	 * Search the COW fork extent list even if we did not find a data fork
> > > +	 * extent.  This serves two purposes: first this implements the
> > > +	 * speculative preallocation using cowextsize, so that we also unshare
> > > +	 * block adjacent to shared blocks instead of just the shared blocks
> > > +	 * themselves.  Second the lookup in the extent list is generally faster
> > > +	 * than going out to the shared extent tree.
> > > +	 */
> > > +	if (xfs_is_reflink_inode(ip)) {
> > > +		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
> > > +				&ccur, &cmap);
> > > +		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> > > +			trace_xfs_reflink_cow_found(ip, &cmap);
> > > +			whichfork = XFS_COW_FORK;
> > > +			goto done;
> > > +		}
> > > +	}
> > >  
> > > -	if (got.br_startoff <= offset_fsb) {
> > > +	if (imap.br_startoff <= offset_fsb) {
> > >  		/*
> > >  		 * For reflink files we may need a delalloc reservation when
> > >  		 * overwriting shared extents.   This includes zeroing of
> > >  		 * existing extents that contain data.
> > >  		 */
> > > -		if (xfs_is_reflink_inode(ip) &&
> > > -		    ((flags & IOMAP_WRITE) ||
> > > -		     got.br_state != XFS_EXT_UNWRITTEN)) {
> > > -			xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
> > > -			error = xfs_reflink_reserve_cow(ip, &got);
> > > -			if (error)
> > > -				goto out_unlock;
> > > +		if (!xfs_is_reflink_inode(ip) ||
> > > +		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
> > > +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> > > +					&imap);
> > > +			goto done;
> > >  		}
> > >  
> > > -		trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got);
> > > -		goto done;
> > > -	}
> > > +		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
> > >  
> > > -	if (flags & IOMAP_ZERO) {
> > > -		xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
> > > -		goto out_unlock;
> > > +		/* Trim the mapping to the nearest shared extent boundary. */
> > > +		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> > > +		if (error)
> > > +			goto out_unlock;
> > > +
> > > +		/* Not shared?  Just report the (potentially capped) extent. */
> > > +		if (!shared) {
> > > +			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
> > > +					&imap);
> > > +			goto done;
> > > +		}
> > > +
> > > +		/*
> > > +		 * Fork all the shared blocks from our write offset until the
> > > +		 * end of the extent.
> > > +		 */
> > > +		whichfork = XFS_COW_FORK;
> > > +		end_fsb = imap.br_startoff + imap.br_blockcount;
> > > +	} else {
> > > +		/*
> > > +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> > > +		 * pages to keep the chunks of work done where somewhat
> > > +		 * symmetric with the work writeback does.  This is a completely
> > > +		 * arbitrary number pulled out of thin air.
> > > +		 *
> > > +		 * Note that the values needs to be less than 32-bits wide until
> > > +		 * the lower level functions are updated.
> > > +		 */
> > > +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > > +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > 
> > The existing code doesn't currently do this but ISTM this should apply
> > to either allocation case, not just data fork delalloc. That could be
> > something for a separate patch though.
> > 
> > >  	}
> > >  
> > >  	error = xfs_qm_dqattach_locked(ip, false);
> > >  	if (error)
> > >  		goto out_unlock;
> > >  
> > > -	/*
> > > -	 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
> > > -	 * to keep the chunks of work done where somewhat symmetric with the
> > > -	 * work writeback does. This is a completely arbitrary number pulled
> > > -	 * out of thin air as a best guess for initial testing.
> > > -	 *
> > > -	 * Note that the values needs to be less than 32-bits wide until
> > > -	 * the lower level functions are updated.
> > > -	 */
> > > -	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > > -	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > > -
> > > -	if (eof) {
> > > +	if (eof && whichfork == XFS_DATA_FORK) {
> > >  		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
> > >  				&icur);
> > >  		if (prealloc_blocks) {
> > ...
> > > @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
> > >  	 * them out if the write happens to fail.
> > >  	 */
> > >  	iomap->flags |= IOMAP_F_NEW;
> > 
> > This looks like it flags the mapping new if we reserve cow blocks, which
> > I don't think is quite right.
> 
> Hmmm.  I thought it was correct -- if the write fails, we punch out the
> pagecache and trim any delalloc blocks in the data fork.  If they user
> tries to reread the area of the failed write we'll just read them back
> in from disk...?
> 

For one, I'm not totally clear what the expected behavior is supposed to
be here. The pagecache truncate probably makes sense regardless, but as
you said, the error case punches delalloc blocks out of the data fork.
What if the blocks were allocated in the COW fork?

Note again that I'm not sure there's reproducible bad behavior here. We
probably shouldn't expect delalloc COW reservation over data fork
delalloc (unless always_cow I suppose), but shared data fork blocks
certainly aren't "new" just because we allocated COW reservation (and
don't happen to currently care about non-delalloc blocks in the error
handling implementation).

Brian

> --D
> 
> > > -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> > > +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> > > +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
> > >  done:
> > > -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> > > +	if (whichfork == XFS_COW_FORK) {
> > > +		if (imap.br_startoff > offset_fsb) {
> > > +			xfs_trim_extent(&cmap, offset_fsb,
> > > +					imap.br_startoff - offset_fsb);
> > > +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> > > +			goto out_unlock;
> > > +		}
> > 
> > Hmm, so this looks like it is in fact handling a COW blocks over a hole
> > case, pushing the COW mapping into the iomap. We never accounted for
> > that case before where we'd always just allocate to the data fork. The
> > change in behavior probably makes sense, but this really should be
> > separate from the refactoring bits to reuse the data fork delalloc code.
> > Beyond making this a bit easier to follow, it warrants its own commit
> > log description and this one makes no mention of it at all.
> > 
> > Brian
> > 
> > > +		/* ensure we only report blocks we have a reservation for */
> > > +		xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
> > > +	}
> > > +	error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
> > >  out_unlock:
> > >  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > >  	return error;
> > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > > index 8a5353daf9ab..9ef1f79cb3ae 100644
> > > --- a/fs/xfs/xfs_reflink.c
> > > +++ b/fs/xfs/xfs_reflink.c
> > > @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared(
> > >  	}
> > >  }
> > >  
> > > -/*
> > > - * Trim the passed in imap to the next shared/unshared extent boundary, and
> > > - * if imap->br_startoff points to a shared extent reserve space for it in the
> > > - * COW fork.
> > > - *
> > > - * Note that imap will always contain the block numbers for the existing blocks
> > > - * in the data fork, as the upper layers need them for read-modify-write
> > > - * operations.
> > > - */
> > > -int
> > > -xfs_reflink_reserve_cow(
> > > -	struct xfs_inode	*ip,
> > > -	struct xfs_bmbt_irec	*imap)
> > > -{
> > > -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> > > -	struct xfs_bmbt_irec	got;
> > > -	int			error = 0;
> > > -	bool			eof = false;
> > > -	struct xfs_iext_cursor	icur;
> > > -	bool			shared;
> > > -
> > > -	/*
> > > -	 * Search the COW fork extent list first.  This serves two purposes:
> > > -	 * first this implement the speculative preallocation using cowextisze,
> > > -	 * so that we also unshared block adjacent to shared blocks instead
> > > -	 * of just the shared blocks themselves.  Second the lookup in the
> > > -	 * extent list is generally faster than going out to the shared extent
> > > -	 * tree.
> > > -	 */
> > > -
> > > -	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
> > > -		eof = true;
> > > -	if (!eof && got.br_startoff <= imap->br_startoff) {
> > > -		trace_xfs_reflink_cow_found(ip, imap);
> > > -		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > > -		return 0;
> > > -	}
> > > -
> > > -	/* Trim the mapping to the nearest shared extent boundary. */
> > > -	error = xfs_reflink_trim_around_shared(ip, imap, &shared);
> > > -	if (error)
> > > -		return error;
> > > -
> > > -	/* Not shared?  Just report the (potentially capped) extent. */
> > > -	if (!shared)
> > > -		return 0;
> > > -
> > > -	/*
> > > -	 * Fork all the shared blocks from our write offset until the end of
> > > -	 * the extent.
> > > -	 */
> > > -	error = xfs_qm_dqattach_locked(ip, false);
> > > -	if (error)
> > > -		return error;
> > > -
> > > -	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
> > > -			imap->br_blockcount, 0, &got, &icur, eof);
> > > -	if (error == -ENOSPC || error == -EDQUOT)
> > > -		trace_xfs_reflink_cow_enospc(ip, imap);
> > > -	if (error)
> > > -		return error;
> > > -
> > > -	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
> > > -	trace_xfs_reflink_cow_alloc(ip, &got);
> > > -	return 0;
> > > -}
> > > -
> > >  /* Convert part of an unwritten CoW extent to a real one. */
> > >  STATIC int
> > >  xfs_reflink_convert_cow_extent(
> > > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > > index 70d68a1a9b49..4a9e3cd4768a 100644
> > > --- a/fs/xfs/xfs_reflink.h
> > > +++ b/fs/xfs/xfs_reflink.h
> > > @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
> > >  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
> > >  		struct xfs_bmbt_irec *irec, bool *shared);
> > >  
> > > -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> > > -		struct xfs_bmbt_irec *imap);
> > >  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> > >  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> > >  		unsigned iomap_flags);
> > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > index f1e18ae8a209..47fb07d86efd 100644
> > > --- a/fs/xfs/xfs_trace.h
> > > +++ b/fs/xfs/xfs_trace.h
> > > @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
> > >  
> > >  /* copy on write */
> > >  DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
> > > -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
> > >  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
> > >  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
> > >  DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
> > >  
> > > -DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
> > > -
> > >  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
> > >  
> > >  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> > > -- 
> > > 2.20.1
> > > 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints
  2019-02-21 17:58   ` Brian Foster
  2019-02-21 22:56     ` Darrick J. Wong
@ 2019-02-22 14:16     ` Christoph Hellwig
  1 sibling, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-22 14:16 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Thu, Feb 21, 2019 at 12:58:17PM -0500, Brian Foster wrote:
> > +		/*
> > +		 * For buffered writes we need to report the address of the
> > +		 * previous block (if there was any) so that the higher level
> > +		 * write code can perform read-modify-write operations.  For
> > +		 * direct I/O code, which must be block aligned we need to
> > +		 * report the newly allocated address.
> > +		 */
> > +		if (!(flags & IOMAP_DIRECT) &&
> > +		    orig.br_startblock != HOLESTARTBLOCK)
> > +			imap = orig;
> 
> I find the logic here kind of confusing. The buffered write (reflink)
> path basically expects to allocated COW blocks over an existing shared
> extent. It thus makes no modification to the caller's imap because it
> (read-modify-)writes into cache and writeback determines where to send
> the I/O. Why not follow the same flow here? For example:

This is to optimize for the command case.  Both in direct I/O being
actually common over extent size hints, and also over this being
the sensible behavior while the buffered I/O behavior of returning
the old map is somewhat odd.

I have outstanding todo items to switch extent size hint based buffered
I/O to use delalloc reservations, and to clean up how the iomap code
currently hacks around the lack of a clear interface for the
read-modify-write cycles in buffered I/O, both of which would remove
this hack above without touching the surrounding code.

> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index 2babc2cbe103..8a5353daf9ab 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
> >  	struct xfs_inode	*ip,
> >  	struct xfs_bmbt_irec	*imap,
> >  	bool			*shared,
> > -	uint			*lockmode)
> > +	uint			*lockmode,
> > +	unsigned		iomap_flags)
> 
> I don't see why a lower level reflink mechanism needs to care about
> direct I/O or not. IMO this should just be a 'bool convert' param.

My memory is a little vague, but I think Darrick preferred it this way.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay
  2019-02-21 17:59   ` Brian Foster
  2019-02-21 21:30     ` Darrick J. Wong
@ 2019-02-22 14:20     ` Christoph Hellwig
  2019-02-22 15:20       ` Brian Foster
  1 sibling, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-22 14:20 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Thu, Feb 21, 2019 at 12:59:03PM -0500, Brian Foster wrote:
> On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> > Besides simplifying the code a bit this allows to actually implement
> > the behavior of using COW preallocation for non-COW data mentioned
> > in the current comments.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> 
> Heh, I'm somewhat amused by the fact that I sent a variant of this patch
> two years ago (for a different purpose) and you explicitly complained
> about the factoring. I'm glad you've finally come around. ;)
> 
> https://marc.info/?l=linux-xfs&m=149498124730442&w=2



> 
> >  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
> >  fs/xfs/xfs_reflink.c |  67 ----------------------
> >  fs/xfs/xfs_reflink.h |   2 -
> >  fs/xfs/xfs_trace.h   |   3 -
> >  4 files changed, 94 insertions(+), 111 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index 19a3331b4a56..c9fd1e4a1f99 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> ...
> > @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
> >  
> >  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> >  
> > -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> > +	/*
> > +	 * Search the data fork fork first to look up our source mapping.  We
> > +	 * always need the data fork map, as we have to return it to the
> > +	 * iomap code so that the higher level write code can read data in to
> > +	 * perform read-modify-write cycles for unaligned writes.
> > +	 */
> > +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
> >  	if (eof)
> > -		got.br_startoff = end_fsb; /* fake hole until the end */
> > +		imap.br_startoff = end_fsb; /* fake hole until the end */
> > +
> > +	/* We never need to allocate blocks for zeroing a hole. */
> > +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> > +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> > +		goto out_unlock;
> > +	}
> > +
> 
> So does this need to account for the case of an overlapping cow block
> over a hole in the data fork (with cached data, if that is possible)?
> IIUC we introduce that possibility just below.

Yes, it probably should, although I need to find a reproducer for that
first. 

> > +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> > +		 * pages to keep the chunks of work done where somewhat
> > +		 * symmetric with the work writeback does.  This is a completely
> > +		 * arbitrary number pulled out of thin air.
> > +		 *
> > +		 * Note that the values needs to be less than 32-bits wide until
> > +		 * the lower level functions are updated.
> > +		 */
> > +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> 
> The existing code doesn't currently do this but ISTM this should apply
> to either allocation case, not just data fork delalloc. That could be
> something for a separate patch though.

I wonder if we need to keep this cap at all, we apply it very
inconsistently through the writeback path.

> > @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
> >  	 * them out if the write happens to fail.
> >  	 */
> >  	iomap->flags |= IOMAP_F_NEW;
> 
> This looks like it flags the mapping new if we reserve cow blocks, which
> I don't think is quite right.

To some extent marking it as new makes a lot of sense, especially if
allocating to a hole.  But we probably only want it for that latter
case.

> 
> > -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> > +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> > +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
> >  done:
> > -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> > +	if (whichfork == XFS_COW_FORK) {
> > +		if (imap.br_startoff > offset_fsb) {
> > +			xfs_trim_extent(&cmap, offset_fsb,
> > +					imap.br_startoff - offset_fsb);
> > +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> > +			goto out_unlock;
> > +		}
> 
> Hmm, so this looks like it is in fact handling a COW blocks over a hole
> case, pushing the COW mapping into the iomap. We never accounted for
> that case before where we'd always just allocate to the data fork. The
> change in behavior probably makes sense, but this really should be
> separate from the refactoring bits to reuse the data fork delalloc code.
> Beyond making this a bit easier to follow, it warrants its own commit
> log description and this one makes no mention of it at all.

Look at the last sentence of the commit log..

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay
  2019-02-21 21:30     ` Darrick J. Wong
  2019-02-22 12:31       ` Brian Foster
@ 2019-02-22 14:22       ` Christoph Hellwig
  1 sibling, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2019-02-22 14:22 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Brian Foster, Christoph Hellwig, linux-xfs

On Thu, Feb 21, 2019 at 01:30:52PM -0800, Darrick J. Wong wrote:
> > So does this need to account for the case of an overlapping cow block
> > over a hole in the data fork (with cached data, if that is possible)?
> > IIUC we introduce that possibility just below.
> 
> I think it makes sense to ignore overlapping cow blocks for zeroing a
> hole in the data fork -- the user told us to zero part of a file that
> didn't have nonzero contents in it, so we just leave the speculative cow
> allocation for that block alone.

For a speculative preallocation I agree.  But if we have valid data
in there due to a cowextsize preallocation being used for data we
should handle it properly.

> > >  	iomap->flags |= IOMAP_F_NEW;
> > 
> > This looks like it flags the mapping new if we reserve cow blocks, which
> > I don't think is quite right.
> 
> Hmmm.  I thought it was correct -- if the write fails, we punch out the
> pagecache and trim any delalloc blocks in the data fork.  If they user
> tries to reread the area of the failed write we'll just read them back
> in from disk...?

Yes, I don't think it is actively harmful, but we don't really need
it either.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay
  2019-02-22 14:20     ` Christoph Hellwig
@ 2019-02-22 15:20       ` Brian Foster
  0 siblings, 0 replies; 33+ messages in thread
From: Brian Foster @ 2019-02-22 15:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Fri, Feb 22, 2019 at 03:20:58PM +0100, Christoph Hellwig wrote:
> On Thu, Feb 21, 2019 at 12:59:03PM -0500, Brian Foster wrote:
> > On Mon, Feb 18, 2019 at 10:18:24AM +0100, Christoph Hellwig wrote:
> > > Besides simplifying the code a bit this allows to actually implement
> > > the behavior of using COW preallocation for non-COW data mentioned
> > > in the current comments.
> > > 
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > ---
> > 
> > Heh, I'm somewhat amused by the fact that I sent a variant of this patch
> > two years ago (for a different purpose) and you explicitly complained
> > about the factoring. I'm glad you've finally come around. ;)
> > 
> > https://marc.info/?l=linux-xfs&m=149498124730442&w=2
> 
> 
> 
> > 
> > >  fs/xfs/xfs_iomap.c   | 133 ++++++++++++++++++++++++++++++-------------
> > >  fs/xfs/xfs_reflink.c |  67 ----------------------
> > >  fs/xfs/xfs_reflink.h |   2 -
> > >  fs/xfs/xfs_trace.h   |   3 -
> > >  4 files changed, 94 insertions(+), 111 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > > index 19a3331b4a56..c9fd1e4a1f99 100644
> > > --- a/fs/xfs/xfs_iomap.c
> > > +++ b/fs/xfs/xfs_iomap.c
> > ...
> > > @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay(
> > >  
> > >  	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > >  
> > > -	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
> > > +	/*
> > > +	 * Search the data fork fork first to look up our source mapping.  We
> > > +	 * always need the data fork map, as we have to return it to the
> > > +	 * iomap code so that the higher level write code can read data in to
> > > +	 * perform read-modify-write cycles for unaligned writes.
> > > +	 */
> > > +	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
> > >  	if (eof)
> > > -		got.br_startoff = end_fsb; /* fake hole until the end */
> > > +		imap.br_startoff = end_fsb; /* fake hole until the end */
> > > +
> > > +	/* We never need to allocate blocks for zeroing a hole. */
> > > +	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
> > > +		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
> > > +		goto out_unlock;
> > > +	}
> > > +
> > 
> > So does this need to account for the case of an overlapping cow block
> > over a hole in the data fork (with cached data, if that is possible)?
> > IIUC we introduce that possibility just below.
> 
> Yes, it probably should, although I need to find a reproducer for that
> first. 
> 

See my other reply. I don't think it's currently reproducible, but it
does technically break the iomap_zero_range() mechanism.

> > > +		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
> > > +		 * pages to keep the chunks of work done where somewhat
> > > +		 * symmetric with the work writeback does.  This is a completely
> > > +		 * arbitrary number pulled out of thin air.
> > > +		 *
> > > +		 * Note that the values needs to be less than 32-bits wide until
> > > +		 * the lower level functions are updated.
> > > +		 */
> > > +		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
> > > +		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> > 
> > The existing code doesn't currently do this but ISTM this should apply
> > to either allocation case, not just data fork delalloc. That could be
> > something for a separate patch though.
> 
> I wonder if we need to keep this cap at all, we apply it very
> inconsistently through the writeback path.
> 

Perhaps we can just kill it off then.

> > > @@ -659,9 +703,20 @@ xfs_file_iomap_begin_delay(
> > >  	 * them out if the write happens to fail.
> > >  	 */
> > >  	iomap->flags |= IOMAP_F_NEW;
> > 
> > This looks like it flags the mapping new if we reserve cow blocks, which
> > I don't think is quite right.
> 
> To some extent marking it as new makes a lot of sense, especially if
> allocating to a hole.  But we probably only want it for that latter
> case.
> 

The allocation over a hole case makes more sense to me, but there's also
the case of cow fork delalloc over data fork delalloc. I think we need
some explicit definition of expected behavior here, not to just set the
flag based on what the current error handler happens to do. Perhaps that
might involve fixing up the error handling context to deal with the cow
fork as well.

> > 
> > > -	trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got);
> > > +	trace_xfs_iomap_alloc(ip, offset, count, whichfork,
> > > +			whichfork == XFS_DATA_FORK ? &imap : &cmap);
> > >  done:
> > > -	error = xfs_bmbt_to_iomap(ip, iomap, &got, false);
> > > +	if (whichfork == XFS_COW_FORK) {
> > > +		if (imap.br_startoff > offset_fsb) {
> > > +			xfs_trim_extent(&cmap, offset_fsb,
> > > +					imap.br_startoff - offset_fsb);
> > > +			error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false);
> > > +			goto out_unlock;
> > > +		}
> > 
> > Hmm, so this looks like it is in fact handling a COW blocks over a hole
> > case, pushing the COW mapping into the iomap. We never accounted for
> > that case before where we'd always just allocate to the data fork. The
> > change in behavior probably makes sense, but this really should be
> > separate from the refactoring bits to reuse the data fork delalloc code.
> > Beyond making this a bit easier to follow, it warrants its own commit
> > log description and this one makes no mention of it at all.
> 
> Look at the last sentence of the commit log..

Ok. I didn't follow that the first time I read it because I thought it
referred to handling COW overlap in writeback, which we already do. It
wasn't until seeing the code (and the in-line comment) and
distinguishing that from the refactoring bits that I realized this
allows for use of existing cow blocks over data fork holes.

So it's not fair to say the commit log doesn't mention it at all, but I
still think that this should have been separate from code reuse
refactoring and warrants more explanation and description than a single
sentence. At minimum, the commit log should describe the current
behavior, the change in behavior and the reason for changing it.

Of course, I guess this is merged now so it doesn't really matter..

Brian

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: xfs/420 and xfs/421: don't disturb unwritten status with md5sum
  2019-02-18  9:19 ` xfs/420 and xfs/421: don't disturb unwritten status with md5sum Christoph Hellwig
@ 2019-03-09 10:32   ` Eryu Guan
  2019-03-09 17:34     ` Darrick J. Wong
  0 siblings, 1 reply; 33+ messages in thread
From: Eryu Guan @ 2019-03-09 10:32 UTC (permalink / raw)
  To: Christoph Hellwig, Darrick J. Wong; +Cc: linux-xfs, fstests

On Mon, Feb 18, 2019 at 10:19:51AM +0100, Christoph Hellwig wrote:
> The way we decided if an unwritten extent is considered a hole or data
> is by checking if the page and/or blocks are marked uptodate, that is
> contain valid data in the page cache.
> 
> xfs/420 and xfs/421 try to exercise SEEK_HOLE / SEEK_DATA in the
> presence of cowextsize preallocations over holes in the data fork.  The
> current XFS code never actually uses those for buffer writes, but a
> pending patch changes that.  For SEEK_HOLE / SEEK_DATA to work properly
> in that case we also need to look at the COW fork in their
> implementations and thus have to rely on the unwritten extent page cache
> probing.  But the tests for it ensure we do have valid data in the
> pagecache by calling md5sum on the test files, and thus reading their
> contents (including the zero-filled holes) in, and thus making them
> all valid data.
> 
> Fix that by dropping the page cache content after the md5sum calls.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Hi Darrick, would you please help review this version of the fix as
well? I basically have no idea about the implementation of the "pending
patch" and what it changes.. Thanks a lot!

Eryu

> ---
>  tests/xfs/420 | 6 ++++++
>  tests/xfs/421 | 6 ++++++
>  2 files changed, 12 insertions(+)
> 
> diff --git a/tests/xfs/420 b/tests/xfs/420
> index a083a12b..34199637 100755
> --- a/tests/xfs/420
> +++ b/tests/xfs/420
> @@ -83,6 +83,8 @@ echo "Compare files"
>  md5sum $testdir/file1 | _filter_scratch
>  md5sum $testdir/file2 | _filter_scratch
>  md5sum $testdir/file3 | _filter_scratch
> +# drop caches to make sure the page cache for the unwritten extents is clean
> +echo 1 > /proc/sys/vm/drop_caches
>  
>  echo "CoW the shared part then write into the empty part" | tee -a $seqres.full
>  $XFS_IO_PROG -c "cowextsize" $testdir/file1 >> $seqres.full
> @@ -106,6 +108,8 @@ echo "Compare files"
>  md5sum $testdir/file1 | _filter_scratch
>  md5sum $testdir/file2 | _filter_scratch
>  md5sum $testdir/file3 | _filter_scratch
> +# drop caches to make sure the page cache for the unwritten extents is clean
> +echo 1 > /proc/sys/vm/drop_caches
>  
>  echo "sync filesystem" | tee -a $seqres.full
>  sync
> @@ -123,6 +127,8 @@ echo "Compare files"
>  md5sum $testdir/file1 | _filter_scratch
>  md5sum $testdir/file2 | _filter_scratch
>  md5sum $testdir/file3 | _filter_scratch
> +# drop caches to make sure the page cache for the unwritten extents is clean
> +echo 1 > /proc/sys/vm/drop_caches
>  
>  echo "Remount" | tee -a $seqres.full
>  _scratch_cycle_mount
> diff --git a/tests/xfs/421 b/tests/xfs/421
> index a2734aba..374389bd 100755
> --- a/tests/xfs/421
> +++ b/tests/xfs/421
> @@ -83,6 +83,8 @@ echo "Compare files"
>  md5sum $testdir/file1 | _filter_scratch
>  md5sum $testdir/file2 | _filter_scratch
>  md5sum $testdir/file3 | _filter_scratch
> +# drop caches to make sure the page cache for the unwritten extents is clean
> +echo 1 > /proc/sys/vm/drop_caches
>  
>  echo "CoW the shared part then write into the empty part" | tee -a $seqres.full
>  $XFS_IO_PROG -c "cowextsize" $testdir/file1 >> $seqres.full
> @@ -106,6 +108,8 @@ echo "Compare files"
>  md5sum $testdir/file1 | _filter_scratch
>  md5sum $testdir/file2 | _filter_scratch
>  md5sum $testdir/file3 | _filter_scratch
> +# drop caches to make sure the page cache for the unwritten extents is clean
> +echo 1 > /proc/sys/vm/drop_caches
>  
>  echo "sync filesystem" | tee -a $seqres.full
>  sync
> @@ -123,6 +127,8 @@ echo "Compare files"
>  md5sum $testdir/file1 | _filter_scratch
>  md5sum $testdir/file2 | _filter_scratch
>  md5sum $testdir/file3 | _filter_scratch
> +# drop caches to make sure the page cache for the unwritten extents is clean
> +echo 1 > /proc/sys/vm/drop_caches
>  
>  echo "Remount" | tee -a $seqres.full
>  _scratch_cycle_mount
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: xfs/420 and xfs/421: don't disturb unwritten status with md5sum
  2019-03-09 10:32   ` Eryu Guan
@ 2019-03-09 17:34     ` Darrick J. Wong
  0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2019-03-09 17:34 UTC (permalink / raw)
  To: Eryu Guan; +Cc: Christoph Hellwig, linux-xfs, fstests

On Sat, Mar 09, 2019 at 06:32:11PM +0800, Eryu Guan wrote:
> On Mon, Feb 18, 2019 at 10:19:51AM +0100, Christoph Hellwig wrote:
> > The way we decided if an unwritten extent is considered a hole or data
> > is by checking if the page and/or blocks are marked uptodate, that is
> > contain valid data in the page cache.
> > 
> > xfs/420 and xfs/421 try to exercise SEEK_HOLE / SEEK_DATA in the
> > presence of cowextsize preallocations over holes in the data fork.  The
> > current XFS code never actually uses those for buffer writes, but a
> > pending patch changes that.  For SEEK_HOLE / SEEK_DATA to work properly
> > in that case we also need to look at the COW fork in their
> > implementations and thus have to rely on the unwritten extent page cache
> > probing.  But the tests for it ensure we do have valid data in the
> > pagecache by calling md5sum on the test files, and thus reading their
> > contents (including the zero-filled holes) in, and thus making them
> > all valid data.
> > 
> > Fix that by dropping the page cache content after the md5sum calls.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Hi Darrick, would you please help review this version of the fix as
> well? I basically have no idea about the implementation of the "pending
> patch" and what it changes.. Thanks a lot!

/me wonders if the "md5sum and drop caches" could be refactored a bit,
but as a strict bugfix for always_cow mode this looks ok to me:

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D


> Eryu
> 
> > ---
> >  tests/xfs/420 | 6 ++++++
> >  tests/xfs/421 | 6 ++++++
> >  2 files changed, 12 insertions(+)
> > 
> > diff --git a/tests/xfs/420 b/tests/xfs/420
> > index a083a12b..34199637 100755
> > --- a/tests/xfs/420
> > +++ b/tests/xfs/420
> > @@ -83,6 +83,8 @@ echo "Compare files"
> >  md5sum $testdir/file1 | _filter_scratch
> >  md5sum $testdir/file2 | _filter_scratch
> >  md5sum $testdir/file3 | _filter_scratch
> > +# drop caches to make sure the page cache for the unwritten extents is clean
> > +echo 1 > /proc/sys/vm/drop_caches
> >  
> >  echo "CoW the shared part then write into the empty part" | tee -a $seqres.full
> >  $XFS_IO_PROG -c "cowextsize" $testdir/file1 >> $seqres.full
> > @@ -106,6 +108,8 @@ echo "Compare files"
> >  md5sum $testdir/file1 | _filter_scratch
> >  md5sum $testdir/file2 | _filter_scratch
> >  md5sum $testdir/file3 | _filter_scratch
> > +# drop caches to make sure the page cache for the unwritten extents is clean
> > +echo 1 > /proc/sys/vm/drop_caches
> >  
> >  echo "sync filesystem" | tee -a $seqres.full
> >  sync
> > @@ -123,6 +127,8 @@ echo "Compare files"
> >  md5sum $testdir/file1 | _filter_scratch
> >  md5sum $testdir/file2 | _filter_scratch
> >  md5sum $testdir/file3 | _filter_scratch
> > +# drop caches to make sure the page cache for the unwritten extents is clean
> > +echo 1 > /proc/sys/vm/drop_caches
> >  
> >  echo "Remount" | tee -a $seqres.full
> >  _scratch_cycle_mount
> > diff --git a/tests/xfs/421 b/tests/xfs/421
> > index a2734aba..374389bd 100755
> > --- a/tests/xfs/421
> > +++ b/tests/xfs/421
> > @@ -83,6 +83,8 @@ echo "Compare files"
> >  md5sum $testdir/file1 | _filter_scratch
> >  md5sum $testdir/file2 | _filter_scratch
> >  md5sum $testdir/file3 | _filter_scratch
> > +# drop caches to make sure the page cache for the unwritten extents is clean
> > +echo 1 > /proc/sys/vm/drop_caches
> >  
> >  echo "CoW the shared part then write into the empty part" | tee -a $seqres.full
> >  $XFS_IO_PROG -c "cowextsize" $testdir/file1 >> $seqres.full
> > @@ -106,6 +108,8 @@ echo "Compare files"
> >  md5sum $testdir/file1 | _filter_scratch
> >  md5sum $testdir/file2 | _filter_scratch
> >  md5sum $testdir/file3 | _filter_scratch
> > +# drop caches to make sure the page cache for the unwritten extents is clean
> > +echo 1 > /proc/sys/vm/drop_caches
> >  
> >  echo "sync filesystem" | tee -a $seqres.full
> >  sync
> > @@ -123,6 +127,8 @@ echo "Compare files"
> >  md5sum $testdir/file1 | _filter_scratch
> >  md5sum $testdir/file2 | _filter_scratch
> >  md5sum $testdir/file3 | _filter_scratch
> > +# drop caches to make sure the page cache for the unwritten extents is clean
> > +echo 1 > /proc/sys/vm/drop_caches
> >  
> >  echo "Remount" | tee -a $seqres.full
> >  _scratch_cycle_mount
> > -- 
> > 2.20.1
> > 

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2019-03-09 17:34 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-18  9:18 COW improvements and always_cow support V5 Christoph Hellwig
2019-02-18  9:18 ` [PATCH 1/8] xfs: make xfs_bmbt_to_iomap more useful Christoph Hellwig
2019-02-18  9:18 ` [PATCH 2/8] xfs: fix SEEK_DATA for speculative COW fork preallocation Christoph Hellwig
2019-02-19  5:13   ` Darrick J. Wong
2019-02-18  9:18 ` [PATCH 3/8] xfs: don't use delalloc extents for COW on files with extsize hints Christoph Hellwig
2019-02-19  5:17   ` Darrick J. Wong
2019-02-21 17:58   ` Brian Foster
2019-02-21 22:56     ` Darrick J. Wong
2019-02-22 14:16     ` Christoph Hellwig
2019-02-18  9:18 ` [PATCH 4/8] xfs: also truncate holes covered by COW blocks Christoph Hellwig
2019-02-18  9:18 ` [PATCH 5/8] xfs: merge COW handling into xfs_file_iomap_begin_delay Christoph Hellwig
2019-02-19 18:12   ` Darrick J. Wong
2019-02-21 17:59   ` Brian Foster
2019-02-21 21:30     ` Darrick J. Wong
2019-02-22 12:31       ` Brian Foster
2019-02-22 14:22       ` Christoph Hellwig
2019-02-22 14:20     ` Christoph Hellwig
2019-02-22 15:20       ` Brian Foster
2019-02-18  9:18 ` [PATCH 6/8] xfs: make COW fork unwritten extent conversions more robust Christoph Hellwig
2019-02-19 18:16   ` Darrick J. Wong
2019-02-18  9:18 ` [PATCH 7/8] xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay Christoph Hellwig
2019-02-19  5:20   ` Darrick J. Wong
2019-02-18  9:18 ` [PATCH 8/8] xfs: introduce an always_cow mode Christoph Hellwig
2019-02-19  5:25   ` Darrick J. Wong
2019-02-19 17:53     ` Darrick J. Wong
2019-02-20 14:58       ` Christoph Hellwig
2019-02-20 15:00     ` Christoph Hellwig
2019-02-19 18:31   ` Darrick J. Wong
2019-02-20 15:08     ` Christoph Hellwig
2019-02-21 17:31       ` Darrick J. Wong
2019-02-18  9:19 ` xfs/420 and xfs/421: don't disturb unwritten status with md5sum Christoph Hellwig
2019-03-09 10:32   ` Eryu Guan
2019-03-09 17:34     ` Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.