All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfs: fix xfs_reflink_allocate_cow_range
@ 2016-09-25 19:30 Christoph Hellwig
  2016-09-27  4:37 ` Darrick J. Wong
  2016-09-27 19:25 ` Darrick J. Wong
  0 siblings, 2 replies; 3+ messages in thread
From: Christoph Hellwig @ 2016-09-25 19:30 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

The current version of xfs_reflink_allocate_cow_range may stumble over
invalid entries in the extent array given that it drops the ilock but
still expects the index to be stable.  Simple fixing it to a new lookup
for every iteration still isn't correct given that xfs_bmapi_allocate
will trigger a BUG_ON() if hitting a hole, and there is nothing
preventing a xfs_bunmapi_cow call removing extents once we dropped the
ilock either.

The right long term implementation would be to not do a detour through
a delayed allocation for direct I/O and just got straight to and on-disk
allocation.  Given how late it is in the merge window this patch instead
duplicates the inner loop of xfs_bmapi_allocate into a helper for
xfs_reflink_allocate_cow_range so that it can be done under the same
ilock criticical section as our COW fork delayed allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c    |   4 --
 fs/xfs/xfs_reflink.c | 134 +++++++++++++++++++++++++++++++++------------------
 fs/xfs/xfs_reflink.h |   4 +-
 fs/xfs/xfs_trace.h   |   1 -
 4 files changed, 90 insertions(+), 53 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aee0c4c..349f328 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -675,10 +675,6 @@ xfs_file_dio_aio_write(
 
 	/* If this is a block-aligned directio CoW, remap immediately. */
 	if (xfs_is_reflink_inode(ip) && !unaligned_io) {
-		ret = xfs_reflink_reserve_cow_range(ip, iocb->ki_pos, count);
-		if (ret)
-			goto out;
-
 		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
 		if (ret)
 			goto out;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 09e0e27..c18692c 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -253,7 +253,8 @@ static int
 __xfs_reflink_reserve_cow(
 	struct xfs_inode	*ip,
 	xfs_fileoff_t		*offset_fsb,
-	xfs_fileoff_t		end_fsb)
+	xfs_fileoff_t		end_fsb,
+	bool			*skipped)
 {
 	struct xfs_bmbt_irec	got, prev, imap;
 	xfs_fileoff_t		orig_end_fsb;
@@ -287,8 +288,10 @@ __xfs_reflink_reserve_cow(
 	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
 
 	/* Not shared?  Just report the (potentially capped) extent. */
-	if (!shared)
+	if (!shared) {
+		*skipped = true;
 		goto done;
+	}
 
 	/*
 	 * Fork all the shared blocks from our write offset until the end of
@@ -337,6 +340,7 @@ xfs_reflink_reserve_cow_range(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb, end_fsb;
+	bool			skipped = false;
 	int			error;
 
 	trace_xfs_reflink_reserve_cow_range(ip, offset, count);
@@ -346,7 +350,8 @@ xfs_reflink_reserve_cow_range(
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	while (offset_fsb < end_fsb) {
-		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb);
+		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb,
+				&skipped);
 		if (error) {
 			trace_xfs_reflink_reserve_cow_range_error(ip, error,
 				_RET_IP_);
@@ -358,63 +363,100 @@ xfs_reflink_reserve_cow_range(
 	return error;
 }
 
-/*
- * Allocate blocks to all CoW reservations within a byte range of a file.
- */
-int
-xfs_reflink_allocate_cow_range(
+static int
+__xfs_reflink_allocate_cow(
 	struct xfs_inode	*ip,
-	xfs_off_t		pos,
-	xfs_off_t		len)
+	xfs_fileoff_t		*offset_fsb,
+	xfs_fileoff_t		end_fsb)
 {
-	struct xfs_ifork	*ifp;
-	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_bmbt_irec	imap;
-	int			error = 0;
-	xfs_fileoff_t		start_lblk;
-	xfs_fileoff_t		end_lblk;
-	xfs_extnum_t		idx;
+	struct xfs_defer_ops	dfops;
+	struct xfs_trans	*tp;
+	xfs_fsblock_t		first_block;
+	xfs_fileoff_t		next_fsb;
+	int			nimaps = 1, error;
+	bool			skipped = false;
 
-	if (!xfs_is_reflink_inode(ip))
-		return 0;
+	xfs_defer_init(&dfops, &first_block);
 
-	trace_xfs_reflink_allocate_cow_range(ip, len, pos);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+			XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
 
-	start_lblk = XFS_B_TO_FSBT(ip->i_mount, pos);
-	end_lblk = XFS_B_TO_FSB(ip->i_mount, pos + len);
-	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-	gotp = xfs_iext_bno_to_ext(ifp, start_lblk, &idx);
-	while (gotp) {
-		xfs_bmbt_get_all(gotp, &imap);
+	next_fsb = *offset_fsb;
+	error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped);
+	if (error)
+		goto out_trans_cancel;
 
-		if (imap.br_startoff >= end_lblk)
-			break;
-		if (!isnullstartblock(imap.br_startblock))
-			goto advloop;
-		xfs_trim_extent(&imap, start_lblk, end_lblk - start_lblk);
-		trace_xfs_reflink_allocate_cow_extent(ip, &imap);
-
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK,
-				XFS_FSB_TO_B(ip->i_mount, imap.br_startoff +
-						imap.br_blockcount - 1), &imap);
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (error)
-			break;
-advloop:
-		/* Roll on... */
-		idx++;
-		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			break;
-		gotp = xfs_iext_get_ext(ifp, idx);
+	if (skipped) {
+		*offset_fsb = next_fsb;
+		goto out_trans_cancel;
 	}
 
+	xfs_trans_ijoin(tp, ip, 0);
+	error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb,
+			XFS_BMAPI_COWFORK, &first_block,
+			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
+			&imap, &nimaps, &dfops);
+	if (error)
+		goto out_trans_cancel;
+
+	/* We might not have been able to map the whole delalloc extent */
+	*offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb);
+
+	error = xfs_defer_finish(&tp, &dfops, NULL);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_trans_commit(tp);
+
+out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+out_trans_cancel:
+	xfs_defer_cancel(&dfops);
+	xfs_trans_cancel(tp);
+	goto out_unlock;
+}
 
+/*
+ * Allocate blocks to all CoW reservations within a byte range of a file.
+ */
+int
+xfs_reflink_allocate_cow_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+	int			error;
+
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	trace_xfs_reflink_allocate_cow_range(ip, offset, count);
+
+	/*
+	 * Make sure that the dquots are there.
+	 */
+	error = xfs_qm_dqattach(ip, 0);
 	if (error)
-		trace_xfs_reflink_allocate_cow_range_error(ip, error, _RET_IP_);
+		return error;
+
+	while (offset_fsb < end_fsb) {
+		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
+		if (error) {
+			trace_xfs_reflink_allocate_cow_range_error(ip, error,
+					_RET_IP_);
+			break;
+		}
+	}
+
 	return error;
 }
 
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 0e19ec6..859ca50 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -28,8 +28,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 
 extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip,
 		xfs_off_t offset, xfs_off_t count);
-extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t pos,
-		xfs_off_t len);
+extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
+		xfs_off_t offset, xfs_off_t count);
 extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
 		struct xfs_bmbt_irec *imap, bool *need_alloc);
 extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8fb91c..26113b9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3320,7 +3320,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
 
 DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range);
 DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_allocate_cow_extent);
 
 DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
 DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] xfs: fix xfs_reflink_allocate_cow_range
  2016-09-25 19:30 [PATCH] xfs: fix xfs_reflink_allocate_cow_range Christoph Hellwig
@ 2016-09-27  4:37 ` Darrick J. Wong
  2016-09-27 19:25 ` Darrick J. Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Darrick J. Wong @ 2016-09-27  4:37 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Sun, Sep 25, 2016 at 12:30:02PM -0700, Christoph Hellwig wrote:
> The current version of xfs_reflink_allocate_cow_range may stumble over
> invalid entries in the extent array given that it drops the ilock but
> still expects the index to be stable.  Simple fixing it to a new lookup
> for every iteration still isn't correct given that xfs_bmapi_allocate
> will trigger a BUG_ON() if hitting a hole, and there is nothing
> preventing a xfs_bunmapi_cow call removing extents once we dropped the
> ilock either.
> 
> The right long term implementation would be to not do a detour through
> a delayed allocation for direct I/O and just got straight to and on-disk
> allocation.  Given how late it is in the merge window this patch instead
> duplicates the inner loop of xfs_bmapi_allocate into a helper for
> xfs_reflink_allocate_cow_range so that it can be done under the same
> ilock criticical section as our COW fork delayed allocation.

Looks ok too, so
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c    |   4 --
>  fs/xfs/xfs_reflink.c | 134 +++++++++++++++++++++++++++++++++------------------
>  fs/xfs/xfs_reflink.h |   4 +-
>  fs/xfs/xfs_trace.h   |   1 -
>  4 files changed, 90 insertions(+), 53 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index aee0c4c..349f328 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -675,10 +675,6 @@ xfs_file_dio_aio_write(
>  
>  	/* If this is a block-aligned directio CoW, remap immediately. */
>  	if (xfs_is_reflink_inode(ip) && !unaligned_io) {
> -		ret = xfs_reflink_reserve_cow_range(ip, iocb->ki_pos, count);
> -		if (ret)
> -			goto out;
> -
>  		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
>  		if (ret)
>  			goto out;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 09e0e27..c18692c 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -253,7 +253,8 @@ static int
>  __xfs_reflink_reserve_cow(
>  	struct xfs_inode	*ip,
>  	xfs_fileoff_t		*offset_fsb,
> -	xfs_fileoff_t		end_fsb)
> +	xfs_fileoff_t		end_fsb,
> +	bool			*skipped)
>  {
>  	struct xfs_bmbt_irec	got, prev, imap;
>  	xfs_fileoff_t		orig_end_fsb;
> @@ -287,8 +288,10 @@ __xfs_reflink_reserve_cow(
>  	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
>  
>  	/* Not shared?  Just report the (potentially capped) extent. */
> -	if (!shared)
> +	if (!shared) {
> +		*skipped = true;
>  		goto done;
> +	}
>  
>  	/*
>  	 * Fork all the shared blocks from our write offset until the end of
> @@ -337,6 +340,7 @@ xfs_reflink_reserve_cow_range(
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb, end_fsb;
> +	bool			skipped = false;
>  	int			error;
>  
>  	trace_xfs_reflink_reserve_cow_range(ip, offset, count);
> @@ -346,7 +350,8 @@ xfs_reflink_reserve_cow_range(
>  
>  	xfs_ilock(ip, XFS_ILOCK_EXCL);
>  	while (offset_fsb < end_fsb) {
> -		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb);
> +		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb,
> +				&skipped);
>  		if (error) {
>  			trace_xfs_reflink_reserve_cow_range_error(ip, error,
>  				_RET_IP_);
> @@ -358,63 +363,100 @@ xfs_reflink_reserve_cow_range(
>  	return error;
>  }
>  
> -/*
> - * Allocate blocks to all CoW reservations within a byte range of a file.
> - */
> -int
> -xfs_reflink_allocate_cow_range(
> +static int
> +__xfs_reflink_allocate_cow(
>  	struct xfs_inode	*ip,
> -	xfs_off_t		pos,
> -	xfs_off_t		len)
> +	xfs_fileoff_t		*offset_fsb,
> +	xfs_fileoff_t		end_fsb)
>  {
> -	struct xfs_ifork	*ifp;
> -	struct xfs_bmbt_rec_host	*gotp;
> +	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_bmbt_irec	imap;
> -	int			error = 0;
> -	xfs_fileoff_t		start_lblk;
> -	xfs_fileoff_t		end_lblk;
> -	xfs_extnum_t		idx;
> +	struct xfs_defer_ops	dfops;
> +	struct xfs_trans	*tp;
> +	xfs_fsblock_t		first_block;
> +	xfs_fileoff_t		next_fsb;
> +	int			nimaps = 1, error;
> +	bool			skipped = false;
>  
> -	if (!xfs_is_reflink_inode(ip))
> -		return 0;
> +	xfs_defer_init(&dfops, &first_block);
>  
> -	trace_xfs_reflink_allocate_cow_range(ip, len, pos);
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
> +			XFS_TRANS_RESERVE, &tp);
> +	if (error)
> +		return error;
>  
> -	start_lblk = XFS_B_TO_FSBT(ip->i_mount, pos);
> -	end_lblk = XFS_B_TO_FSB(ip->i_mount, pos + len);
> -	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
>  	xfs_ilock(ip, XFS_ILOCK_EXCL);
>  
> -	gotp = xfs_iext_bno_to_ext(ifp, start_lblk, &idx);
> -	while (gotp) {
> -		xfs_bmbt_get_all(gotp, &imap);
> +	next_fsb = *offset_fsb;
> +	error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped);
> +	if (error)
> +		goto out_trans_cancel;
>  
> -		if (imap.br_startoff >= end_lblk)
> -			break;
> -		if (!isnullstartblock(imap.br_startblock))
> -			goto advloop;
> -		xfs_trim_extent(&imap, start_lblk, end_lblk - start_lblk);
> -		trace_xfs_reflink_allocate_cow_extent(ip, &imap);
> -
> -		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK,
> -				XFS_FSB_TO_B(ip->i_mount, imap.br_startoff +
> -						imap.br_blockcount - 1), &imap);
> -		xfs_ilock(ip, XFS_ILOCK_EXCL);
> -		if (error)
> -			break;
> -advloop:
> -		/* Roll on... */
> -		idx++;
> -		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
> -			break;
> -		gotp = xfs_iext_get_ext(ifp, idx);
> +	if (skipped) {
> +		*offset_fsb = next_fsb;
> +		goto out_trans_cancel;
>  	}
>  
> +	xfs_trans_ijoin(tp, ip, 0);
> +	error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb,
> +			XFS_BMAPI_COWFORK, &first_block,
> +			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
> +			&imap, &nimaps, &dfops);
> +	if (error)
> +		goto out_trans_cancel;
> +
> +	/* We might not have been able to map the whole delalloc extent */
> +	*offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb);
> +
> +	error = xfs_defer_finish(&tp, &dfops, NULL);
> +	if (error)
> +		goto out_trans_cancel;
> +
> +	error = xfs_trans_commit(tp);
> +
> +out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return error;
> +out_trans_cancel:
> +	xfs_defer_cancel(&dfops);
> +	xfs_trans_cancel(tp);
> +	goto out_unlock;
> +}
>  
> +/*
> + * Allocate blocks to all CoW reservations within a byte range of a file.
> + */
> +int
> +xfs_reflink_allocate_cow_range(
> +	struct xfs_inode	*ip,
> +	xfs_off_t		offset,
> +	xfs_off_t		count)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
> +	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
> +	int			error;
> +
> +	ASSERT(xfs_is_reflink_inode(ip));
> +
> +	trace_xfs_reflink_allocate_cow_range(ip, offset, count);
> +
> +	/*
> +	 * Make sure that the dquots are there.
> +	 */
> +	error = xfs_qm_dqattach(ip, 0);
>  	if (error)
> -		trace_xfs_reflink_allocate_cow_range_error(ip, error, _RET_IP_);
> +		return error;
> +
> +	while (offset_fsb < end_fsb) {
> +		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
> +		if (error) {
> +			trace_xfs_reflink_allocate_cow_range_error(ip, error,
> +					_RET_IP_);
> +			break;
> +		}
> +	}
> +
>  	return error;
>  }
>  
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 0e19ec6..859ca50 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -28,8 +28,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  
>  extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip,
>  		xfs_off_t offset, xfs_off_t count);
> -extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t pos,
> -		xfs_off_t len);
> +extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
> +		xfs_off_t offset, xfs_off_t count);
>  extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
>  		struct xfs_bmbt_irec *imap, bool *need_alloc);
>  extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index c8fb91c..26113b9 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3320,7 +3320,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
>  
>  DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range);
>  DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
> -DEFINE_INODE_IREC_EVENT(xfs_reflink_allocate_cow_extent);
>  
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
>  DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
> -- 
> 2.1.4
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] xfs: fix xfs_reflink_allocate_cow_range
  2016-09-25 19:30 [PATCH] xfs: fix xfs_reflink_allocate_cow_range Christoph Hellwig
  2016-09-27  4:37 ` Darrick J. Wong
@ 2016-09-27 19:25 ` Darrick J. Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Darrick J. Wong @ 2016-09-27 19:25 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Sun, Sep 25, 2016 at 12:30:02PM -0700, Christoph Hellwig wrote:
> The current version of xfs_reflink_allocate_cow_range may stumble over
> invalid entries in the extent array given that it drops the ilock but
> still expects the index to be stable.  Simple fixing it to a new lookup
> for every iteration still isn't correct given that xfs_bmapi_allocate
> will trigger a BUG_ON() if hitting a hole, and there is nothing
> preventing a xfs_bunmapi_cow call removing extents once we dropped the
> ilock either.

Patch still ok, but I have a few comments about the following:

> The right long term implementation would be to not do a detour through
> a delayed allocation for direct I/O and just got straight to and on-disk
> allocation.  Given how late it is in the merge window this patch instead
> duplicates the inner loop of xfs_bmapi_allocate into a helper for
> xfs_reflink_allocate_cow_range so that it can be done under the same
> ilock criticical section as our COW fork delayed allocation.

The reason for making dio writes to shared extents go through the
delayed allocation system is to honor the CoW extent size hint and to
combat fragmentation.  Say we want to dio write an extent that looks
like this in the refcount btree:

D: .................s.ss...s.s.s.s.ss..sss.s..ss..s.s.s..s.s.s.s

(s = shared, dot = not shared; lets say cowextsize = 8)

If we overwrite the non-shared blocks and directly allocate CoW
replacements for only the shared blocks, we'll turn this single bmap
extent into 33 bmaps.  If instead we go to the trouble of creating
delalloc reservations in the CoW fork, we end up with this:

D: .................s.ss...s.s.s.s.ss..sss.s..ss..s.s.s..s.s.s.s
C:                 dddddddddddddddddddddddddddddddddddddddddddddddd

If there's a big enough extent in the free space, we can satisfy the
entire allocation with one extent, which means two bmaps.  Furthermore, 
a subsequent write to the next offset creates its own delalloc CoW
reservations.  If xfs_bmap_adjacent picks up the adjacent allocated
extent in the CoW fork, then that second COW write can be placed right
next to the first write.

This is the strategy I use to reduce fragmentation and metadata overhead
to avoid the "how did this file explode into 50 million extents"
situation we've encountered on a different filesystem.

It occurs to me that perhaps you meant that we could simply detect if
any part of the dio write was to a shared extent and CoW the entire
write?  We could do that too, though I don't think it makes sense to do
that for a large write to a region with a single shared block.

IOWs: I'm using the cowextsize hint as a balancing point between the two
extremes of only CoWing the bare minimum and CoWing everything. :)

--D

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c    |   4 --
>  fs/xfs/xfs_reflink.c | 134 +++++++++++++++++++++++++++++++++------------------
>  fs/xfs/xfs_reflink.h |   4 +-
>  fs/xfs/xfs_trace.h   |   1 -
>  4 files changed, 90 insertions(+), 53 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index aee0c4c..349f328 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -675,10 +675,6 @@ xfs_file_dio_aio_write(
>  
>  	/* If this is a block-aligned directio CoW, remap immediately. */
>  	if (xfs_is_reflink_inode(ip) && !unaligned_io) {
> -		ret = xfs_reflink_reserve_cow_range(ip, iocb->ki_pos, count);
> -		if (ret)
> -			goto out;
> -
>  		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
>  		if (ret)
>  			goto out;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 09e0e27..c18692c 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -253,7 +253,8 @@ static int
>  __xfs_reflink_reserve_cow(
>  	struct xfs_inode	*ip,
>  	xfs_fileoff_t		*offset_fsb,
> -	xfs_fileoff_t		end_fsb)
> +	xfs_fileoff_t		end_fsb,
> +	bool			*skipped)
>  {
>  	struct xfs_bmbt_irec	got, prev, imap;
>  	xfs_fileoff_t		orig_end_fsb;
> @@ -287,8 +288,10 @@ __xfs_reflink_reserve_cow(
>  	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
>  
>  	/* Not shared?  Just report the (potentially capped) extent. */
> -	if (!shared)
> +	if (!shared) {
> +		*skipped = true;
>  		goto done;
> +	}
>  
>  	/*
>  	 * Fork all the shared blocks from our write offset until the end of
> @@ -337,6 +340,7 @@ xfs_reflink_reserve_cow_range(
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb, end_fsb;
> +	bool			skipped = false;
>  	int			error;
>  
>  	trace_xfs_reflink_reserve_cow_range(ip, offset, count);
> @@ -346,7 +350,8 @@ xfs_reflink_reserve_cow_range(
>  
>  	xfs_ilock(ip, XFS_ILOCK_EXCL);
>  	while (offset_fsb < end_fsb) {
> -		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb);
> +		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb,
> +				&skipped);
>  		if (error) {
>  			trace_xfs_reflink_reserve_cow_range_error(ip, error,
>  				_RET_IP_);
> @@ -358,63 +363,100 @@ xfs_reflink_reserve_cow_range(
>  	return error;
>  }
>  
> -/*
> - * Allocate blocks to all CoW reservations within a byte range of a file.
> - */
> -int
> -xfs_reflink_allocate_cow_range(
> +static int
> +__xfs_reflink_allocate_cow(
>  	struct xfs_inode	*ip,
> -	xfs_off_t		pos,
> -	xfs_off_t		len)
> +	xfs_fileoff_t		*offset_fsb,
> +	xfs_fileoff_t		end_fsb)
>  {
> -	struct xfs_ifork	*ifp;
> -	struct xfs_bmbt_rec_host	*gotp;
> +	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_bmbt_irec	imap;
> -	int			error = 0;
> -	xfs_fileoff_t		start_lblk;
> -	xfs_fileoff_t		end_lblk;
> -	xfs_extnum_t		idx;
> +	struct xfs_defer_ops	dfops;
> +	struct xfs_trans	*tp;
> +	xfs_fsblock_t		first_block;
> +	xfs_fileoff_t		next_fsb;
> +	int			nimaps = 1, error;
> +	bool			skipped = false;
>  
> -	if (!xfs_is_reflink_inode(ip))
> -		return 0;
> +	xfs_defer_init(&dfops, &first_block);
>  
> -	trace_xfs_reflink_allocate_cow_range(ip, len, pos);
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
> +			XFS_TRANS_RESERVE, &tp);
> +	if (error)
> +		return error;
>  
> -	start_lblk = XFS_B_TO_FSBT(ip->i_mount, pos);
> -	end_lblk = XFS_B_TO_FSB(ip->i_mount, pos + len);
> -	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
>  	xfs_ilock(ip, XFS_ILOCK_EXCL);
>  
> -	gotp = xfs_iext_bno_to_ext(ifp, start_lblk, &idx);
> -	while (gotp) {
> -		xfs_bmbt_get_all(gotp, &imap);
> +	next_fsb = *offset_fsb;
> +	error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped);
> +	if (error)
> +		goto out_trans_cancel;
>  
> -		if (imap.br_startoff >= end_lblk)
> -			break;
> -		if (!isnullstartblock(imap.br_startblock))
> -			goto advloop;
> -		xfs_trim_extent(&imap, start_lblk, end_lblk - start_lblk);
> -		trace_xfs_reflink_allocate_cow_extent(ip, &imap);
> -
> -		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK,
> -				XFS_FSB_TO_B(ip->i_mount, imap.br_startoff +
> -						imap.br_blockcount - 1), &imap);
> -		xfs_ilock(ip, XFS_ILOCK_EXCL);
> -		if (error)
> -			break;
> -advloop:
> -		/* Roll on... */
> -		idx++;
> -		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
> -			break;
> -		gotp = xfs_iext_get_ext(ifp, idx);
> +	if (skipped) {
> +		*offset_fsb = next_fsb;
> +		goto out_trans_cancel;
>  	}
>  
> +	xfs_trans_ijoin(tp, ip, 0);
> +	error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb,
> +			XFS_BMAPI_COWFORK, &first_block,
> +			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
> +			&imap, &nimaps, &dfops);
> +	if (error)
> +		goto out_trans_cancel;
> +
> +	/* We might not have been able to map the whole delalloc extent */
> +	*offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb);
> +
> +	error = xfs_defer_finish(&tp, &dfops, NULL);
> +	if (error)
> +		goto out_trans_cancel;
> +
> +	error = xfs_trans_commit(tp);
> +
> +out_unlock:
>  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return error;
> +out_trans_cancel:
> +	xfs_defer_cancel(&dfops);
> +	xfs_trans_cancel(tp);
> +	goto out_unlock;
> +}
>  
> +/*
> + * Allocate blocks to all CoW reservations within a byte range of a file.
> + */
> +int
> +xfs_reflink_allocate_cow_range(
> +	struct xfs_inode	*ip,
> +	xfs_off_t		offset,
> +	xfs_off_t		count)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
> +	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
> +	int			error;
> +
> +	ASSERT(xfs_is_reflink_inode(ip));
> +
> +	trace_xfs_reflink_allocate_cow_range(ip, offset, count);
> +
> +	/*
> +	 * Make sure that the dquots are there.
> +	 */
> +	error = xfs_qm_dqattach(ip, 0);
>  	if (error)
> -		trace_xfs_reflink_allocate_cow_range_error(ip, error, _RET_IP_);
> +		return error;
> +
> +	while (offset_fsb < end_fsb) {
> +		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
> +		if (error) {
> +			trace_xfs_reflink_allocate_cow_range_error(ip, error,
> +					_RET_IP_);
> +			break;
> +		}
> +	}
> +
>  	return error;
>  }
>  
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 0e19ec6..859ca50 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -28,8 +28,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  
>  extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip,
>  		xfs_off_t offset, xfs_off_t count);
> -extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t pos,
> -		xfs_off_t len);
> +extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
> +		xfs_off_t offset, xfs_off_t count);
>  extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
>  		struct xfs_bmbt_irec *imap, bool *need_alloc);
>  extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index c8fb91c..26113b9 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3320,7 +3320,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
>  
>  DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range);
>  DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
> -DEFINE_INODE_IREC_EVENT(xfs_reflink_allocate_cow_extent);
>  
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
>  DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
> -- 
> 2.1.4
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2016-09-27 19:25 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-25 19:30 [PATCH] xfs: fix xfs_reflink_allocate_cow_range Christoph Hellwig
2016-09-27  4:37 ` Darrick J. Wong
2016-09-27 19:25 ` Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.