All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
@ 2017-08-09 11:06 alex
  2017-08-09 13:17 ` Brian Foster
                   ` (2 more replies)
  0 siblings, 3 replies; 40+ messages in thread
From: alex @ 2017-08-09 11:06 UTC (permalink / raw)
  To: linux-xfs; +Cc: bfoster, david, darrick.wong, libor.klepac, Alex Lyakas

From: Alex Lyakas <alex@zadarastorage.com>

The new attribute leaf buffer is not held locked across
the transaction roll between the shortform->leaf modification
and the addition of the new entry. As a result, the attribute
buffer modification being made is not atomic from
an operational perspective. Hence the AIL push can grab it in
the transient state of "just created" after the initial
transaction is rolled, because the buffer has been released.
This leads to xfs_attr3_leaf_verify() asserting that
hdr.count is zero, treating this as in-memory corruption,
and shutting down the filesystem.

Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
---
 fs/xfs/libxfs/xfs_attr.c      | 19 ++++++++++++++++++-
 fs/xfs/libxfs/xfs_attr_leaf.c |  4 +++-
 fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index de7b9bd..982e322 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -216,10 +216,11 @@
 	struct xfs_defer_ops	dfops;
 	struct xfs_trans_res	tres;
 	xfs_fsblock_t		firstblock;
 	int			rsvd = (flags & ATTR_ROOT) != 0;
 	int			error, err2, local;
+	struct xfs_buf		*leaf_bp = NULL;
 
 	XFS_STATS_INC(mp, xs_attr_set);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
@@ -325,11 +326,17 @@
 		/*
 		 * It won't fit in the shortform, transform to a leaf block.
 		 * GROT: another possible req'mt for a double-split btree op.
 		 */
 		xfs_defer_init(args.dfops, args.firstblock);
-		error = xfs_attr_shortform_to_leaf(&args);
+		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
+		/*
+		 * Prevent the leaf buffer from being unlocked
+		 * when "args.trans" transaction commits.
+		 */
+		if (leaf_bp)
+			xfs_trans_bhold(args.trans, leaf_bp);
 		if (!error)
 			error = xfs_defer_finish(&args.trans, args.dfops, dp);
 		if (error) {
 			args.trans = NULL;
 			xfs_defer_cancel(&dfops);
@@ -343,10 +350,18 @@
 
 		error = xfs_trans_roll(&args.trans, dp);
 		if (error)
 			goto out;
 
+		/*
+		 * Rejoin the leaf buffer to the new transaction.
+		 * This allows a subsequent read to find the buffer in the
+		 * transaction (and avoid a deadlock).
+		 */
+		xfs_trans_bjoin(args.trans, leaf_bp);
+		/* Prevent from being released at the end of the function */
+		leaf_bp = NULL;
 	}
 
 	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
 		error = xfs_attr_leaf_addname(&args);
 	else
@@ -374,10 +389,12 @@
 	return error;
 
 out:
 	if (args.trans)
 		xfs_trans_cancel(args.trans);
+	if (leaf_bp)
+		xfs_buf_relse(leaf_bp);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index c6c15e5..ab73e4b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -738,13 +738,14 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
 	return -ENOATTR;
 }
 
 /*
  * Convert from using the shortform to the leaf.
+ * Upon success, return the leaf buffer.
  */
 int
-xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args, struct xfs_buf **bpp)
 {
 	xfs_inode_t *dp;
 	xfs_attr_shortform_t *sf;
 	xfs_attr_sf_entry_t *sfe;
 	xfs_da_args_t nargs;
@@ -820,10 +821,11 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
 		if (error)
 			goto out;
 		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 	}
 	error = 0;
+	*bpp = bp;
 
 out:
 	kmem_free(tmpbuffer);
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index f7dda0c..2b3c69df 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -46,11 +46,12 @@
  */
 void	xfs_attr_shortform_create(struct xfs_da_args *args);
 void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
 int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
 int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
+					struct xfs_buf **bpp);
 int	xfs_attr_shortform_remove(struct xfs_da_args *args);
 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
 int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
 void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-09 11:06 [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute alex
@ 2017-08-09 13:17 ` Brian Foster
  2017-08-09 21:33 ` Dave Chinner
  2017-11-21 15:31 ` Libor Klepáč
  2 siblings, 0 replies; 40+ messages in thread
From: Brian Foster @ 2017-08-09 13:17 UTC (permalink / raw)
  To: alex; +Cc: linux-xfs, david, darrick.wong, libor.klepac

On Wed, Aug 09, 2017 at 02:06:12PM +0300, alex@zadarastorage.com wrote:
> From: Alex Lyakas <alex@zadarastorage.com>
> 
> The new attribute leaf buffer is not held locked across
> the transaction roll between the shortform->leaf modification
> and the addition of the new entry. As a result, the attribute
> buffer modification being made is not atomic from
> an operational perspective. Hence the AIL push can grab it in
> the transient state of "just created" after the initial
> transaction is rolled, because the buffer has been released.
> This leads to xfs_attr3_leaf_verify() asserting that
> hdr.count is zero, treating this as in-memory corruption,
> and shutting down the filesystem.
> 
> Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
> ---

Thanks for the fixups Alex. This patch applies clean and looks correct
to me. Some minor comments..

>  fs/xfs/libxfs/xfs_attr.c      | 19 ++++++++++++++++++-
>  fs/xfs/libxfs/xfs_attr_leaf.c |  4 +++-
>  fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
>  3 files changed, 23 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> index de7b9bd..982e322 100644
> --- a/fs/xfs/libxfs/xfs_attr.c
> +++ b/fs/xfs/libxfs/xfs_attr.c
> @@ -216,10 +216,11 @@
>  	struct xfs_defer_ops	dfops;
>  	struct xfs_trans_res	tres;
>  	xfs_fsblock_t		firstblock;
>  	int			rsvd = (flags & ATTR_ROOT) != 0;
>  	int			error, err2, local;
> +	struct xfs_buf		*leaf_bp = NULL;
>  
>  	XFS_STATS_INC(mp, xs_attr_set);
>  
>  	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
>  		return -EIO;
> @@ -325,11 +326,17 @@
>  		/*
>  		 * It won't fit in the shortform, transform to a leaf block.
>  		 * GROT: another possible req'mt for a double-split btree op.
>  		 */
>  		xfs_defer_init(args.dfops, args.firstblock);
> -		error = xfs_attr_shortform_to_leaf(&args);
> +		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> +		/*
> +		 * Prevent the leaf buffer from being unlocked
> +		 * when "args.trans" transaction commits.
> +		 */

I think this comment should explain why we're keeping the buffer locked
here rather than reiterating what the code is doing. For example,
something like the following:

"Keep the buffer locked across the leaf conversion transaction to
prevent it from being written back before the new xattr is added. The
buffer could be in an invalid, intermediate state (i.e., empty) and
trigger write verifier failure."

> +		if (leaf_bp)
> +			xfs_trans_bhold(args.trans, leaf_bp);
>  		if (!error)
>  			error = xfs_defer_finish(&args.trans, args.dfops, dp);
>  		if (error) {
>  			args.trans = NULL;
>  			xfs_defer_cancel(&dfops);
> @@ -343,10 +350,18 @@
>  
>  		error = xfs_trans_roll(&args.trans, dp);
>  		if (error)
>  			goto out;
>  
> +		/*
> +		 * Rejoin the leaf buffer to the new transaction.
> +		 * This allows a subsequent read to find the buffer in the
> +		 * transaction (and avoid a deadlock).
> +		 */

"Join the leaf buffer to the new transaction so addname can read it in
its locked state (and avoid a deadlock)."

> +		xfs_trans_bjoin(args.trans, leaf_bp);
> +		/* Prevent from being released at the end of the function */

Combine this comment with the one above (or just kill it, the code is
self-explanatory).

> +		leaf_bp = NULL;
>  	}
>  
>  	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
>  		error = xfs_attr_leaf_addname(&args);
>  	else
> @@ -374,10 +389,12 @@
>  	return error;
>  
>  out:
>  	if (args.trans)
>  		xfs_trans_cancel(args.trans);
> +	if (leaf_bp)
> +		xfs_buf_relse(leaf_bp);
>  	xfs_iunlock(dp, XFS_ILOCK_EXCL);
>  	return error;
>  }
>  
>  /*
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
> index c6c15e5..ab73e4b 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> @@ -738,13 +738,14 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
>  	return -ENOATTR;
>  }
>  
>  /*
>   * Convert from using the shortform to the leaf.
> + * Upon success, return the leaf buffer.
>   */
>  int
> -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> +xfs_attr_shortform_to_leaf(xfs_da_args_t *args, struct xfs_buf **bpp)
>  {
>  	xfs_inode_t *dp;
>  	xfs_attr_shortform_t *sf;
>  	xfs_attr_sf_entry_t *sfe;
>  	xfs_da_args_t nargs;
> @@ -820,10 +821,11 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
>  		if (error)
>  			goto out;
>  		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
>  	}
>  	error = 0;
> +	*bpp = bp;

This was probably lost in the discussion noise, but one thing I
mentioned for the rfc was to call xfs_trans_bhold() here rather than in
xfs_attr_set(), and do so (and set bpp) only when the leaf buffer is
actually empty. If the inode already has xattrs, then it's fine for it
to be written back in the meantime.

I don't care as much about not holding the buffer when not totally
necessary, but rather what I like about this is that it defines clear
semantics for any callers of the function that the buffer, when
returned, is the caller responsibility to put into a valid state before
it is released. This seems a bit more clear to me than always setting
the buffer on return.

That said, this is a minor, subjective design point and I'm fine with
the code as it is if that is preferred.

Brian

>  
>  out:
>  	kmem_free(tmpbuffer);
>  	return error;
>  }
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
> index f7dda0c..2b3c69df 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> @@ -46,11 +46,12 @@
>   */
>  void	xfs_attr_shortform_create(struct xfs_da_args *args);
>  void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
>  int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
>  int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> -int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> +int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> +					struct xfs_buf **bpp);
>  int	xfs_attr_shortform_remove(struct xfs_da_args *args);
>  int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
>  int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
>  void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
>  
> -- 
> 1.9.1
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-09 11:06 [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute alex
  2017-08-09 13:17 ` Brian Foster
@ 2017-08-09 21:33 ` Dave Chinner
  2017-08-10  8:02   ` Alex Lyakas
  2017-08-11 12:53   ` Christoph Hellwig
  2017-11-21 15:31 ` Libor Klepáč
  2 siblings, 2 replies; 40+ messages in thread
From: Dave Chinner @ 2017-08-09 21:33 UTC (permalink / raw)
  To: alex; +Cc: linux-xfs, bfoster, darrick.wong, libor.klepac

On Wed, Aug 09, 2017 at 02:06:12PM +0300, alex@zadarastorage.com wrote:
> From: Alex Lyakas <alex@zadarastorage.com>
> 
> The new attribute leaf buffer is not held locked across
> the transaction roll between the shortform->leaf modification
> and the addition of the new entry. As a result, the attribute
> buffer modification being made is not atomic from
> an operational perspective. Hence the AIL push can grab it in
> the transient state of "just created" after the initial
> transaction is rolled, because the buffer has been released.
> This leads to xfs_attr3_leaf_verify() asserting that
> hdr.count is zero, treating this as in-memory corruption,
> and shutting down the filesystem.
> 
> Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
> ---
>  fs/xfs/libxfs/xfs_attr.c      | 19 ++++++++++++++++++-
>  fs/xfs/libxfs/xfs_attr_leaf.c |  4 +++-
>  fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
>  3 files changed, 23 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> index de7b9bd..982e322 100644
> --- a/fs/xfs/libxfs/xfs_attr.c
> +++ b/fs/xfs/libxfs/xfs_attr.c
> @@ -216,10 +216,11 @@
>  	struct xfs_defer_ops	dfops;
>  	struct xfs_trans_res	tres;
>  	xfs_fsblock_t		firstblock;
>  	int			rsvd = (flags & ATTR_ROOT) != 0;
>  	int			error, err2, local;
> +	struct xfs_buf		*leaf_bp = NULL;
>  
>  	XFS_STATS_INC(mp, xs_attr_set);
>  
>  	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
>  		return -EIO;
> @@ -325,11 +326,17 @@
>  		/*
>  		 * It won't fit in the shortform, transform to a leaf block.
>  		 * GROT: another possible req'mt for a double-split btree op.
>  		 */
>  		xfs_defer_init(args.dfops, args.firstblock);
> -		error = xfs_attr_shortform_to_leaf(&args);
> +		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> +		/*
> +		 * Prevent the leaf buffer from being unlocked
> +		 * when "args.trans" transaction commits.
> +		 */
> +		if (leaf_bp)
> +			xfs_trans_bhold(args.trans, leaf_bp);
>  		if (!error)
>  			error = xfs_defer_finish(&args.trans, args.dfops, dp);
>  		if (error) {
>  			args.trans = NULL;
>  			xfs_defer_cancel(&dfops);

Hmmmm, looking closer at xfs_defer_finish(), just holding the buffer
here isn't sufficient. xfs_defer_finish() can roll the transaction a
number of times and holding the buffer is a one-shot deal. Hence the
buffer held buffer will have BLI_HOLD removed on the next commit
and be unlocked by the second commit, whether it be inside
xfs_defer_finish() or the roll that occurs below.

ISTR a previous discussion with Darrick that we needed something
like xfs_defer_join() with buffers instead of inodes to allow them
to be held across a call to xfs_defer_finish()....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-09 21:33 ` Dave Chinner
@ 2017-08-10  8:02   ` Alex Lyakas
  2017-08-10 11:33     ` Dave Chinner
  2017-08-11 12:53   ` Christoph Hellwig
  1 sibling, 1 reply; 40+ messages in thread
From: Alex Lyakas @ 2017-08-10  8:02 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, bfoster, darrick.wong, libor.klepac

Hi Dave,

Let's sayf that xfs_defer_finish() commits the current transaction and the 
buffer has been held. Then xfs_defer_finish() opens the next transaction. 
The buffer that has been held is not joined to the second transaction, i.e., 
the second transaction knows nothing about this buffer, is that correct? If 
so, the caller now holds the buffer exclusively, and he has one of two 
options:
- release the buffer explicitly
- join the buffer to some transaction

So it looks like I a missing the crux of your concern, can you please 
comment?

Thanks,
Alex.


-----Original Message----- 
From: Dave Chinner
Sent: Thursday, August 10, 2017 12:33 AM
To: alex@zadarastorage.com
Cc: linux-xfs@vger.kernel.org ; bfoster@redhat.com ; darrick.wong@oracle.com 
; libor.klepac@bcom.cz
Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf 
conversion and the addition of an attribute

On Wed, Aug 09, 2017 at 02:06:12PM +0300, alex@zadarastorage.com wrote:
> From: Alex Lyakas <alex@zadarastorage.com>
>
> The new attribute leaf buffer is not held locked across
> the transaction roll between the shortform->leaf modification
> and the addition of the new entry. As a result, the attribute
> buffer modification being made is not atomic from
> an operational perspective. Hence the AIL push can grab it in
> the transient state of "just created" after the initial
> transaction is rolled, because the buffer has been released.
> This leads to xfs_attr3_leaf_verify() asserting that
> hdr.count is zero, treating this as in-memory corruption,
> and shutting down the filesystem.
>
> Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
> ---
>  fs/xfs/libxfs/xfs_attr.c      | 19 ++++++++++++++++++-
>  fs/xfs/libxfs/xfs_attr_leaf.c |  4 +++-
>  fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
>  3 files changed, 23 insertions(+), 3 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> index de7b9bd..982e322 100644
> --- a/fs/xfs/libxfs/xfs_attr.c
> +++ b/fs/xfs/libxfs/xfs_attr.c
> @@ -216,10 +216,11 @@
>  struct xfs_defer_ops dfops;
>  struct xfs_trans_res tres;
>  xfs_fsblock_t firstblock;
>  int rsvd = (flags & ATTR_ROOT) != 0;
>  int error, err2, local;
> + struct xfs_buf *leaf_bp = NULL;
>
>  XFS_STATS_INC(mp, xs_attr_set);
>
>  if (XFS_FORCED_SHUTDOWN(dp->i_mount))
>  return -EIO;
> @@ -325,11 +326,17 @@
>  /*
>  * It won't fit in the shortform, transform to a leaf block.
>  * GROT: another possible req'mt for a double-split btree op.
>  */
>  xfs_defer_init(args.dfops, args.firstblock);
> - error = xfs_attr_shortform_to_leaf(&args);
> + error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> + /*
> + * Prevent the leaf buffer from being unlocked
> + * when "args.trans" transaction commits.
> + */
> + if (leaf_bp)
> + xfs_trans_bhold(args.trans, leaf_bp);
>  if (!error)
>  error = xfs_defer_finish(&args.trans, args.dfops, dp);
>  if (error) {
>  args.trans = NULL;
>  xfs_defer_cancel(&dfops);

Hmmmm, looking closer at xfs_defer_finish(), just holding the buffer
here isn't sufficient. xfs_defer_finish() can roll the transaction a
number of times and holding the buffer is a one-shot deal. Hence the
buffer held buffer will have BLI_HOLD removed on the next commit
and be unlocked by the second commit, whether it be inside
xfs_defer_finish() or the roll that occurs below.

ISTR a previous discussion with Darrick that we needed something
like xfs_defer_join() with buffers instead of inodes to allow them
to be held across a call to xfs_defer_finish()....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com 


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-10  8:02   ` Alex Lyakas
@ 2017-08-10 11:33     ` Dave Chinner
  2017-08-10 12:09       ` Alex Lyakas
  0 siblings, 1 reply; 40+ messages in thread
From: Dave Chinner @ 2017-08-10 11:33 UTC (permalink / raw)
  To: Alex Lyakas; +Cc: linux-xfs, bfoster, darrick.wong, libor.klepac

On Thu, Aug 10, 2017 at 11:02:12AM +0300, Alex Lyakas wrote:
> Hi Dave,
> 
> Let's sayf that xfs_defer_finish() commits the current transaction
> and the buffer has been held. Then xfs_defer_finish() opens the next
> transaction. The buffer that has been held is not joined to the
> second transaction, i.e., the second transaction knows nothing about
> this buffer, is that correct?

Yes.

> If so, the caller now holds the buffer
> exclusively, and he has one of two options:
> - release the buffer explicitly
> - join the buffer to some transaction

That's correct, but that's not the problem I see. :/

The problem is that the locked buffer is not joined and logged in
the rolling transactions run in xfs_defer_ops. Hence it can pin the
tail of the AIL, and this can prevent the transaction roll from
regranting the log space necessary to continue rolling the
transaction for the required number of transactions to complete the
deferred ops. If this happens, we end up with a log space deadlock.

Hence if we are holding an item that we logged in a transaction
locked and we roll the transaction, we have to join, hold and log it
in each subsequent transaction until we have finished with the item
and can unlock and release it.

This is documented in xfs_trans_roll():

        /*
         * Reserve space in the log for th next transaction.
         * This also pushes items in the "AIL", the list of logged items,
         * out to disk if they are taking up space at the tail of the log
         * that we want to use.  This requires that either nothing be locked
         * across this call, or that anything that is locked be logged in
         * the prior and the next transactions.
         */


Cheers,

Dave.

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-10 11:33     ` Dave Chinner
@ 2017-08-10 12:09       ` Alex Lyakas
  2017-08-10 14:52         ` Brian Foster
  0 siblings, 1 reply; 40+ messages in thread
From: Alex Lyakas @ 2017-08-10 12:09 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, bfoster, darrick.wong, libor.klepac

Hi Dave,

Thanks for the explanation. So it seems we cannot move forward with this 
fix.

Will somebody else in XFS community be working on fixing this issue? As you 
pointed out, it exists for over two decades. Our production systems hit this 
every couple of days, and shutting down the filesystem causes outage.

Thanks,
Alex.




-----Original Message----- 
From: Dave Chinner
Sent: Thursday, August 10, 2017 2:33 PM
To: Alex Lyakas
Cc: linux-xfs@vger.kernel.org ; bfoster@redhat.com ; darrick.wong@oracle.com 
; libor.klepac@bcom.cz
Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf 
conversion and the addition of an attribute

On Thu, Aug 10, 2017 at 11:02:12AM +0300, Alex Lyakas wrote:
> Hi Dave,
>
> Let's sayf that xfs_defer_finish() commits the current transaction
> and the buffer has been held. Then xfs_defer_finish() opens the next
> transaction. The buffer that has been held is not joined to the
> second transaction, i.e., the second transaction knows nothing about
> this buffer, is that correct?

Yes.

> If so, the caller now holds the buffer
> exclusively, and he has one of two options:
> - release the buffer explicitly
> - join the buffer to some transaction

That's correct, but that's not the problem I see. :/

The problem is that the locked buffer is not joined and logged in
the rolling transactions run in xfs_defer_ops. Hence it can pin the
tail of the AIL, and this can prevent the transaction roll from
regranting the log space necessary to continue rolling the
transaction for the required number of transactions to complete the
deferred ops. If this happens, we end up with a log space deadlock.

Hence if we are holding an item that we logged in a transaction
locked and we roll the transaction, we have to join, hold and log it
in each subsequent transaction until we have finished with the item
and can unlock and release it.

This is documented in xfs_trans_roll():

        /*
         * Reserve space in the log for th next transaction.
         * This also pushes items in the "AIL", the list of logged items,
         * out to disk if they are taking up space at the tail of the log
         * that we want to use.  This requires that either nothing be locked
         * across this call, or that anything that is locked be logged in
         * the prior and the next transactions.
         */


Cheers,

Dave.

-- 
Dave Chinner
david@fromorbit.com 


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-10 12:09       ` Alex Lyakas
@ 2017-08-10 14:52         ` Brian Foster
  2017-08-10 17:55           ` Darrick J. Wong
  0 siblings, 1 reply; 40+ messages in thread
From: Brian Foster @ 2017-08-10 14:52 UTC (permalink / raw)
  To: Alex Lyakas; +Cc: Dave Chinner, linux-xfs, darrick.wong, libor.klepac

On Thu, Aug 10, 2017 at 03:09:09PM +0300, Alex Lyakas wrote:
> Hi Dave,
> 
> Thanks for the explanation. So it seems we cannot move forward with this
> fix.
> 

I don't think this completely invalidates the fix.. Dave is pointing out
a flaw that the deferred ops infrastructure doesn't properly handle the
technique we want to use here. IOW, it means there's a dependency that
needs to be implemented first.

FWIW, I also think this means that your approach on the older kernel to
join/hold the buffer to the finished transaction may be the right
approach (depending on whether I follow the perm transaction code
correctly or not, see below), but I think you'd need to relog the buffer
as well.

> Will somebody else in XFS community be working on fixing this issue? As you
> pointed out, it exists for over two decades. Our production systems hit this
> every couple of days, and shutting down the filesystem causes outage.
> 

I'm guessing the defer infrastructure needs to handle relogging a buffer
similar to how it currently handles joining/relogging inodes..?

...
> -----Original Message----- From: Dave Chinner
...
> The problem is that the locked buffer is not joined and logged in
> the rolling transactions run in xfs_defer_ops. Hence it can pin the
> tail of the AIL, and this can prevent the transaction roll from
> regranting the log space necessary to continue rolling the
> transaction for the required number of transactions to complete the
> deferred ops. If this happens, we end up with a log space deadlock.
> 
> Hence if we are holding an item that we logged in a transaction
> locked and we roll the transaction, we have to join, hold and log it
> in each subsequent transaction until we have finished with the item
> and can unlock and release it.
> 
> This is documented in xfs_trans_roll():
> 
>        /*
>         * Reserve space in the log for th next transaction.
>         * This also pushes items in the "AIL", the list of logged items,
>         * out to disk if they are taking up space at the tail of the log
>         * that we want to use.  This requires that either nothing be locked
>         * across this call, or that anything that is locked be logged in
>         * the prior and the next transactions.
>         */
> 

Good catch, though I'm wondering whether it's a real enough problem atm
to block this fix. A few thoughts/questions:

1.) The transaction in this case has a t_log_count of 3, presumably to
cover the commits by the historical bmap_finish, the trans roll and the
final commit. If I'm following the permanent transaction code correctly,
doesn't that mean that we have room for at least 2 rolls (and 3 commits)
before this task would actually block on log reservation? AFAICT it
looks like the commit would dec ticket->t_cnt and replenish the current
log reservation. The subsequent xfs_trans_reserve() would just return if
t_cnt > 0.

This of course doesn't accommodate the fact the xfs_defer_finish() can
now roll a transaction an indeterminate number of times, which probably
needs to be handled in general, but...

2.) It doesn't look like we actually defer any ops in this situation
unless rmapbt is enabled, assuming that we limit holding the buffer to
the empty leaf case, at least (see my comment on the previous version).
I also don't see where a deferred rmapbt update would itself ever roll
the transaction.

3.) The buffer in this case is a new allocation, which I think means the
risk of it pinning the tail and causing a log deadlock here means that
on top of somehow depleting the initial permanent reservation, we'd have
to exhaust the log in the time between the first commit and the last
reservation.

Given the above, it seems reasonably safe enough to me to merge this
change as is and fix up the deferred ops stuff after the fact
(considering we know we need to rework the xattr stuff as such anyways).
This is still a landmine that should be fixed up, but I wouldn't be
against an ASSERT() or something for the time being if we could somehow
verify that the transaction ticket didn't require any extra reservation.

OTOH, just adding deferred ops buffer relogging might not be too much
trouble either. ;) Anyways, thoughts?

Brian

> 
> Cheers,
> 
> Dave.
> 
> -- 
> Dave Chinner
> david@fromorbit.com
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-10 14:52         ` Brian Foster
@ 2017-08-10 17:55           ` Darrick J. Wong
  2017-08-10 18:32             ` Brian Foster
  2017-08-11  2:09             ` Dave Chinner
  0 siblings, 2 replies; 40+ messages in thread
From: Darrick J. Wong @ 2017-08-10 17:55 UTC (permalink / raw)
  To: Brian Foster; +Cc: Alex Lyakas, Dave Chinner, linux-xfs, libor.klepac

On Thu, Aug 10, 2017 at 10:52:49AM -0400, Brian Foster wrote:
> On Thu, Aug 10, 2017 at 03:09:09PM +0300, Alex Lyakas wrote:
> > Hi Dave,
> > 
> > Thanks for the explanation. So it seems we cannot move forward with this
> > fix.
> > 
> 
> I don't think this completely invalidates the fix.. Dave is pointing out
> a flaw that the deferred ops infrastructure doesn't properly handle the
> technique we want to use here. IOW, it means there's a dependency that
> needs to be implemented first.
> 
> FWIW, I also think this means that your approach on the older kernel to
> join/hold the buffer to the finished transaction may be the right
> approach (depending on whether I follow the perm transaction code
> correctly or not, see below), but I think you'd need to relog the buffer
> as well.
> 
> > Will somebody else in XFS community be working on fixing this issue? As you
> > pointed out, it exists for over two decades. Our production systems hit this
> > every couple of days, and shutting down the filesystem causes outage.
> > 
> 
> I'm guessing the defer infrastructure needs to handle relogging a buffer
> similar to how it currently handles joining/relogging inodes..?
> 
> ...
> > -----Original Message----- From: Dave Chinner
> ...
> > The problem is that the locked buffer is not joined and logged in
> > the rolling transactions run in xfs_defer_ops. Hence it can pin the
> > tail of the AIL, and this can prevent the transaction roll from
> > regranting the log space necessary to continue rolling the
> > transaction for the required number of transactions to complete the
> > deferred ops. If this happens, we end up with a log space deadlock.
> > 
> > Hence if we are holding an item that we logged in a transaction
> > locked and we roll the transaction, we have to join, hold and log it
> > in each subsequent transaction until we have finished with the item
> > and can unlock and release it.
> > 
> > This is documented in xfs_trans_roll():
> > 
> >        /*
> >         * Reserve space in the log for th next transaction.
> >         * This also pushes items in the "AIL", the list of logged items,
> >         * out to disk if they are taking up space at the tail of the log
> >         * that we want to use.  This requires that either nothing be locked
> >         * across this call, or that anything that is locked be logged in
> >         * the prior and the next transactions.
> >         */
> > 
> 
> Good catch, though I'm wondering whether it's a real enough problem atm
> to block this fix. A few thoughts/questions:
> 
> 1.) The transaction in this case has a t_log_count of 3, presumably to
> cover the commits by the historical bmap_finish, the trans roll and the
> final commit. If I'm following the permanent transaction code correctly,
> doesn't that mean that we have room for at least 2 rolls (and 3 commits)
> before this task would actually block on log reservation? AFAICT it
> looks like the commit would dec ticket->t_cnt and replenish the current
> log reservation. The subsequent xfs_trans_reserve() would just return if
> t_cnt > 0.
> 
> This of course doesn't accommodate the fact the xfs_defer_finish() can
> now roll a transaction an indeterminate number of times, which probably
> needs to be handled in general, but...

I'd been wondering if tr_logcount needed upward adjusting, but so far
haven't observed any problems.

> 2.) It doesn't look like we actually defer any ops in this situation
> unless rmapbt is enabled, assuming that we limit holding the buffer to
> the empty leaf case, at least (see my comment on the previous version).
> I also don't see where a deferred rmapbt update would itself ever roll
> the transaction.

rmapbt split causes the agfl to hit the low water mark and refresh,
requiring an allocation ... but I think that's all stuffed in the same
transaction.  (So yeah I think I agree with you)

> 3.) The buffer in this case is a new allocation, which I think means the
> risk of it pinning the tail and causing a log deadlock here means that
> on top of somehow depleting the initial permanent reservation, we'd have
> to exhaust the log in the time between the first commit and the last
> reservation.
> 
> Given the above, it seems reasonably safe enough to me to merge this
> change as is and fix up the deferred ops stuff after the fact
> (considering we know we need to rework the xattr stuff as such anyways).
> This is still a landmine that should be fixed up, but I wouldn't be
> against an ASSERT() or something for the time being if we could somehow
> verify that the transaction ticket didn't require any extra reservation.
> 
> OTOH, just adding deferred ops buffer relogging might not be too much
> trouble either. ;) Anyways, thoughts?

I don't think it'd be difficult to add a _defer_bjoin operation that
maintains a list of buffers that we need to bhold across rolls.

I think xfs_buf->b_list is only used for delwri buffers, and a buffer
cannot be part of a transaction /and/ on a delwri list at the same time,
right?  So it shouldn't be hard to whip something up and couple this
patch to that.

--D

> 
> Brian
> 
> > 
> > Cheers,
> > 
> > Dave.
> > 
> > -- 
> > Dave Chinner
> > david@fromorbit.com
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-10 17:55           ` Darrick J. Wong
@ 2017-08-10 18:32             ` Brian Foster
  2017-08-11  2:22               ` Dave Chinner
  2017-08-11  2:09             ` Dave Chinner
  1 sibling, 1 reply; 40+ messages in thread
From: Brian Foster @ 2017-08-10 18:32 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Alex Lyakas, Dave Chinner, linux-xfs, libor.klepac

On Thu, Aug 10, 2017 at 10:55:48AM -0700, Darrick J. Wong wrote:
> On Thu, Aug 10, 2017 at 10:52:49AM -0400, Brian Foster wrote:
> > On Thu, Aug 10, 2017 at 03:09:09PM +0300, Alex Lyakas wrote:
...
> > OTOH, just adding deferred ops buffer relogging might not be too much
> > trouble either. ;) Anyways, thoughts?
> 
> I don't think it'd be difficult to add a _defer_bjoin operation that
> maintains a list of buffers that we need to bhold across rolls.
> 
> I think xfs_buf->b_list is only used for delwri buffers, and a buffer
> cannot be part of a transaction /and/ on a delwri list at the same time,
> right?  So it shouldn't be hard to whip something up and couple this
> patch to that.
> 

Hmm.. so if a buffer is modified, logged, committed, put on the AIL and
pushed, xfs_buf_item_push() locks it, puts it on the delwri queue and
unlocks. At that point, I _think_ it may be possible for another thread
to lock the buffer and join it to a new transaction. The delwri submit
skips the buffer if it has become pinned or locked since the delwri
queue (though I'm wondering if that unlocked pin check is racy against
locked buffer modifications. I suppose that would require a full
lock->pin->unlock cycle between the pin check and trylock however).

All that said, it looks like xfs_defer_ops uses a fixed size array for
relogged inodes. Perhaps use something similar for buffers?

The question I have for buffer relogging is what's the best way to track
the parts of the buffer that need to be relogged after a roll?
Copy/translate the dirty (xfs_buf_log_format) segment map(s)?

Brian

> --D
> 
> > 
> > Brian
> > 
> > > 
> > > Cheers,
> > > 
> > > Dave.
> > > 
> > > -- 
> > > Dave Chinner
> > > david@fromorbit.com
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-10 17:55           ` Darrick J. Wong
  2017-08-10 18:32             ` Brian Foster
@ 2017-08-11  2:09             ` Dave Chinner
  2017-08-11 14:30               ` Brian Foster
  1 sibling, 1 reply; 40+ messages in thread
From: Dave Chinner @ 2017-08-11  2:09 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Brian Foster, Alex Lyakas, linux-xfs, libor.klepac

On Thu, Aug 10, 2017 at 10:55:48AM -0700, Darrick J. Wong wrote:
> On Thu, Aug 10, 2017 at 10:52:49AM -0400, Brian Foster wrote:
> > On Thu, Aug 10, 2017 at 03:09:09PM +0300, Alex Lyakas wrote:
> > > Hi Dave,
> > > 
> > > Thanks for the explanation. So it seems we cannot move forward with this
> > > fix.
> > > 
> > 
> > I don't think this completely invalidates the fix.. Dave is pointing out
> > a flaw that the deferred ops infrastructure doesn't properly handle the
> > technique we want to use here. IOW, it means there's a dependency that
> > needs to be implemented first.
> > 
> > FWIW, I also think this means that your approach on the older kernel to
> > join/hold the buffer to the finished transaction may be the right
> > approach (depending on whether I follow the perm transaction code
> > correctly or not, see below), but I think you'd need to relog the buffer
> > as well.

Yes, the problem exists in 3.18 via the roll in xfs_bmap_finish()
so it would also need to be done there, too.

> > 
> > > Will somebody else in XFS community be working on fixing this issue? As you
> > > pointed out, it exists for over two decades. Our production systems hit this
> > > every couple of days, and shutting down the filesystem causes outage.
> > > 
> > 
> > I'm guessing the defer infrastructure needs to handle relogging a buffer
> > similar to how it currently handles joining/relogging inodes..?

Yup, pretty much identical, and only a 10-20 lines of new code, I
think.

> > > The problem is that the locked buffer is not joined and logged in
> > > the rolling transactions run in xfs_defer_ops. Hence it can pin the
> > > tail of the AIL, and this can prevent the transaction roll from
> > > regranting the log space necessary to continue rolling the
> > > transaction for the required number of transactions to complete the
> > > deferred ops. If this happens, we end up with a log space deadlock.
> > > 
> > > Hence if we are holding an item that we logged in a transaction
> > > locked and we roll the transaction, we have to join, hold and log it
> > > in each subsequent transaction until we have finished with the item
> > > and can unlock and release it.
> > > 
> > > This is documented in xfs_trans_roll():
> > > 
> > >        /*
> > >         * Reserve space in the log for th next transaction.
> > >         * This also pushes items in the "AIL", the list of logged items,
> > >         * out to disk if they are taking up space at the tail of the log
> > >         * that we want to use.  This requires that either nothing be locked
> > >         * across this call, or that anything that is locked be logged in
> > >         * the prior and the next transactions.
> > >         */
> > > 
> > 
> > Good catch, though I'm wondering whether it's a real enough problem atm
> > to block this fix. A few thoughts/questions:
> > 
> > 1.) The transaction in this case has a t_log_count of 3, presumably to
> > cover the commits by the historical bmap_finish, the trans roll and the
> > final commit. If I'm following the permanent transaction code correctly,
> > doesn't that mean that we have room for at least 2 rolls (and 3 commits)
> > before this task would actually block on log reservation? AFAICT it
> > looks like the commit would dec ticket->t_cnt and replenish the current
> > log reservation. The subsequent xfs_trans_reserve() would just return if
> > t_cnt > 0.
> > 
> > This of course doesn't accommodate the fact the xfs_defer_finish() can
> > now roll a transaction an indeterminate number of times, which probably
> > needs to be handled in general, but...
> 
> I'd been wondering if tr_logcount needed upward adjusting, but so far
> haven't observed any problems.

That won't avoid the general problem, though, just increase log
reservation pressure from active transactions.

> > 2.) It doesn't look like we actually defer any ops in this situation
> > unless rmapbt is enabled, assuming that we limit holding the buffer to
> > the empty leaf case, at least (see my comment on the previous version).
> > I also don't see where a deferred rmapbt update would itself ever roll
> > the transaction.
> 
> rmapbt split causes the agfl to hit the low water mark and refresh,
> requiring an allocation ... but I think that's all stuffed in the same
> transaction.  (So yeah I think I agree with you)

I haven't looked that far, but I'd prefer we fix the problem now
while we are looking at it because it doesn't seem that hard to
fix...

> > 3.) The buffer in this case is a new allocation, which I think means the
> > risk of it pinning the tail and causing a log deadlock here means that
> > on top of somehow depleting the initial permanent reservation, we'd have
> > to exhaust the log in the time between the first commit and the last
> > reservation.
> > 
> > Given the above, it seems reasonably safe enough to me to merge this
> > change as is and fix up the deferred ops stuff after the fact
> > (considering we know we need to rework the xattr stuff as such anyways).
> > This is still a landmine that should be fixed up, but I wouldn't be
> > against an ASSERT() or something for the time being if we could somehow
> > verify that the transaction ticket didn't require any extra reservation.
> > 
> > OTOH, just adding deferred ops buffer relogging might not be too much
> > trouble either. ;) Anyways, thoughts?
> 
> I don't think it'd be difficult to add a _defer_bjoin operation that
> maintains a list of buffers that we need to bhold across rolls.

Just a small array like inodes currently use would be sufficient.
We only need to hold one buffer right now....

> I think xfs_buf->b_list is only used for delwri buffers, and a buffer
> cannot be part of a transaction /and/ on a delwri list at the same time,
> right?  So it shouldn't be hard to whip something up and couple this
> patch to that.

Reading xfs_buf_item_push() answers that question:

        if (!xfs_buf_delwri_queue(bp, buffer_list))
                rval = XFS_ITEM_FLUSHING;
        xfs_buf_unlock(bp);
        return rval;

So, yes, a buffer can be on the delwri queue and be part of a
transaction at the same time because the buffers on the delwri queue
get unlocked once they are queued. If a transaction locks and joins
the buffer while it is on the delwri queue, the commit will pin the
buffer in memory before unlocking it and
xfs_buf_delwri_submit_nowait() will see it pinned and skip over it.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-10 18:32             ` Brian Foster
@ 2017-08-11  2:22               ` Dave Chinner
  2017-08-11 14:27                 ` Brian Foster
  0 siblings, 1 reply; 40+ messages in thread
From: Dave Chinner @ 2017-08-11  2:22 UTC (permalink / raw)
  To: Brian Foster; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Thu, Aug 10, 2017 at 02:32:33PM -0400, Brian Foster wrote:
> On Thu, Aug 10, 2017 at 10:55:48AM -0700, Darrick J. Wong wrote:
> > On Thu, Aug 10, 2017 at 10:52:49AM -0400, Brian Foster wrote:
> > > On Thu, Aug 10, 2017 at 03:09:09PM +0300, Alex Lyakas wrote:
> ...
> > > OTOH, just adding deferred ops buffer relogging might not be too much
> > > trouble either. ;) Anyways, thoughts?
> > 
> > I don't think it'd be difficult to add a _defer_bjoin operation that
> > maintains a list of buffers that we need to bhold across rolls.
> > 
> > I think xfs_buf->b_list is only used for delwri buffers, and a buffer
> > cannot be part of a transaction /and/ on a delwri list at the same time,
> > right?  So it shouldn't be hard to whip something up and couple this
> > patch to that.
> > 
> 
> Hmm.. so if a buffer is modified, logged, committed, put on the AIL and
> pushed, xfs_buf_item_push() locks it, puts it on the delwri queue and
> unlocks. At that point, I _think_ it may be possible for another thread
> to lock the buffer and join it to a new transaction. The delwri submit
> skips the buffer if it has become pinned or locked since the delwri
> queue (though I'm wondering if that unlocked pin check is racy against
> locked buffer modifications. I suppose that would require a full
> lock->pin->unlock cycle between the pin check and trylock however).

If it does race, we still catch pinned buffers in xfs_buf_submit() and
block there on them. SO a race is just sub-optimal behaviour, not a
bug.

> The question I have for buffer relogging is what's the best way to track
> the parts of the buffer that need to be relogged after a roll?
> Copy/translate the dirty (xfs_buf_log_format) segment map(s)?

Just mark it ordered?

That way it goes through the transaction commit, pinned and put into
the CIL and  gets moved forward in the AIL when the log checkpoints.
We don't need to relog the actual contents in this case, just ensure
it moves forward in the AIL appropriately while we hold it locked.
The ordered flag is removed at each commit the buffer goes through,
so as soon as we commit the final transaction it'll go back to
behaving like a normal buffer...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-09 21:33 ` Dave Chinner
  2017-08-10  8:02   ` Alex Lyakas
@ 2017-08-11 12:53   ` Christoph Hellwig
  2017-08-11 16:52     ` Darrick J. Wong
  1 sibling, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-08-11 12:53 UTC (permalink / raw)
  To: Dave Chinner; +Cc: alex, linux-xfs, bfoster, darrick.wong, libor.klepac

On Thu, Aug 10, 2017 at 07:33:07AM +1000, Dave Chinner wrote:
> Hmmmm, looking closer at xfs_defer_finish(), just holding the buffer
> here isn't sufficient. xfs_defer_finish() can roll the transaction a
> number of times and holding the buffer is a one-shot deal. Hence the
> buffer held buffer will have BLI_HOLD removed on the next commit
> and be unlocked by the second commit, whether it be inside
> xfs_defer_finish() or the roll that occurs below.
> 
> ISTR a previous discussion with Darrick that we needed something
> like xfs_defer_join() with buffers instead of inodes to allow them
> to be held across a call to xfs_defer_finish()....

We do.  I actually have patches lying around that remove the
xfs_trans_roll and xfs_defer_finish inode arguments, and instead have a
separate helper for rolling over an inode and adding an inode to the
defer list.

I've not added a magic helper for buffers yet, but that would be a
natural fit into that model.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-11  2:22               ` Dave Chinner
@ 2017-08-11 14:27                 ` Brian Foster
  2017-08-12  0:16                   ` Dave Chinner
  0 siblings, 1 reply; 40+ messages in thread
From: Brian Foster @ 2017-08-11 14:27 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> On Thu, Aug 10, 2017 at 02:32:33PM -0400, Brian Foster wrote:
> > On Thu, Aug 10, 2017 at 10:55:48AM -0700, Darrick J. Wong wrote:
> > > On Thu, Aug 10, 2017 at 10:52:49AM -0400, Brian Foster wrote:
> > > > On Thu, Aug 10, 2017 at 03:09:09PM +0300, Alex Lyakas wrote:
> > ...
> > > > OTOH, just adding deferred ops buffer relogging might not be too much
> > > > trouble either. ;) Anyways, thoughts?
> > > 
> > > I don't think it'd be difficult to add a _defer_bjoin operation that
> > > maintains a list of buffers that we need to bhold across rolls.
> > > 
> > > I think xfs_buf->b_list is only used for delwri buffers, and a buffer
> > > cannot be part of a transaction /and/ on a delwri list at the same time,
> > > right?  So it shouldn't be hard to whip something up and couple this
> > > patch to that.
> > > 
> > 
> > Hmm.. so if a buffer is modified, logged, committed, put on the AIL and
> > pushed, xfs_buf_item_push() locks it, puts it on the delwri queue and
> > unlocks. At that point, I _think_ it may be possible for another thread
> > to lock the buffer and join it to a new transaction. The delwri submit
> > skips the buffer if it has become pinned or locked since the delwri
> > queue (though I'm wondering if that unlocked pin check is racy against
> > locked buffer modifications. I suppose that would require a full
> > lock->pin->unlock cycle between the pin check and trylock however).
> 
> If it does race, we still catch pinned buffers in xfs_buf_submit() and
> block there on them. SO a race is just sub-optimal behaviour, not a
> bug.
> 

Ah I see, thanks.

> > The question I have for buffer relogging is what's the best way to track
> > the parts of the buffer that need to be relogged after a roll?
> > Copy/translate the dirty (xfs_buf_log_format) segment map(s)?
> 
> Just mark it ordered?
> 
> That way it goes through the transaction commit, pinned and put into
> the CIL and  gets moved forward in the AIL when the log checkpoints.
> We don't need to relog the actual contents in this case, just ensure
> it moves forward in the AIL appropriately while we hold it locked.

Hmm.. is it safe to mark a previously logged and AIL resident buffer
ordered in a subsequent transaction? The problem in this particular
example is that the empty leaf buffer is logged, committed and unpinned
(and thus AIL resident). We want to relog the buffer to move it forward
in the AIL on the next transaction because we're holding it locked and
thus it cannot be written back (and thus could pin the log tail).

If we mark the buffer ordered in the subsequent transaction and that
transaction commits/checkpoints to the log, don't we push the buffer
forward in the AIL to a checkpoint that doesn't have the originally
logged data..? IOW, it seems like if this does end up pushing the tail
of the log and we crash, we've thrown away checkpointed but not written
back metadata and potentially corrupted the fs. Hm?

In reading through some of this code, perhaps it doesn't matter which
part of the buffer we relog. We just need to make sure that the buffer
is marked dirty in the next transaction (i.e., via the log item
descriptor). If the buffer is held across transactions, I think that
means the underlying log item can't go away because the buffer can't be
written back. IIUC, the bli hangs around until writeback completion
precisely to allow this kind of relogging to occur by retaining the
dirty bitmap associated the buffer across potentially separate
transactions. If I'm following all that correctly (?), perhaps a new
xfs_trans_buf_relog() helper that just flags a buffer/transaction dirty
would suffice. Thoughts?

Brian

> The ordered flag is removed at each commit the buffer goes through,
> so as soon as we commit the final transaction it'll go back to
> behaving like a normal buffer...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-11  2:09             ` Dave Chinner
@ 2017-08-11 14:30               ` Brian Foster
  0 siblings, 0 replies; 40+ messages in thread
From: Brian Foster @ 2017-08-11 14:30 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Fri, Aug 11, 2017 at 12:09:56PM +1000, Dave Chinner wrote:
> On Thu, Aug 10, 2017 at 10:55:48AM -0700, Darrick J. Wong wrote:
> > On Thu, Aug 10, 2017 at 10:52:49AM -0400, Brian Foster wrote:
> > > On Thu, Aug 10, 2017 at 03:09:09PM +0300, Alex Lyakas wrote:
> > > > Hi Dave,
> > > > 
> > > > Thanks for the explanation. So it seems we cannot move forward with this
> > > > fix.
> > > > 
> > > 
> > > I don't think this completely invalidates the fix.. Dave is pointing out
> > > a flaw that the deferred ops infrastructure doesn't properly handle the
> > > technique we want to use here. IOW, it means there's a dependency that
> > > needs to be implemented first.
> > > 
> > > FWIW, I also think this means that your approach on the older kernel to
> > > join/hold the buffer to the finished transaction may be the right
> > > approach (depending on whether I follow the perm transaction code
> > > correctly or not, see below), but I think you'd need to relog the buffer
> > > as well.
> 
> Yes, the problem exists in 3.18 via the roll in xfs_bmap_finish()
> so it would also need to be done there, too.
> 

The argument to fix the deferred ops problem in the current code first
because its fairly straightforward makes sense to me.

That said, what I wrote below in 1. suggests that this is not a problem
in the v3.18 xfs_attr_set() code. The argument is basically that the old
xfs_bmap_finish() only committed the transaction once at most and so
this codepath never waits on log reservation after the initial
xfs_trans_reserve(). IOW, the problem in this case (in principle) is
that the xfs_defer_finish() can roll the transaction an arbitrary number
of times. Am I missing something?

Brian

> > > 
> > > > Will somebody else in XFS community be working on fixing this issue? As you
> > > > pointed out, it exists for over two decades. Our production systems hit this
> > > > every couple of days, and shutting down the filesystem causes outage.
> > > > 
> > > 
> > > I'm guessing the defer infrastructure needs to handle relogging a buffer
> > > similar to how it currently handles joining/relogging inodes..?
> 
> Yup, pretty much identical, and only a 10-20 lines of new code, I
> think.
> 
> > > > The problem is that the locked buffer is not joined and logged in
> > > > the rolling transactions run in xfs_defer_ops. Hence it can pin the
> > > > tail of the AIL, and this can prevent the transaction roll from
> > > > regranting the log space necessary to continue rolling the
> > > > transaction for the required number of transactions to complete the
> > > > deferred ops. If this happens, we end up with a log space deadlock.
> > > > 
> > > > Hence if we are holding an item that we logged in a transaction
> > > > locked and we roll the transaction, we have to join, hold and log it
> > > > in each subsequent transaction until we have finished with the item
> > > > and can unlock and release it.
> > > > 
> > > > This is documented in xfs_trans_roll():
> > > > 
> > > >        /*
> > > >         * Reserve space in the log for th next transaction.
> > > >         * This also pushes items in the "AIL", the list of logged items,
> > > >         * out to disk if they are taking up space at the tail of the log
> > > >         * that we want to use.  This requires that either nothing be locked
> > > >         * across this call, or that anything that is locked be logged in
> > > >         * the prior and the next transactions.
> > > >         */
> > > > 
> > > 
> > > Good catch, though I'm wondering whether it's a real enough problem atm
> > > to block this fix. A few thoughts/questions:
> > > 
> > > 1.) The transaction in this case has a t_log_count of 3, presumably to
> > > cover the commits by the historical bmap_finish, the trans roll and the
> > > final commit. If I'm following the permanent transaction code correctly,
> > > doesn't that mean that we have room for at least 2 rolls (and 3 commits)
> > > before this task would actually block on log reservation? AFAICT it
> > > looks like the commit would dec ticket->t_cnt and replenish the current
> > > log reservation. The subsequent xfs_trans_reserve() would just return if
> > > t_cnt > 0.
> > > 
> > > This of course doesn't accommodate the fact the xfs_defer_finish() can
> > > now roll a transaction an indeterminate number of times, which probably
> > > needs to be handled in general, but...
> > 
> > I'd been wondering if tr_logcount needed upward adjusting, but so far
> > haven't observed any problems.
> 
> That won't avoid the general problem, though, just increase log
> reservation pressure from active transactions.
> 
> > > 2.) It doesn't look like we actually defer any ops in this situation
> > > unless rmapbt is enabled, assuming that we limit holding the buffer to
> > > the empty leaf case, at least (see my comment on the previous version).
> > > I also don't see where a deferred rmapbt update would itself ever roll
> > > the transaction.
> > 
> > rmapbt split causes the agfl to hit the low water mark and refresh,
> > requiring an allocation ... but I think that's all stuffed in the same
> > transaction.  (So yeah I think I agree with you)
> 
> I haven't looked that far, but I'd prefer we fix the problem now
> while we are looking at it because it doesn't seem that hard to
> fix...
> 
> > > 3.) The buffer in this case is a new allocation, which I think means the
> > > risk of it pinning the tail and causing a log deadlock here means that
> > > on top of somehow depleting the initial permanent reservation, we'd have
> > > to exhaust the log in the time between the first commit and the last
> > > reservation.
> > > 
> > > Given the above, it seems reasonably safe enough to me to merge this
> > > change as is and fix up the deferred ops stuff after the fact
> > > (considering we know we need to rework the xattr stuff as such anyways).
> > > This is still a landmine that should be fixed up, but I wouldn't be
> > > against an ASSERT() or something for the time being if we could somehow
> > > verify that the transaction ticket didn't require any extra reservation.
> > > 
> > > OTOH, just adding deferred ops buffer relogging might not be too much
> > > trouble either. ;) Anyways, thoughts?
> > 
> > I don't think it'd be difficult to add a _defer_bjoin operation that
> > maintains a list of buffers that we need to bhold across rolls.
> 
> Just a small array like inodes currently use would be sufficient.
> We only need to hold one buffer right now....
> 
> > I think xfs_buf->b_list is only used for delwri buffers, and a buffer
> > cannot be part of a transaction /and/ on a delwri list at the same time,
> > right?  So it shouldn't be hard to whip something up and couple this
> > patch to that.
> 
> Reading xfs_buf_item_push() answers that question:
> 
>         if (!xfs_buf_delwri_queue(bp, buffer_list))
>                 rval = XFS_ITEM_FLUSHING;
>         xfs_buf_unlock(bp);
>         return rval;
> 
> So, yes, a buffer can be on the delwri queue and be part of a
> transaction at the same time because the buffers on the delwri queue
> get unlocked once they are queued. If a transaction locks and joins
> the buffer while it is on the delwri queue, the commit will pin the
> buffer in memory before unlocking it and
> xfs_buf_delwri_submit_nowait() will see it pinned and skip over it.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-11 12:53   ` Christoph Hellwig
@ 2017-08-11 16:52     ` Darrick J. Wong
  2017-08-12  7:37       ` Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Darrick J. Wong @ 2017-08-11 16:52 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Dave Chinner, alex, linux-xfs, bfoster, libor.klepac

On Fri, Aug 11, 2017 at 05:53:10AM -0700, Christoph Hellwig wrote:
> On Thu, Aug 10, 2017 at 07:33:07AM +1000, Dave Chinner wrote:
> > Hmmmm, looking closer at xfs_defer_finish(), just holding the buffer
> > here isn't sufficient. xfs_defer_finish() can roll the transaction a
> > number of times and holding the buffer is a one-shot deal. Hence the
> > buffer held buffer will have BLI_HOLD removed on the next commit
> > and be unlocked by the second commit, whether it be inside
> > xfs_defer_finish() or the roll that occurs below.
> > 
> > ISTR a previous discussion with Darrick that we needed something
> > like xfs_defer_join() with buffers instead of inodes to allow them
> > to be held across a call to xfs_defer_finish()....
> 
> We do.  I actually have patches lying around that remove the
> xfs_trans_roll and xfs_defer_finish inode arguments, and instead have a
> separate helper for rolling over an inode and adding an inode to the
> defer list.
> 
> I've not added a magic helper for buffers yet, but that would be a
> natural fit into that model.

Could you send them to the list, please? :)

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-11 14:27                 ` Brian Foster
@ 2017-08-12  0:16                   ` Dave Chinner
  2017-08-12 14:04                     ` Brian Foster
  0 siblings, 1 reply; 40+ messages in thread
From: Dave Chinner @ 2017-08-12  0:16 UTC (permalink / raw)
  To: Brian Foster; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > On Thu, Aug 10, 2017 at 02:32:33PM -0400, Brian Foster wrote:
> > > On Thu, Aug 10, 2017 at 10:55:48AM -0700, Darrick J. Wong wrote:
> > > > On Thu, Aug 10, 2017 at 10:52:49AM -0400, Brian Foster wrote:
> > > > > On Thu, Aug 10, 2017 at 03:09:09PM +0300, Alex Lyakas wrote:
> > > ...
> > > > > OTOH, just adding deferred ops buffer relogging might not be too much
> > > > > trouble either. ;) Anyways, thoughts?
> > > > 
> > > > I don't think it'd be difficult to add a _defer_bjoin operation that
> > > > maintains a list of buffers that we need to bhold across rolls.
> > > > 
> > > > I think xfs_buf->b_list is only used for delwri buffers, and a buffer
> > > > cannot be part of a transaction /and/ on a delwri list at the same time,
> > > > right?  So it shouldn't be hard to whip something up and couple this
> > > > patch to that.
> > > > 
> > > 
> > > Hmm.. so if a buffer is modified, logged, committed, put on the AIL and
> > > pushed, xfs_buf_item_push() locks it, puts it on the delwri queue and
> > > unlocks. At that point, I _think_ it may be possible for another thread
> > > to lock the buffer and join it to a new transaction. The delwri submit
> > > skips the buffer if it has become pinned or locked since the delwri
> > > queue (though I'm wondering if that unlocked pin check is racy against
> > > locked buffer modifications. I suppose that would require a full
> > > lock->pin->unlock cycle between the pin check and trylock however).
> > 
> > If it does race, we still catch pinned buffers in xfs_buf_submit() and
> > block there on them. SO a race is just sub-optimal behaviour, not a
> > bug.
> > 
> 
> Ah I see, thanks.
> 
> > > The question I have for buffer relogging is what's the best way to track
> > > the parts of the buffer that need to be relogged after a roll?
> > > Copy/translate the dirty (xfs_buf_log_format) segment map(s)?
> > 
> > Just mark it ordered?
> > 
> > That way it goes through the transaction commit, pinned and put into
> > the CIL and  gets moved forward in the AIL when the log checkpoints.
> > We don't need to relog the actual contents in this case, just ensure
> > it moves forward in the AIL appropriately while we hold it locked.
> 
> Hmm.. is it safe to mark a previously logged and AIL resident buffer
> ordered in a subsequent transaction?

That's what I'm asking - can we mark it ordered and not have to
worry about what is already dirty?

> The problem in this particular
> example is that the empty leaf buffer is logged, committed and unpinned
> (and thus AIL resident). We want to relog the buffer to move it forward
> in the AIL on the next transaction because we're holding it locked and
> thus it cannot be written back (and thus could pin the log tail).

Yup.

> If we mark the buffer ordered in the subsequent transaction and that
> transaction commits/checkpoints to the log, don't we push the buffer
> forward in the AIL to a checkpoint that doesn't have the originally
> logged data..? IOW, it seems like if this does end up pushing the tail
> of the log and we crash, we've thrown away checkpointed but not written
> back metadata and potentially corrupted the fs. Hm?

Relogging of existing dirty regions is supposed to solve this
problem. i.e. while the log item is dirty in the AIL, any
transaction that logs and commits the log item will also log all the
existing dirty regions on the buffer, hence the next checkpoint will
contain everything it's supposed to.

Hence in this case, we don't need to log any new regions of the
buffer because it already has a record of all the dirty regions on
it from the prior transaction we committed.  That means we don't
actually need to mark any new ranges dirty, we just need to mark the
log item dirty again to trigger relogging of the existing dirty
ranges on the buffer.

Using XFS_BLI_ORDERED allows us to log the buffer without recording
a new dirty range on the buffer. IOWs, it retains whatever dirty range
it already had, and so after joining, marking it ordered and then
logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
in the transaction.

The question is this: what happens when a XFS_BLI_ORDERED buffer
with a pre-existing dirty region is formatted for the CIL? We
haven't done that before, so I'm betting that we don't relog the
dirty region like we should be doing....

... and we don't relog the existing dirty range because the
ordered flag takes precedence.

Ok, the ordered buffer checks in xfs_buf_item_size() and
xfs_buf_item_format() need to also check for dirty regions. If dirty
regions exist, then we treat it like a normal buffer rather than an
ordered buffer. We can factor the dirty region check out of
xfs_buf_item_unlock() for this...

Actually, check the case in xfs_buf_item_size() and remove the
ordered flag if there are dirty regions. Then xfs_buf_item_format()
will do the right thing without needing a duplicate check...

Nothing in XFS is ever simple, is it? :P

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-11 16:52     ` Darrick J. Wong
@ 2017-08-12  7:37       ` Christoph Hellwig
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2017-08-12  7:37 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Dave Chinner, alex, linux-xfs, bfoster, libor.klepac

On Fri, Aug 11, 2017 at 09:52:41AM -0700, Darrick J. Wong wrote:
> Could you send them to the list, please? :)

I'll try to dig it out - it's on top of another larger series,
and I'm a little overloaded with work at the moment.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-12  0:16                   ` Dave Chinner
@ 2017-08-12 14:04                     ` Brian Foster
  2017-08-14  0:28                       ` Dave Chinner
  0 siblings, 1 reply; 40+ messages in thread
From: Brian Foster @ 2017-08-12 14:04 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > On Thu, Aug 10, 2017 at 02:32:33PM -0400, Brian Foster wrote:
> > > > On Thu, Aug 10, 2017 at 10:55:48AM -0700, Darrick J. Wong wrote:
> > > > > On Thu, Aug 10, 2017 at 10:52:49AM -0400, Brian Foster wrote:
...
> > > > The question I have for buffer relogging is what's the best way to track
> > > > the parts of the buffer that need to be relogged after a roll?
> > > > Copy/translate the dirty (xfs_buf_log_format) segment map(s)?
> > > 
> > > Just mark it ordered?
> > > 
> > > That way it goes through the transaction commit, pinned and put into
> > > the CIL and  gets moved forward in the AIL when the log checkpoints.
> > > We don't need to relog the actual contents in this case, just ensure
> > > it moves forward in the AIL appropriately while we hold it locked.
> > 
> > Hmm.. is it safe to mark a previously logged and AIL resident buffer
> > ordered in a subsequent transaction?
> 
> That's what I'm asking - can we mark it ordered and not have to
> worry about what is already dirty?
> 

Ok..

> > The problem in this particular
> > example is that the empty leaf buffer is logged, committed and unpinned
> > (and thus AIL resident). We want to relog the buffer to move it forward
> > in the AIL on the next transaction because we're holding it locked and
> > thus it cannot be written back (and thus could pin the log tail).
> 
> Yup.
> 
> > If we mark the buffer ordered in the subsequent transaction and that
> > transaction commits/checkpoints to the log, don't we push the buffer
> > forward in the AIL to a checkpoint that doesn't have the originally
> > logged data..? IOW, it seems like if this does end up pushing the tail
> > of the log and we crash, we've thrown away checkpointed but not written
> > back metadata and potentially corrupted the fs. Hm?
> 
> Relogging of existing dirty regions is supposed to solve this
> problem. i.e. while the log item is dirty in the AIL, any
> transaction that logs and commits the log item will also log all the
> existing dirty regions on the buffer, hence the next checkpoint will
> contain everything it's supposed to.
> 
> Hence in this case, we don't need to log any new regions of the
> buffer because it already has a record of all the dirty regions on
> it from the prior transaction we committed.  That means we don't
> actually need to mark any new ranges dirty, we just need to mark the
> log item dirty again to trigger relogging of the existing dirty
> ranges on the buffer.
> 

Yep, this is what I was alluding to as an alternative solution in my
last mail. Just a nit: note that we need to mark the log item descriptor
dirty in the transaction (as opposed to the log item, which is already
dirty in this case) so it isn't thrown away at commit time.

> Using XFS_BLI_ORDERED allows us to log the buffer without recording
> a new dirty range on the buffer. IOWs, it retains whatever dirty range
> it already had, and so after joining, marking it ordered and then
> logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> in the transaction.
> 
> The question is this: what happens when a XFS_BLI_ORDERED buffer
> with a pre-existing dirty region is formatted for the CIL? We
> haven't done that before, so I'm betting that we don't relog the
> dirty region like we should be doing....
> 
> ... and we don't relog the existing dirty range because the
> ordered flag takes precedence.
> 

Right.. so it seems that the current implementation for ordered buffers
assumes a buffer is only ever used in one mode or the other.
Additionally, the AIL assumes that any reinserted item has been fully
relogged and so it moves the LSN forward unconditionally. Current
ordered buffer processing violates this constraint for an already logged
buffer.

> Ok, the ordered buffer checks in xfs_buf_item_size() and
> xfs_buf_item_format() need to also check for dirty regions. If dirty
> regions exist, then we treat it like a normal buffer rather than an
> ordered buffer. We can factor the dirty region check out of
> xfs_buf_item_unlock() for this...
> 
> Actually, check the case in xfs_buf_item_size() and remove the
> ordered flag if there are dirty regions. Then xfs_buf_item_format()
> will do the right thing without needing a duplicate check...
> 

I think that would work, assuming we actually check the
xfs_buf_log_format for dirty-ness rather than just the log item. As it
is, note that ordered buffers are still "logged" in the transaction
because otherwise the transaction infrastructure will assume it made no
change to the buf and toss the log item at commit time (we also need to
set up I/O completion on the buf and whatnot).

What concerns me about this approach is that I think we introduce the
possibility for subtle bugs. Existing ordered buffer code does this:

        xfs_trans_ordered_buf(tp, fbuf);
        xfs_trans_log_buf(tp, fbuf, 0,
                          BBTOB(fbuf->b_length) - 1);

... which should continue to work fine. Allowing ordered buffers to
physically log means that something like this:

        xfs_trans_log_buf(tp, fbuf, 0,
                          BBTOB(fbuf->b_length) - 1);
        xfs_trans_ordered_buf(tp, fbuf);

... is now a bug that is only apparent after scrutiny of xfs_trans_*()
and logging internals. Granted, the above already is incorrect, but it
technically still works as expected. I don't see the need to turn that
into a real problem by actually logging the buffer when we might not
expect to.

So while I agree that this could probably be made to work and I think it
is ideal to doing any kind of logged range tracking in the deferred ops
code, it still seems more tricky than it needs to be. To relog a held
buffer in a new transaction, why not just mark the lidp dirty in the new
transaction so it inherits all existing dirty segments? AFAICT, all we
really need to do is:

        tp->t_flags |= XFS_TRANS_DIRTY;
        lidp->lid_flags |= XFS_LID_DIRTY;

... on the new transaction and everything should just work as designed
(for a buffer that has been previously logged, held, rolled and
rejoined).

To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
new xfs_trans_dirty_buf() helper that covers all of the relevant bits
not related to actually dirtying the bli. xfs_trans_log_buf() would call
xfs_trans_dirty_buf() and thus would not change functionally.
xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
the existing ordered buf users would no longer need to log a range of
the buffer (which doesn't make much sense anyways). Finally, the
deferred infrastructure could join/dirty/hold the buffer to the new
transaction after each roll without needing to track and relog specific
regions of the buffer. Thoughts?

Unless I'm missing something as to why this is busted, I'll take a
closer look at the code and float an rfc next week since otherwise it
sounds like this is something we could actually fix up in the ordered
buffer code today.

> Nothing in XFS is ever simple, is it? :P
> 

There used to be a level of satisfaction at feeling I understood some
new corner of XFS. Nowadays I know that just means I'm not yet aware of
whatever dragons remain in that corner (is that paranoia? not if it's
true!). :P

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-12 14:04                     ` Brian Foster
@ 2017-08-14  0:28                       ` Dave Chinner
  2017-08-14  8:11                         ` Alex Lyakas
  2017-08-17 20:38                         ` Brian Foster
  0 siblings, 2 replies; 40+ messages in thread
From: Dave Chinner @ 2017-08-14  0:28 UTC (permalink / raw)
  To: Brian Foster; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > it already had, and so after joining, marking it ordered and then
> > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > in the transaction.
> > 
> > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > with a pre-existing dirty region is formatted for the CIL? We
> > haven't done that before, so I'm betting that we don't relog the
> > dirty region like we should be doing....
> > 
> > ... and we don't relog the existing dirty range because the
> > ordered flag takes precedence.
> > 
> 
> Right.. so it seems that the current implementation for ordered buffers
> assumes a buffer is only ever used in one mode or the other.
> Additionally, the AIL assumes that any reinserted item has been fully
> relogged and so it moves the LSN forward unconditionally. Current
> ordered buffer processing violates this constraint for an already logged
> buffer.

Right, but it's not been a concern until now because we've only ever
used ordered buffers on newly allocated buffers that haven't been
previously logged.

> > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > regions exist, then we treat it like a normal buffer rather than an
> > ordered buffer. We can factor the dirty region check out of
> > xfs_buf_item_unlock() for this...
> > 
> > Actually, check the case in xfs_buf_item_size() and remove the
> > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > will do the right thing without needing a duplicate check...
> > 
> 
> I think that would work, assuming we actually check the
> xfs_buf_log_format for dirty-ness rather than just the log item. As it
> is, note that ordered buffers are still "logged" in the transaction
> because otherwise the transaction infrastructure will assume it made no
> change to the buf and toss the log item at commit time (we also need to
> set up I/O completion on the buf and whatnot).

*nod*

> What concerns me about this approach is that I think we introduce the
> possibility for subtle bugs. Existing ordered buffer code does this:
> 
>         xfs_trans_ordered_buf(tp, fbuf);
>         xfs_trans_log_buf(tp, fbuf, 0,
>                           BBTOB(fbuf->b_length) - 1);
> 
> ... which should continue to work fine. Allowing ordered buffers to
> physically log means that something like this:
> 
>         xfs_trans_log_buf(tp, fbuf, 0,
>                           BBTOB(fbuf->b_length) - 1);
>         xfs_trans_ordered_buf(tp, fbuf);
> 
> ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> and logging internals. Granted, the above already is incorrect, but it
> technically still works as expected. I don't see the need to turn that
> into a real problem by actually logging the buffer when we might not
> expect to.

Well, it's not a "things go bad" bug. It's a "we screwed up an
optimisation" bug, because logging the buffer contents unnecessarily
only increases the required log bandwidth. It shouldn't affect
replay because the buffer is still correctly ordered in the log.
Hence both the transient and end states of the buffer during replay
will still be the same...

> So while I agree that this could probably be made to work and I think it
> is ideal to doing any kind of logged range tracking in the deferred ops
> code, it still seems more tricky than it needs to be. To relog a held
> buffer in a new transaction, why not just mark the lidp dirty in the new
> transaction so it inherits all existing dirty segments? AFAICT, all we
> really need to do is:
> 
>         tp->t_flags |= XFS_TRANS_DIRTY;
>         lidp->lid_flags |= XFS_LID_DIRTY;
> 
> ... on the new transaction and everything should just work as designed
> (for a buffer that has been previously logged, held, rolled and
> rejoined).

We would also need to set:

	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;

which means we should....

> To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> not related to actually dirtying the bli. xfs_trans_log_buf() would call
> xfs_trans_dirty_buf() and thus would not change functionally.
> xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> the existing ordered buf users would no longer need to log a range of
> the buffer (which doesn't make much sense anyways).

... do this. :)

> Finally, the
> deferred infrastructure could join/dirty/hold the buffer to the new
> transaction after each roll without needing to track and relog specific
> regions of the buffer. Thoughts?

Yup, that's exactly what I was thinking should be possible by using
ordered buffers.... :)

And Christoph's rework of the transaction roll and deferred inode
handling that he just posted should make adding buffer handling
quite a bit neater and cleaner.

> Unless I'm missing something as to why this is busted, I'll take a
> closer look at the code and float an rfc next week since otherwise it
> sounds like this is something we could actually fix up in the ordered
> buffer code today.

Cool.

> > Nothing in XFS is ever simple, is it? :P
> 
> There used to be a level of satisfaction at feeling I understood some
> new corner of XFS. Nowadays I know that just means I'm not yet aware of
> whatever dragons remain in that corner (is that paranoia? not if it's
> true!). :P

Ah, the true signs of expertise: developing a knowledge base and
insight deep enough to understand that there is always another
hidden dragon poised to bite your head off. :)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-14  0:28                       ` Dave Chinner
@ 2017-08-14  8:11                         ` Alex Lyakas
  2017-08-14 12:22                           ` Brian Foster
  2017-08-17 20:38                         ` Brian Foster
  1 sibling, 1 reply; 40+ messages in thread
From: Alex Lyakas @ 2017-08-14  8:11 UTC (permalink / raw)
  To: Dave Chinner, Brian Foster; +Cc: Darrick J. Wong, linux-xfs, libor.klepac

Hello David, Brian,

I was not able to follow the details, unfortunately. Can you confirm that 
this patch is safe to go into kernel 3.18?

Thanks,
Alex.


-----Original Message----- 
From: Dave Chinner
Sent: Monday, August 14, 2017 3:28 AM
To: Brian Foster
Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ; 
libor.klepac@bcom.cz
Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf 
conversion and the addition of an attribute

On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > it already had, and so after joining, marking it ordered and then
> > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > in the transaction.
> >
> > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > with a pre-existing dirty region is formatted for the CIL? We
> > haven't done that before, so I'm betting that we don't relog the
> > dirty region like we should be doing....
> >
> > ... and we don't relog the existing dirty range because the
> > ordered flag takes precedence.
> >
>
> Right.. so it seems that the current implementation for ordered buffers
> assumes a buffer is only ever used in one mode or the other.
> Additionally, the AIL assumes that any reinserted item has been fully
> relogged and so it moves the LSN forward unconditionally. Current
> ordered buffer processing violates this constraint for an already logged
> buffer.

Right, but it's not been a concern until now because we've only ever
used ordered buffers on newly allocated buffers that haven't been
previously logged.

> > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > regions exist, then we treat it like a normal buffer rather than an
> > ordered buffer. We can factor the dirty region check out of
> > xfs_buf_item_unlock() for this...
> >
> > Actually, check the case in xfs_buf_item_size() and remove the
> > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > will do the right thing without needing a duplicate check...
> >
>
> I think that would work, assuming we actually check the
> xfs_buf_log_format for dirty-ness rather than just the log item. As it
> is, note that ordered buffers are still "logged" in the transaction
> because otherwise the transaction infrastructure will assume it made no
> change to the buf and toss the log item at commit time (we also need to
> set up I/O completion on the buf and whatnot).

*nod*

> What concerns me about this approach is that I think we introduce the
> possibility for subtle bugs. Existing ordered buffer code does this:
>
>         xfs_trans_ordered_buf(tp, fbuf);
>         xfs_trans_log_buf(tp, fbuf, 0,
>                           BBTOB(fbuf->b_length) - 1);
>
> ... which should continue to work fine. Allowing ordered buffers to
> physically log means that something like this:
>
>         xfs_trans_log_buf(tp, fbuf, 0,
>                           BBTOB(fbuf->b_length) - 1);
>         xfs_trans_ordered_buf(tp, fbuf);
>
> ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> and logging internals. Granted, the above already is incorrect, but it
> technically still works as expected. I don't see the need to turn that
> into a real problem by actually logging the buffer when we might not
> expect to.

Well, it's not a "things go bad" bug. It's a "we screwed up an
optimisation" bug, because logging the buffer contents unnecessarily
only increases the required log bandwidth. It shouldn't affect
replay because the buffer is still correctly ordered in the log.
Hence both the transient and end states of the buffer during replay
will still be the same...

> So while I agree that this could probably be made to work and I think it
> is ideal to doing any kind of logged range tracking in the deferred ops
> code, it still seems more tricky than it needs to be. To relog a held
> buffer in a new transaction, why not just mark the lidp dirty in the new
> transaction so it inherits all existing dirty segments? AFAICT, all we
> really need to do is:
>
>         tp->t_flags |= XFS_TRANS_DIRTY;
>         lidp->lid_flags |= XFS_LID_DIRTY;
>
> ... on the new transaction and everything should just work as designed
> (for a buffer that has been previously logged, held, rolled and
> rejoined).

We would also need to set:

bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;

which means we should....

> To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> not related to actually dirtying the bli. xfs_trans_log_buf() would call
> xfs_trans_dirty_buf() and thus would not change functionally.
> xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> the existing ordered buf users would no longer need to log a range of
> the buffer (which doesn't make much sense anyways).

... do this. :)

> Finally, the
> deferred infrastructure could join/dirty/hold the buffer to the new
> transaction after each roll without needing to track and relog specific
> regions of the buffer. Thoughts?

Yup, that's exactly what I was thinking should be possible by using
ordered buffers.... :)

And Christoph's rework of the transaction roll and deferred inode
handling that he just posted should make adding buffer handling
quite a bit neater and cleaner.

> Unless I'm missing something as to why this is busted, I'll take a
> closer look at the code and float an rfc next week since otherwise it
> sounds like this is something we could actually fix up in the ordered
> buffer code today.

Cool.

> > Nothing in XFS is ever simple, is it? :P
>
> There used to be a level of satisfaction at feeling I understood some
> new corner of XFS. Nowadays I know that just means I'm not yet aware of
> whatever dragons remain in that corner (is that paranoia? not if it's
> true!). :P

Ah, the true signs of expertise: developing a knowledge base and
insight deep enough to understand that there is always another
hidden dragon poised to bite your head off. :)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com 


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-14  8:11                         ` Alex Lyakas
@ 2017-08-14 12:22                           ` Brian Foster
  2017-08-14 16:04                             ` Alex Lyakas
  2019-03-22  9:12                             ` Shyam Kaushik
  0 siblings, 2 replies; 40+ messages in thread
From: Brian Foster @ 2017-08-14 12:22 UTC (permalink / raw)
  To: Alex Lyakas; +Cc: Dave Chinner, Darrick J. Wong, linux-xfs, libor.klepac

On Mon, Aug 14, 2017 at 11:11:41AM +0300, Alex Lyakas wrote:
> Hello David, Brian,
> 
> I was not able to follow the details, unfortunately. Can you confirm that
> this patch is safe to go into kernel 3.18?
> 

This is the open question in the separate subthread (this one is
discussion around designing a solution for the current code):

http://marc.info/?l=linux-xfs&m=150246184413604&w=2

This could use confirmation, but my understanding is that this is safe
because v3.18 doesn't have the more advanced deferred ops
infrastructure. It uses xfs_bmap_finish() which has a max roll count of
one and a transaction with enough reservation for 2 rolls before
blocking reservation is required.

Note that doesn't mean we'd officially post a v3.18 stable patch before
this is fixed in the upstream code. We always fix upstream first and
backport from there to ensure a consistent base going forward (we don't
want to go change v3.18, end up with a slightly different upstream
patch, then have to backport more changes to fix the original patch).
This may be safe enough for you to use locally in the meantime, however.

Brian

> Thanks,
> Alex.
> 
> 
> -----Original Message----- From: Dave Chinner
> Sent: Monday, August 14, 2017 3:28 AM
> To: Brian Foster
> Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ;
> libor.klepac@bcom.cz
> Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> conversion and the addition of an attribute
> 
> On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > > it already had, and so after joining, marking it ordered and then
> > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > in the transaction.
> > >
> > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > with a pre-existing dirty region is formatted for the CIL? We
> > > haven't done that before, so I'm betting that we don't relog the
> > > dirty region like we should be doing....
> > >
> > > ... and we don't relog the existing dirty range because the
> > > ordered flag takes precedence.
> > >
> > 
> > Right.. so it seems that the current implementation for ordered buffers
> > assumes a buffer is only ever used in one mode or the other.
> > Additionally, the AIL assumes that any reinserted item has been fully
> > relogged and so it moves the LSN forward unconditionally. Current
> > ordered buffer processing violates this constraint for an already logged
> > buffer.
> 
> Right, but it's not been a concern until now because we've only ever
> used ordered buffers on newly allocated buffers that haven't been
> previously logged.
> 
> > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > > regions exist, then we treat it like a normal buffer rather than an
> > > ordered buffer. We can factor the dirty region check out of
> > > xfs_buf_item_unlock() for this...
> > >
> > > Actually, check the case in xfs_buf_item_size() and remove the
> > > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > > will do the right thing without needing a duplicate check...
> > >
> > 
> > I think that would work, assuming we actually check the
> > xfs_buf_log_format for dirty-ness rather than just the log item. As it
> > is, note that ordered buffers are still "logged" in the transaction
> > because otherwise the transaction infrastructure will assume it made no
> > change to the buf and toss the log item at commit time (we also need to
> > set up I/O completion on the buf and whatnot).
> 
> *nod*
> 
> > What concerns me about this approach is that I think we introduce the
> > possibility for subtle bugs. Existing ordered buffer code does this:
> > 
> >         xfs_trans_ordered_buf(tp, fbuf);
> >         xfs_trans_log_buf(tp, fbuf, 0,
> >                           BBTOB(fbuf->b_length) - 1);
> > 
> > ... which should continue to work fine. Allowing ordered buffers to
> > physically log means that something like this:
> > 
> >         xfs_trans_log_buf(tp, fbuf, 0,
> >                           BBTOB(fbuf->b_length) - 1);
> >         xfs_trans_ordered_buf(tp, fbuf);
> > 
> > ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> > and logging internals. Granted, the above already is incorrect, but it
> > technically still works as expected. I don't see the need to turn that
> > into a real problem by actually logging the buffer when we might not
> > expect to.
> 
> Well, it's not a "things go bad" bug. It's a "we screwed up an
> optimisation" bug, because logging the buffer contents unnecessarily
> only increases the required log bandwidth. It shouldn't affect
> replay because the buffer is still correctly ordered in the log.
> Hence both the transient and end states of the buffer during replay
> will still be the same...
> 
> > So while I agree that this could probably be made to work and I think it
> > is ideal to doing any kind of logged range tracking in the deferred ops
> > code, it still seems more tricky than it needs to be. To relog a held
> > buffer in a new transaction, why not just mark the lidp dirty in the new
> > transaction so it inherits all existing dirty segments? AFAICT, all we
> > really need to do is:
> > 
> >         tp->t_flags |= XFS_TRANS_DIRTY;
> >         lidp->lid_flags |= XFS_LID_DIRTY;
> > 
> > ... on the new transaction and everything should just work as designed
> > (for a buffer that has been previously logged, held, rolled and
> > rejoined).
> 
> We would also need to set:
> 
> bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> 
> which means we should....
> 
> > To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> > new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> > not related to actually dirtying the bli. xfs_trans_log_buf() would call
> > xfs_trans_dirty_buf() and thus would not change functionally.
> > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> > the existing ordered buf users would no longer need to log a range of
> > the buffer (which doesn't make much sense anyways).
> 
> ... do this. :)
> 
> > Finally, the
> > deferred infrastructure could join/dirty/hold the buffer to the new
> > transaction after each roll without needing to track and relog specific
> > regions of the buffer. Thoughts?
> 
> Yup, that's exactly what I was thinking should be possible by using
> ordered buffers.... :)
> 
> And Christoph's rework of the transaction roll and deferred inode
> handling that he just posted should make adding buffer handling
> quite a bit neater and cleaner.
> 
> > Unless I'm missing something as to why this is busted, I'll take a
> > closer look at the code and float an rfc next week since otherwise it
> > sounds like this is something we could actually fix up in the ordered
> > buffer code today.
> 
> Cool.
> 
> > > Nothing in XFS is ever simple, is it? :P
> > 
> > There used to be a level of satisfaction at feeling I understood some
> > new corner of XFS. Nowadays I know that just means I'm not yet aware of
> > whatever dragons remain in that corner (is that paranoia? not if it's
> > true!). :P
> 
> Ah, the true signs of expertise: developing a knowledge base and
> insight deep enough to understand that there is always another
> hidden dragon poised to bite your head off. :)
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-14 12:22                           ` Brian Foster
@ 2017-08-14 16:04                             ` Alex Lyakas
  2017-08-14 21:33                               ` Darrick J. Wong
  2019-03-22  9:12                             ` Shyam Kaushik
  1 sibling, 1 reply; 40+ messages in thread
From: Alex Lyakas @ 2017-08-14 16:04 UTC (permalink / raw)
  To: Brian Foster; +Cc: Dave Chinner, Darrick J. Wong, linux-xfs, libor.klepac

Hi Brian,

Thanks for confirming. 3.18 is anyways EOL, so probably no more patches will 
show up for it. We are already running with this patch on our 3.18 for about 
a week, and did not see any issues.

Alex.


-----Original Message----- 
From: Brian Foster
Sent: Monday, August 14, 2017 3:22 PM
To: Alex Lyakas
Cc: Dave Chinner ; Darrick J. Wong ; linux-xfs@vger.kernel.org ; 
libor.klepac@bcom.cz
Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf 
conversion and the addition of an attribute

On Mon, Aug 14, 2017 at 11:11:41AM +0300, Alex Lyakas wrote:
> Hello David, Brian,
>
> I was not able to follow the details, unfortunately. Can you confirm that
> this patch is safe to go into kernel 3.18?
>

This is the open question in the separate subthread (this one is
discussion around designing a solution for the current code):

http://marc.info/?l=linux-xfs&m=150246184413604&w=2

This could use confirmation, but my understanding is that this is safe
because v3.18 doesn't have the more advanced deferred ops
infrastructure. It uses xfs_bmap_finish() which has a max roll count of
one and a transaction with enough reservation for 2 rolls before
blocking reservation is required.

Note that doesn't mean we'd officially post a v3.18 stable patch before
this is fixed in the upstream code. We always fix upstream first and
backport from there to ensure a consistent base going forward (we don't
want to go change v3.18, end up with a slightly different upstream
patch, then have to backport more changes to fix the original patch).
This may be safe enough for you to use locally in the meantime, however.

Brian

> Thanks,
> Alex.
>
>
> -----Original Message----- From: Dave Chinner
> Sent: Monday, August 14, 2017 3:28 AM
> To: Brian Foster
> Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ;
> libor.klepac@bcom.cz
> Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> conversion and the addition of an attribute
>
> On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > > it already had, and so after joining, marking it ordered and then
> > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > in the transaction.
> > >
> > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > with a pre-existing dirty region is formatted for the CIL? We
> > > haven't done that before, so I'm betting that we don't relog the
> > > dirty region like we should be doing....
> > >
> > > ... and we don't relog the existing dirty range because the
> > > ordered flag takes precedence.
> > >
> >
> > Right.. so it seems that the current implementation for ordered buffers
> > assumes a buffer is only ever used in one mode or the other.
> > Additionally, the AIL assumes that any reinserted item has been fully
> > relogged and so it moves the LSN forward unconditionally. Current
> > ordered buffer processing violates this constraint for an already logged
> > buffer.
>
> Right, but it's not been a concern until now because we've only ever
> used ordered buffers on newly allocated buffers that haven't been
> previously logged.
>
> > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > > regions exist, then we treat it like a normal buffer rather than an
> > > ordered buffer. We can factor the dirty region check out of
> > > xfs_buf_item_unlock() for this...
> > >
> > > Actually, check the case in xfs_buf_item_size() and remove the
> > > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > > will do the right thing without needing a duplicate check...
> > >
> >
> > I think that would work, assuming we actually check the
> > xfs_buf_log_format for dirty-ness rather than just the log item. As it
> > is, note that ordered buffers are still "logged" in the transaction
> > because otherwise the transaction infrastructure will assume it made no
> > change to the buf and toss the log item at commit time (we also need to
> > set up I/O completion on the buf and whatnot).
>
> *nod*
>
> > What concerns me about this approach is that I think we introduce the
> > possibility for subtle bugs. Existing ordered buffer code does this:
> >
> >         xfs_trans_ordered_buf(tp, fbuf);
> >         xfs_trans_log_buf(tp, fbuf, 0,
> >                           BBTOB(fbuf->b_length) - 1);
> >
> > ... which should continue to work fine. Allowing ordered buffers to
> > physically log means that something like this:
> >
> >         xfs_trans_log_buf(tp, fbuf, 0,
> >                           BBTOB(fbuf->b_length) - 1);
> >         xfs_trans_ordered_buf(tp, fbuf);
> >
> > ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> > and logging internals. Granted, the above already is incorrect, but it
> > technically still works as expected. I don't see the need to turn that
> > into a real problem by actually logging the buffer when we might not
> > expect to.
>
> Well, it's not a "things go bad" bug. It's a "we screwed up an
> optimisation" bug, because logging the buffer contents unnecessarily
> only increases the required log bandwidth. It shouldn't affect
> replay because the buffer is still correctly ordered in the log.
> Hence both the transient and end states of the buffer during replay
> will still be the same...
>
> > So while I agree that this could probably be made to work and I think it
> > is ideal to doing any kind of logged range tracking in the deferred ops
> > code, it still seems more tricky than it needs to be. To relog a held
> > buffer in a new transaction, why not just mark the lidp dirty in the new
> > transaction so it inherits all existing dirty segments? AFAICT, all we
> > really need to do is:
> >
> >         tp->t_flags |= XFS_TRANS_DIRTY;
> >         lidp->lid_flags |= XFS_LID_DIRTY;
> >
> > ... on the new transaction and everything should just work as designed
> > (for a buffer that has been previously logged, held, rolled and
> > rejoined).
>
> We would also need to set:
>
> bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
>
> which means we should....
>
> > To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> > new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> > not related to actually dirtying the bli. xfs_trans_log_buf() would call
> > xfs_trans_dirty_buf() and thus would not change functionally.
> > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> > the existing ordered buf users would no longer need to log a range of
> > the buffer (which doesn't make much sense anyways).
>
> ... do this. :)
>
> > Finally, the
> > deferred infrastructure could join/dirty/hold the buffer to the new
> > transaction after each roll without needing to track and relog specific
> > regions of the buffer. Thoughts?
>
> Yup, that's exactly what I was thinking should be possible by using
> ordered buffers.... :)
>
> And Christoph's rework of the transaction roll and deferred inode
> handling that he just posted should make adding buffer handling
> quite a bit neater and cleaner.
>
> > Unless I'm missing something as to why this is busted, I'll take a
> > closer look at the code and float an rfc next week since otherwise it
> > sounds like this is something we could actually fix up in the ordered
> > buffer code today.
>
> Cool.
>
> > > Nothing in XFS is ever simple, is it? :P
> >
> > There used to be a level of satisfaction at feeling I understood some
> > new corner of XFS. Nowadays I know that just means I'm not yet aware of
> > whatever dragons remain in that corner (is that paranoia? not if it's
> > true!). :P
>
> Ah, the true signs of expertise: developing a knowledge base and
> insight deep enough to understand that there is always another
> hidden dragon poised to bite your head off. :)
>
> Cheers,
>
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html 


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-14 16:04                             ` Alex Lyakas
@ 2017-08-14 21:33                               ` Darrick J. Wong
  0 siblings, 0 replies; 40+ messages in thread
From: Darrick J. Wong @ 2017-08-14 21:33 UTC (permalink / raw)
  To: Alex Lyakas; +Cc: Brian Foster, Dave Chinner, linux-xfs, libor.klepac

On Mon, Aug 14, 2017 at 07:04:19PM +0300, Alex Lyakas wrote:
> Hi Brian,
> 
> Thanks for confirming. 3.18 is anyways EOL, so probably no more patches will
> show up for it. We are already running with this patch on our 3.18 for about
> a week, and did not see any issues.

Cool!  Thanks for supplying the testing data point!

--D

> 
> Alex.
> 
> 
> -----Original Message----- From: Brian Foster
> Sent: Monday, August 14, 2017 3:22 PM
> To: Alex Lyakas
> Cc: Dave Chinner ; Darrick J. Wong ; linux-xfs@vger.kernel.org ;
> libor.klepac@bcom.cz
> Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> conversion and the addition of an attribute
> 
> On Mon, Aug 14, 2017 at 11:11:41AM +0300, Alex Lyakas wrote:
> >Hello David, Brian,
> >
> >I was not able to follow the details, unfortunately. Can you confirm that
> >this patch is safe to go into kernel 3.18?
> >
> 
> This is the open question in the separate subthread (this one is
> discussion around designing a solution for the current code):
> 
> http://marc.info/?l=linux-xfs&m=150246184413604&w=2
> 
> This could use confirmation, but my understanding is that this is safe
> because v3.18 doesn't have the more advanced deferred ops
> infrastructure. It uses xfs_bmap_finish() which has a max roll count of
> one and a transaction with enough reservation for 2 rolls before
> blocking reservation is required.
> 
> Note that doesn't mean we'd officially post a v3.18 stable patch before
> this is fixed in the upstream code. We always fix upstream first and
> backport from there to ensure a consistent base going forward (we don't
> want to go change v3.18, end up with a slightly different upstream
> patch, then have to backport more changes to fix the original patch).
> This may be safe enough for you to use locally in the meantime, however.
> 
> Brian
> 
> >Thanks,
> >Alex.
> >
> >
> >-----Original Message----- From: Dave Chinner
> >Sent: Monday, August 14, 2017 3:28 AM
> >To: Brian Foster
> >Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ;
> >libor.klepac@bcom.cz
> >Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> >conversion and the addition of an attribute
> >
> >On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> >> On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> >> > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> >> > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> >> > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> >> > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> >> > it already had, and so after joining, marking it ordered and then
> >> > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> >> > in the transaction.
> >> >
> >> > The question is this: what happens when a XFS_BLI_ORDERED buffer
> >> > with a pre-existing dirty region is formatted for the CIL? We
> >> > haven't done that before, so I'm betting that we don't relog the
> >> > dirty region like we should be doing....
> >> >
> >> > ... and we don't relog the existing dirty range because the
> >> > ordered flag takes precedence.
> >> >
> >>
> >> Right.. so it seems that the current implementation for ordered buffers
> >> assumes a buffer is only ever used in one mode or the other.
> >> Additionally, the AIL assumes that any reinserted item has been fully
> >> relogged and so it moves the LSN forward unconditionally. Current
> >> ordered buffer processing violates this constraint for an already logged
> >> buffer.
> >
> >Right, but it's not been a concern until now because we've only ever
> >used ordered buffers on newly allocated buffers that haven't been
> >previously logged.
> >
> >> > Ok, the ordered buffer checks in xfs_buf_item_size() and
> >> > xfs_buf_item_format() need to also check for dirty regions. If dirty
> >> > regions exist, then we treat it like a normal buffer rather than an
> >> > ordered buffer. We can factor the dirty region check out of
> >> > xfs_buf_item_unlock() for this...
> >> >
> >> > Actually, check the case in xfs_buf_item_size() and remove the
> >> > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> >> > will do the right thing without needing a duplicate check...
> >> >
> >>
> >> I think that would work, assuming we actually check the
> >> xfs_buf_log_format for dirty-ness rather than just the log item. As it
> >> is, note that ordered buffers are still "logged" in the transaction
> >> because otherwise the transaction infrastructure will assume it made no
> >> change to the buf and toss the log item at commit time (we also need to
> >> set up I/O completion on the buf and whatnot).
> >
> >*nod*
> >
> >> What concerns me about this approach is that I think we introduce the
> >> possibility for subtle bugs. Existing ordered buffer code does this:
> >>
> >>         xfs_trans_ordered_buf(tp, fbuf);
> >>         xfs_trans_log_buf(tp, fbuf, 0,
> >>                           BBTOB(fbuf->b_length) - 1);
> >>
> >> ... which should continue to work fine. Allowing ordered buffers to
> >> physically log means that something like this:
> >>
> >>         xfs_trans_log_buf(tp, fbuf, 0,
> >>                           BBTOB(fbuf->b_length) - 1);
> >>         xfs_trans_ordered_buf(tp, fbuf);
> >>
> >> ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> >> and logging internals. Granted, the above already is incorrect, but it
> >> technically still works as expected. I don't see the need to turn that
> >> into a real problem by actually logging the buffer when we might not
> >> expect to.
> >
> >Well, it's not a "things go bad" bug. It's a "we screwed up an
> >optimisation" bug, because logging the buffer contents unnecessarily
> >only increases the required log bandwidth. It shouldn't affect
> >replay because the buffer is still correctly ordered in the log.
> >Hence both the transient and end states of the buffer during replay
> >will still be the same...
> >
> >> So while I agree that this could probably be made to work and I think it
> >> is ideal to doing any kind of logged range tracking in the deferred ops
> >> code, it still seems more tricky than it needs to be. To relog a held
> >> buffer in a new transaction, why not just mark the lidp dirty in the new
> >> transaction so it inherits all existing dirty segments? AFAICT, all we
> >> really need to do is:
> >>
> >>         tp->t_flags |= XFS_TRANS_DIRTY;
> >>         lidp->lid_flags |= XFS_LID_DIRTY;
> >>
> >> ... on the new transaction and everything should just work as designed
> >> (for a buffer that has been previously logged, held, rolled and
> >> rejoined).
> >
> >We would also need to set:
> >
> >bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> >
> >which means we should....
> >
> >> To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> >> new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> >> not related to actually dirtying the bli. xfs_trans_log_buf() would call
> >> xfs_trans_dirty_buf() and thus would not change functionally.
> >> xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> >> the existing ordered buf users would no longer need to log a range of
> >> the buffer (which doesn't make much sense anyways).
> >
> >... do this. :)
> >
> >> Finally, the
> >> deferred infrastructure could join/dirty/hold the buffer to the new
> >> transaction after each roll without needing to track and relog specific
> >> regions of the buffer. Thoughts?
> >
> >Yup, that's exactly what I was thinking should be possible by using
> >ordered buffers.... :)
> >
> >And Christoph's rework of the transaction roll and deferred inode
> >handling that he just posted should make adding buffer handling
> >quite a bit neater and cleaner.
> >
> >> Unless I'm missing something as to why this is busted, I'll take a
> >> closer look at the code and float an rfc next week since otherwise it
> >> sounds like this is something we could actually fix up in the ordered
> >> buffer code today.
> >
> >Cool.
> >
> >> > Nothing in XFS is ever simple, is it? :P
> >>
> >> There used to be a level of satisfaction at feeling I understood some
> >> new corner of XFS. Nowadays I know that just means I'm not yet aware of
> >> whatever dragons remain in that corner (is that paranoia? not if it's
> >> true!). :P
> >
> >Ah, the true signs of expertise: developing a knowledge base and
> >insight deep enough to understand that there is always another
> >hidden dragon poised to bite your head off. :)
> >
> >Cheers,
> >
> >Dave.
> >-- 
> >Dave Chinner
> >david@fromorbit.com
> >
> >--
> >To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> >the body of a message to majordomo@vger.kernel.org
> >More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-14  0:28                       ` Dave Chinner
  2017-08-14  8:11                         ` Alex Lyakas
@ 2017-08-17 20:38                         ` Brian Foster
  2017-08-17 22:31                           ` Darrick J. Wong
  2017-08-18  2:04                           ` Dave Chinner
  1 sibling, 2 replies; 40+ messages in thread
From: Brian Foster @ 2017-08-17 20:38 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Mon, Aug 14, 2017 at 10:28:09AM +1000, Dave Chinner wrote:
> On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > > it already had, and so after joining, marking it ordered and then
> > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > in the transaction.
> > > 
> > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > with a pre-existing dirty region is formatted for the CIL? We
> > > haven't done that before, so I'm betting that we don't relog the
> > > dirty region like we should be doing....
> > > 
> > > ... and we don't relog the existing dirty range because the
> > > ordered flag takes precedence.
> > > 
> > 
> > Right.. so it seems that the current implementation for ordered buffers
> > assumes a buffer is only ever used in one mode or the other.
> > Additionally, the AIL assumes that any reinserted item has been fully
> > relogged and so it moves the LSN forward unconditionally. Current
> > ordered buffer processing violates this constraint for an already logged
> > buffer.
> 
> Right, but it's not been a concern until now because we've only ever
> used ordered buffers on newly allocated buffers that haven't been
> previously logged.
> 

Shortly after starting to test the latest refactoring with some asserts
that enforce "strict" buffer ordering, I (sure enough ;)) ran into an
instance of the above.

The issue is with the bmbt owner change operation that is part of the
swap extent sequence. The owner change code ends up setting a buffer
ordered that is currently dirty due to an unrelated previous
transaction. The potential side effects of this are basically what we've
already laid out above. Note that this is only relevant on
crc=1,rmapbt=0 filesystems.

I'm currently testing out something that uses the rmapbt based swap
implementation for all filesystems. IIUC, that algorithm doesn't
actually require rmapbt support, it is just unfortunately named as such
because we only use it on rmapbt filesystems. I suspect that is because
rmapbt management requires the unmap/remapping of space to keep the tree
consistent (and I suspect this is limited to rmapbt fs' for performance
reasons, Darrick?) vs. it being something that actually depends on the
rmapbt.

FWIW, I've so far at least considered some of the following alternatives
to try and preserve the existing bmbt owner change algorithm before
moving on to testing the rmapbt approach:

1.) Don't use ordered buffers for the bmbt owner change. My
understanding is that this is a non-starter as we'd need to log every
buffer in the tree in a single transaction.

2.) Update the extent swap operation to push the log and AIL based on
the most recent LSN of any buffer in the bmbt before the bmbt owner
change takes place. I've experimented with this enough to hack together
an xfs_ail_push_sync() prototype and also observe that using the
ili_item.li_lsn of the inode is not sufficient.

IOW, this basically requires a second bmbt walk per-inode to discover
the max lsn of all bmbt blocks. That may or may not be a performance
issue since we have to walk the bmbt anyways. Otherwise I think this is
a viable option from a correctness perspective.

A simplified variant of this (that just crossed my mind while writing
this up) may be to do an in-core check for dirty bmbt buffers on an
inode and just do an xfs_ail_push_all_sync() if any are found.

3.) Synchronous write the affected buffers at xfs_trans_ordered_buf()
time. This is a bit ugly because it requires to xfs_trans_brelse() the
buf from the current transaction, _bwrite() it and then _bjoin() it back
to the transaction all within xfs_trans_ordered_buf(). The advantage
over #2 is that this only occurs for buffers with logged segments in the
bli. It also effectively implements a form of generic support for
ordering previously logged buffers.

4.) Implement the ordered buffer relogging logic discussed below: detect
ordered buffers with previously logged segments and relog them at
->iop_format() time. I suppose I could alleviate my aforementioned
concerns with an assert that verifies an ordered buffer isn't already
dirtied by the _current_ transaction (i.e., to catch the case of the
"screwed up optimization" I was concerned about).

That aside, I think there is another problem here that we missed
previously: transactions that use ordered buffers don't reserve log
space for them, but relogged items require log reservation. If I follow
the code correctly, a relogged item that is presently in the CIL may not
use reservation in a subsequent tx. If the item resides in the AIL at
relog commit time, however, the relogging transaction must have
reservation for the item (the physical log space for the item is freed
and the space used for the relog is accounted out of the reservation).

So I think the only way we can support the ability to relog previously
dirty buffers that have been marked ordered is to require log
reservation for them as for normal physically logged buffers, which kind
of defeats the purpose.

5.) A deferred op variant of the bmbt owner change algorithm. I haven't
fully thought this one through yet so it may not be viable, but the
general idea is to use deferred ops to conditionally physically log
previously dirty buffers where required. For example, use a transaction
with log reservation for one full buffer, log a new bmbt owner change
intent and start running through the bmbt scan attempting to order
buffers. Update xfs_trans_ordered_buf() to explicitly fail on buffers
with dirty segments. When said failure occurs during the bmbt scan,
physically log the buffer and terminate the scan with -EAGAIN. The
deferred infra rolls the tx, relogs the intent and has to restart the
bmbt scan over again. This repeats until we've processed the entire
tree.

I had a couple more thoughts that are similarly not yet thought out and
thus not worth rambling about. Thoughts on any of the above? On using
the rmapbt algorithm? Other ideas?

Brian

> > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > > regions exist, then we treat it like a normal buffer rather than an
> > > ordered buffer. We can factor the dirty region check out of
> > > xfs_buf_item_unlock() for this...
> > > 
> > > Actually, check the case in xfs_buf_item_size() and remove the
> > > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > > will do the right thing without needing a duplicate check...
> > > 
> > 
> > I think that would work, assuming we actually check the
> > xfs_buf_log_format for dirty-ness rather than just the log item. As it
> > is, note that ordered buffers are still "logged" in the transaction
> > because otherwise the transaction infrastructure will assume it made no
> > change to the buf and toss the log item at commit time (we also need to
> > set up I/O completion on the buf and whatnot).
> 
> *nod*
> 
> > What concerns me about this approach is that I think we introduce the
> > possibility for subtle bugs. Existing ordered buffer code does this:
> > 
> >         xfs_trans_ordered_buf(tp, fbuf);
> >         xfs_trans_log_buf(tp, fbuf, 0,
> >                           BBTOB(fbuf->b_length) - 1);
> > 
> > ... which should continue to work fine. Allowing ordered buffers to
> > physically log means that something like this:
> > 
> >         xfs_trans_log_buf(tp, fbuf, 0,
> >                           BBTOB(fbuf->b_length) - 1);
> >         xfs_trans_ordered_buf(tp, fbuf);
> > 
> > ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> > and logging internals. Granted, the above already is incorrect, but it
> > technically still works as expected. I don't see the need to turn that
> > into a real problem by actually logging the buffer when we might not
> > expect to.
> 
> Well, it's not a "things go bad" bug. It's a "we screwed up an
> optimisation" bug, because logging the buffer contents unnecessarily
> only increases the required log bandwidth. It shouldn't affect
> replay because the buffer is still correctly ordered in the log.
> Hence both the transient and end states of the buffer during replay
> will still be the same...
> 
> > So while I agree that this could probably be made to work and I think it
> > is ideal to doing any kind of logged range tracking in the deferred ops
> > code, it still seems more tricky than it needs to be. To relog a held
> > buffer in a new transaction, why not just mark the lidp dirty in the new
> > transaction so it inherits all existing dirty segments? AFAICT, all we
> > really need to do is:
> > 
> >         tp->t_flags |= XFS_TRANS_DIRTY;
> >         lidp->lid_flags |= XFS_LID_DIRTY;
> > 
> > ... on the new transaction and everything should just work as designed
> > (for a buffer that has been previously logged, held, rolled and
> > rejoined).
> 
> We would also need to set:
> 
> 	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> 
> which means we should....
> 
> > To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> > new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> > not related to actually dirtying the bli. xfs_trans_log_buf() would call
> > xfs_trans_dirty_buf() and thus would not change functionally.
> > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> > the existing ordered buf users would no longer need to log a range of
> > the buffer (which doesn't make much sense anyways).
> 
> ... do this. :)
> 
> > Finally, the
> > deferred infrastructure could join/dirty/hold the buffer to the new
> > transaction after each roll without needing to track and relog specific
> > regions of the buffer. Thoughts?
> 
> Yup, that's exactly what I was thinking should be possible by using
> ordered buffers.... :)
> 
> And Christoph's rework of the transaction roll and deferred inode
> handling that he just posted should make adding buffer handling
> quite a bit neater and cleaner.
> 
> > Unless I'm missing something as to why this is busted, I'll take a
> > closer look at the code and float an rfc next week since otherwise it
> > sounds like this is something we could actually fix up in the ordered
> > buffer code today.
> 
> Cool.
> 
> > > Nothing in XFS is ever simple, is it? :P
> > 
> > There used to be a level of satisfaction at feeling I understood some
> > new corner of XFS. Nowadays I know that just means I'm not yet aware of
> > whatever dragons remain in that corner (is that paranoia? not if it's
> > true!). :P
> 
> Ah, the true signs of expertise: developing a knowledge base and
> insight deep enough to understand that there is always another
> hidden dragon poised to bite your head off. :)
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-17 20:38                         ` Brian Foster
@ 2017-08-17 22:31                           ` Darrick J. Wong
  2017-08-18 11:39                             ` Brian Foster
  2017-08-18  2:04                           ` Dave Chinner
  1 sibling, 1 reply; 40+ messages in thread
From: Darrick J. Wong @ 2017-08-17 22:31 UTC (permalink / raw)
  To: Brian Foster; +Cc: Dave Chinner, Alex Lyakas, linux-xfs, libor.klepac

On Thu, Aug 17, 2017 at 04:38:26PM -0400, Brian Foster wrote:
> On Mon, Aug 14, 2017 at 10:28:09AM +1000, Dave Chinner wrote:
> > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > > > it already had, and so after joining, marking it ordered and then
> > > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > > in the transaction.
> > > > 
> > > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > > with a pre-existing dirty region is formatted for the CIL? We
> > > > haven't done that before, so I'm betting that we don't relog the
> > > > dirty region like we should be doing....
> > > > 
> > > > ... and we don't relog the existing dirty range because the
> > > > ordered flag takes precedence.
> > > > 
> > > 
> > > Right.. so it seems that the current implementation for ordered buffers
> > > assumes a buffer is only ever used in one mode or the other.
> > > Additionally, the AIL assumes that any reinserted item has been fully
> > > relogged and so it moves the LSN forward unconditionally. Current
> > > ordered buffer processing violates this constraint for an already logged
> > > buffer.
> > 
> > Right, but it's not been a concern until now because we've only ever
> > used ordered buffers on newly allocated buffers that haven't been
> > previously logged.
> > 
> 
> Shortly after starting to test the latest refactoring with some asserts
> that enforce "strict" buffer ordering, I (sure enough ;)) ran into an
> instance of the above.
> 
> The issue is with the bmbt owner change operation that is part of the
> swap extent sequence. The owner change code ends up setting a buffer
> ordered that is currently dirty due to an unrelated previous
> transaction. The potential side effects of this are basically what we've
> already laid out above. Note that this is only relevant on
> crc=1,rmapbt=0 filesystems.
> 
> I'm currently testing out something that uses the rmapbt based swap
> implementation for all filesystems. IIUC, that algorithm doesn't
> actually require rmapbt support, it is just unfortunately named as such
> because we only use it on rmapbt filesystems. I suspect that is because
> rmapbt management requires the unmap/remapping of space to keep the tree
> consistent (and I suspect this is limited to rmapbt fs' for performance
> reasons, Darrick?) vs. it being something that actually depends on the

The rmapbt extent swapping function requires rmapbt because it writes
rmap and (more importantly) bmap intent items to the log.  An older
kernel cannot be allowed to recover such log items, which the rocompat
feature check in xfs_mount_validate_sb prevents.

I've never tried to see what happens when log recovery hits an item with
an unknown magic number -- I think we just bail out with EIO and leave
the log for someone else to recover?  In theory you could use the
rmap swapextents function all the time, but anyone trying to recover
after a crash with an old kernel will faceplant hard.  We could also
implement a new log-incompat flag if !rmapbt and someone tries to log a
bmap item... though iirc log_incompat doesn't exist for v4 filesystems,
so that still doesn't help us.

Conceptually, at least, the rmap swapext function should be capable of
swapping /any/ two inodes, not just the particular set of circumstances
required by xfs_swap_extents.

> rmapbt.
> 
> FWIW, I've so far at least considered some of the following alternatives
> to try and preserve the existing bmbt owner change algorithm before
> moving on to testing the rmapbt approach:
> 
> 1.) Don't use ordered buffers for the bmbt owner change. My
> understanding is that this is a non-starter as we'd need to log every
> buffer in the tree in a single transaction.

Yep.

> 2.) Update the extent swap operation to push the log and AIL based on
> the most recent LSN of any buffer in the bmbt before the bmbt owner
> change takes place. I've experimented with this enough to hack together
> an xfs_ail_push_sync() prototype and also observe that using the
> ili_item.li_lsn of the inode is not sufficient.
> 
> IOW, this basically requires a second bmbt walk per-inode to discover
> the max lsn of all bmbt blocks. That may or may not be a performance
> issue since we have to walk the bmbt anyways. Otherwise I think this is
> a viable option from a correctness perspective.
> 
> A simplified variant of this (that just crossed my mind while writing
> this up) may be to do an in-core check for dirty bmbt buffers on an
> inode and just do an xfs_ail_push_all_sync() if any are found.

I imagine you have to keep the inodes locked throughout all this to
prevent something else from wandering in and re-logging the bmbt block?

> 3.) Synchronous write the affected buffers at xfs_trans_ordered_buf()
> time. This is a bit ugly because it requires to xfs_trans_brelse() the
> buf from the current transaction, _bwrite() it and then _bjoin() it back
> to the transaction all within xfs_trans_ordered_buf(). The advantage
> over #2 is that this only occurs for buffers with logged segments in the
> bli. It also effectively implements a form of generic support for
> ordering previously logged buffers.
> 
> 4.) Implement the ordered buffer relogging logic discussed below: detect
> ordered buffers with previously logged segments and relog them at
> ->iop_format() time. I suppose I could alleviate my aforementioned
> concerns with an assert that verifies an ordered buffer isn't already
> dirtied by the _current_ transaction (i.e., to catch the case of the
> "screwed up optimization" I was concerned about).
> 
> That aside, I think there is another problem here that we missed
> previously: transactions that use ordered buffers don't reserve log
> space for them, but relogged items require log reservation. If I follow
> the code correctly, a relogged item that is presently in the CIL may not
> use reservation in a subsequent tx. If the item resides in the AIL at
> relog commit time, however, the relogging transaction must have
> reservation for the item (the physical log space for the item is freed
> and the space used for the relog is accounted out of the reservation).
> 
> So I think the only way we can support the ability to relog previously
> dirty buffers that have been marked ordered is to require log
> reservation for them as for normal physically logged buffers, which kind
> of defeats the purpose.
> 
> 5.) A deferred op variant of the bmbt owner change algorithm. I haven't
> fully thought this one through yet so it may not be viable, but the
> general idea is to use deferred ops to conditionally physically log
> previously dirty buffers where required. For example, use a transaction
> with log reservation for one full buffer, log a new bmbt owner change
> intent and start running through the bmbt scan attempting to order
> buffers. Update xfs_trans_ordered_buf() to explicitly fail on buffers
> with dirty segments. When said failure occurs during the bmbt scan,
> physically log the buffer and terminate the scan with -EAGAIN. The
> deferred infra rolls the tx, relogs the intent and has to restart the
> bmbt scan over again. This repeats until we've processed the entire
> tree.
> 
> I had a couple more thoughts that are similarly not yet thought out and
> thus not worth rambling about. Thoughts on any of the above? On using
> the rmapbt algorithm? Other ideas?
> 
> Brian
> 
> > > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > > > regions exist, then we treat it like a normal buffer rather than an
> > > > ordered buffer. We can factor the dirty region check out of
> > > > xfs_buf_item_unlock() for this...
> > > > 
> > > > Actually, check the case in xfs_buf_item_size() and remove the
> > > > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > > > will do the right thing without needing a duplicate check...
> > > > 
> > > 
> > > I think that would work, assuming we actually check the
> > > xfs_buf_log_format for dirty-ness rather than just the log item. As it
> > > is, note that ordered buffers are still "logged" in the transaction
> > > because otherwise the transaction infrastructure will assume it made no
> > > change to the buf and toss the log item at commit time (we also need to
> > > set up I/O completion on the buf and whatnot).
> > 
> > *nod*
> > 
> > > What concerns me about this approach is that I think we introduce the
> > > possibility for subtle bugs. Existing ordered buffer code does this:
> > > 
> > >         xfs_trans_ordered_buf(tp, fbuf);
> > >         xfs_trans_log_buf(tp, fbuf, 0,
> > >                           BBTOB(fbuf->b_length) - 1);
> > > 
> > > ... which should continue to work fine. Allowing ordered buffers to
> > > physically log means that something like this:
> > > 
> > >         xfs_trans_log_buf(tp, fbuf, 0,
> > >                           BBTOB(fbuf->b_length) - 1);
> > >         xfs_trans_ordered_buf(tp, fbuf);
> > > 
> > > ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> > > and logging internals. Granted, the above already is incorrect, but it
> > > technically still works as expected. I don't see the need to turn that
> > > into a real problem by actually logging the buffer when we might not
> > > expect to.
> > 
> > Well, it's not a "things go bad" bug. It's a "we screwed up an
> > optimisation" bug, because logging the buffer contents unnecessarily
> > only increases the required log bandwidth. It shouldn't affect
> > replay because the buffer is still correctly ordered in the log.
> > Hence both the transient and end states of the buffer during replay
> > will still be the same...
> > 
> > > So while I agree that this could probably be made to work and I think it
> > > is ideal to doing any kind of logged range tracking in the deferred ops
> > > code, it still seems more tricky than it needs to be. To relog a held
> > > buffer in a new transaction, why not just mark the lidp dirty in the new
> > > transaction so it inherits all existing dirty segments? AFAICT, all we
> > > really need to do is:
> > > 
> > >         tp->t_flags |= XFS_TRANS_DIRTY;
> > >         lidp->lid_flags |= XFS_LID_DIRTY;
> > > 
> > > ... on the new transaction and everything should just work as designed
> > > (for a buffer that has been previously logged, held, rolled and
> > > rejoined).
> > 
> > We would also need to set:
> > 
> > 	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> > 
> > which means we should....
> > 
> > > To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> > > new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> > > not related to actually dirtying the bli. xfs_trans_log_buf() would call
> > > xfs_trans_dirty_buf() and thus would not change functionally.
> > > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> > > the existing ordered buf users would no longer need to log a range of
> > > the buffer (which doesn't make much sense anyways).
> > 
> > ... do this. :)
> > 
> > > Finally, the
> > > deferred infrastructure could join/dirty/hold the buffer to the new
> > > transaction after each roll without needing to track and relog specific
> > > regions of the buffer. Thoughts?
> > 
> > Yup, that's exactly what I was thinking should be possible by using
> > ordered buffers.... :)
> > 
> > And Christoph's rework of the transaction roll and deferred inode
> > handling that he just posted should make adding buffer handling
> > quite a bit neater and cleaner.
> > 
> > > Unless I'm missing something as to why this is busted, I'll take a
> > > closer look at the code and float an rfc next week since otherwise it
> > > sounds like this is something we could actually fix up in the ordered
> > > buffer code today.
> > 
> > Cool.
> > 
> > > > Nothing in XFS is ever simple, is it? :P
> > > 
> > > There used to be a level of satisfaction at feeling I understood some
> > > new corner of XFS. Nowadays I know that just means I'm not yet aware of
> > > whatever dragons remain in that corner (is that paranoia? not if it's
> > > true!). :P
> > 
> > Ah, the true signs of expertise: developing a knowledge base and
> > insight deep enough to understand that there is always another
> > hidden dragon poised to bite your head off. :)
> > 
> > Cheers,
> > 
> > Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-17 20:38                         ` Brian Foster
  2017-08-17 22:31                           ` Darrick J. Wong
@ 2017-08-18  2:04                           ` Dave Chinner
  2017-08-18 11:42                             ` Brian Foster
  1 sibling, 1 reply; 40+ messages in thread
From: Dave Chinner @ 2017-08-18  2:04 UTC (permalink / raw)
  To: Brian Foster; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Thu, Aug 17, 2017 at 04:38:26PM -0400, Brian Foster wrote:
> On Mon, Aug 14, 2017 at 10:28:09AM +1000, Dave Chinner wrote:
> > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > > > it already had, and so after joining, marking it ordered and then
> > > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > > in the transaction.
> > > > 
> > > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > > with a pre-existing dirty region is formatted for the CIL? We
> > > > haven't done that before, so I'm betting that we don't relog the
> > > > dirty region like we should be doing....
> > > > 
> > > > ... and we don't relog the existing dirty range because the
> > > > ordered flag takes precedence.
> > > > 
> > > 
> > > Right.. so it seems that the current implementation for ordered buffers
> > > assumes a buffer is only ever used in one mode or the other.
> > > Additionally, the AIL assumes that any reinserted item has been fully
> > > relogged and so it moves the LSN forward unconditionally. Current
> > > ordered buffer processing violates this constraint for an already logged
> > > buffer.
> > 
> > Right, but it's not been a concern until now because we've only ever
> > used ordered buffers on newly allocated buffers that haven't been
> > previously logged.
> > 
> 
> Shortly after starting to test the latest refactoring with some asserts
> that enforce "strict" buffer ordering, I (sure enough ;)) ran into an
> instance of the above.
> 
> The issue is with the bmbt owner change operation that is part of the
> swap extent sequence.

Ah, I forgot about that. I was thinking that it went away with
rmapbt....

> The owner change code ends up setting a buffer
> ordered that is currently dirty due to an unrelated previous
> transaction. The potential side effects of this are basically what we've
> already laid out above. Note that this is only relevant on
> crc=1,rmapbt=0 filesystems.

.... but obviously I was wrong. :/

[....]

> 
> 4.) Implement the ordered buffer relogging logic discussed below: detect
> ordered buffers with previously logged segments and relog them at
> ->iop_format() time. I suppose I could alleviate my aforementioned
> concerns with an assert that verifies an ordered buffer isn't already
> dirtied by the _current_ transaction (i.e., to catch the case of the
> "screwed up optimization" I was concerned about).

Yeah, that would work, except .....

> That aside, I think there is another problem here that we missed
> previously: transactions that use ordered buffers don't reserve log
> space for them, but relogged items require log reservation. If I follow
> the code correctly, a relogged item that is presently in the CIL may not
> use reservation in a subsequent tx. If the item resides in the AIL at
> relog commit time, however, the relogging transaction must have
> reservation for the item (the physical log space for the item is freed
> and the space used for the relog is accounted out of the reservation).

... this.

Yes, you are right, Brian, that if the item is in the CIL then we
don't need a new reservation because the space is already accounted
for in the current checkpoint. This, however, is still racy as the
item in the CIL can be committed while we have it locked and so
when we commit it's the same as "in the AIL and we need a
reservation".

> So I think the only way we can support the ability to relog previously
> dirty buffers that have been marked ordered is to require log
> reservation for them as for normal physically logged buffers, which kind
> of defeats the purpose.
> 
> 5.) A deferred op variant of the bmbt owner change algorithm. I haven't
> fully thought this one through yet so it may not be viable, but the
> general idea is to use deferred ops to conditionally physically log
> previously dirty buffers where required. For example, use a transaction
> with log reservation for one full buffer, log a new bmbt owner change
> intent and start running through the bmbt scan attempting to order
> buffers. Update xfs_trans_ordered_buf() to explicitly fail on buffers
> with dirty segments. When said failure occurs during the bmbt scan,
> physically log the buffer and terminate the scan with -EAGAIN. The
> deferred infra rolls the tx, relogs the intent and has to restart the
> bmbt scan over again. This repeats until we've processed the entire
> tree.

Problem is that a deferred op variant isn't backwards compatible,
and given this is a v4 filesystem issue, that's a big problem
because we can't just use a log incompat flag while the op is active
in the log.

However, I think we can do this with just normal rolling
transactions. Like you suggest, we can create the transaction with a
small amount of buffer space that can be logged (e.g. enough for N
complete buffers). Then we modify xfs_trans_buf_ordered() to "try
ordering the buffer". If the buffer needs relogging, then it fails
and we log the owner field in the buffer as per a normal buffer. (or
maybe just take the range we need to log and log that instead of
ordering the buffer - as long as we can count the buffers that do
this.)

On the N-th relogged buffer, roll the transaction to trigger
regranting of the log space for another N buffers and keep going
with the owner change.  The transaction roll will then commit the
buffer owner change to the log.

In recovery, we just replay the owner change as we currently do,
knowing that if the buffer was logged during the owner change and
written to a later checkpoint, the replay of that buffer will
contain the owner change we just did and so we won't lose anything.

I think this should work on all v4 filesystems without any need to
modify log recovery or new log items....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-17 22:31                           ` Darrick J. Wong
@ 2017-08-18 11:39                             ` Brian Foster
  2017-08-18 15:37                               ` Darrick J. Wong
  0 siblings, 1 reply; 40+ messages in thread
From: Brian Foster @ 2017-08-18 11:39 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Dave Chinner, Alex Lyakas, linux-xfs, libor.klepac

On Thu, Aug 17, 2017 at 03:31:21PM -0700, Darrick J. Wong wrote:
> On Thu, Aug 17, 2017 at 04:38:26PM -0400, Brian Foster wrote:
> > On Mon, Aug 14, 2017 at 10:28:09AM +1000, Dave Chinner wrote:
> > > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > > > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > > > > it already had, and so after joining, marking it ordered and then
> > > > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > > > in the transaction.
> > > > > 
> > > > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > > > with a pre-existing dirty region is formatted for the CIL? We
> > > > > haven't done that before, so I'm betting that we don't relog the
> > > > > dirty region like we should be doing....
> > > > > 
> > > > > ... and we don't relog the existing dirty range because the
> > > > > ordered flag takes precedence.
> > > > > 
> > > > 
> > > > Right.. so it seems that the current implementation for ordered buffers
> > > > assumes a buffer is only ever used in one mode or the other.
> > > > Additionally, the AIL assumes that any reinserted item has been fully
> > > > relogged and so it moves the LSN forward unconditionally. Current
> > > > ordered buffer processing violates this constraint for an already logged
> > > > buffer.
> > > 
> > > Right, but it's not been a concern until now because we've only ever
> > > used ordered buffers on newly allocated buffers that haven't been
> > > previously logged.
> > > 
> > 
> > Shortly after starting to test the latest refactoring with some asserts
> > that enforce "strict" buffer ordering, I (sure enough ;)) ran into an
> > instance of the above.
> > 
> > The issue is with the bmbt owner change operation that is part of the
> > swap extent sequence. The owner change code ends up setting a buffer
> > ordered that is currently dirty due to an unrelated previous
> > transaction. The potential side effects of this are basically what we've
> > already laid out above. Note that this is only relevant on
> > crc=1,rmapbt=0 filesystems.
> > 
> > I'm currently testing out something that uses the rmapbt based swap
> > implementation for all filesystems. IIUC, that algorithm doesn't
> > actually require rmapbt support, it is just unfortunately named as such
> > because we only use it on rmapbt filesystems. I suspect that is because
> > rmapbt management requires the unmap/remapping of space to keep the tree
> > consistent (and I suspect this is limited to rmapbt fs' for performance
> > reasons, Darrick?) vs. it being something that actually depends on the
> 
> The rmapbt extent swapping function requires rmapbt because it writes
> rmap and (more importantly) bmap intent items to the log.  An older
> kernel cannot be allowed to recover such log items, which the rocompat
> feature check in xfs_mount_validate_sb prevents.
> 

Hmm, that's not what I've observed. I've tested recovery of both the
rmapbt algorithm (with and without rmapbt actually enabled) as well as
the bmbt owner change algorithm. bmbt owner change recovery is actually
currently broken due to the inode owner check in
xfs_btree_lookup_get_block(), but I've been waiting to determine how we
end up resolving this higher level issue before digging any further into
that (I suspect it just needs to be bypassed during recovery). I hadn't
hit any issues with the former cases.

Looking at the code, I see the high level algorithm basically queues
bmap unmap/map deferred ops across the file address space. That
eventually resolves to __xfs_bunmapi()/xfs_bmapi_remap() calls that
migrate the mappings. __xfs_bunmapi() clearly has to work in general for
!rmapbt filesystems. AFAICT, all the XFS_BMAPI_REMAP flag does here is
prevent us from actually freeing the blocks.

Moving along, we next end up in xfs_bmapi_remap() to map said blocks
back into the other file... where I also don't see anything feature
specific in the implementation. It uses xfs_bmap_add_extent_hole_real(),
which calls xfs_rmap_map_extent(), which calls
xfs_rmap_update_is_needed(), which checks xfs_sb_version_hasrmapbt().

Looking back at the unmap side, it looks like we have the same pattern
down in xfs_bmap_del_extent() -> xfs_rmap_unmap_extent() -> ...

So AFAICT we have feature checks in the appropriate places. Am I missing
something else?

> I've never tried to see what happens when log recovery hits an item with
> an unknown magic number -- I think we just bail out with EIO and leave
> the log for someone else to recover?  In theory you could use the
> rmap swapextents function all the time, but anyone trying to recover
> after a crash with an old kernel will faceplant hard.  We could also
> implement a new log-incompat flag if !rmapbt and someone tries to log a
> bmap item... though iirc log_incompat doesn't exist for v4 filesystems,
> so that still doesn't help us.
> 
> Conceptually, at least, the rmap swapext function should be capable of
> swapping /any/ two inodes, not just the particular set of circumstances
> required by xfs_swap_extents.
> 

Indeed, thanks. If there is some dependency here it seems like it
shouldn't be too difficult to break.

> > rmapbt.
> > 
> > FWIW, I've so far at least considered some of the following alternatives
> > to try and preserve the existing bmbt owner change algorithm before
> > moving on to testing the rmapbt approach:
> > 
> > 1.) Don't use ordered buffers for the bmbt owner change. My
> > understanding is that this is a non-starter as we'd need to log every
> > buffer in the tree in a single transaction.
> 
> Yep.
> 
> > 2.) Update the extent swap operation to push the log and AIL based on
> > the most recent LSN of any buffer in the bmbt before the bmbt owner
> > change takes place. I've experimented with this enough to hack together
> > an xfs_ail_push_sync() prototype and also observe that using the
> > ili_item.li_lsn of the inode is not sufficient.
> > 
> > IOW, this basically requires a second bmbt walk per-inode to discover
> > the max lsn of all bmbt blocks. That may or may not be a performance
> > issue since we have to walk the bmbt anyways. Otherwise I think this is
> > a viable option from a correctness perspective.
> > 
> > A simplified variant of this (that just crossed my mind while writing
> > this up) may be to do an in-core check for dirty bmbt buffers on an
> > inode and just do an xfs_ail_push_all_sync() if any are found.
> 
> I imagine you have to keep the inodes locked throughout all this to
> prevent something else from wandering in and re-logging the bmbt block?
> 

Yeah, or retry or fail with -EAGAIN or something if we hit the
problematic state, and/or see if there's something we can do to
effectively quiesce the bmbt while under iolock. I hadn't really got
that far tbh because this didn't seem like a great first option.

Brian

> > 3.) Synchronous write the affected buffers at xfs_trans_ordered_buf()
> > time. This is a bit ugly because it requires to xfs_trans_brelse() the
> > buf from the current transaction, _bwrite() it and then _bjoin() it back
> > to the transaction all within xfs_trans_ordered_buf(). The advantage
> > over #2 is that this only occurs for buffers with logged segments in the
> > bli. It also effectively implements a form of generic support for
> > ordering previously logged buffers.
> > 
> > 4.) Implement the ordered buffer relogging logic discussed below: detect
> > ordered buffers with previously logged segments and relog them at
> > ->iop_format() time. I suppose I could alleviate my aforementioned
> > concerns with an assert that verifies an ordered buffer isn't already
> > dirtied by the _current_ transaction (i.e., to catch the case of the
> > "screwed up optimization" I was concerned about).
> > 
> > That aside, I think there is another problem here that we missed
> > previously: transactions that use ordered buffers don't reserve log
> > space for them, but relogged items require log reservation. If I follow
> > the code correctly, a relogged item that is presently in the CIL may not
> > use reservation in a subsequent tx. If the item resides in the AIL at
> > relog commit time, however, the relogging transaction must have
> > reservation for the item (the physical log space for the item is freed
> > and the space used for the relog is accounted out of the reservation).
> > 
> > So I think the only way we can support the ability to relog previously
> > dirty buffers that have been marked ordered is to require log
> > reservation for them as for normal physically logged buffers, which kind
> > of defeats the purpose.
> > 
> > 5.) A deferred op variant of the bmbt owner change algorithm. I haven't
> > fully thought this one through yet so it may not be viable, but the
> > general idea is to use deferred ops to conditionally physically log
> > previously dirty buffers where required. For example, use a transaction
> > with log reservation for one full buffer, log a new bmbt owner change
> > intent and start running through the bmbt scan attempting to order
> > buffers. Update xfs_trans_ordered_buf() to explicitly fail on buffers
> > with dirty segments. When said failure occurs during the bmbt scan,
> > physically log the buffer and terminate the scan with -EAGAIN. The
> > deferred infra rolls the tx, relogs the intent and has to restart the
> > bmbt scan over again. This repeats until we've processed the entire
> > tree.
> > 
> > I had a couple more thoughts that are similarly not yet thought out and
> > thus not worth rambling about. Thoughts on any of the above? On using
> > the rmapbt algorithm? Other ideas?
> > 
> > Brian
> > 
> > > > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > > > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > > > > regions exist, then we treat it like a normal buffer rather than an
> > > > > ordered buffer. We can factor the dirty region check out of
> > > > > xfs_buf_item_unlock() for this...
> > > > > 
> > > > > Actually, check the case in xfs_buf_item_size() and remove the
> > > > > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > > > > will do the right thing without needing a duplicate check...
> > > > > 
> > > > 
> > > > I think that would work, assuming we actually check the
> > > > xfs_buf_log_format for dirty-ness rather than just the log item. As it
> > > > is, note that ordered buffers are still "logged" in the transaction
> > > > because otherwise the transaction infrastructure will assume it made no
> > > > change to the buf and toss the log item at commit time (we also need to
> > > > set up I/O completion on the buf and whatnot).
> > > 
> > > *nod*
> > > 
> > > > What concerns me about this approach is that I think we introduce the
> > > > possibility for subtle bugs. Existing ordered buffer code does this:
> > > > 
> > > >         xfs_trans_ordered_buf(tp, fbuf);
> > > >         xfs_trans_log_buf(tp, fbuf, 0,
> > > >                           BBTOB(fbuf->b_length) - 1);
> > > > 
> > > > ... which should continue to work fine. Allowing ordered buffers to
> > > > physically log means that something like this:
> > > > 
> > > >         xfs_trans_log_buf(tp, fbuf, 0,
> > > >                           BBTOB(fbuf->b_length) - 1);
> > > >         xfs_trans_ordered_buf(tp, fbuf);
> > > > 
> > > > ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> > > > and logging internals. Granted, the above already is incorrect, but it
> > > > technically still works as expected. I don't see the need to turn that
> > > > into a real problem by actually logging the buffer when we might not
> > > > expect to.
> > > 
> > > Well, it's not a "things go bad" bug. It's a "we screwed up an
> > > optimisation" bug, because logging the buffer contents unnecessarily
> > > only increases the required log bandwidth. It shouldn't affect
> > > replay because the buffer is still correctly ordered in the log.
> > > Hence both the transient and end states of the buffer during replay
> > > will still be the same...
> > > 
> > > > So while I agree that this could probably be made to work and I think it
> > > > is ideal to doing any kind of logged range tracking in the deferred ops
> > > > code, it still seems more tricky than it needs to be. To relog a held
> > > > buffer in a new transaction, why not just mark the lidp dirty in the new
> > > > transaction so it inherits all existing dirty segments? AFAICT, all we
> > > > really need to do is:
> > > > 
> > > >         tp->t_flags |= XFS_TRANS_DIRTY;
> > > >         lidp->lid_flags |= XFS_LID_DIRTY;
> > > > 
> > > > ... on the new transaction and everything should just work as designed
> > > > (for a buffer that has been previously logged, held, rolled and
> > > > rejoined).
> > > 
> > > We would also need to set:
> > > 
> > > 	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> > > 
> > > which means we should....
> > > 
> > > > To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> > > > new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> > > > not related to actually dirtying the bli. xfs_trans_log_buf() would call
> > > > xfs_trans_dirty_buf() and thus would not change functionally.
> > > > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> > > > the existing ordered buf users would no longer need to log a range of
> > > > the buffer (which doesn't make much sense anyways).
> > > 
> > > ... do this. :)
> > > 
> > > > Finally, the
> > > > deferred infrastructure could join/dirty/hold the buffer to the new
> > > > transaction after each roll without needing to track and relog specific
> > > > regions of the buffer. Thoughts?
> > > 
> > > Yup, that's exactly what I was thinking should be possible by using
> > > ordered buffers.... :)
> > > 
> > > And Christoph's rework of the transaction roll and deferred inode
> > > handling that he just posted should make adding buffer handling
> > > quite a bit neater and cleaner.
> > > 
> > > > Unless I'm missing something as to why this is busted, I'll take a
> > > > closer look at the code and float an rfc next week since otherwise it
> > > > sounds like this is something we could actually fix up in the ordered
> > > > buffer code today.
> > > 
> > > Cool.
> > > 
> > > > > Nothing in XFS is ever simple, is it? :P
> > > > 
> > > > There used to be a level of satisfaction at feeling I understood some
> > > > new corner of XFS. Nowadays I know that just means I'm not yet aware of
> > > > whatever dragons remain in that corner (is that paranoia? not if it's
> > > > true!). :P
> > > 
> > > Ah, the true signs of expertise: developing a knowledge base and
> > > insight deep enough to understand that there is always another
> > > hidden dragon poised to bite your head off. :)
> > > 
> > > Cheers,
> > > 
> > > Dave.
> > > -- 
> > > Dave Chinner
> > > david@fromorbit.com
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-18  2:04                           ` Dave Chinner
@ 2017-08-18 11:42                             ` Brian Foster
  0 siblings, 0 replies; 40+ messages in thread
From: Brian Foster @ 2017-08-18 11:42 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, Alex Lyakas, linux-xfs, libor.klepac

On Fri, Aug 18, 2017 at 12:04:42PM +1000, Dave Chinner wrote:
> On Thu, Aug 17, 2017 at 04:38:26PM -0400, Brian Foster wrote:
> > On Mon, Aug 14, 2017 at 10:28:09AM +1000, Dave Chinner wrote:
> > > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
...
> > 4.) Implement the ordered buffer relogging logic discussed below: detect
> > ordered buffers with previously logged segments and relog them at
> > ->iop_format() time. I suppose I could alleviate my aforementioned
> > concerns with an assert that verifies an ordered buffer isn't already
> > dirtied by the _current_ transaction (i.e., to catch the case of the
> > "screwed up optimization" I was concerned about).
> 
> Yeah, that would work, except .....
> 
> > That aside, I think there is another problem here that we missed
> > previously: transactions that use ordered buffers don't reserve log
> > space for them, but relogged items require log reservation. If I follow
> > the code correctly, a relogged item that is presently in the CIL may not
> > use reservation in a subsequent tx. If the item resides in the AIL at
> > relog commit time, however, the relogging transaction must have
> > reservation for the item (the physical log space for the item is freed
> > and the space used for the relog is accounted out of the reservation).
> 
> ... this.
> 
> Yes, you are right, Brian, that if the item is in the CIL then we
> don't need a new reservation because the space is already accounted
> for in the current checkpoint. This, however, is still racy as the
> item in the CIL can be committed while we have it locked and so
> when we commit it's the same as "in the AIL and we need a
> reservation".
> 

Indeed.

> > So I think the only way we can support the ability to relog previously
> > dirty buffers that have been marked ordered is to require log
> > reservation for them as for normal physically logged buffers, which kind
> > of defeats the purpose.
> > 
> > 5.) A deferred op variant of the bmbt owner change algorithm. I haven't
> > fully thought this one through yet so it may not be viable, but the
> > general idea is to use deferred ops to conditionally physically log
> > previously dirty buffers where required. For example, use a transaction
> > with log reservation for one full buffer, log a new bmbt owner change
> > intent and start running through the bmbt scan attempting to order
> > buffers. Update xfs_trans_ordered_buf() to explicitly fail on buffers
> > with dirty segments. When said failure occurs during the bmbt scan,
> > physically log the buffer and terminate the scan with -EAGAIN. The
> > deferred infra rolls the tx, relogs the intent and has to restart the
> > bmbt scan over again. This repeats until we've processed the entire
> > tree.
> 
> Problem is that a deferred op variant isn't backwards compatible,
> and given this is a v4 filesystem issue, that's a big problem
> because we can't just use a log incompat flag while the op is active
> in the log.
> 

Yeah, I was thinking we'd be able to continue to support the old
recovery mechanism for forward compatibility, but apparently glossed
over the backwards compatibility requirement. 

> However, I think we can do this with just normal rolling
> transactions. Like you suggest, we can create the transaction with a
> small amount of buffer space that can be logged (e.g. enough for N
> complete buffers). Then we modify xfs_trans_buf_ordered() to "try
> ordering the buffer". If the buffer needs relogging, then it fails
> and we log the owner field in the buffer as per a normal buffer. (or
> maybe just take the range we need to log and log that instead of
> ordering the buffer - as long as we can count the buffers that do
> this.)
> 

This is basically how I was thinking it through at first, I just went
the next logical step and defined it as a deferred operation. Walking
back from that, I still think something like this could work.

One subtle difference I think we may need to accommodate is that once
this algorithm can start committing/rolling transactions, we'd have to
make sure the inode owner change log item makes it to the log first. I
think that means we'd have to commit/roll the initial swap transaction
first and then start the bmbt change sequence while we still have the
inode(s) locked (which then implies we should be relogging the inode as
well). We also have to make sure it's Ok to roll transactions with
ordered buffers, which as of today means those buffers are free to write
back. I _think_ this is safe here so long as the owner change item is
logged and effectively pinned. Anyways, the details will probably be
easier to reason about after playing with it a bit.

Hmm, this does make me wonder whether we already have a similar
atomicity issue with the rmapbt algorithm. The map/unmap sequence is
going to roll transactions as it progresses, which I think should be
fine in and of itself. We don't actually change the reflink flags if
necessary until afterwards, however, so I'm wondering whether we could
end up inconsistent if we crash after having swapped enough reflinked
extents to require a flag state change but not having updated the flag.
If so, it still may not be a problem in practice given the semantics of
the reflink flag and how xfs_fsr works.

> On the N-th relogged buffer, roll the transaction to trigger
> regranting of the log space for another N buffers and keep going
> with the owner change.  The transaction roll will then commit the
> buffer owner change to the log.
> 
> In recovery, we just replay the owner change as we currently do,
> knowing that if the buffer was logged during the owner change and
> written to a later checkpoint, the replay of that buffer will
> contain the owner change we just did and so we won't lose anything.
> 
> I think this should work on all v4 filesystems without any need to
> modify log recovery or new log items....
> 

All of the above aside, any thoughts on just using the "rmap" algorithm
universally? I hadn't hit any issues with that so far, but worst case it
sounds like we could break whatever tenuous rmapbt feature dependencies
might exist.

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-18 11:39                             ` Brian Foster
@ 2017-08-18 15:37                               ` Darrick J. Wong
  0 siblings, 0 replies; 40+ messages in thread
From: Darrick J. Wong @ 2017-08-18 15:37 UTC (permalink / raw)
  To: Brian Foster; +Cc: Dave Chinner, Alex Lyakas, linux-xfs, libor.klepac

On Fri, Aug 18, 2017 at 07:39:34AM -0400, Brian Foster wrote:
> On Thu, Aug 17, 2017 at 03:31:21PM -0700, Darrick J. Wong wrote:
> > On Thu, Aug 17, 2017 at 04:38:26PM -0400, Brian Foster wrote:
> > > On Mon, Aug 14, 2017 at 10:28:09AM +1000, Dave Chinner wrote:
> > > > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > > > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > > > > a new dirty range on the buffer. IOWs, it retains whatever dirty range
> > > > > > it already had, and so after joining, marking it ordered and then
> > > > > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > > > > in the transaction.
> > > > > > 
> > > > > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > > > > with a pre-existing dirty region is formatted for the CIL? We
> > > > > > haven't done that before, so I'm betting that we don't relog the
> > > > > > dirty region like we should be doing....
> > > > > > 
> > > > > > ... and we don't relog the existing dirty range because the
> > > > > > ordered flag takes precedence.
> > > > > > 
> > > > > 
> > > > > Right.. so it seems that the current implementation for ordered buffers
> > > > > assumes a buffer is only ever used in one mode or the other.
> > > > > Additionally, the AIL assumes that any reinserted item has been fully
> > > > > relogged and so it moves the LSN forward unconditionally. Current
> > > > > ordered buffer processing violates this constraint for an already logged
> > > > > buffer.
> > > > 
> > > > Right, but it's not been a concern until now because we've only ever
> > > > used ordered buffers on newly allocated buffers that haven't been
> > > > previously logged.
> > > > 
> > > 
> > > Shortly after starting to test the latest refactoring with some asserts
> > > that enforce "strict" buffer ordering, I (sure enough ;)) ran into an
> > > instance of the above.
> > > 
> > > The issue is with the bmbt owner change operation that is part of the
> > > swap extent sequence. The owner change code ends up setting a buffer
> > > ordered that is currently dirty due to an unrelated previous
> > > transaction. The potential side effects of this are basically what we've
> > > already laid out above. Note that this is only relevant on
> > > crc=1,rmapbt=0 filesystems.
> > > 
> > > I'm currently testing out something that uses the rmapbt based swap
> > > implementation for all filesystems. IIUC, that algorithm doesn't
> > > actually require rmapbt support, it is just unfortunately named as such
> > > because we only use it on rmapbt filesystems. I suspect that is because
> > > rmapbt management requires the unmap/remapping of space to keep the tree
> > > consistent (and I suspect this is limited to rmapbt fs' for performance
> > > reasons, Darrick?) vs. it being something that actually depends on the
> > 
> > The rmapbt extent swapping function requires rmapbt because it writes
> > rmap and (more importantly) bmap intent items to the log.  An older
> > kernel cannot be allowed to recover such log items, which the rocompat
> > feature check in xfs_mount_validate_sb prevents.
> > 
> 
> Hmm, that's not what I've observed. I've tested recovery of both the
> rmapbt algorithm (with and without rmapbt actually enabled) as well as

But did you try recovery of the rmapbt algorithm on a pre-4.8 kernel?
That's what I meant by 'older kernel cannot be allowed...'.

IOWs, it's ok for log recovery on a 4.9+ kernel to process bmap intent
items on a non-rmap/non-reflink fs.

> the bmbt owner change algorithm. bmbt owner change recovery is actually
> currently broken due to the inode owner check in
> xfs_btree_lookup_get_block(), but I've been waiting to determine how we
> end up resolving this higher level issue before digging any further into
> that (I suspect it just needs to be bypassed during recovery). I hadn't
> hit any issues with the former cases.
>
> Looking at the code, I see the high level algorithm basically queues
> bmap unmap/map deferred ops across the file address space. That
> eventually resolves to __xfs_bunmapi()/xfs_bmapi_remap() calls that
> migrate the mappings. __xfs_bunmapi() clearly has to work in general for
> !rmapbt filesystems. AFAICT, all the XFS_BMAPI_REMAP flag does here is
> prevent us from actually freeing the blocks.
> 
> Moving along, we next end up in xfs_bmapi_remap() to map said blocks
> back into the other file... where I also don't see anything feature
> specific in the implementation. It uses xfs_bmap_add_extent_hole_real(),
> which calls xfs_rmap_map_extent(), which calls
> xfs_rmap_update_is_needed(), which checks xfs_sb_version_hasrmapbt().
> 
> Looking back at the unmap side, it looks like we have the same pattern
> down in xfs_bmap_del_extent() -> xfs_rmap_unmap_extent() -> ...
> 
> So AFAICT we have feature checks in the appropriate places. Am I missing
> something else?

Nope.

--D

> > I've never tried to see what happens when log recovery hits an item with
> > an unknown magic number -- I think we just bail out with EIO and leave
> > the log for someone else to recover?  In theory you could use the
> > rmap swapextents function all the time, but anyone trying to recover
> > after a crash with an old kernel will faceplant hard.  We could also
> > implement a new log-incompat flag if !rmapbt and someone tries to log a
> > bmap item... though iirc log_incompat doesn't exist for v4 filesystems,
> > so that still doesn't help us.
> > 
> > Conceptually, at least, the rmap swapext function should be capable of
> > swapping /any/ two inodes, not just the particular set of circumstances
> > required by xfs_swap_extents.
> > 
> 
> Indeed, thanks. If there is some dependency here it seems like it
> shouldn't be too difficult to break.
> 
> > > rmapbt.
> > > 
> > > FWIW, I've so far at least considered some of the following alternatives
> > > to try and preserve the existing bmbt owner change algorithm before
> > > moving on to testing the rmapbt approach:
> > > 
> > > 1.) Don't use ordered buffers for the bmbt owner change. My
> > > understanding is that this is a non-starter as we'd need to log every
> > > buffer in the tree in a single transaction.
> > 
> > Yep.
> > 
> > > 2.) Update the extent swap operation to push the log and AIL based on
> > > the most recent LSN of any buffer in the bmbt before the bmbt owner
> > > change takes place. I've experimented with this enough to hack together
> > > an xfs_ail_push_sync() prototype and also observe that using the
> > > ili_item.li_lsn of the inode is not sufficient.
> > > 
> > > IOW, this basically requires a second bmbt walk per-inode to discover
> > > the max lsn of all bmbt blocks. That may or may not be a performance
> > > issue since we have to walk the bmbt anyways. Otherwise I think this is
> > > a viable option from a correctness perspective.
> > > 
> > > A simplified variant of this (that just crossed my mind while writing
> > > this up) may be to do an in-core check for dirty bmbt buffers on an
> > > inode and just do an xfs_ail_push_all_sync() if any are found.
> > 
> > I imagine you have to keep the inodes locked throughout all this to
> > prevent something else from wandering in and re-logging the bmbt block?
> > 
> 
> Yeah, or retry or fail with -EAGAIN or something if we hit the
> problematic state, and/or see if there's something we can do to
> effectively quiesce the bmbt while under iolock. I hadn't really got
> that far tbh because this didn't seem like a great first option.
> 
> Brian
> 
> > > 3.) Synchronous write the affected buffers at xfs_trans_ordered_buf()
> > > time. This is a bit ugly because it requires to xfs_trans_brelse() the
> > > buf from the current transaction, _bwrite() it and then _bjoin() it back
> > > to the transaction all within xfs_trans_ordered_buf(). The advantage
> > > over #2 is that this only occurs for buffers with logged segments in the
> > > bli. It also effectively implements a form of generic support for
> > > ordering previously logged buffers.
> > > 
> > > 4.) Implement the ordered buffer relogging logic discussed below: detect
> > > ordered buffers with previously logged segments and relog them at
> > > ->iop_format() time. I suppose I could alleviate my aforementioned
> > > concerns with an assert that verifies an ordered buffer isn't already
> > > dirtied by the _current_ transaction (i.e., to catch the case of the
> > > "screwed up optimization" I was concerned about).
> > > 
> > > That aside, I think there is another problem here that we missed
> > > previously: transactions that use ordered buffers don't reserve log
> > > space for them, but relogged items require log reservation. If I follow
> > > the code correctly, a relogged item that is presently in the CIL may not
> > > use reservation in a subsequent tx. If the item resides in the AIL at
> > > relog commit time, however, the relogging transaction must have
> > > reservation for the item (the physical log space for the item is freed
> > > and the space used for the relog is accounted out of the reservation).
> > > 
> > > So I think the only way we can support the ability to relog previously
> > > dirty buffers that have been marked ordered is to require log
> > > reservation for them as for normal physically logged buffers, which kind
> > > of defeats the purpose.
> > > 
> > > 5.) A deferred op variant of the bmbt owner change algorithm. I haven't
> > > fully thought this one through yet so it may not be viable, but the
> > > general idea is to use deferred ops to conditionally physically log
> > > previously dirty buffers where required. For example, use a transaction
> > > with log reservation for one full buffer, log a new bmbt owner change
> > > intent and start running through the bmbt scan attempting to order
> > > buffers. Update xfs_trans_ordered_buf() to explicitly fail on buffers
> > > with dirty segments. When said failure occurs during the bmbt scan,
> > > physically log the buffer and terminate the scan with -EAGAIN. The
> > > deferred infra rolls the tx, relogs the intent and has to restart the
> > > bmbt scan over again. This repeats until we've processed the entire
> > > tree.
> > > 
> > > I had a couple more thoughts that are similarly not yet thought out and
> > > thus not worth rambling about. Thoughts on any of the above? On using
> > > the rmapbt algorithm? Other ideas?
> > > 
> > > Brian
> > > 
> > > > > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > > > > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > > > > > regions exist, then we treat it like a normal buffer rather than an
> > > > > > ordered buffer. We can factor the dirty region check out of
> > > > > > xfs_buf_item_unlock() for this...
> > > > > > 
> > > > > > Actually, check the case in xfs_buf_item_size() and remove the
> > > > > > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > > > > > will do the right thing without needing a duplicate check...
> > > > > > 
> > > > > 
> > > > > I think that would work, assuming we actually check the
> > > > > xfs_buf_log_format for dirty-ness rather than just the log item. As it
> > > > > is, note that ordered buffers are still "logged" in the transaction
> > > > > because otherwise the transaction infrastructure will assume it made no
> > > > > change to the buf and toss the log item at commit time (we also need to
> > > > > set up I/O completion on the buf and whatnot).
> > > > 
> > > > *nod*
> > > > 
> > > > > What concerns me about this approach is that I think we introduce the
> > > > > possibility for subtle bugs. Existing ordered buffer code does this:
> > > > > 
> > > > >         xfs_trans_ordered_buf(tp, fbuf);
> > > > >         xfs_trans_log_buf(tp, fbuf, 0,
> > > > >                           BBTOB(fbuf->b_length) - 1);
> > > > > 
> > > > > ... which should continue to work fine. Allowing ordered buffers to
> > > > > physically log means that something like this:
> > > > > 
> > > > >         xfs_trans_log_buf(tp, fbuf, 0,
> > > > >                           BBTOB(fbuf->b_length) - 1);
> > > > >         xfs_trans_ordered_buf(tp, fbuf);
> > > > > 
> > > > > ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> > > > > and logging internals. Granted, the above already is incorrect, but it
> > > > > technically still works as expected. I don't see the need to turn that
> > > > > into a real problem by actually logging the buffer when we might not
> > > > > expect to.
> > > > 
> > > > Well, it's not a "things go bad" bug. It's a "we screwed up an
> > > > optimisation" bug, because logging the buffer contents unnecessarily
> > > > only increases the required log bandwidth. It shouldn't affect
> > > > replay because the buffer is still correctly ordered in the log.
> > > > Hence both the transient and end states of the buffer during replay
> > > > will still be the same...
> > > > 
> > > > > So while I agree that this could probably be made to work and I think it
> > > > > is ideal to doing any kind of logged range tracking in the deferred ops
> > > > > code, it still seems more tricky than it needs to be. To relog a held
> > > > > buffer in a new transaction, why not just mark the lidp dirty in the new
> > > > > transaction so it inherits all existing dirty segments? AFAICT, all we
> > > > > really need to do is:
> > > > > 
> > > > >         tp->t_flags |= XFS_TRANS_DIRTY;
> > > > >         lidp->lid_flags |= XFS_LID_DIRTY;
> > > > > 
> > > > > ... on the new transaction and everything should just work as designed
> > > > > (for a buffer that has been previously logged, held, rolled and
> > > > > rejoined).
> > > > 
> > > > We would also need to set:
> > > > 
> > > > 	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> > > > 
> > > > which means we should....
> > > > 
> > > > > To elaborate a bit, I think we could refactor xfs_trans_log_buf() into a
> > > > > new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> > > > > not related to actually dirtying the bli. xfs_trans_log_buf() would call
> > > > > xfs_trans_dirty_buf() and thus would not change functionally.
> > > > > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> > > > > the existing ordered buf users would no longer need to log a range of
> > > > > the buffer (which doesn't make much sense anyways).
> > > > 
> > > > ... do this. :)
> > > > 
> > > > > Finally, the
> > > > > deferred infrastructure could join/dirty/hold the buffer to the new
> > > > > transaction after each roll without needing to track and relog specific
> > > > > regions of the buffer. Thoughts?
> > > > 
> > > > Yup, that's exactly what I was thinking should be possible by using
> > > > ordered buffers.... :)
> > > > 
> > > > And Christoph's rework of the transaction roll and deferred inode
> > > > handling that he just posted should make adding buffer handling
> > > > quite a bit neater and cleaner.
> > > > 
> > > > > Unless I'm missing something as to why this is busted, I'll take a
> > > > > closer look at the code and float an rfc next week since otherwise it
> > > > > sounds like this is something we could actually fix up in the ordered
> > > > > buffer code today.
> > > > 
> > > > Cool.
> > > > 
> > > > > > Nothing in XFS is ever simple, is it? :P
> > > > > 
> > > > > There used to be a level of satisfaction at feeling I understood some
> > > > > new corner of XFS. Nowadays I know that just means I'm not yet aware of
> > > > > whatever dragons remain in that corner (is that paranoia? not if it's
> > > > > true!). :P
> > > > 
> > > > Ah, the true signs of expertise: developing a knowledge base and
> > > > insight deep enough to understand that there is always another
> > > > hidden dragon poised to bite your head off. :)
> > > > 
> > > > Cheers,
> > > > 
> > > > Dave.
> > > > -- 
> > > > Dave Chinner
> > > > david@fromorbit.com
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-09 11:06 [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute alex
  2017-08-09 13:17 ` Brian Foster
  2017-08-09 21:33 ` Dave Chinner
@ 2017-11-21 15:31 ` Libor Klepáč
  2017-11-21 16:24   ` Brian Foster
  2 siblings, 1 reply; 40+ messages in thread
From: Libor Klepáč @ 2017-11-21 15:31 UTC (permalink / raw)
  To: alex; +Cc: linux-xfs, bfoster, david, darrick.wong

Hello again,
i'm sorry to bug you, but i would like to ask, if this patch made it to kernel 
yet?
I was looking on pull request messages and did not see it.

We were struck by "Metadata corruption detected at 
xfs_attr3_leaf_write_verify" under high load, after months without problem.

But we are still on 4.9.30 on this server, there is possibility to upgrade to 
4.9.51 from backports.

With regards,
Libor


On středa 9. srpna 2017 13:06:12 CET alex@zadarastorage.com wrote:
> From: Alex Lyakas <alex@zadarastorage.com>
> 
> The new attribute leaf buffer is not held locked across
> the transaction roll between the shortform->leaf modification
> and the addition of the new entry. As a result, the attribute
> buffer modification being made is not atomic from
> an operational perspective. Hence the AIL push can grab it in
> the transient state of "just created" after the initial
> transaction is rolled, because the buffer has been released.
> This leads to xfs_attr3_leaf_verify() asserting that
> hdr.count is zero, treating this as in-memory corruption,
> and shutting down the filesystem.
> 
> Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
> ---
>  fs/xfs/libxfs/xfs_attr.c      | 19 ++++++++++++++++++-
>  fs/xfs/libxfs/xfs_attr_leaf.c |  4 +++-
>  fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
>  3 files changed, 23 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> index de7b9bd..982e322 100644
> --- a/fs/xfs/libxfs/xfs_attr.c
> +++ b/fs/xfs/libxfs/xfs_attr.c
> @@ -216,10 +216,11 @@
>  	struct xfs_defer_ops	dfops;
>  	struct xfs_trans_res	tres;
>  	xfs_fsblock_t		firstblock;
>  	int			rsvd = (flags & ATTR_ROOT) != 0;
>  	int			error, err2, local;
> +	struct xfs_buf		*leaf_bp = NULL;
>  
>  	XFS_STATS_INC(mp, xs_attr_set);
>  
>  	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
>  		return -EIO;
> @@ -325,11 +326,17 @@
>  		/*
>  		 * It won't fit in the shortform, transform to a leaf block.
>  		 * GROT: another possible req'mt for a double-split btree op.
>  		 */
>  		xfs_defer_init(args.dfops, args.firstblock);
> -		error = xfs_attr_shortform_to_leaf(&args);
> +		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> +		/*
> +		 * Prevent the leaf buffer from being unlocked
> +		 * when "args.trans" transaction commits.
> +		 */
> +		if (leaf_bp)
> +			xfs_trans_bhold(args.trans, leaf_bp);
>  		if (!error)
>  			error = xfs_defer_finish(&args.trans, args.dfops, dp);
>  		if (error) {
>  			args.trans = NULL;
>  			xfs_defer_cancel(&dfops);
> @@ -343,10 +350,18 @@
>  
>  		error = xfs_trans_roll(&args.trans, dp);
>  		if (error)
>  			goto out;
>  
> +		/*
> +		 * Rejoin the leaf buffer to the new transaction.
> +		 * This allows a subsequent read to find the buffer in the
> +		 * transaction (and avoid a deadlock).
> +		 */
> +		xfs_trans_bjoin(args.trans, leaf_bp);
> +		/* Prevent from being released at the end of the function */
> +		leaf_bp = NULL;
>  	}
>  
>  	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
>  		error = xfs_attr_leaf_addname(&args);
>  	else
> @@ -374,10 +389,12 @@
>  	return error;
>  
>  out:
>  	if (args.trans)
>  		xfs_trans_cancel(args.trans);
> +	if (leaf_bp)
> +		xfs_buf_relse(leaf_bp);
>  	xfs_iunlock(dp, XFS_ILOCK_EXCL);
>  	return error;
>  }
>  
>  /*
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
> index c6c15e5..ab73e4b 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> @@ -738,13 +738,14 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args 
*args,
>  	return -ENOATTR;
>  }
>  
>  /*
>   * Convert from using the shortform to the leaf.
> + * Upon success, return the leaf buffer.
>   */
>  int
> -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> +xfs_attr_shortform_to_leaf(xfs_da_args_t *args, struct xfs_buf **bpp)
>  {
>  	xfs_inode_t *dp;
>  	xfs_attr_shortform_t *sf;
>  	xfs_attr_sf_entry_t *sfe;
>  	xfs_da_args_t nargs;
> @@ -820,10 +821,11 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args 
*args,
>  		if (error)
>  			goto out;
>  		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
>  	}
>  	error = 0;
> +	*bpp = bp;
>  
>  out:
>  	kmem_free(tmpbuffer);
>  	return error;
>  }
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
> index f7dda0c..2b3c69df 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> @@ -46,11 +46,12 @@
>   */
>  void	xfs_attr_shortform_create(struct xfs_da_args *args);
>  void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
>  int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
>  int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> -int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> +int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> +					struct xfs_buf **bpp);
>  int	xfs_attr_shortform_remove(struct xfs_da_args *args);
>  int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
>  int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
>  void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
>  
> 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-11-21 15:31 ` Libor Klepáč
@ 2017-11-21 16:24   ` Brian Foster
  2017-11-21 18:50     ` Darrick J. Wong
  0 siblings, 1 reply; 40+ messages in thread
From: Brian Foster @ 2017-11-21 16:24 UTC (permalink / raw)
  To: Libor Klepáč; +Cc: alex, linux-xfs, david, darrick.wong

On Tue, Nov 21, 2017 at 04:31:20PM +0100, Libor Klepáč wrote:
> Hello again,
> i'm sorry to bug you, but i would like to ask, if this patch made it to kernel 
> yet?
> I was looking on pull request messages and did not see it.
> 
> We were struck by "Metadata corruption detected at 
> xfs_attr3_leaf_write_verify" under high load, after months without problem.
> 
> But we are still on 4.9.30 on this server, there is possibility to upgrade to 
> 4.9.51 from backports.
> 

IIRC, the issue that prevented this patch from being merged was that the
buffer needed to be joined and relogged across deferred operations. That
had a dependency on some rectoring of the buffer logging code, which has
since been merged, but I don't recall seeing anything regarding a
deferred op buffer relogging mechanism and using it here. Alex?

Brian

> With regards,
> Libor
> 
> 
> On středa 9. srpna 2017 13:06:12 CET alex@zadarastorage.com wrote:
> > From: Alex Lyakas <alex@zadarastorage.com>
> > 
> > The new attribute leaf buffer is not held locked across
> > the transaction roll between the shortform->leaf modification
> > and the addition of the new entry. As a result, the attribute
> > buffer modification being made is not atomic from
> > an operational perspective. Hence the AIL push can grab it in
> > the transient state of "just created" after the initial
> > transaction is rolled, because the buffer has been released.
> > This leads to xfs_attr3_leaf_verify() asserting that
> > hdr.count is zero, treating this as in-memory corruption,
> > and shutting down the filesystem.
> > 
> > Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
> > ---
> >  fs/xfs/libxfs/xfs_attr.c      | 19 ++++++++++++++++++-
> >  fs/xfs/libxfs/xfs_attr_leaf.c |  4 +++-
> >  fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
> >  3 files changed, 23 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> > index de7b9bd..982e322 100644
> > --- a/fs/xfs/libxfs/xfs_attr.c
> > +++ b/fs/xfs/libxfs/xfs_attr.c
> > @@ -216,10 +216,11 @@
> >  	struct xfs_defer_ops	dfops;
> >  	struct xfs_trans_res	tres;
> >  	xfs_fsblock_t		firstblock;
> >  	int			rsvd = (flags & ATTR_ROOT) != 0;
> >  	int			error, err2, local;
> > +	struct xfs_buf		*leaf_bp = NULL;
> >  
> >  	XFS_STATS_INC(mp, xs_attr_set);
> >  
> >  	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
> >  		return -EIO;
> > @@ -325,11 +326,17 @@
> >  		/*
> >  		 * It won't fit in the shortform, transform to a leaf block.
> >  		 * GROT: another possible req'mt for a double-split btree op.
> >  		 */
> >  		xfs_defer_init(args.dfops, args.firstblock);
> > -		error = xfs_attr_shortform_to_leaf(&args);
> > +		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> > +		/*
> > +		 * Prevent the leaf buffer from being unlocked
> > +		 * when "args.trans" transaction commits.
> > +		 */
> > +		if (leaf_bp)
> > +			xfs_trans_bhold(args.trans, leaf_bp);
> >  		if (!error)
> >  			error = xfs_defer_finish(&args.trans, args.dfops, dp);
> >  		if (error) {
> >  			args.trans = NULL;
> >  			xfs_defer_cancel(&dfops);
> > @@ -343,10 +350,18 @@
> >  
> >  		error = xfs_trans_roll(&args.trans, dp);
> >  		if (error)
> >  			goto out;
> >  
> > +		/*
> > +		 * Rejoin the leaf buffer to the new transaction.
> > +		 * This allows a subsequent read to find the buffer in the
> > +		 * transaction (and avoid a deadlock).
> > +		 */
> > +		xfs_trans_bjoin(args.trans, leaf_bp);
> > +		/* Prevent from being released at the end of the function */
> > +		leaf_bp = NULL;
> >  	}
> >  
> >  	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
> >  		error = xfs_attr_leaf_addname(&args);
> >  	else
> > @@ -374,10 +389,12 @@
> >  	return error;
> >  
> >  out:
> >  	if (args.trans)
> >  		xfs_trans_cancel(args.trans);
> > +	if (leaf_bp)
> > +		xfs_buf_relse(leaf_bp);
> >  	xfs_iunlock(dp, XFS_ILOCK_EXCL);
> >  	return error;
> >  }
> >  
> >  /*
> > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
> > index c6c15e5..ab73e4b 100644
> > --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> > +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> > @@ -738,13 +738,14 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args 
> *args,
> >  	return -ENOATTR;
> >  }
> >  
> >  /*
> >   * Convert from using the shortform to the leaf.
> > + * Upon success, return the leaf buffer.
> >   */
> >  int
> > -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> > +xfs_attr_shortform_to_leaf(xfs_da_args_t *args, struct xfs_buf **bpp)
> >  {
> >  	xfs_inode_t *dp;
> >  	xfs_attr_shortform_t *sf;
> >  	xfs_attr_sf_entry_t *sfe;
> >  	xfs_da_args_t nargs;
> > @@ -820,10 +821,11 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args 
> *args,
> >  		if (error)
> >  			goto out;
> >  		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
> >  	}
> >  	error = 0;
> > +	*bpp = bp;
> >  
> >  out:
> >  	kmem_free(tmpbuffer);
> >  	return error;
> >  }
> > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
> > index f7dda0c..2b3c69df 100644
> > --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> > +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> > @@ -46,11 +46,12 @@
> >   */
> >  void	xfs_attr_shortform_create(struct xfs_da_args *args);
> >  void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
> >  int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
> >  int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> > -int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> > +int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> > +					struct xfs_buf **bpp);
> >  int	xfs_attr_shortform_remove(struct xfs_da_args *args);
> >  int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
> >  int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
> >  void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
> >  
> > 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-11-21 16:24   ` Brian Foster
@ 2017-11-21 18:50     ` Darrick J. Wong
  2017-11-30 17:55       ` Darrick J. Wong
  0 siblings, 1 reply; 40+ messages in thread
From: Darrick J. Wong @ 2017-11-21 18:50 UTC (permalink / raw)
  To: Brian Foster; +Cc: Libor Klepáč, alex, linux-xfs, david

On Tue, Nov 21, 2017 at 11:24:29AM -0500, Brian Foster wrote:
> On Tue, Nov 21, 2017 at 04:31:20PM +0100, Libor Klepáč wrote:
> > Hello again,
> > i'm sorry to bug you, but i would like to ask, if this patch made it to kernel 
> > yet?
> > I was looking on pull request messages and did not see it.
> > 
> > We were struck by "Metadata corruption detected at 
> > xfs_attr3_leaf_write_verify" under high load, after months without problem.
> > 
> > But we are still on 4.9.30 on this server, there is possibility to upgrade to 
> > 4.9.51 from backports.
> > 
> 
> IIRC, the issue that prevented this patch from being merged was that the
> buffer needed to be joined and relogged across deferred operations. That
> had a dependency on some rectoring of the buffer logging code, which has
> since been merged, but I don't recall seeing anything regarding a
> deferred op buffer relogging mechanism and using it here. Alex?

Correct, there's no such thing as deferred op buffer relogging.  If I
have time this week or next I can try to revisit what exactly we needed
to do to retain the buffer lock across transaction rolls in
defer_finish and try to cough up a patch, after which the original attr
fix from Alex should be refactored to use that mechanism.

(Not sure what we do about backporting to pre-defer_ops kernels...)

--D

> Brian
> 
> > With regards,
> > Libor
> > 
> > 
> > On středa 9. srpna 2017 13:06:12 CET alex@zadarastorage.com wrote:
> > > From: Alex Lyakas <alex@zadarastorage.com>
> > > 
> > > The new attribute leaf buffer is not held locked across
> > > the transaction roll between the shortform->leaf modification
> > > and the addition of the new entry. As a result, the attribute
> > > buffer modification being made is not atomic from
> > > an operational perspective. Hence the AIL push can grab it in
> > > the transient state of "just created" after the initial
> > > transaction is rolled, because the buffer has been released.
> > > This leads to xfs_attr3_leaf_verify() asserting that
> > > hdr.count is zero, treating this as in-memory corruption,
> > > and shutting down the filesystem.
> > > 
> > > Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
> > > ---
> > >  fs/xfs/libxfs/xfs_attr.c      | 19 ++++++++++++++++++-
> > >  fs/xfs/libxfs/xfs_attr_leaf.c |  4 +++-
> > >  fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
> > >  3 files changed, 23 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> > > index de7b9bd..982e322 100644
> > > --- a/fs/xfs/libxfs/xfs_attr.c
> > > +++ b/fs/xfs/libxfs/xfs_attr.c
> > > @@ -216,10 +216,11 @@
> > >  	struct xfs_defer_ops	dfops;
> > >  	struct xfs_trans_res	tres;
> > >  	xfs_fsblock_t		firstblock;
> > >  	int			rsvd = (flags & ATTR_ROOT) != 0;
> > >  	int			error, err2, local;
> > > +	struct xfs_buf		*leaf_bp = NULL;
> > >  
> > >  	XFS_STATS_INC(mp, xs_attr_set);
> > >  
> > >  	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
> > >  		return -EIO;
> > > @@ -325,11 +326,17 @@
> > >  		/*
> > >  		 * It won't fit in the shortform, transform to a leaf block.
> > >  		 * GROT: another possible req'mt for a double-split btree op.
> > >  		 */
> > >  		xfs_defer_init(args.dfops, args.firstblock);
> > > -		error = xfs_attr_shortform_to_leaf(&args);
> > > +		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> > > +		/*
> > > +		 * Prevent the leaf buffer from being unlocked
> > > +		 * when "args.trans" transaction commits.
> > > +		 */
> > > +		if (leaf_bp)
> > > +			xfs_trans_bhold(args.trans, leaf_bp);
> > >  		if (!error)
> > >  			error = xfs_defer_finish(&args.trans, args.dfops, dp);
> > >  		if (error) {
> > >  			args.trans = NULL;
> > >  			xfs_defer_cancel(&dfops);
> > > @@ -343,10 +350,18 @@
> > >  
> > >  		error = xfs_trans_roll(&args.trans, dp);
> > >  		if (error)
> > >  			goto out;
> > >  
> > > +		/*
> > > +		 * Rejoin the leaf buffer to the new transaction.
> > > +		 * This allows a subsequent read to find the buffer in the
> > > +		 * transaction (and avoid a deadlock).
> > > +		 */
> > > +		xfs_trans_bjoin(args.trans, leaf_bp);
> > > +		/* Prevent from being released at the end of the function */
> > > +		leaf_bp = NULL;
> > >  	}
> > >  
> > >  	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
> > >  		error = xfs_attr_leaf_addname(&args);
> > >  	else
> > > @@ -374,10 +389,12 @@
> > >  	return error;
> > >  
> > >  out:
> > >  	if (args.trans)
> > >  		xfs_trans_cancel(args.trans);
> > > +	if (leaf_bp)
> > > +		xfs_buf_relse(leaf_bp);
> > >  	xfs_iunlock(dp, XFS_ILOCK_EXCL);
> > >  	return error;
> > >  }
> > >  
> > >  /*
> > > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
> > > index c6c15e5..ab73e4b 100644
> > > --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> > > +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> > > @@ -738,13 +738,14 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args 
> > *args,
> > >  	return -ENOATTR;
> > >  }
> > >  
> > >  /*
> > >   * Convert from using the shortform to the leaf.
> > > + * Upon success, return the leaf buffer.
> > >   */
> > >  int
> > > -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> > > +xfs_attr_shortform_to_leaf(xfs_da_args_t *args, struct xfs_buf **bpp)
> > >  {
> > >  	xfs_inode_t *dp;
> > >  	xfs_attr_shortform_t *sf;
> > >  	xfs_attr_sf_entry_t *sfe;
> > >  	xfs_da_args_t nargs;
> > > @@ -820,10 +821,11 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args 
> > *args,
> > >  		if (error)
> > >  			goto out;
> > >  		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
> > >  	}
> > >  	error = 0;
> > > +	*bpp = bp;
> > >  
> > >  out:
> > >  	kmem_free(tmpbuffer);
> > >  	return error;
> > >  }
> > > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
> > > index f7dda0c..2b3c69df 100644
> > > --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> > > +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> > > @@ -46,11 +46,12 @@
> > >   */
> > >  void	xfs_attr_shortform_create(struct xfs_da_args *args);
> > >  void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
> > >  int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
> > >  int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> > > -int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> > > +int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> > > +					struct xfs_buf **bpp);
> > >  int	xfs_attr_shortform_remove(struct xfs_da_args *args);
> > >  int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
> > >  int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
> > >  void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
> > >  
> > > 
> > 
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-11-21 18:50     ` Darrick J. Wong
@ 2017-11-30 17:55       ` Darrick J. Wong
  0 siblings, 0 replies; 40+ messages in thread
From: Darrick J. Wong @ 2017-11-30 17:55 UTC (permalink / raw)
  To: Brian Foster; +Cc: Libor Klepáč, alex, linux-xfs, david

On Tue, Nov 21, 2017 at 10:50:18AM -0800, Darrick J. Wong wrote:
> On Tue, Nov 21, 2017 at 11:24:29AM -0500, Brian Foster wrote:
> > On Tue, Nov 21, 2017 at 04:31:20PM +0100, Libor Klepáč wrote:
> > > Hello again,
> > > i'm sorry to bug you, but i would like to ask, if this patch made it to kernel 
> > > yet?
> > > I was looking on pull request messages and did not see it.
> > > 
> > > We were struck by "Metadata corruption detected at 
> > > xfs_attr3_leaf_write_verify" under high load, after months without problem.
> > > 
> > > But we are still on 4.9.30 on this server, there is possibility to upgrade to 
> > > 4.9.51 from backports.
> > > 
> > 
> > IIRC, the issue that prevented this patch from being merged was that the
> > buffer needed to be joined and relogged across deferred operations. That
> > had a dependency on some rectoring of the buffer logging code, which has
> > since been merged, but I don't recall seeing anything regarding a
> > deferred op buffer relogging mechanism and using it here. Alex?
> 
> Correct, there's no such thing as deferred op buffer relogging.  If I
> have time this week or next I can try to revisit what exactly we needed
> to do to retain the buffer lock across transaction rolls in
> defer_finish and try to cough up a patch, after which the original attr
> fix from Alex should be refactored to use that mechanism.
> 
> (Not sure what we do about backporting to pre-defer_ops kernels...)

FWIW I will be sending some RFC patches to the list shortly that fix
this problem on 4.15 kernels.  The new xfs_defer_bjoin code could use
some careful scrutiny to make sure that I captured the gist of this
thread accurately with regard to bhold/join/dirty'ing the bjoin'd buffer
across the transaction roll in xfs_defer_finish.

It doesn't blow up on any of the attr fstests, but that doesn't mean
much. :)

--D

> 
> --D
> 
> > Brian
> > 
> > > With regards,
> > > Libor
> > > 
> > > 
> > > On středa 9. srpna 2017 13:06:12 CET alex@zadarastorage.com wrote:
> > > > From: Alex Lyakas <alex@zadarastorage.com>
> > > > 
> > > > The new attribute leaf buffer is not held locked across
> > > > the transaction roll between the shortform->leaf modification
> > > > and the addition of the new entry. As a result, the attribute
> > > > buffer modification being made is not atomic from
> > > > an operational perspective. Hence the AIL push can grab it in
> > > > the transient state of "just created" after the initial
> > > > transaction is rolled, because the buffer has been released.
> > > > This leads to xfs_attr3_leaf_verify() asserting that
> > > > hdr.count is zero, treating this as in-memory corruption,
> > > > and shutting down the filesystem.
> > > > 
> > > > Signed-off-by: Alex Lyakas <alex@zadarastorage.com>
> > > > ---
> > > >  fs/xfs/libxfs/xfs_attr.c      | 19 ++++++++++++++++++-
> > > >  fs/xfs/libxfs/xfs_attr_leaf.c |  4 +++-
> > > >  fs/xfs/libxfs/xfs_attr_leaf.h |  3 ++-
> > > >  3 files changed, 23 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> > > > index de7b9bd..982e322 100644
> > > > --- a/fs/xfs/libxfs/xfs_attr.c
> > > > +++ b/fs/xfs/libxfs/xfs_attr.c
> > > > @@ -216,10 +216,11 @@
> > > >  	struct xfs_defer_ops	dfops;
> > > >  	struct xfs_trans_res	tres;
> > > >  	xfs_fsblock_t		firstblock;
> > > >  	int			rsvd = (flags & ATTR_ROOT) != 0;
> > > >  	int			error, err2, local;
> > > > +	struct xfs_buf		*leaf_bp = NULL;
> > > >  
> > > >  	XFS_STATS_INC(mp, xs_attr_set);
> > > >  
> > > >  	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
> > > >  		return -EIO;
> > > > @@ -325,11 +326,17 @@
> > > >  		/*
> > > >  		 * It won't fit in the shortform, transform to a leaf block.
> > > >  		 * GROT: another possible req'mt for a double-split btree op.
> > > >  		 */
> > > >  		xfs_defer_init(args.dfops, args.firstblock);
> > > > -		error = xfs_attr_shortform_to_leaf(&args);
> > > > +		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> > > > +		/*
> > > > +		 * Prevent the leaf buffer from being unlocked
> > > > +		 * when "args.trans" transaction commits.
> > > > +		 */
> > > > +		if (leaf_bp)
> > > > +			xfs_trans_bhold(args.trans, leaf_bp);
> > > >  		if (!error)
> > > >  			error = xfs_defer_finish(&args.trans, args.dfops, dp);
> > > >  		if (error) {
> > > >  			args.trans = NULL;
> > > >  			xfs_defer_cancel(&dfops);
> > > > @@ -343,10 +350,18 @@
> > > >  
> > > >  		error = xfs_trans_roll(&args.trans, dp);
> > > >  		if (error)
> > > >  			goto out;
> > > >  
> > > > +		/*
> > > > +		 * Rejoin the leaf buffer to the new transaction.
> > > > +		 * This allows a subsequent read to find the buffer in the
> > > > +		 * transaction (and avoid a deadlock).
> > > > +		 */
> > > > +		xfs_trans_bjoin(args.trans, leaf_bp);
> > > > +		/* Prevent from being released at the end of the function */
> > > > +		leaf_bp = NULL;
> > > >  	}
> > > >  
> > > >  	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
> > > >  		error = xfs_attr_leaf_addname(&args);
> > > >  	else
> > > > @@ -374,10 +389,12 @@
> > > >  	return error;
> > > >  
> > > >  out:
> > > >  	if (args.trans)
> > > >  		xfs_trans_cancel(args.trans);
> > > > +	if (leaf_bp)
> > > > +		xfs_buf_relse(leaf_bp);
> > > >  	xfs_iunlock(dp, XFS_ILOCK_EXCL);
> > > >  	return error;
> > > >  }
> > > >  
> > > >  /*
> > > > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
> > > > index c6c15e5..ab73e4b 100644
> > > > --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> > > > +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> > > > @@ -738,13 +738,14 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args 
> > > *args,
> > > >  	return -ENOATTR;
> > > >  }
> > > >  
> > > >  /*
> > > >   * Convert from using the shortform to the leaf.
> > > > + * Upon success, return the leaf buffer.
> > > >   */
> > > >  int
> > > > -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> > > > +xfs_attr_shortform_to_leaf(xfs_da_args_t *args, struct xfs_buf **bpp)
> > > >  {
> > > >  	xfs_inode_t *dp;
> > > >  	xfs_attr_shortform_t *sf;
> > > >  	xfs_attr_sf_entry_t *sfe;
> > > >  	xfs_da_args_t nargs;
> > > > @@ -820,10 +821,11 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args 
> > > *args,
> > > >  		if (error)
> > > >  			goto out;
> > > >  		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
> > > >  	}
> > > >  	error = 0;
> > > > +	*bpp = bp;
> > > >  
> > > >  out:
> > > >  	kmem_free(tmpbuffer);
> > > >  	return error;
> > > >  }
> > > > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
> > > > index f7dda0c..2b3c69df 100644
> > > > --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> > > > +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> > > > @@ -46,11 +46,12 @@
> > > >   */
> > > >  void	xfs_attr_shortform_create(struct xfs_da_args *args);
> > > >  void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
> > > >  int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
> > > >  int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> > > > -int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> > > > +int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> > > > +					struct xfs_buf **bpp);
> > > >  int	xfs_attr_shortform_remove(struct xfs_da_args *args);
> > > >  int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
> > > >  int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
> > > >  void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
> > > >  
> > > > 
> > > 
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2017-08-14 12:22                           ` Brian Foster
  2017-08-14 16:04                             ` Alex Lyakas
@ 2019-03-22  9:12                             ` Shyam Kaushik
  2019-03-22 16:08                               ` Darrick J. Wong
  1 sibling, 1 reply; 40+ messages in thread
From: Shyam Kaushik @ 2019-03-22  9:12 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Dave Chinner, Brian Foster, Alex Lyakas, linux-xfs, libor.klepac

Hi Darrick,

We are trying to port your patch
https://github.com/torvalds/linux/commit/6e643cd094de3bd0f97edcc1db0089afa
24d909f to 4.14 LTS kernel. In 4.14 there is no xfs_defer_bjoin(). Can you
please comment if the below 4.14 LTS kernel patch looks ok to you? Do you
see any issues with it?

Thanks.

--Shyam

PATCH
-----

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index ea66f04f46f7..f7316138a8db 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -218,6 +218,7 @@ xfs_attr_set(
        xfs_fsblock_t           firstblock;
        int                     rsvd = (flags & ATTR_ROOT) != 0;
        int                     error, err2, local;
+       struct xfs_buf          *leaf_bp = NULL;

        XFS_STATS_INC(mp, xs_attr_set);

@@ -327,9 +328,15 @@ xfs_attr_set(
                 * GROT: another possible req'mt for a double-split btree
op.
                 */
                xfs_defer_init(args.dfops, args.firstblock);
-               error = xfs_attr_shortform_to_leaf(&args);
+               error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
                if (error)
                        goto out_defer_cancel;
+               /*
+                * Prevent the leaf buffer from being unlocked so that a
+                * concurrent AIL push cannot grab the half-baked leaf
+                * buffer and run into problems with the write verifier.
+                */
+               xfs_trans_bhold(args.trans, leaf_bp);
                xfs_defer_ijoin(args.dfops, dp);
                error = xfs_defer_finish(&args.trans, args.dfops);
                if (error)
@@ -337,13 +344,15 @@ xfs_attr_set(

                /*
                 * Commit the leaf transformation.  We'll need another
(linked)
-                * transaction to add the new attribute to the leaf.
+                * transaction to add the new attribute to the leaf, which
+                * means that we have to hold & join the leaf buffer here
too.
                 */

                error = xfs_trans_roll_inode(&args.trans, dp);
                if (error)
                        goto out;
-
+               xfs_trans_bjoin(args.trans, leaf_bp);
+               leaf_bp = NULL;
        }

        if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
@@ -374,8 +383,9 @@ xfs_attr_set(

 out_defer_cancel:
        xfs_defer_cancel(&dfops);
-       args.trans = NULL;
 out:
+       if (leaf_bp)
+               xfs_buf_relse(leaf_bp);
        if (args.trans)
                xfs_trans_cancel(args.trans);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 40e53a4fc0a6..92ae04ac413a 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -739,10 +739,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
 }

 /*
- * Convert from using the shortform to the leaf.
+ * Convert from using the shortform to the leaf.  On success, return the
+ * buffer so that we can keep it locked until we're totally done with it.
  */
 int
-xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+xfs_attr_shortform_to_leaf(
+       xfs_da_args_t *args,
+       struct xfs_buf **leaf_bp)
 {
        xfs_inode_t *dp;
        xfs_attr_shortform_t *sf;
@@ -821,6 +824,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
        }
        error = 0;
+       *leaf_bp = bp;

 out:
        kmem_free(tmpbuffer);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index f7dda0c237b0..894124efb421 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -48,7 +48,8 @@ void  xfs_attr_shortform_create(struct xfs_da_args
*args);
 void   xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
 int    xfs_attr_shortform_lookup(struct xfs_da_args *args);
 int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
+                       struct xfs_buf **leaf_bp);
 int    xfs_attr_shortform_remove(struct xfs_da_args *args);
 int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode
*dp);
 int    xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);



-----Original Message-----
From: linux-xfs-owner@vger.kernel.org
[mailto:linux-xfs-owner@vger.kernel.org] On Behalf Of Brian Foster
Sent: 14 August 2017 17:52
To: Alex Lyakas
Cc: Dave Chinner; Darrick J. Wong; linux-xfs@vger.kernel.org;
libor.klepac@bcom.cz
Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
conversion and the addition of an attribute

On Mon, Aug 14, 2017 at 11:11:41AM +0300, Alex Lyakas wrote:
> Hello David, Brian,
>
> I was not able to follow the details, unfortunately. Can you confirm
that
> this patch is safe to go into kernel 3.18?
>

This is the open question in the separate subthread (this one is
discussion around designing a solution for the current code):

http://marc.info/?l=linux-xfs&m=150246184413604&w=2

This could use confirmation, but my understanding is that this is safe
because v3.18 doesn't have the more advanced deferred ops
infrastructure. It uses xfs_bmap_finish() which has a max roll count of
one and a transaction with enough reservation for 2 rolls before
blocking reservation is required.

Note that doesn't mean we'd officially post a v3.18 stable patch before
this is fixed in the upstream code. We always fix upstream first and
backport from there to ensure a consistent base going forward (we don't
want to go change v3.18, end up with a slightly different upstream
patch, then have to backport more changes to fix the original patch).
This may be safe enough for you to use locally in the meantime, however.

Brian

> Thanks,
> Alex.
>
>
> -----Original Message----- From: Dave Chinner
> Sent: Monday, August 14, 2017 3:28 AM
> To: Brian Foster
> Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ;
> libor.klepac@bcom.cz
> Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> conversion and the addition of an attribute
>
> On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > a new dirty range on the buffer. IOWs, it retains whatever dirty
range
> > > it already had, and so after joining, marking it ordered and then
> > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > in the transaction.
> > >
> > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > with a pre-existing dirty region is formatted for the CIL? We
> > > haven't done that before, so I'm betting that we don't relog the
> > > dirty region like we should be doing....
> > >
> > > ... and we don't relog the existing dirty range because the
> > > ordered flag takes precedence.
> > >
> >
> > Right.. so it seems that the current implementation for ordered
buffers
> > assumes a buffer is only ever used in one mode or the other.
> > Additionally, the AIL assumes that any reinserted item has been fully
> > relogged and so it moves the LSN forward unconditionally. Current
> > ordered buffer processing violates this constraint for an already
logged
> > buffer.
>
> Right, but it's not been a concern until now because we've only ever
> used ordered buffers on newly allocated buffers that haven't been
> previously logged.
>
> > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > > regions exist, then we treat it like a normal buffer rather than an
> > > ordered buffer. We can factor the dirty region check out of
> > > xfs_buf_item_unlock() for this...
> > >
> > > Actually, check the case in xfs_buf_item_size() and remove the
> > > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > > will do the right thing without needing a duplicate check...
> > >
> >
> > I think that would work, assuming we actually check the
> > xfs_buf_log_format for dirty-ness rather than just the log item. As it
> > is, note that ordered buffers are still "logged" in the transaction
> > because otherwise the transaction infrastructure will assume it made
no
> > change to the buf and toss the log item at commit time (we also need
to
> > set up I/O completion on the buf and whatnot).
>
> *nod*
>
> > What concerns me about this approach is that I think we introduce the
> > possibility for subtle bugs. Existing ordered buffer code does this:
> >
> >         xfs_trans_ordered_buf(tp, fbuf);
> >         xfs_trans_log_buf(tp, fbuf, 0,
> >                           BBTOB(fbuf->b_length) - 1);
> >
> > ... which should continue to work fine. Allowing ordered buffers to
> > physically log means that something like this:
> >
> >         xfs_trans_log_buf(tp, fbuf, 0,
> >                           BBTOB(fbuf->b_length) - 1);
> >         xfs_trans_ordered_buf(tp, fbuf);
> >
> > ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> > and logging internals. Granted, the above already is incorrect, but it
> > technically still works as expected. I don't see the need to turn that
> > into a real problem by actually logging the buffer when we might not
> > expect to.
>
> Well, it's not a "things go bad" bug. It's a "we screwed up an
> optimisation" bug, because logging the buffer contents unnecessarily
> only increases the required log bandwidth. It shouldn't affect
> replay because the buffer is still correctly ordered in the log.
> Hence both the transient and end states of the buffer during replay
> will still be the same...
>
> > So while I agree that this could probably be made to work and I think
it
> > is ideal to doing any kind of logged range tracking in the deferred
ops
> > code, it still seems more tricky than it needs to be. To relog a held
> > buffer in a new transaction, why not just mark the lidp dirty in the
new
> > transaction so it inherits all existing dirty segments? AFAICT, all we
> > really need to do is:
> >
> >         tp->t_flags |= XFS_TRANS_DIRTY;
> >         lidp->lid_flags |= XFS_LID_DIRTY;
> >
> > ... on the new transaction and everything should just work as designed
> > (for a buffer that has been previously logged, held, rolled and
> > rejoined).
>
> We would also need to set:
>
> bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
>
> which means we should....
>
> > To elaborate a bit, I think we could refactor xfs_trans_log_buf() into
a
> > new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> > not related to actually dirtying the bli. xfs_trans_log_buf() would
call
> > xfs_trans_dirty_buf() and thus would not change functionally.
> > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> > the existing ordered buf users would no longer need to log a range of
> > the buffer (which doesn't make much sense anyways).
>
> ... do this. :)
>
> > Finally, the
> > deferred infrastructure could join/dirty/hold the buffer to the new
> > transaction after each roll without needing to track and relog
specific
> > regions of the buffer. Thoughts?
>
> Yup, that's exactly what I was thinking should be possible by using
> ordered buffers.... :)
>
> And Christoph's rework of the transaction roll and deferred inode
> handling that he just posted should make adding buffer handling
> quite a bit neater and cleaner.
>
> > Unless I'm missing something as to why this is busted, I'll take a
> > closer look at the code and float an rfc next week since otherwise it
> > sounds like this is something we could actually fix up in the ordered
> > buffer code today.
>
> Cool.
>
> > > Nothing in XFS is ever simple, is it? :P
> >
> > There used to be a level of satisfaction at feeling I understood some
> > new corner of XFS. Nowadays I know that just means I'm not yet aware
of
> > whatever dragons remain in that corner (is that paranoia? not if it's
> > true!). :P
>
> Ah, the true signs of expertise: developing a knowledge base and
> insight deep enough to understand that there is always another
> hidden dragon poised to bite your head off. :)
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2019-03-22  9:12                             ` Shyam Kaushik
@ 2019-03-22 16:08                               ` Darrick J. Wong
  2019-03-25 13:49                                 ` Shyam Kaushik
  0 siblings, 1 reply; 40+ messages in thread
From: Darrick J. Wong @ 2019-03-22 16:08 UTC (permalink / raw)
  To: Shyam Kaushik
  Cc: Dave Chinner, Brian Foster, Alex Lyakas, linux-xfs, libor.klepac

On Fri, Mar 22, 2019 at 02:42:36PM +0530, Shyam Kaushik wrote:
> Hi Darrick,
> 
> We are trying to port your patch
> https://github.com/torvalds/linux/commit/6e643cd094de3bd0f97edcc1db0089afa
> 24d909f to 4.14 LTS kernel. In 4.14 there is no xfs_defer_bjoin(). Can you
> please comment if the below 4.14 LTS kernel patch looks ok to you? Do you
> see any issues with it?

I don't see anything that resembles what xfs_defer_bjoin used to do
here, so it's hard to say without knowing if you've already backported
the pieces that made that function unnecessary or if you simply dropped
the call to satisfy the compiler...

--D

> Thanks.
> 
> --Shyam
> 
> PATCH
> -----
> 
> diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> index ea66f04f46f7..f7316138a8db 100644
> --- a/fs/xfs/libxfs/xfs_attr.c
> +++ b/fs/xfs/libxfs/xfs_attr.c
> @@ -218,6 +218,7 @@ xfs_attr_set(
>         xfs_fsblock_t           firstblock;
>         int                     rsvd = (flags & ATTR_ROOT) != 0;
>         int                     error, err2, local;
> +       struct xfs_buf          *leaf_bp = NULL;
> 
>         XFS_STATS_INC(mp, xs_attr_set);
> 
> @@ -327,9 +328,15 @@ xfs_attr_set(
>                  * GROT: another possible req'mt for a double-split btree
> op.
>                  */
>                 xfs_defer_init(args.dfops, args.firstblock);
> -               error = xfs_attr_shortform_to_leaf(&args);
> +               error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
>                 if (error)
>                         goto out_defer_cancel;
> +               /*
> +                * Prevent the leaf buffer from being unlocked so that a
> +                * concurrent AIL push cannot grab the half-baked leaf
> +                * buffer and run into problems with the write verifier.
> +                */
> +               xfs_trans_bhold(args.trans, leaf_bp);
>                 xfs_defer_ijoin(args.dfops, dp);
>                 error = xfs_defer_finish(&args.trans, args.dfops);
>                 if (error)
> @@ -337,13 +344,15 @@ xfs_attr_set(
> 
>                 /*
>                  * Commit the leaf transformation.  We'll need another
> (linked)
> -                * transaction to add the new attribute to the leaf.
> +                * transaction to add the new attribute to the leaf, which
> +                * means that we have to hold & join the leaf buffer here
> too.
>                  */
> 
>                 error = xfs_trans_roll_inode(&args.trans, dp);
>                 if (error)
>                         goto out;
> -
> +               xfs_trans_bjoin(args.trans, leaf_bp);
> +               leaf_bp = NULL;
>         }
> 
>         if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
> @@ -374,8 +383,9 @@ xfs_attr_set(
> 
>  out_defer_cancel:
>         xfs_defer_cancel(&dfops);
> -       args.trans = NULL;
>  out:
> +       if (leaf_bp)
> +               xfs_buf_relse(leaf_bp);
>         if (args.trans)
>                 xfs_trans_cancel(args.trans);
>         xfs_iunlock(dp, XFS_ILOCK_EXCL);
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
> index 40e53a4fc0a6..92ae04ac413a 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> @@ -739,10 +739,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
>  }
> 
>  /*
> - * Convert from using the shortform to the leaf.
> + * Convert from using the shortform to the leaf.  On success, return the
> + * buffer so that we can keep it locked until we're totally done with it.
>   */
>  int
> -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> +xfs_attr_shortform_to_leaf(
> +       xfs_da_args_t *args,
> +       struct xfs_buf **leaf_bp)
>  {
>         xfs_inode_t *dp;
>         xfs_attr_shortform_t *sf;
> @@ -821,6 +824,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
>                 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
>         }
>         error = 0;
> +       *leaf_bp = bp;
> 
>  out:
>         kmem_free(tmpbuffer);
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
> index f7dda0c237b0..894124efb421 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> @@ -48,7 +48,8 @@ void  xfs_attr_shortform_create(struct xfs_da_args
> *args);
>  void   xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
>  int    xfs_attr_shortform_lookup(struct xfs_da_args *args);
>  int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> -int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> +int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> +                       struct xfs_buf **leaf_bp);
>  int    xfs_attr_shortform_remove(struct xfs_da_args *args);
>  int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode
> *dp);
>  int    xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
> 
> 
> 
> -----Original Message-----
> From: linux-xfs-owner@vger.kernel.org
> [mailto:linux-xfs-owner@vger.kernel.org] On Behalf Of Brian Foster
> Sent: 14 August 2017 17:52
> To: Alex Lyakas
> Cc: Dave Chinner; Darrick J. Wong; linux-xfs@vger.kernel.org;
> libor.klepac@bcom.cz
> Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> conversion and the addition of an attribute
> 
> On Mon, Aug 14, 2017 at 11:11:41AM +0300, Alex Lyakas wrote:
> > Hello David, Brian,
> >
> > I was not able to follow the details, unfortunately. Can you confirm
> that
> > this patch is safe to go into kernel 3.18?
> >
> 
> This is the open question in the separate subthread (this one is
> discussion around designing a solution for the current code):
> 
> http://marc.info/?l=linux-xfs&m=150246184413604&w=2
> 
> This could use confirmation, but my understanding is that this is safe
> because v3.18 doesn't have the more advanced deferred ops
> infrastructure. It uses xfs_bmap_finish() which has a max roll count of
> one and a transaction with enough reservation for 2 rolls before
> blocking reservation is required.
> 
> Note that doesn't mean we'd officially post a v3.18 stable patch before
> this is fixed in the upstream code. We always fix upstream first and
> backport from there to ensure a consistent base going forward (we don't
> want to go change v3.18, end up with a slightly different upstream
> patch, then have to backport more changes to fix the original patch).
> This may be safe enough for you to use locally in the meantime, however.
> 
> Brian
> 
> > Thanks,
> > Alex.
> >
> >
> > -----Original Message----- From: Dave Chinner
> > Sent: Monday, August 14, 2017 3:28 AM
> > To: Brian Foster
> > Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ;
> > libor.klepac@bcom.cz
> > Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> > conversion and the addition of an attribute
> >
> > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > > Using XFS_BLI_ORDERED allows us to log the buffer without recording
> > > > a new dirty range on the buffer. IOWs, it retains whatever dirty
> range
> > > > it already had, and so after joining, marking it ordered and then
> > > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED buffer
> > > > in the transaction.
> > > >
> > > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > > with a pre-existing dirty region is formatted for the CIL? We
> > > > haven't done that before, so I'm betting that we don't relog the
> > > > dirty region like we should be doing....
> > > >
> > > > ... and we don't relog the existing dirty range because the
> > > > ordered flag takes precedence.
> > > >
> > >
> > > Right.. so it seems that the current implementation for ordered
> buffers
> > > assumes a buffer is only ever used in one mode or the other.
> > > Additionally, the AIL assumes that any reinserted item has been fully
> > > relogged and so it moves the LSN forward unconditionally. Current
> > > ordered buffer processing violates this constraint for an already
> logged
> > > buffer.
> >
> > Right, but it's not been a concern until now because we've only ever
> > used ordered buffers on newly allocated buffers that haven't been
> > previously logged.
> >
> > > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > > xfs_buf_item_format() need to also check for dirty regions. If dirty
> > > > regions exist, then we treat it like a normal buffer rather than an
> > > > ordered buffer. We can factor the dirty region check out of
> > > > xfs_buf_item_unlock() for this...
> > > >
> > > > Actually, check the case in xfs_buf_item_size() and remove the
> > > > ordered flag if there are dirty regions. Then xfs_buf_item_format()
> > > > will do the right thing without needing a duplicate check...
> > > >
> > >
> > > I think that would work, assuming we actually check the
> > > xfs_buf_log_format for dirty-ness rather than just the log item. As it
> > > is, note that ordered buffers are still "logged" in the transaction
> > > because otherwise the transaction infrastructure will assume it made
> no
> > > change to the buf and toss the log item at commit time (we also need
> to
> > > set up I/O completion on the buf and whatnot).
> >
> > *nod*
> >
> > > What concerns me about this approach is that I think we introduce the
> > > possibility for subtle bugs. Existing ordered buffer code does this:
> > >
> > >         xfs_trans_ordered_buf(tp, fbuf);
> > >         xfs_trans_log_buf(tp, fbuf, 0,
> > >                           BBTOB(fbuf->b_length) - 1);
> > >
> > > ... which should continue to work fine. Allowing ordered buffers to
> > > physically log means that something like this:
> > >
> > >         xfs_trans_log_buf(tp, fbuf, 0,
> > >                           BBTOB(fbuf->b_length) - 1);
> > >         xfs_trans_ordered_buf(tp, fbuf);
> > >
> > > ... is now a bug that is only apparent after scrutiny of xfs_trans_*()
> > > and logging internals. Granted, the above already is incorrect, but it
> > > technically still works as expected. I don't see the need to turn that
> > > into a real problem by actually logging the buffer when we might not
> > > expect to.
> >
> > Well, it's not a "things go bad" bug. It's a "we screwed up an
> > optimisation" bug, because logging the buffer contents unnecessarily
> > only increases the required log bandwidth. It shouldn't affect
> > replay because the buffer is still correctly ordered in the log.
> > Hence both the transient and end states of the buffer during replay
> > will still be the same...
> >
> > > So while I agree that this could probably be made to work and I think
> it
> > > is ideal to doing any kind of logged range tracking in the deferred
> ops
> > > code, it still seems more tricky than it needs to be. To relog a held
> > > buffer in a new transaction, why not just mark the lidp dirty in the
> new
> > > transaction so it inherits all existing dirty segments? AFAICT, all we
> > > really need to do is:
> > >
> > >         tp->t_flags |= XFS_TRANS_DIRTY;
> > >         lidp->lid_flags |= XFS_LID_DIRTY;
> > >
> > > ... on the new transaction and everything should just work as designed
> > > (for a buffer that has been previously logged, held, rolled and
> > > rejoined).
> >
> > We would also need to set:
> >
> > bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> >
> > which means we should....
> >
> > > To elaborate a bit, I think we could refactor xfs_trans_log_buf() into
> a
> > > new xfs_trans_dirty_buf() helper that covers all of the relevant bits
> > > not related to actually dirtying the bli. xfs_trans_log_buf() would
> call
> > > xfs_trans_dirty_buf() and thus would not change functionally.
> > > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and thus
> > > the existing ordered buf users would no longer need to log a range of
> > > the buffer (which doesn't make much sense anyways).
> >
> > ... do this. :)
> >
> > > Finally, the
> > > deferred infrastructure could join/dirty/hold the buffer to the new
> > > transaction after each roll without needing to track and relog
> specific
> > > regions of the buffer. Thoughts?
> >
> > Yup, that's exactly what I was thinking should be possible by using
> > ordered buffers.... :)
> >
> > And Christoph's rework of the transaction roll and deferred inode
> > handling that he just posted should make adding buffer handling
> > quite a bit neater and cleaner.
> >
> > > Unless I'm missing something as to why this is busted, I'll take a
> > > closer look at the code and float an rfc next week since otherwise it
> > > sounds like this is something we could actually fix up in the ordered
> > > buffer code today.
> >
> > Cool.
> >
> > > > Nothing in XFS is ever simple, is it? :P
> > >
> > > There used to be a level of satisfaction at feeling I understood some
> > > new corner of XFS. Nowadays I know that just means I'm not yet aware
> of
> > > whatever dragons remain in that corner (is that paranoia? not if it's
> > > true!). :P
> >
> > Ah, the true signs of expertise: developing a knowledge base and
> > insight deep enough to understand that there is always another
> > hidden dragon poised to bite your head off. :)
> >
> > Cheers,
> >
> > Dave.
> > --
> > Dave Chinner
> > david@fromorbit.com
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2019-03-22 16:08                               ` Darrick J. Wong
@ 2019-03-25 13:49                                 ` Shyam Kaushik
  2019-03-25 18:17                                   ` Darrick J. Wong
  0 siblings, 1 reply; 40+ messages in thread
From: Shyam Kaushik @ 2019-03-25 13:49 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Dave Chinner, Brian Foster, Alex Lyakas, linux-xfs, libor.klepac

Hi Darrick,

The original patch that was posted for 3.18-stable kernel
https://patchwork.kernel.org/patch/9885843/ didn't use xfs_defer_bjoin().

Question is, is it safe to port the original patch to 4.14 kernel (without
xfs_defer_bjoin()) or do you think its mandatory to get equivalent of
xfs_defer_bjoin() in 4.14 kernel to have this patch?

Can you please suggest? Thanks.

--Shyam

-----Original Message-----
From: Darrick J. Wong [mailto:darrick.wong@oracle.com]
Sent: 22 March 2019 21:39
To: Shyam Kaushik
Cc: Dave Chinner; Brian Foster; Alex Lyakas; linux-xfs@vger.kernel.org;
libor.klepac@bcom.cz
Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
conversion and the addition of an attribute

On Fri, Mar 22, 2019 at 02:42:36PM +0530, Shyam Kaushik wrote:
> Hi Darrick,
>
> We are trying to port your patch
>
https://github.com/torvalds/linux/commit/6e643cd094de3bd0f97edcc1db0089afa
> 24d909f to 4.14 LTS kernel. In 4.14 there is no xfs_defer_bjoin(). Can
you
> please comment if the below 4.14 LTS kernel patch looks ok to you? Do
you
> see any issues with it?

I don't see anything that resembles what xfs_defer_bjoin used to do
here, so it's hard to say without knowing if you've already backported
the pieces that made that function unnecessary or if you simply dropped
the call to satisfy the compiler...

--D

> Thanks.
>
> --Shyam
>
> PATCH
> -----
>
> diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> index ea66f04f46f7..f7316138a8db 100644
> --- a/fs/xfs/libxfs/xfs_attr.c
> +++ b/fs/xfs/libxfs/xfs_attr.c
> @@ -218,6 +218,7 @@ xfs_attr_set(
>         xfs_fsblock_t           firstblock;
>         int                     rsvd = (flags & ATTR_ROOT) != 0;
>         int                     error, err2, local;
> +       struct xfs_buf          *leaf_bp = NULL;
>
>         XFS_STATS_INC(mp, xs_attr_set);
>
> @@ -327,9 +328,15 @@ xfs_attr_set(
>                  * GROT: another possible req'mt for a double-split
btree
> op.
>                  */
>                 xfs_defer_init(args.dfops, args.firstblock);
> -               error = xfs_attr_shortform_to_leaf(&args);
> +               error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
>                 if (error)
>                         goto out_defer_cancel;
> +               /*
> +                * Prevent the leaf buffer from being unlocked so that a
> +                * concurrent AIL push cannot grab the half-baked leaf
> +                * buffer and run into problems with the write verifier.
> +                */
> +               xfs_trans_bhold(args.trans, leaf_bp);
>                 xfs_defer_ijoin(args.dfops, dp);
>                 error = xfs_defer_finish(&args.trans, args.dfops);
>                 if (error)
> @@ -337,13 +344,15 @@ xfs_attr_set(
>
>                 /*
>                  * Commit the leaf transformation.  We'll need another
> (linked)
> -                * transaction to add the new attribute to the leaf.
> +                * transaction to add the new attribute to the leaf,
which
> +                * means that we have to hold & join the leaf buffer
here
> too.
>                  */
>
>                 error = xfs_trans_roll_inode(&args.trans, dp);
>                 if (error)
>                         goto out;
> -
> +               xfs_trans_bjoin(args.trans, leaf_bp);
> +               leaf_bp = NULL;
>         }
>
>         if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
> @@ -374,8 +383,9 @@ xfs_attr_set(
>
>  out_defer_cancel:
>         xfs_defer_cancel(&dfops);
> -       args.trans = NULL;
>  out:
> +       if (leaf_bp)
> +               xfs_buf_relse(leaf_bp);
>         if (args.trans)
>                 xfs_trans_cancel(args.trans);
>         xfs_iunlock(dp, XFS_ILOCK_EXCL);
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c
b/fs/xfs/libxfs/xfs_attr_leaf.c
> index 40e53a4fc0a6..92ae04ac413a 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> @@ -739,10 +739,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
>  }
>
>  /*
> - * Convert from using the shortform to the leaf.
> + * Convert from using the shortform to the leaf.  On success, return
the
> + * buffer so that we can keep it locked until we're totally done with
it.
>   */
>  int
> -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> +xfs_attr_shortform_to_leaf(
> +       xfs_da_args_t *args,
> +       struct xfs_buf **leaf_bp)
>  {
>         xfs_inode_t *dp;
>         xfs_attr_shortform_t *sf;
> @@ -821,6 +824,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
>                 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
>         }
>         error = 0;
> +       *leaf_bp = bp;
>
>  out:
>         kmem_free(tmpbuffer);
> diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h
b/fs/xfs/libxfs/xfs_attr_leaf.h
> index f7dda0c237b0..894124efb421 100644
> --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> @@ -48,7 +48,8 @@ void  xfs_attr_shortform_create(struct xfs_da_args
> *args);
>  void   xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
>  int    xfs_attr_shortform_lookup(struct xfs_da_args *args);
>  int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> -int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> +int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> +                       struct xfs_buf **leaf_bp);
>  int    xfs_attr_shortform_remove(struct xfs_da_args *args);
>  int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode
> *dp);
>  int    xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
>
>
>
> -----Original Message-----
> From: linux-xfs-owner@vger.kernel.org
> [mailto:linux-xfs-owner@vger.kernel.org] On Behalf Of Brian Foster
> Sent: 14 August 2017 17:52
> To: Alex Lyakas
> Cc: Dave Chinner; Darrick J. Wong; linux-xfs@vger.kernel.org;
> libor.klepac@bcom.cz
> Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> conversion and the addition of an attribute
>
> On Mon, Aug 14, 2017 at 11:11:41AM +0300, Alex Lyakas wrote:
> > Hello David, Brian,
> >
> > I was not able to follow the details, unfortunately. Can you confirm
> that
> > this patch is safe to go into kernel 3.18?
> >
>
> This is the open question in the separate subthread (this one is
> discussion around designing a solution for the current code):
>
> http://marc.info/?l=linux-xfs&m=150246184413604&w=2
>
> This could use confirmation, but my understanding is that this is safe
> because v3.18 doesn't have the more advanced deferred ops
> infrastructure. It uses xfs_bmap_finish() which has a max roll count of
> one and a transaction with enough reservation for 2 rolls before
> blocking reservation is required.
>
> Note that doesn't mean we'd officially post a v3.18 stable patch before
> this is fixed in the upstream code. We always fix upstream first and
> backport from there to ensure a consistent base going forward (we don't
> want to go change v3.18, end up with a slightly different upstream
> patch, then have to backport more changes to fix the original patch).
> This may be safe enough for you to use locally in the meantime, however.
>
> Brian
>
> > Thanks,
> > Alex.
> >
> >
> > -----Original Message----- From: Dave Chinner
> > Sent: Monday, August 14, 2017 3:28 AM
> > To: Brian Foster
> > Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ;
> > libor.klepac@bcom.cz
> > Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> > conversion and the addition of an attribute
> >
> > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > > Using XFS_BLI_ORDERED allows us to log the buffer without
recording
> > > > a new dirty range on the buffer. IOWs, it retains whatever dirty
> range
> > > > it already had, and so after joining, marking it ordered and then
> > > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED
buffer
> > > > in the transaction.
> > > >
> > > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > > with a pre-existing dirty region is formatted for the CIL? We
> > > > haven't done that before, so I'm betting that we don't relog the
> > > > dirty region like we should be doing....
> > > >
> > > > ... and we don't relog the existing dirty range because the
> > > > ordered flag takes precedence.
> > > >
> > >
> > > Right.. so it seems that the current implementation for ordered
> buffers
> > > assumes a buffer is only ever used in one mode or the other.
> > > Additionally, the AIL assumes that any reinserted item has been
fully
> > > relogged and so it moves the LSN forward unconditionally. Current
> > > ordered buffer processing violates this constraint for an already
> logged
> > > buffer.
> >
> > Right, but it's not been a concern until now because we've only ever
> > used ordered buffers on newly allocated buffers that haven't been
> > previously logged.
> >
> > > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > > xfs_buf_item_format() need to also check for dirty regions. If
dirty
> > > > regions exist, then we treat it like a normal buffer rather than
an
> > > > ordered buffer. We can factor the dirty region check out of
> > > > xfs_buf_item_unlock() for this...
> > > >
> > > > Actually, check the case in xfs_buf_item_size() and remove the
> > > > ordered flag if there are dirty regions. Then
xfs_buf_item_format()
> > > > will do the right thing without needing a duplicate check...
> > > >
> > >
> > > I think that would work, assuming we actually check the
> > > xfs_buf_log_format for dirty-ness rather than just the log item. As
it
> > > is, note that ordered buffers are still "logged" in the transaction
> > > because otherwise the transaction infrastructure will assume it made
> no
> > > change to the buf and toss the log item at commit time (we also need
> to
> > > set up I/O completion on the buf and whatnot).
> >
> > *nod*
> >
> > > What concerns me about this approach is that I think we introduce
the
> > > possibility for subtle bugs. Existing ordered buffer code does this:
> > >
> > >         xfs_trans_ordered_buf(tp, fbuf);
> > >         xfs_trans_log_buf(tp, fbuf, 0,
> > >                           BBTOB(fbuf->b_length) - 1);
> > >
> > > ... which should continue to work fine. Allowing ordered buffers to
> > > physically log means that something like this:
> > >
> > >         xfs_trans_log_buf(tp, fbuf, 0,
> > >                           BBTOB(fbuf->b_length) - 1);
> > >         xfs_trans_ordered_buf(tp, fbuf);
> > >
> > > ... is now a bug that is only apparent after scrutiny of
xfs_trans_*()
> > > and logging internals. Granted, the above already is incorrect, but
it
> > > technically still works as expected. I don't see the need to turn
that
> > > into a real problem by actually logging the buffer when we might not
> > > expect to.
> >
> > Well, it's not a "things go bad" bug. It's a "we screwed up an
> > optimisation" bug, because logging the buffer contents unnecessarily
> > only increases the required log bandwidth. It shouldn't affect
> > replay because the buffer is still correctly ordered in the log.
> > Hence both the transient and end states of the buffer during replay
> > will still be the same...
> >
> > > So while I agree that this could probably be made to work and I
think
> it
> > > is ideal to doing any kind of logged range tracking in the deferred
> ops
> > > code, it still seems more tricky than it needs to be. To relog a
held
> > > buffer in a new transaction, why not just mark the lidp dirty in the
> new
> > > transaction so it inherits all existing dirty segments? AFAICT, all
we
> > > really need to do is:
> > >
> > >         tp->t_flags |= XFS_TRANS_DIRTY;
> > >         lidp->lid_flags |= XFS_LID_DIRTY;
> > >
> > > ... on the new transaction and everything should just work as
designed
> > > (for a buffer that has been previously logged, held, rolled and
> > > rejoined).
> >
> > We would also need to set:
> >
> > bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> >
> > which means we should....
> >
> > > To elaborate a bit, I think we could refactor xfs_trans_log_buf()
into
> a
> > > new xfs_trans_dirty_buf() helper that covers all of the relevant
bits
> > > not related to actually dirtying the bli. xfs_trans_log_buf() would
> call
> > > xfs_trans_dirty_buf() and thus would not change functionally.
> > > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and
thus
> > > the existing ordered buf users would no longer need to log a range
of
> > > the buffer (which doesn't make much sense anyways).
> >
> > ... do this. :)
> >
> > > Finally, the
> > > deferred infrastructure could join/dirty/hold the buffer to the new
> > > transaction after each roll without needing to track and relog
> specific
> > > regions of the buffer. Thoughts?
> >
> > Yup, that's exactly what I was thinking should be possible by using
> > ordered buffers.... :)
> >
> > And Christoph's rework of the transaction roll and deferred inode
> > handling that he just posted should make adding buffer handling
> > quite a bit neater and cleaner.
> >
> > > Unless I'm missing something as to why this is busted, I'll take a
> > > closer look at the code and float an rfc next week since otherwise
it
> > > sounds like this is something we could actually fix up in the
ordered
> > > buffer code today.
> >
> > Cool.
> >
> > > > Nothing in XFS is ever simple, is it? :P
> > >
> > > There used to be a level of satisfaction at feeling I understood
some
> > > new corner of XFS. Nowadays I know that just means I'm not yet aware
> of
> > > whatever dragons remain in that corner (is that paranoia? not if
it's
> > > true!). :P
> >
> > Ah, the true signs of expertise: developing a knowledge base and
> > insight deep enough to understand that there is always another
> > hidden dragon poised to bite your head off. :)
> >
> > Cheers,
> >
> > Dave.
> > --
> > Dave Chinner
> > david@fromorbit.com
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs"
in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2019-03-25 13:49                                 ` Shyam Kaushik
@ 2019-03-25 18:17                                   ` Darrick J. Wong
  2019-03-27 16:03                                     ` Alex Lyakas
  0 siblings, 1 reply; 40+ messages in thread
From: Darrick J. Wong @ 2019-03-25 18:17 UTC (permalink / raw)
  To: Shyam Kaushik
  Cc: Dave Chinner, Brian Foster, Alex Lyakas, linux-xfs, libor.klepac

On Mon, Mar 25, 2019 at 07:19:18PM +0530, Shyam Kaushik wrote:
> Hi Darrick,
> 
> The original patch that was posted for 3.18-stable kernel
> https://patchwork.kernel.org/patch/9885843/ didn't use xfs_defer_bjoin().
> 
> Question is, is it safe to port the original patch to 4.14 kernel (without
> xfs_defer_bjoin()) or do you think its mandatory to get equivalent of
> xfs_defer_bjoin() in 4.14 kernel to have this patch?
> 
> Can you please suggest? Thanks.

I have no idea.  It depends entirely on whether your kernel and intended
configuration require the functionality that xfs_defer_bjoin provided.

--D

> --Shyam
> 
> -----Original Message-----
> From: Darrick J. Wong [mailto:darrick.wong@oracle.com]
> Sent: 22 March 2019 21:39
> To: Shyam Kaushik
> Cc: Dave Chinner; Brian Foster; Alex Lyakas; linux-xfs@vger.kernel.org;
> libor.klepac@bcom.cz
> Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> conversion and the addition of an attribute
> 
> On Fri, Mar 22, 2019 at 02:42:36PM +0530, Shyam Kaushik wrote:
> > Hi Darrick,
> >
> > We are trying to port your patch
> >
> https://github.com/torvalds/linux/commit/6e643cd094de3bd0f97edcc1db0089afa
> > 24d909f to 4.14 LTS kernel. In 4.14 there is no xfs_defer_bjoin(). Can
> you
> > please comment if the below 4.14 LTS kernel patch looks ok to you? Do
> you
> > see any issues with it?
> 
> I don't see anything that resembles what xfs_defer_bjoin used to do
> here, so it's hard to say without knowing if you've already backported
> the pieces that made that function unnecessary or if you simply dropped
> the call to satisfy the compiler...
> 
> --D
> 
> > Thanks.
> >
> > --Shyam
> >
> > PATCH
> > -----
> >
> > diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> > index ea66f04f46f7..f7316138a8db 100644
> > --- a/fs/xfs/libxfs/xfs_attr.c
> > +++ b/fs/xfs/libxfs/xfs_attr.c
> > @@ -218,6 +218,7 @@ xfs_attr_set(
> >         xfs_fsblock_t           firstblock;
> >         int                     rsvd = (flags & ATTR_ROOT) != 0;
> >         int                     error, err2, local;
> > +       struct xfs_buf          *leaf_bp = NULL;
> >
> >         XFS_STATS_INC(mp, xs_attr_set);
> >
> > @@ -327,9 +328,15 @@ xfs_attr_set(
> >                  * GROT: another possible req'mt for a double-split
> btree
> > op.
> >                  */
> >                 xfs_defer_init(args.dfops, args.firstblock);
> > -               error = xfs_attr_shortform_to_leaf(&args);
> > +               error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> >                 if (error)
> >                         goto out_defer_cancel;
> > +               /*
> > +                * Prevent the leaf buffer from being unlocked so that a
> > +                * concurrent AIL push cannot grab the half-baked leaf
> > +                * buffer and run into problems with the write verifier.
> > +                */
> > +               xfs_trans_bhold(args.trans, leaf_bp);
> >                 xfs_defer_ijoin(args.dfops, dp);
> >                 error = xfs_defer_finish(&args.trans, args.dfops);
> >                 if (error)
> > @@ -337,13 +344,15 @@ xfs_attr_set(
> >
> >                 /*
> >                  * Commit the leaf transformation.  We'll need another
> > (linked)
> > -                * transaction to add the new attribute to the leaf.
> > +                * transaction to add the new attribute to the leaf,
> which
> > +                * means that we have to hold & join the leaf buffer
> here
> > too.
> >                  */
> >
> >                 error = xfs_trans_roll_inode(&args.trans, dp);
> >                 if (error)
> >                         goto out;
> > -
> > +               xfs_trans_bjoin(args.trans, leaf_bp);
> > +               leaf_bp = NULL;
> >         }
> >
> >         if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
> > @@ -374,8 +383,9 @@ xfs_attr_set(
> >
> >  out_defer_cancel:
> >         xfs_defer_cancel(&dfops);
> > -       args.trans = NULL;
> >  out:
> > +       if (leaf_bp)
> > +               xfs_buf_relse(leaf_bp);
> >         if (args.trans)
> >                 xfs_trans_cancel(args.trans);
> >         xfs_iunlock(dp, XFS_ILOCK_EXCL);
> > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c
> b/fs/xfs/libxfs/xfs_attr_leaf.c
> > index 40e53a4fc0a6..92ae04ac413a 100644
> > --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> > +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> > @@ -739,10 +739,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
> >  }
> >
> >  /*
> > - * Convert from using the shortform to the leaf.
> > + * Convert from using the shortform to the leaf.  On success, return
> the
> > + * buffer so that we can keep it locked until we're totally done with
> it.
> >   */
> >  int
> > -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> > +xfs_attr_shortform_to_leaf(
> > +       xfs_da_args_t *args,
> > +       struct xfs_buf **leaf_bp)
> >  {
> >         xfs_inode_t *dp;
> >         xfs_attr_shortform_t *sf;
> > @@ -821,6 +824,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> >                 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
> >         }
> >         error = 0;
> > +       *leaf_bp = bp;
> >
> >  out:
> >         kmem_free(tmpbuffer);
> > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h
> b/fs/xfs/libxfs/xfs_attr_leaf.h
> > index f7dda0c237b0..894124efb421 100644
> > --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> > +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> > @@ -48,7 +48,8 @@ void  xfs_attr_shortform_create(struct xfs_da_args
> > *args);
> >  void   xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
> >  int    xfs_attr_shortform_lookup(struct xfs_da_args *args);
> >  int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> > -int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> > +int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> > +                       struct xfs_buf **leaf_bp);
> >  int    xfs_attr_shortform_remove(struct xfs_da_args *args);
> >  int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode
> > *dp);
> >  int    xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
> >
> >
> >
> > -----Original Message-----
> > From: linux-xfs-owner@vger.kernel.org
> > [mailto:linux-xfs-owner@vger.kernel.org] On Behalf Of Brian Foster
> > Sent: 14 August 2017 17:52
> > To: Alex Lyakas
> > Cc: Dave Chinner; Darrick J. Wong; linux-xfs@vger.kernel.org;
> > libor.klepac@bcom.cz
> > Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> > conversion and the addition of an attribute
> >
> > On Mon, Aug 14, 2017 at 11:11:41AM +0300, Alex Lyakas wrote:
> > > Hello David, Brian,
> > >
> > > I was not able to follow the details, unfortunately. Can you confirm
> > that
> > > this patch is safe to go into kernel 3.18?
> > >
> >
> > This is the open question in the separate subthread (this one is
> > discussion around designing a solution for the current code):
> >
> > http://marc.info/?l=linux-xfs&m=150246184413604&w=2
> >
> > This could use confirmation, but my understanding is that this is safe
> > because v3.18 doesn't have the more advanced deferred ops
> > infrastructure. It uses xfs_bmap_finish() which has a max roll count of
> > one and a transaction with enough reservation for 2 rolls before
> > blocking reservation is required.
> >
> > Note that doesn't mean we'd officially post a v3.18 stable patch before
> > this is fixed in the upstream code. We always fix upstream first and
> > backport from there to ensure a consistent base going forward (we don't
> > want to go change v3.18, end up with a slightly different upstream
> > patch, then have to backport more changes to fix the original patch).
> > This may be safe enough for you to use locally in the meantime, however.
> >
> > Brian
> >
> > > Thanks,
> > > Alex.
> > >
> > >
> > > -----Original Message----- From: Dave Chinner
> > > Sent: Monday, August 14, 2017 3:28 AM
> > > To: Brian Foster
> > > Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ;
> > > libor.klepac@bcom.cz
> > > Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> > > conversion and the addition of an attribute
> > >
> > > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > > > Using XFS_BLI_ORDERED allows us to log the buffer without
> recording
> > > > > a new dirty range on the buffer. IOWs, it retains whatever dirty
> > range
> > > > > it already had, and so after joining, marking it ordered and then
> > > > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED
> buffer
> > > > > in the transaction.
> > > > >
> > > > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > > > with a pre-existing dirty region is formatted for the CIL? We
> > > > > haven't done that before, so I'm betting that we don't relog the
> > > > > dirty region like we should be doing....
> > > > >
> > > > > ... and we don't relog the existing dirty range because the
> > > > > ordered flag takes precedence.
> > > > >
> > > >
> > > > Right.. so it seems that the current implementation for ordered
> > buffers
> > > > assumes a buffer is only ever used in one mode or the other.
> > > > Additionally, the AIL assumes that any reinserted item has been
> fully
> > > > relogged and so it moves the LSN forward unconditionally. Current
> > > > ordered buffer processing violates this constraint for an already
> > logged
> > > > buffer.
> > >
> > > Right, but it's not been a concern until now because we've only ever
> > > used ordered buffers on newly allocated buffers that haven't been
> > > previously logged.
> > >
> > > > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > > > xfs_buf_item_format() need to also check for dirty regions. If
> dirty
> > > > > regions exist, then we treat it like a normal buffer rather than
> an
> > > > > ordered buffer. We can factor the dirty region check out of
> > > > > xfs_buf_item_unlock() for this...
> > > > >
> > > > > Actually, check the case in xfs_buf_item_size() and remove the
> > > > > ordered flag if there are dirty regions. Then
> xfs_buf_item_format()
> > > > > will do the right thing without needing a duplicate check...
> > > > >
> > > >
> > > > I think that would work, assuming we actually check the
> > > > xfs_buf_log_format for dirty-ness rather than just the log item. As
> it
> > > > is, note that ordered buffers are still "logged" in the transaction
> > > > because otherwise the transaction infrastructure will assume it made
> > no
> > > > change to the buf and toss the log item at commit time (we also need
> > to
> > > > set up I/O completion on the buf and whatnot).
> > >
> > > *nod*
> > >
> > > > What concerns me about this approach is that I think we introduce
> the
> > > > possibility for subtle bugs. Existing ordered buffer code does this:
> > > >
> > > >         xfs_trans_ordered_buf(tp, fbuf);
> > > >         xfs_trans_log_buf(tp, fbuf, 0,
> > > >                           BBTOB(fbuf->b_length) - 1);
> > > >
> > > > ... which should continue to work fine. Allowing ordered buffers to
> > > > physically log means that something like this:
> > > >
> > > >         xfs_trans_log_buf(tp, fbuf, 0,
> > > >                           BBTOB(fbuf->b_length) - 1);
> > > >         xfs_trans_ordered_buf(tp, fbuf);
> > > >
> > > > ... is now a bug that is only apparent after scrutiny of
> xfs_trans_*()
> > > > and logging internals. Granted, the above already is incorrect, but
> it
> > > > technically still works as expected. I don't see the need to turn
> that
> > > > into a real problem by actually logging the buffer when we might not
> > > > expect to.
> > >
> > > Well, it's not a "things go bad" bug. It's a "we screwed up an
> > > optimisation" bug, because logging the buffer contents unnecessarily
> > > only increases the required log bandwidth. It shouldn't affect
> > > replay because the buffer is still correctly ordered in the log.
> > > Hence both the transient and end states of the buffer during replay
> > > will still be the same...
> > >
> > > > So while I agree that this could probably be made to work and I
> think
> > it
> > > > is ideal to doing any kind of logged range tracking in the deferred
> > ops
> > > > code, it still seems more tricky than it needs to be. To relog a
> held
> > > > buffer in a new transaction, why not just mark the lidp dirty in the
> > new
> > > > transaction so it inherits all existing dirty segments? AFAICT, all
> we
> > > > really need to do is:
> > > >
> > > >         tp->t_flags |= XFS_TRANS_DIRTY;
> > > >         lidp->lid_flags |= XFS_LID_DIRTY;
> > > >
> > > > ... on the new transaction and everything should just work as
> designed
> > > > (for a buffer that has been previously logged, held, rolled and
> > > > rejoined).
> > >
> > > We would also need to set:
> > >
> > > bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> > >
> > > which means we should....
> > >
> > > > To elaborate a bit, I think we could refactor xfs_trans_log_buf()
> into
> > a
> > > > new xfs_trans_dirty_buf() helper that covers all of the relevant
> bits
> > > > not related to actually dirtying the bli. xfs_trans_log_buf() would
> > call
> > > > xfs_trans_dirty_buf() and thus would not change functionally.
> > > > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and
> thus
> > > > the existing ordered buf users would no longer need to log a range
> of
> > > > the buffer (which doesn't make much sense anyways).
> > >
> > > ... do this. :)
> > >
> > > > Finally, the
> > > > deferred infrastructure could join/dirty/hold the buffer to the new
> > > > transaction after each roll without needing to track and relog
> > specific
> > > > regions of the buffer. Thoughts?
> > >
> > > Yup, that's exactly what I was thinking should be possible by using
> > > ordered buffers.... :)
> > >
> > > And Christoph's rework of the transaction roll and deferred inode
> > > handling that he just posted should make adding buffer handling
> > > quite a bit neater and cleaner.
> > >
> > > > Unless I'm missing something as to why this is busted, I'll take a
> > > > closer look at the code and float an rfc next week since otherwise
> it
> > > > sounds like this is something we could actually fix up in the
> ordered
> > > > buffer code today.
> > >
> > > Cool.
> > >
> > > > > Nothing in XFS is ever simple, is it? :P
> > > >
> > > > There used to be a level of satisfaction at feeling I understood
> some
> > > > new corner of XFS. Nowadays I know that just means I'm not yet aware
> > of
> > > > whatever dragons remain in that corner (is that paranoia? not if
> it's
> > > > true!). :P
> > >
> > > Ah, the true signs of expertise: developing a knowledge base and
> > > insight deep enough to understand that there is always another
> > > hidden dragon poised to bite your head off. :)
> > >
> > > Cheers,
> > >
> > > Dave.
> > > --
> > > Dave Chinner
> > > david@fromorbit.com
> > >
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs"
> in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2019-03-25 18:17                                   ` Darrick J. Wong
@ 2019-03-27 16:03                                     ` Alex Lyakas
  2019-03-27 20:46                                       ` Dave Chinner
  0 siblings, 1 reply; 40+ messages in thread
From: Alex Lyakas @ 2019-03-27 16:03 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Shyam Kaushik, Dave Chinner, Brian Foster, linux-xfs

Hi Darrick,

I started this long email thread originally, and posted a patch with
the proposed fix to the "Metadata corruption at
xfs_attr3_leaf_write_verify" problem. We reported this problem
originally. Eventually we found a stable reproducer for the issue,
added different prints in the code, and posted our analysis to
community in https://www.spinics.net/lists/linux-xfs/msg08752.html.
The community (Dave) confirmed that we found a "zero day" bug, and
gave us some hints on how to fix it. Hence this thread.

After reviewing my patch, Dave expressed the following concern:

"The problem is that the locked buffer is not joined and logged in
the rolling transactions run in xfs_defer_ops. Hence it can pin the
tail of the AIL, and this can prevent the transaction roll from
regranting the log space necessary to continue rolling the
transaction for the required number of transactions to complete the
deferred ops. If this happens, we end up with a log space deadlock."

However, after more discussions, there was more or less a consensus
that for kernel 3.18 this fix should be safe. We went ahead, applied
and qualified the fix. With this fix we did not see the issue in any
of the production systems, which were hitting the issue frequently.

We are now in the process of moving to long-term kernel 4.14.x. We
see, however, that this problem was fixed by the community only for
kernels 4.15 and later. Since we had several production systems
hitting this issue frequently, we need a fix for it in kernel 4.14.

Hence our question: whether our original patch should be safe to apply
to kernel 4.14?

Brian, Dave, can you perhaps also comment?

Thanks,
Alex.


On Mon, Mar 25, 2019 at 8:18 PM Darrick J. Wong <darrick.wong@oracle.com> wrote:
>
> On Mon, Mar 25, 2019 at 07:19:18PM +0530, Shyam Kaushik wrote:
> > Hi Darrick,
> >
> > The original patch that was posted for 3.18-stable kernel
> > https://patchwork.kernel.org/patch/9885843/ didn't use xfs_defer_bjoin().
> >
> > Question is, is it safe to port the original patch to 4.14 kernel (without
> > xfs_defer_bjoin()) or do you think its mandatory to get equivalent of
> > xfs_defer_bjoin() in 4.14 kernel to have this patch?
> >
> > Can you please suggest? Thanks.
>
> I have no idea.  It depends entirely on whether your kernel and intended
> configuration require the functionality that xfs_defer_bjoin provided.
>
> --D
>
> > --Shyam
> >
> > -----Original Message-----
> > From: Darrick J. Wong [mailto:darrick.wong@oracle.com]
> > Sent: 22 March 2019 21:39
> > To: Shyam Kaushik
> > Cc: Dave Chinner; Brian Foster; Alex Lyakas; linux-xfs@vger.kernel.org;
> > libor.klepac@bcom.cz
> > Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> > conversion and the addition of an attribute
> >
> > On Fri, Mar 22, 2019 at 02:42:36PM +0530, Shyam Kaushik wrote:
> > > Hi Darrick,
> > >
> > > We are trying to port your patch
> > >
> > https://github.com/torvalds/linux/commit/6e643cd094de3bd0f97edcc1db0089afa
> > > 24d909f to 4.14 LTS kernel. In 4.14 there is no xfs_defer_bjoin(). Can
> > you
> > > please comment if the below 4.14 LTS kernel patch looks ok to you? Do
> > you
> > > see any issues with it?
> >
> > I don't see anything that resembles what xfs_defer_bjoin used to do
> > here, so it's hard to say without knowing if you've already backported
> > the pieces that made that function unnecessary or if you simply dropped
> > the call to satisfy the compiler...
> >
> > --D
> >
> > > Thanks.
> > >
> > > --Shyam
> > >
> > > PATCH
> > > -----
> > >
> > > diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
> > > index ea66f04f46f7..f7316138a8db 100644
> > > --- a/fs/xfs/libxfs/xfs_attr.c
> > > +++ b/fs/xfs/libxfs/xfs_attr.c
> > > @@ -218,6 +218,7 @@ xfs_attr_set(
> > >         xfs_fsblock_t           firstblock;
> > >         int                     rsvd = (flags & ATTR_ROOT) != 0;
> > >         int                     error, err2, local;
> > > +       struct xfs_buf          *leaf_bp = NULL;
> > >
> > >         XFS_STATS_INC(mp, xs_attr_set);
> > >
> > > @@ -327,9 +328,15 @@ xfs_attr_set(
> > >                  * GROT: another possible req'mt for a double-split
> > btree
> > > op.
> > >                  */
> > >                 xfs_defer_init(args.dfops, args.firstblock);
> > > -               error = xfs_attr_shortform_to_leaf(&args);
> > > +               error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
> > >                 if (error)
> > >                         goto out_defer_cancel;
> > > +               /*
> > > +                * Prevent the leaf buffer from being unlocked so that a
> > > +                * concurrent AIL push cannot grab the half-baked leaf
> > > +                * buffer and run into problems with the write verifier.
> > > +                */
> > > +               xfs_trans_bhold(args.trans, leaf_bp);
> > >                 xfs_defer_ijoin(args.dfops, dp);
> > >                 error = xfs_defer_finish(&args.trans, args.dfops);
> > >                 if (error)
> > > @@ -337,13 +344,15 @@ xfs_attr_set(
> > >
> > >                 /*
> > >                  * Commit the leaf transformation.  We'll need another
> > > (linked)
> > > -                * transaction to add the new attribute to the leaf.
> > > +                * transaction to add the new attribute to the leaf,
> > which
> > > +                * means that we have to hold & join the leaf buffer
> > here
> > > too.
> > >                  */
> > >
> > >                 error = xfs_trans_roll_inode(&args.trans, dp);
> > >                 if (error)
> > >                         goto out;
> > > -
> > > +               xfs_trans_bjoin(args.trans, leaf_bp);
> > > +               leaf_bp = NULL;
> > >         }
> > >
> > >         if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
> > > @@ -374,8 +383,9 @@ xfs_attr_set(
> > >
> > >  out_defer_cancel:
> > >         xfs_defer_cancel(&dfops);
> > > -       args.trans = NULL;
> > >  out:
> > > +       if (leaf_bp)
> > > +               xfs_buf_relse(leaf_bp);
> > >         if (args.trans)
> > >                 xfs_trans_cancel(args.trans);
> > >         xfs_iunlock(dp, XFS_ILOCK_EXCL);
> > > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c
> > b/fs/xfs/libxfs/xfs_attr_leaf.c
> > > index 40e53a4fc0a6..92ae04ac413a 100644
> > > --- a/fs/xfs/libxfs/xfs_attr_leaf.c
> > > +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
> > > @@ -739,10 +739,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
> > >  }
> > >
> > >  /*
> > > - * Convert from using the shortform to the leaf.
> > > + * Convert from using the shortform to the leaf.  On success, return
> > the
> > > + * buffer so that we can keep it locked until we're totally done with
> > it.
> > >   */
> > >  int
> > > -xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> > > +xfs_attr_shortform_to_leaf(
> > > +       xfs_da_args_t *args,
> > > +       struct xfs_buf **leaf_bp)
> > >  {
> > >         xfs_inode_t *dp;
> > >         xfs_attr_shortform_t *sf;
> > > @@ -821,6 +824,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
> > >                 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
> > >         }
> > >         error = 0;
> > > +       *leaf_bp = bp;
> > >
> > >  out:
> > >         kmem_free(tmpbuffer);
> > > diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h
> > b/fs/xfs/libxfs/xfs_attr_leaf.h
> > > index f7dda0c237b0..894124efb421 100644
> > > --- a/fs/xfs/libxfs/xfs_attr_leaf.h
> > > +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
> > > @@ -48,7 +48,8 @@ void  xfs_attr_shortform_create(struct xfs_da_args
> > > *args);
> > >  void   xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
> > >  int    xfs_attr_shortform_lookup(struct xfs_da_args *args);
> > >  int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
> > > -int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
> > > +int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
> > > +                       struct xfs_buf **leaf_bp);
> > >  int    xfs_attr_shortform_remove(struct xfs_da_args *args);
> > >  int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode
> > > *dp);
> > >  int    xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
> > >
> > >
> > >
> > > -----Original Message-----
> > > From: linux-xfs-owner@vger.kernel.org
> > > [mailto:linux-xfs-owner@vger.kernel.org] On Behalf Of Brian Foster
> > > Sent: 14 August 2017 17:52
> > > To: Alex Lyakas
> > > Cc: Dave Chinner; Darrick J. Wong; linux-xfs@vger.kernel.org;
> > > libor.klepac@bcom.cz
> > > Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> > > conversion and the addition of an attribute
> > >
> > > On Mon, Aug 14, 2017 at 11:11:41AM +0300, Alex Lyakas wrote:
> > > > Hello David, Brian,
> > > >
> > > > I was not able to follow the details, unfortunately. Can you confirm
> > > that
> > > > this patch is safe to go into kernel 3.18?
> > > >
> > >
> > > This is the open question in the separate subthread (this one is
> > > discussion around designing a solution for the current code):
> > >
> > > http://marc.info/?l=linux-xfs&m=150246184413604&w=2
> > >
> > > This could use confirmation, but my understanding is that this is safe
> > > because v3.18 doesn't have the more advanced deferred ops
> > > infrastructure. It uses xfs_bmap_finish() which has a max roll count of
> > > one and a transaction with enough reservation for 2 rolls before
> > > blocking reservation is required.
> > >
> > > Note that doesn't mean we'd officially post a v3.18 stable patch before
> > > this is fixed in the upstream code. We always fix upstream first and
> > > backport from there to ensure a consistent base going forward (we don't
> > > want to go change v3.18, end up with a slightly different upstream
> > > patch, then have to backport more changes to fix the original patch).
> > > This may be safe enough for you to use locally in the meantime, however.
> > >
> > > Brian
> > >
> > > > Thanks,
> > > > Alex.
> > > >
> > > >
> > > > -----Original Message----- From: Dave Chinner
> > > > Sent: Monday, August 14, 2017 3:28 AM
> > > > To: Brian Foster
> > > > Cc: Darrick J. Wong ; Alex Lyakas ; linux-xfs@vger.kernel.org ;
> > > > libor.klepac@bcom.cz
> > > > Subject: Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf
> > > > conversion and the addition of an attribute
> > > >
> > > > On Sat, Aug 12, 2017 at 10:04:34AM -0400, Brian Foster wrote:
> > > > > On Sat, Aug 12, 2017 at 10:16:37AM +1000, Dave Chinner wrote:
> > > > > > On Fri, Aug 11, 2017 at 10:27:43AM -0400, Brian Foster wrote:
> > > > > > > On Fri, Aug 11, 2017 at 12:22:04PM +1000, Dave Chinner wrote:
> > > > > > Using XFS_BLI_ORDERED allows us to log the buffer without
> > recording
> > > > > > a new dirty range on the buffer. IOWs, it retains whatever dirty
> > > range
> > > > > > it already had, and so after joining, marking it ordered and then
> > > > > > logging the buffer, we have a XFS_BLI_DIRTY | XFS_BLI_ORDERED
> > buffer
> > > > > > in the transaction.
> > > > > >
> > > > > > The question is this: what happens when a XFS_BLI_ORDERED buffer
> > > > > > with a pre-existing dirty region is formatted for the CIL? We
> > > > > > haven't done that before, so I'm betting that we don't relog the
> > > > > > dirty region like we should be doing....
> > > > > >
> > > > > > ... and we don't relog the existing dirty range because the
> > > > > > ordered flag takes precedence.
> > > > > >
> > > > >
> > > > > Right.. so it seems that the current implementation for ordered
> > > buffers
> > > > > assumes a buffer is only ever used in one mode or the other.
> > > > > Additionally, the AIL assumes that any reinserted item has been
> > fully
> > > > > relogged and so it moves the LSN forward unconditionally. Current
> > > > > ordered buffer processing violates this constraint for an already
> > > logged
> > > > > buffer.
> > > >
> > > > Right, but it's not been a concern until now because we've only ever
> > > > used ordered buffers on newly allocated buffers that haven't been
> > > > previously logged.
> > > >
> > > > > > Ok, the ordered buffer checks in xfs_buf_item_size() and
> > > > > > xfs_buf_item_format() need to also check for dirty regions. If
> > dirty
> > > > > > regions exist, then we treat it like a normal buffer rather than
> > an
> > > > > > ordered buffer. We can factor the dirty region check out of
> > > > > > xfs_buf_item_unlock() for this...
> > > > > >
> > > > > > Actually, check the case in xfs_buf_item_size() and remove the
> > > > > > ordered flag if there are dirty regions. Then
> > xfs_buf_item_format()
> > > > > > will do the right thing without needing a duplicate check...
> > > > > >
> > > > >
> > > > > I think that would work, assuming we actually check the
> > > > > xfs_buf_log_format for dirty-ness rather than just the log item. As
> > it
> > > > > is, note that ordered buffers are still "logged" in the transaction
> > > > > because otherwise the transaction infrastructure will assume it made
> > > no
> > > > > change to the buf and toss the log item at commit time (we also need
> > > to
> > > > > set up I/O completion on the buf and whatnot).
> > > >
> > > > *nod*
> > > >
> > > > > What concerns me about this approach is that I think we introduce
> > the
> > > > > possibility for subtle bugs. Existing ordered buffer code does this:
> > > > >
> > > > >         xfs_trans_ordered_buf(tp, fbuf);
> > > > >         xfs_trans_log_buf(tp, fbuf, 0,
> > > > >                           BBTOB(fbuf->b_length) - 1);
> > > > >
> > > > > ... which should continue to work fine. Allowing ordered buffers to
> > > > > physically log means that something like this:
> > > > >
> > > > >         xfs_trans_log_buf(tp, fbuf, 0,
> > > > >                           BBTOB(fbuf->b_length) - 1);
> > > > >         xfs_trans_ordered_buf(tp, fbuf);
> > > > >
> > > > > ... is now a bug that is only apparent after scrutiny of
> > xfs_trans_*()
> > > > > and logging internals. Granted, the above already is incorrect, but
> > it
> > > > > technically still works as expected. I don't see the need to turn
> > that
> > > > > into a real problem by actually logging the buffer when we might not
> > > > > expect to.
> > > >
> > > > Well, it's not a "things go bad" bug. It's a "we screwed up an
> > > > optimisation" bug, because logging the buffer contents unnecessarily
> > > > only increases the required log bandwidth. It shouldn't affect
> > > > replay because the buffer is still correctly ordered in the log.
> > > > Hence both the transient and end states of the buffer during replay
> > > > will still be the same...
> > > >
> > > > > So while I agree that this could probably be made to work and I
> > think
> > > it
> > > > > is ideal to doing any kind of logged range tracking in the deferred
> > > ops
> > > > > code, it still seems more tricky than it needs to be. To relog a
> > held
> > > > > buffer in a new transaction, why not just mark the lidp dirty in the
> > > new
> > > > > transaction so it inherits all existing dirty segments? AFAICT, all
> > we
> > > > > really need to do is:
> > > > >
> > > > >         tp->t_flags |= XFS_TRANS_DIRTY;
> > > > >         lidp->lid_flags |= XFS_LID_DIRTY;
> > > > >
> > > > > ... on the new transaction and everything should just work as
> > designed
> > > > > (for a buffer that has been previously logged, held, rolled and
> > > > > rejoined).
> > > >
> > > > We would also need to set:
> > > >
> > > > bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
> > > >
> > > > which means we should....
> > > >
> > > > > To elaborate a bit, I think we could refactor xfs_trans_log_buf()
> > into
> > > a
> > > > > new xfs_trans_dirty_buf() helper that covers all of the relevant
> > bits
> > > > > not related to actually dirtying the bli. xfs_trans_log_buf() would
> > > call
> > > > > xfs_trans_dirty_buf() and thus would not change functionally.
> > > > > xfs_trans_ordered_buf() could now call xfs_trans_dirty_buf() and
> > thus
> > > > > the existing ordered buf users would no longer need to log a range
> > of
> > > > > the buffer (which doesn't make much sense anyways).
> > > >
> > > > ... do this. :)
> > > >
> > > > > Finally, the
> > > > > deferred infrastructure could join/dirty/hold the buffer to the new
> > > > > transaction after each roll without needing to track and relog
> > > specific
> > > > > regions of the buffer. Thoughts?
> > > >
> > > > Yup, that's exactly what I was thinking should be possible by using
> > > > ordered buffers.... :)
> > > >
> > > > And Christoph's rework of the transaction roll and deferred inode
> > > > handling that he just posted should make adding buffer handling
> > > > quite a bit neater and cleaner.
> > > >
> > > > > Unless I'm missing something as to why this is busted, I'll take a
> > > > > closer look at the code and float an rfc next week since otherwise
> > it
> > > > > sounds like this is something we could actually fix up in the
> > ordered
> > > > > buffer code today.
> > > >
> > > > Cool.
> > > >
> > > > > > Nothing in XFS is ever simple, is it? :P
> > > > >
> > > > > There used to be a level of satisfaction at feeling I understood
> > some
> > > > > new corner of XFS. Nowadays I know that just means I'm not yet aware
> > > of
> > > > > whatever dragons remain in that corner (is that paranoia? not if
> > it's
> > > > > true!). :P
> > > >
> > > > Ah, the true signs of expertise: developing a knowledge base and
> > > > insight deep enough to understand that there is always another
> > > > hidden dragon poised to bite your head off. :)
> > > >
> > > > Cheers,
> > > >
> > > > Dave.
> > > > --
> > > > Dave Chinner
> > > > david@fromorbit.com
> > > >
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs"
> > in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2019-03-27 16:03                                     ` Alex Lyakas
@ 2019-03-27 20:46                                       ` Dave Chinner
  2019-03-28 11:26                                         ` Alex Lyakas
  0 siblings, 1 reply; 40+ messages in thread
From: Dave Chinner @ 2019-03-27 20:46 UTC (permalink / raw)
  To: Alex Lyakas; +Cc: Darrick J. Wong, Shyam Kaushik, Brian Foster, linux-xfs

On Wed, Mar 27, 2019 at 06:03:38PM +0200, Alex Lyakas wrote:
> Hi Darrick,
> 
> I started this long email thread originally, and posted a patch with
> the proposed fix to the "Metadata corruption at
> xfs_attr3_leaf_write_verify" problem. We reported this problem
> originally. Eventually we found a stable reproducer for the issue,
> added different prints in the code, and posted our analysis to
> community in https://www.spinics.net/lists/linux-xfs/msg08752.html.
> The community (Dave) confirmed that we found a "zero day" bug, and
> gave us some hints on how to fix it. Hence this thread.
> 
> After reviewing my patch, Dave expressed the following concern:
> 
> "The problem is that the locked buffer is not joined and logged in
> the rolling transactions run in xfs_defer_ops. Hence it can pin the
> tail of the AIL, and this can prevent the transaction roll from
> regranting the log space necessary to continue rolling the
> transaction for the required number of transactions to complete the
> deferred ops. If this happens, we end up with a log space deadlock."
> 
> However, after more discussions, there was more or less a consensus
> that for kernel 3.18 this fix should be safe. We went ahead, applied
> and qualified the fix. With this fix we did not see the issue in any
> of the production systems, which were hitting the issue frequently.
> 
> We are now in the process of moving to long-term kernel 4.14.x. We
> see, however, that this problem was fixed by the community only for
> kernels 4.15 and later. Since we had several production systems
> hitting this issue frequently, we need a fix for it in kernel 4.14.
> 
> Hence our question: whether our original patch should be safe to apply
> to kernel 4.14?
> 
> Brian, Dave, can you perhaps also comment?

The right thing to do is to backport the upstream fix and all it's
dependencies to the LTS kernel. If it's 4.15 to 4.14, everything
should pretty much just drop in without too much hassle. Then test
the backport fixes the problem it was intended to fix, post the
patch series to the XFS list as [STABLE PATCH X/Y] with a cc to
stable@kernel.org, and if it passes review (shouldn't be an issue if
it's a straight backport) it will get merged into the 4.14-LTS kernel
tree and go through the stable kernel QA process.

This gets the problem fixed for all users of the LTS kernel, and you
do not have to maintain the backport yourself as you update to new
LTS kernels over the life of your product....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  2019-03-27 20:46                                       ` Dave Chinner
@ 2019-03-28 11:26                                         ` Alex Lyakas
  0 siblings, 0 replies; 40+ messages in thread
From: Alex Lyakas @ 2019-03-28 11:26 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Darrick J. Wong, Shyam Kaushik, Brian Foster, linux-xfs

Hi Dave,
Thank you for your response.

On Wed, Mar 27, 2019 at 10:46 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Wed, Mar 27, 2019 at 06:03:38PM +0200, Alex Lyakas wrote:
> > Hi Darrick,
> >
> > I started this long email thread originally, and posted a patch with
> > the proposed fix to the "Metadata corruption at
> > xfs_attr3_leaf_write_verify" problem. We reported this problem
> > originally. Eventually we found a stable reproducer for the issue,
> > added different prints in the code, and posted our analysis to
> > community in https://www.spinics.net/lists/linux-xfs/msg08752.html.
> > The community (Dave) confirmed that we found a "zero day" bug, and
> > gave us some hints on how to fix it. Hence this thread.
> >
> > After reviewing my patch, Dave expressed the following concern:
> >
> > "The problem is that the locked buffer is not joined and logged in
> > the rolling transactions run in xfs_defer_ops. Hence it can pin the
> > tail of the AIL, and this can prevent the transaction roll from
> > regranting the log space necessary to continue rolling the
> > transaction for the required number of transactions to complete the
> > deferred ops. If this happens, we end up with a log space deadlock."
> >
> > However, after more discussions, there was more or less a consensus
> > that for kernel 3.18 this fix should be safe. We went ahead, applied
> > and qualified the fix. With this fix we did not see the issue in any
> > of the production systems, which were hitting the issue frequently.
> >
> > We are now in the process of moving to long-term kernel 4.14.x. We
> > see, however, that this problem was fixed by the community only for
> > kernels 4.15 and later. Since we had several production systems
> > hitting this issue frequently, we need a fix for it in kernel 4.14.
> >
> > Hence our question: whether our original patch should be safe to apply
> > to kernel 4.14?
> >
> > Brian, Dave, can you perhaps also comment?
>
> The right thing to do is to backport the upstream fix and all it's
> dependencies to the LTS kernel. If it's 4.15 to 4.14, everything
> should pretty much just drop in without too much hassle. Then test
> the backport fixes the problem it was intended to fix, post the
> patch series to the XFS list as [STABLE PATCH X/Y] with a cc to
> stable@kernel.org, and if it passes review (shouldn't be an issue if
> it's a straight backport) it will get merged into the 4.14-LTS kernel
> tree and go through the stable kernel QA process.
We will work on backporting the fix to kernel 4.14. We will post the
fix for community review once it ready.

Thanks,
Alex.



>
> This gets the problem fixed for all users of the LTS kernel, and you
> do not have to maintain the backport yourself as you update to new
> LTS kernels over the life of your product....
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2019-03-28 11:26 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-09 11:06 [PATCH] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute alex
2017-08-09 13:17 ` Brian Foster
2017-08-09 21:33 ` Dave Chinner
2017-08-10  8:02   ` Alex Lyakas
2017-08-10 11:33     ` Dave Chinner
2017-08-10 12:09       ` Alex Lyakas
2017-08-10 14:52         ` Brian Foster
2017-08-10 17:55           ` Darrick J. Wong
2017-08-10 18:32             ` Brian Foster
2017-08-11  2:22               ` Dave Chinner
2017-08-11 14:27                 ` Brian Foster
2017-08-12  0:16                   ` Dave Chinner
2017-08-12 14:04                     ` Brian Foster
2017-08-14  0:28                       ` Dave Chinner
2017-08-14  8:11                         ` Alex Lyakas
2017-08-14 12:22                           ` Brian Foster
2017-08-14 16:04                             ` Alex Lyakas
2017-08-14 21:33                               ` Darrick J. Wong
2019-03-22  9:12                             ` Shyam Kaushik
2019-03-22 16:08                               ` Darrick J. Wong
2019-03-25 13:49                                 ` Shyam Kaushik
2019-03-25 18:17                                   ` Darrick J. Wong
2019-03-27 16:03                                     ` Alex Lyakas
2019-03-27 20:46                                       ` Dave Chinner
2019-03-28 11:26                                         ` Alex Lyakas
2017-08-17 20:38                         ` Brian Foster
2017-08-17 22:31                           ` Darrick J. Wong
2017-08-18 11:39                             ` Brian Foster
2017-08-18 15:37                               ` Darrick J. Wong
2017-08-18  2:04                           ` Dave Chinner
2017-08-18 11:42                             ` Brian Foster
2017-08-11  2:09             ` Dave Chinner
2017-08-11 14:30               ` Brian Foster
2017-08-11 12:53   ` Christoph Hellwig
2017-08-11 16:52     ` Darrick J. Wong
2017-08-12  7:37       ` Christoph Hellwig
2017-11-21 15:31 ` Libor Klepáč
2017-11-21 16:24   ` Brian Foster
2017-11-21 18:50     ` Darrick J. Wong
2017-11-30 17:55       ` Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.