Linux-XFS Archive on lore.kernel.org
 help / color / Atom feed
* [RFC PATCH] xfs: support shrinking unused space in the last AG
       [not found] <20201014005809.6619-1-hsiangkao.ref@aol.com>
@ 2020-10-14  0:58 ` Gao Xiang
  2020-10-14  1:06   ` Gao Xiang
                     ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Gao Xiang @ 2020-10-14  0:58 UTC (permalink / raw)
  To: linux-xfs; +Cc: Gao Xiang

From: Gao Xiang <hsiangkao@redhat.com>

At the first step of shrinking, this attempts to enable shrinking
unused space in the last allocation group by fixing up freespace
btree, agi, agf and adjusting super block.

This can be all done in one transaction for now, so I think no
additional protection is needed.

Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---

Honestly, I've got headache about shrinking entire AGs
since the codebase doesn't expect agcount can be decreased
suddenly, I got some ideas from people but the modification
seems all over the codebase, I will keep on this at least
to learn more about XFS internals.

It might be worth sending out shrinking the last AG first
since users might need to shrink a little unused space
occasionally, yet I'm not quite sure the log space reservation
calculation in this patch and other details are correct.
I've done some manual test and it seems work. Yeah, as a
formal patch it needs more test to be done but I'd like
to hear more ideas about this first since I'm not quite
familiar with XFS for now and this topic involves a lot
new XFS-specific implementation details.

Kindly point out all strange places and what I'm missing
so I can revise it. It would be of great help for me to
learn more about XFS. At least just as a record on this
topic for further discussion.

Thanks,
Gao Xiang

 fs/xfs/libxfs/xfs_ag.c    | 46 ++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_ag.h    |  2 ++
 fs/xfs/libxfs/xfs_alloc.c | 55 +++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_alloc.h | 10 +++++++
 fs/xfs/xfs_fsops.c        | 49 ++++++++++++++++++++++++----------
 fs/xfs/xfs_trans.c        |  1 -
 6 files changed, 148 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 9331f3516afa..cac7b715e90b 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -485,6 +485,52 @@ xfs_ag_init_headers(
 	return error;
 }
 
+int
+xfs_ag_shrink_space(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct aghdr_init_data	*id,
+	xfs_extlen_t		len)
+{
+	struct xfs_buf		*agibp, *agfbp;
+	struct xfs_agi		*agi;
+	struct xfs_agf		*agf;
+	int			error;
+
+	ASSERT(id->agno == mp->m_sb.sb_agcount - 1);
+	error = xfs_ialloc_read_agi(mp, tp, id->agno, &agibp);
+	if (error)
+		return error;
+
+	agi = agibp->b_addr;
+
+	/* Cannot touch the log space */
+	if (is_log_ag(mp, id) &&
+	    XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) +
+	    mp->m_sb.sb_logblocks > be32_to_cpu(agi->agi_length) - len)
+		return -EINVAL;
+
+	error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &agfbp);
+	if (error)
+		return error;
+
+	error = xfs_alloc_vextent_shrink(tp, agfbp,
+			be32_to_cpu(agi->agi_length) - len, len);
+	if (error)
+		return error;
+
+	/* Change the agi length */
+	be32_add_cpu(&agi->agi_length, -len);
+	xfs_ialloc_log_agi(tp, agibp, XFS_AGI_LENGTH);
+
+	/* Change agf length */
+	agf = agfbp->b_addr;
+	be32_add_cpu(&agf->agf_length, -len);
+	ASSERT(agf->agf_length == agi->agi_length);
+	xfs_alloc_log_agf(tp, agfbp, XFS_AGF_LENGTH);
+	return 0;
+}
+
 /*
  * Extent the AG indicated by the @id by the length passed in
  */
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 5166322807e7..f3b5bbfeadce 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -24,6 +24,8 @@ struct aghdr_init_data {
 };
 
 int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
+int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans *tp,
+			struct aghdr_init_data *id, xfs_extlen_t len);
 int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
 			struct aghdr_init_data *id, xfs_extlen_t len);
 int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 852b536551b5..681357bb2701 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1118,6 +1118,61 @@ xfs_alloc_ag_vextent_small(
 	return error;
 }
 
+/*
+ * Allocate an extent for shrinking the last allocation group
+ * to fix the freespace btree.
+ * agfl fix is avoided in order to keep from dirtying the
+ * transaction unnecessarily compared with xfs_alloc_vextent().
+ */
+int
+xfs_alloc_vextent_shrink(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_agnumber_t		agno = agbp->b_pag->pag_agno;
+	struct xfs_alloc_arg	args = {
+		.tp = tp,
+		.mp = mp,
+		.type = XFS_ALLOCTYPE_THIS_BNO,
+		.agbp = agbp,
+		.agno = agno,
+		.agbno = agbno,
+		.fsbno = XFS_AGB_TO_FSB(mp, agno, agbno),
+		.minlen = len,
+		.maxlen = len,
+		.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
+		.resv = XFS_AG_RESV_NONE,
+		.prod = 1,
+		.alignment = 1,
+		.pag = agbp->b_pag
+	};
+	int			error;
+
+	error = xfs_alloc_ag_vextent_exact(&args);
+	if (error || args.agbno == NULLAGBLOCK)
+		return -EBUSY;
+
+	ASSERT(args.agbno == agbno);
+	ASSERT(args.len == len);
+	ASSERT(!args.wasfromfl || args.resv != XFS_AG_RESV_AGFL);
+
+	if (!args.wasfromfl) {
+		error = xfs_alloc_update_counters(tp, agbp, -(long)len);
+		if (error)
+			return error;
+
+		ASSERT(!xfs_extent_busy_search(mp, args.agno, agbno, args.len));
+	}
+	xfs_ag_resv_alloc_extent(args.pag, args.resv, &args);
+
+	XFS_STATS_INC(mp, xs_allocx);
+	XFS_STATS_ADD(mp, xs_allocb, args.len);
+	return 0;
+}
+
 /*
  * Allocate a variable extent in the allocation group agno.
  * Type and bno are used to determine where in the allocation group the
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6c22b12176b8..6080140bcb56 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -160,6 +160,16 @@ int				/* error */
 xfs_alloc_vextent(
 	xfs_alloc_arg_t	*args);	/* allocation argument structure */
 
+/*
+ * Allocate an extent for shrinking the last AG
+ */
+int
+xfs_alloc_vextent_shrink(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len);	/* length of extent */
+
 /*
  * Free an extent.
  */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ef1d5bb88b93..80927d323939 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -36,19 +36,21 @@ xfs_growfs_data_private(
 	xfs_rfsblock_t		new;
 	xfs_agnumber_t		oagcount;
 	xfs_trans_t		*tp;
+	bool			extend;
 	struct aghdr_init_data	id = {};
 
 	nb = in->newblocks;
-	if (nb < mp->m_sb.sb_dblocks)
-		return -EINVAL;
 	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
 		return error;
-	error = xfs_buf_read_uncached(mp->m_ddev_targp,
+
+	if (nb > mp->m_sb.sb_dblocks) {
+		error = xfs_buf_read_uncached(mp->m_ddev_targp,
 				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
 				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
-	if (error)
-		return error;
-	xfs_buf_relse(bp);
+		if (error)
+			return error;
+		xfs_buf_relse(bp);
+	}
 
 	new = nb;	/* use new as a temporary here */
 	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
@@ -56,10 +58,18 @@ xfs_growfs_data_private(
 	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
 		nagcount--;
 		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
-		if (nb < mp->m_sb.sb_dblocks)
+		if (!nagcount)
 			return -EINVAL;
 	}
-	new = nb - mp->m_sb.sb_dblocks;
+
+	if (nb > mp->m_sb.sb_dblocks) {
+		new = nb - mp->m_sb.sb_dblocks;
+		extend = true;
+	} else {
+		new = mp->m_sb.sb_dblocks - nb;
+		extend = false;
+	}
+
 	oagcount = mp->m_sb.sb_agcount;
 
 	/* allocate the new per-ag structures */
@@ -67,10 +77,14 @@ xfs_growfs_data_private(
 		error = xfs_initialize_perag(mp, nagcount, &nagimax);
 		if (error)
 			return error;
+	} else if (nagcount != oagcount) {
+		/* TODO: shrinking a whole AG hasn't yet implemented */
+		return -EINVAL;
 	}
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
-			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+			(extend ? 0 : new) + XFS_GROWFS_SPACE_RES(mp), 0,
+			XFS_TRANS_RESERVE, &tp);
 	if (error)
 		return error;
 
@@ -103,15 +117,22 @@ xfs_growfs_data_private(
 			goto out_trans_cancel;
 		}
 	}
-	error = xfs_buf_delwri_submit(&id.buffer_list);
-	if (error)
-		goto out_trans_cancel;
+
+	if (!list_empty(&id.buffer_list)) {
+		error = xfs_buf_delwri_submit(&id.buffer_list);
+		if (error)
+			goto out_trans_cancel;
+	}
 
 	xfs_trans_agblocks_delta(tp, id.nfree);
 
 	/* If there are new blocks in the old last AG, extend it. */
 	if (new) {
-		error = xfs_ag_extend_space(mp, tp, &id, new);
+		if (extend)
+			error = xfs_ag_extend_space(mp, tp, &id, new);
+		else
+			error = xfs_ag_shrink_space(mp, tp, &id, new);
+
 		if (error)
 			goto out_trans_cancel;
 	}
@@ -123,7 +144,7 @@ xfs_growfs_data_private(
 	 */
 	if (nagcount > oagcount)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
-	if (nb > mp->m_sb.sb_dblocks)
+	if (nb != mp->m_sb.sb_dblocks)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
 				 nb - mp->m_sb.sb_dblocks);
 	if (id.nfree)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c94e71f741b6..81b9c32f9bef 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -419,7 +419,6 @@ xfs_trans_mod_sb(
 		tp->t_res_frextents_delta += delta;
 		break;
 	case XFS_TRANS_SB_DBLOCKS:
-		ASSERT(delta > 0);
 		tp->t_dblocks_delta += delta;
 		break;
 	case XFS_TRANS_SB_AGCOUNT:
-- 
2.24.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-14  0:58 ` [RFC PATCH] xfs: support shrinking unused space in the last AG Gao Xiang
@ 2020-10-14  1:06   ` Gao Xiang
  2020-10-14 16:06   ` Darrick J. Wong
  2020-10-14 17:01   ` Brian Foster
  2 siblings, 0 replies; 11+ messages in thread
From: Gao Xiang @ 2020-10-14  1:06 UTC (permalink / raw)
  To: linux-xfs; +Cc: Gao Xiang

On Wed, Oct 14, 2020 at 08:58:09AM +0800, Gao Xiang wrote:
> From: Gao Xiang <hsiangkao@redhat.com>
> 
> At the first step of shrinking, this attempts to enable shrinking
> unused space in the last allocation group by fixing up freespace
> btree, agi, agf and adjusting super block.
> 
> This can be all done in one transaction for now, so I think no
> additional protection is needed.
> 
> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
> 
> Honestly, I've got headache about shrinking entire AGs
> since the codebase doesn't expect agcount can be decreased
> suddenly, I got some ideas from people but the modification
> seems all over the codebase, I will keep on this at least
> to learn more about XFS internals.
> 
> It might be worth sending out shrinking the last AG first
> since users might need to shrink a little unused space
> occasionally, yet I'm not quite sure the log space reservation
> calculation in this patch and other details are correct.
> I've done some manual test and it seems work. Yeah, as a
> formal patch it needs more test to be done but I'd like
> to hear more ideas about this first since I'm not quite
> familiar with XFS for now and this topic involves a lot
> new XFS-specific implementation details.
> 
> Kindly point out all strange places and what I'm missing
> so I can revise it. It would be of great help for me to
> learn more about XFS. At least just as a record on this
> topic for further discussion.
> 

and a simple user prog is attached:
#include <stdio.h>
#include <stdint.h>
#include <errno.h>
#include <sys/ioctl.h>
#include <stdint.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>

#define XFS_IOC_FSGROWFSDATA         _IOW ('X', 110, struct xfs_growfs_data)

/*
 * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
 */
typedef struct xfs_growfs_data {
	int64_t		newblocks;	/* new data subvol size, fsblocks */
	uint32_t	imaxpct;	/* new inode space percentage limit */
} xfs_growfs_data_t;

int main(int argc, char *argv[])
{
	xfs_growfs_data_t in;
	int fd = open(argv[1], O_RDONLY);

	in.newblocks = strtoll(argv[2], NULL, 10);
	in.imaxpct = 0;
	if (ioctl(fd, XFS_IOC_FSGROWFSDATA, &in) < 0) {
		fprintf(stderr, "XFS_IOC_FSGROWFSDATA xfsctl failed: %s\n",
			strerror(errno));
		return 1;
	}

	return 0;
}


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-14  0:58 ` [RFC PATCH] xfs: support shrinking unused space in the last AG Gao Xiang
  2020-10-14  1:06   ` Gao Xiang
@ 2020-10-14 16:06   ` Darrick J. Wong
  2020-10-15  1:11     ` Gao Xiang
  2020-10-14 17:01   ` Brian Foster
  2 siblings, 1 reply; 11+ messages in thread
From: Darrick J. Wong @ 2020-10-14 16:06 UTC (permalink / raw)
  To: Gao Xiang; +Cc: linux-xfs, Gao Xiang

On Wed, Oct 14, 2020 at 08:58:09AM +0800, Gao Xiang wrote:
> From: Gao Xiang <hsiangkao@redhat.com>
> 
> At the first step of shrinking, this attempts to enable shrinking
> unused space in the last allocation group by fixing up freespace
> btree, agi, agf and adjusting super block.
> 
> This can be all done in one transaction for now, so I think no
> additional protection is needed.
> 
> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
> 
> Honestly, I've got headache about shrinking entire AGs
> since the codebase doesn't expect agcount can be decreased
> suddenly, I got some ideas from people but the modification
> seems all over the codebase, I will keep on this at least
> to learn more about XFS internals.
> 
> It might be worth sending out shrinking the last AG first
> since users might need to shrink a little unused space
> occasionally, yet I'm not quite sure the log space reservation
> calculation in this patch and other details are correct.
> I've done some manual test and it seems work. Yeah, as a
> formal patch it needs more test to be done but I'd like
> to hear more ideas about this first since I'm not quite
> familiar with XFS for now and this topic involves a lot
> new XFS-specific implementation details.
> 
> Kindly point out all strange places and what I'm missing
> so I can revise it. It would be of great help for me to
> learn more about XFS. At least just as a record on this
> topic for further discussion.
> 
> Thanks,
> Gao Xiang
> 
>  fs/xfs/libxfs/xfs_ag.c    | 46 ++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_ag.h    |  2 ++
>  fs/xfs/libxfs/xfs_alloc.c | 55 +++++++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_alloc.h | 10 +++++++
>  fs/xfs/xfs_fsops.c        | 49 ++++++++++++++++++++++++----------
>  fs/xfs/xfs_trans.c        |  1 -
>  6 files changed, 148 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index 9331f3516afa..cac7b715e90b 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -485,6 +485,52 @@ xfs_ag_init_headers(
>  	return error;
>  }
>  
> +int
> +xfs_ag_shrink_space(
> +	struct xfs_mount	*mp,
> +	struct xfs_trans	*tp,
> +	struct aghdr_init_data	*id,
> +	xfs_extlen_t		len)
> +{
> +	struct xfs_buf		*agibp, *agfbp;
> +	struct xfs_agi		*agi;
> +	struct xfs_agf		*agf;
> +	int			error;
> +
> +	ASSERT(id->agno == mp->m_sb.sb_agcount - 1);
> +	error = xfs_ialloc_read_agi(mp, tp, id->agno, &agibp);
> +	if (error)
> +		return error;
> +
> +	agi = agibp->b_addr;
> +
> +	/* Cannot touch the log space */
> +	if (is_log_ag(mp, id) &&
> +	    XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) +
> +	    mp->m_sb.sb_logblocks > be32_to_cpu(agi->agi_length) - len)
> +		return -EINVAL;

The space used by the internal log shouldn't also show up in the free
space btrees, so I think you could rely on the _alloc_vextent_shrink
call to return ENOSPC, right?

> +
> +	error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &agfbp);
> +	if (error)
> +		return error;
> +
> +	error = xfs_alloc_vextent_shrink(tp, agfbp,
> +			be32_to_cpu(agi->agi_length) - len, len);
> +	if (error)
> +		return error;
> +
> +	/* Change the agi length */
> +	be32_add_cpu(&agi->agi_length, -len);
> +	xfs_ialloc_log_agi(tp, agibp, XFS_AGI_LENGTH);
> +
> +	/* Change agf length */
> +	agf = agfbp->b_addr;
> +	be32_add_cpu(&agf->agf_length, -len);
> +	ASSERT(agf->agf_length == agi->agi_length);
> +	xfs_alloc_log_agf(tp, agfbp, XFS_AGF_LENGTH);
> +	return 0;
> +}
> +
>  /*
>   * Extent the AG indicated by the @id by the length passed in

"Extend..." ?

>   */
> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> index 5166322807e7..f3b5bbfeadce 100644
> --- a/fs/xfs/libxfs/xfs_ag.h
> +++ b/fs/xfs/libxfs/xfs_ag.h
> @@ -24,6 +24,8 @@ struct aghdr_init_data {
>  };
>  
>  int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
> +int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans *tp,
> +			struct aghdr_init_data *id, xfs_extlen_t len);
>  int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
>  			struct aghdr_init_data *id, xfs_extlen_t len);
>  int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 852b536551b5..681357bb2701 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -1118,6 +1118,61 @@ xfs_alloc_ag_vextent_small(
>  	return error;
>  }
>  
> +/*
> + * Allocate an extent for shrinking the last allocation group
> + * to fix the freespace btree.
> + * agfl fix is avoided in order to keep from dirtying the
> + * transaction unnecessarily compared with xfs_alloc_vextent().
> + */
> +int
> +xfs_alloc_vextent_shrink(
> +	struct xfs_trans	*tp,
> +	struct xfs_buf		*agbp,
> +	xfs_agblock_t		agbno,
> +	xfs_extlen_t		len)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	xfs_agnumber_t		agno = agbp->b_pag->pag_agno;
> +	struct xfs_alloc_arg	args = {
> +		.tp = tp,
> +		.mp = mp,
> +		.type = XFS_ALLOCTYPE_THIS_BNO,
> +		.agbp = agbp,
> +		.agno = agno,
> +		.agbno = agbno,
> +		.fsbno = XFS_AGB_TO_FSB(mp, agno, agbno),

/me would have thought you could compute agbno from the AGF buffer
passed in and the length parameter.

> +		.minlen = len,
> +		.maxlen = len,
> +		.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
> +		.resv = XFS_AG_RESV_NONE,

Hmm.  Using AG_RESV_NONE here means that the allocation can fail because
there won't be enough free space left to satisfy the AG metadata
preallocations.  I think you have to call xfs_ag_resv_free before
calling this allocation function to remove the preallocations, and then
call xfs_ag_resv_init afterwards (regardless of whether the shrink
succeeds) to re-establish the preallocations to fit the new AG size.

> +		.prod = 1,
> +		.alignment = 1,
> +		.pag = agbp->b_pag
> +	};
> +	int			error;
> +
> +	error = xfs_alloc_ag_vextent_exact(&args);
> +	if (error || args.agbno == NULLAGBLOCK)
> +		return -EBUSY;
> +
> +	ASSERT(args.agbno == agbno);
> +	ASSERT(args.len == len);
> +	ASSERT(!args.wasfromfl || args.resv != XFS_AG_RESV_AGFL);

Can wasfromfl==true actually happen?  I think for this shrink case we
need to prevent that from ever happening.

> +
> +	if (!args.wasfromfl) {
> +		error = xfs_alloc_update_counters(tp, agbp, -(long)len);
> +		if (error)
> +			return error;
> +
> +		ASSERT(!xfs_extent_busy_search(mp, args.agno, agbno, args.len));
> +	}
> +	xfs_ag_resv_alloc_extent(args.pag, args.resv, &args);

(I think you can skip this call if you free the AG's preallocations
before allocating the last freespace.)

> +
> +	XFS_STATS_INC(mp, xs_allocx);
> +	XFS_STATS_ADD(mp, xs_allocb, args.len);
> +	return 0;
> +}
> +
>  /*
>   * Allocate a variable extent in the allocation group agno.
>   * Type and bno are used to determine where in the allocation group the
> diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
> index 6c22b12176b8..6080140bcb56 100644
> --- a/fs/xfs/libxfs/xfs_alloc.h
> +++ b/fs/xfs/libxfs/xfs_alloc.h
> @@ -160,6 +160,16 @@ int				/* error */
>  xfs_alloc_vextent(
>  	xfs_alloc_arg_t	*args);	/* allocation argument structure */
>  
> +/*
> + * Allocate an extent for shrinking the last AG
> + */
> +int
> +xfs_alloc_vextent_shrink(
> +	struct xfs_trans	*tp,
> +	struct xfs_buf		*agbp,
> +	xfs_agblock_t		agbno,
> +	xfs_extlen_t		len);	/* length of extent */
> +
>  /*
>   * Free an extent.
>   */
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index ef1d5bb88b93..80927d323939 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -36,19 +36,21 @@ xfs_growfs_data_private(
>  	xfs_rfsblock_t		new;
>  	xfs_agnumber_t		oagcount;
>  	xfs_trans_t		*tp;
> +	bool			extend;
>  	struct aghdr_init_data	id = {};
>  
>  	nb = in->newblocks;
> -	if (nb < mp->m_sb.sb_dblocks)
> -		return -EINVAL;
>  	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
>  		return error;

Please convert this to the more normal format:

error = xfs_sb_validate...();
if (error)
	return error;

> -	error = xfs_buf_read_uncached(mp->m_ddev_targp,
> +
> +	if (nb > mp->m_sb.sb_dblocks) {
> +		error = xfs_buf_read_uncached(mp->m_ddev_targp,
>  				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
>  				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
> -	if (error)
> -		return error;
> -	xfs_buf_relse(bp);
> +		if (error)
> +			return error;
> +		xfs_buf_relse(bp);
> +	}
>  
>  	new = nb;	/* use new as a temporary here */
>  	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
> @@ -56,10 +58,18 @@ xfs_growfs_data_private(
>  	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
>  		nagcount--;
>  		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
> -		if (nb < mp->m_sb.sb_dblocks)
> +		if (!nagcount)
>  			return -EINVAL;

Realistically, we don't ever want fewer than 2 AGs since that would
leave us with no superblock redundancy.  Repair doesn't really support
the 1 AG case.

>  	}
> -	new = nb - mp->m_sb.sb_dblocks;
> +
> +	if (nb > mp->m_sb.sb_dblocks) {
> +		new = nb - mp->m_sb.sb_dblocks;
> +		extend = true;
> +	} else {
> +		new = mp->m_sb.sb_dblocks - nb;
> +		extend = false;
> +	}

Er... maybe you should start by hoisting all the "make data device
larger" code into a separate function so that you can add the "make data
device smaller" code as a second function.  Then xfs_growfs_data_private
can decide which of the two to call based on the new size.

> +
>  	oagcount = mp->m_sb.sb_agcount;
>  
>  	/* allocate the new per-ag structures */
> @@ -67,10 +77,14 @@ xfs_growfs_data_private(
>  		error = xfs_initialize_perag(mp, nagcount, &nagimax);
>  		if (error)
>  			return error;
> +	} else if (nagcount != oagcount) {
> +		/* TODO: shrinking a whole AG hasn't yet implemented */
> +		return -EINVAL;
>  	}
>  
>  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
> -			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> +			(extend ? 0 : new) + XFS_GROWFS_SPACE_RES(mp), 0,

XFS_GROWFS_SPACE_RES is defined as twice m_ag_maxlevels, which I think
means (in the grow case) that it's reserving space for one full split
for each free space btree.

Shrink, by contrast, is removing the rightmost record from the bnobt and
some arbitrary record from the cntbt.  Can removing 1 record from each
btree actually cause a split?

> +			XFS_TRANS_RESERVE, &tp);
>  	if (error)
>  		return error;
>  
> @@ -103,15 +117,22 @@ xfs_growfs_data_private(
>  			goto out_trans_cancel;
>  		}
>  	}
> -	error = xfs_buf_delwri_submit(&id.buffer_list);
> -	if (error)
> -		goto out_trans_cancel;
> +
> +	if (!list_empty(&id.buffer_list)) {
> +		error = xfs_buf_delwri_submit(&id.buffer_list);
> +		if (error)
> +			goto out_trans_cancel;
> +	}
>  
>  	xfs_trans_agblocks_delta(tp, id.nfree);
>  
>  	/* If there are new blocks in the old last AG, extend it. */
>  	if (new) {
> -		error = xfs_ag_extend_space(mp, tp, &id, new);
> +		if (extend)
> +			error = xfs_ag_extend_space(mp, tp, &id, new);
> +		else
> +			error = xfs_ag_shrink_space(mp, tp, &id, new);

This logic is getting harder for me to track because the patch combines
grow and shrink logic in the same function....

--D

> +
>  		if (error)
>  			goto out_trans_cancel;
>  	}
> @@ -123,7 +144,7 @@ xfs_growfs_data_private(
>  	 */
>  	if (nagcount > oagcount)
>  		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
> -	if (nb > mp->m_sb.sb_dblocks)
> +	if (nb != mp->m_sb.sb_dblocks)
>  		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
>  				 nb - mp->m_sb.sb_dblocks);
>  	if (id.nfree)
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index c94e71f741b6..81b9c32f9bef 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -419,7 +419,6 @@ xfs_trans_mod_sb(
>  		tp->t_res_frextents_delta += delta;
>  		break;
>  	case XFS_TRANS_SB_DBLOCKS:
> -		ASSERT(delta > 0);
>  		tp->t_dblocks_delta += delta;
>  		break;
>  	case XFS_TRANS_SB_AGCOUNT:
> -- 
> 2.24.0
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-14  0:58 ` [RFC PATCH] xfs: support shrinking unused space in the last AG Gao Xiang
  2020-10-14  1:06   ` Gao Xiang
  2020-10-14 16:06   ` Darrick J. Wong
@ 2020-10-14 17:01   ` Brian Foster
  2020-10-15  1:49     ` Gao Xiang
  2 siblings, 1 reply; 11+ messages in thread
From: Brian Foster @ 2020-10-14 17:01 UTC (permalink / raw)
  To: Gao Xiang; +Cc: linux-xfs, Gao Xiang

On Wed, Oct 14, 2020 at 08:58:09AM +0800, Gao Xiang wrote:
> From: Gao Xiang <hsiangkao@redhat.com>
> 
> At the first step of shrinking, this attempts to enable shrinking
> unused space in the last allocation group by fixing up freespace
> btree, agi, agf and adjusting super block.
> 
> This can be all done in one transaction for now, so I think no
> additional protection is needed.
> 
> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
> 
> Honestly, I've got headache about shrinking entire AGs
> since the codebase doesn't expect agcount can be decreased
> suddenly, I got some ideas from people but the modification
> seems all over the codebase, I will keep on this at least
> to learn more about XFS internals.
> 
> It might be worth sending out shrinking the last AG first
> since users might need to shrink a little unused space
> occasionally, yet I'm not quite sure the log space reservation
> calculation in this patch and other details are correct.
> I've done some manual test and it seems work. Yeah, as a
> formal patch it needs more test to be done but I'd like
> to hear more ideas about this first since I'm not quite
> familiar with XFS for now and this topic involves a lot
> new XFS-specific implementation details.
> 
> Kindly point out all strange places and what I'm missing
> so I can revise it. It would be of great help for me to
> learn more about XFS. At least just as a record on this
> topic for further discussion.
> 

Interesting... this seems fundamentally sane when narrowing the scope
down to tail AG shrinking. Does xfs_repair flag any issues in the simple
tail AG shrink case?

Some random initial thoughts..

> Thanks,
> Gao Xiang
> 
>  fs/xfs/libxfs/xfs_ag.c    | 46 ++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_ag.h    |  2 ++
>  fs/xfs/libxfs/xfs_alloc.c | 55 +++++++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_alloc.h | 10 +++++++
>  fs/xfs/xfs_fsops.c        | 49 ++++++++++++++++++++++++----------
>  fs/xfs/xfs_trans.c        |  1 -
>  6 files changed, 148 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index 9331f3516afa..cac7b715e90b 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -485,6 +485,52 @@ xfs_ag_init_headers(
>  	return error;
>  }
>  
> +int
> +xfs_ag_shrink_space(
> +	struct xfs_mount	*mp,
> +	struct xfs_trans	*tp,
> +	struct aghdr_init_data	*id,
> +	xfs_extlen_t		len)
> +{
> +	struct xfs_buf		*agibp, *agfbp;
> +	struct xfs_agi		*agi;
> +	struct xfs_agf		*agf;
> +	int			error;
> +
> +	ASSERT(id->agno == mp->m_sb.sb_agcount - 1);
> +	error = xfs_ialloc_read_agi(mp, tp, id->agno, &agibp);
> +	if (error)
> +		return error;
> +
> +	agi = agibp->b_addr;
> +
> +	/* Cannot touch the log space */
> +	if (is_log_ag(mp, id) &&
> +	    XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) +
> +	    mp->m_sb.sb_logblocks > be32_to_cpu(agi->agi_length) - len)
> +		return -EINVAL;
> +
> +	error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &agfbp);
> +	if (error)
> +		return error;
> +
> +	error = xfs_alloc_vextent_shrink(tp, agfbp,
> +			be32_to_cpu(agi->agi_length) - len, len);
> +	if (error)
> +		return error;
> +
> +	/* Change the agi length */
> +	be32_add_cpu(&agi->agi_length, -len);
> +	xfs_ialloc_log_agi(tp, agibp, XFS_AGI_LENGTH);
> +
> +	/* Change agf length */
> +	agf = agfbp->b_addr;
> +	be32_add_cpu(&agf->agf_length, -len);
> +	ASSERT(agf->agf_length == agi->agi_length);
> +	xfs_alloc_log_agf(tp, agfbp, XFS_AGF_LENGTH);
> +	return 0;
> +}
> +
>  /*
>   * Extent the AG indicated by the @id by the length passed in
>   */
> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> index 5166322807e7..f3b5bbfeadce 100644
> --- a/fs/xfs/libxfs/xfs_ag.h
> +++ b/fs/xfs/libxfs/xfs_ag.h
> @@ -24,6 +24,8 @@ struct aghdr_init_data {
>  };
>  
>  int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
> +int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans *tp,
> +			struct aghdr_init_data *id, xfs_extlen_t len);
>  int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
>  			struct aghdr_init_data *id, xfs_extlen_t len);
>  int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 852b536551b5..681357bb2701 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -1118,6 +1118,61 @@ xfs_alloc_ag_vextent_small(
>  	return error;
>  }
>  
> +/*
> + * Allocate an extent for shrinking the last allocation group
> + * to fix the freespace btree.
> + * agfl fix is avoided in order to keep from dirtying the
> + * transaction unnecessarily compared with xfs_alloc_vextent().
> + */
> +int
> +xfs_alloc_vextent_shrink(
> +	struct xfs_trans	*tp,
> +	struct xfs_buf		*agbp,
> +	xfs_agblock_t		agbno,
> +	xfs_extlen_t		len)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	xfs_agnumber_t		agno = agbp->b_pag->pag_agno;
> +	struct xfs_alloc_arg	args = {
> +		.tp = tp,
> +		.mp = mp,
> +		.type = XFS_ALLOCTYPE_THIS_BNO,
> +		.agbp = agbp,
> +		.agno = agno,
> +		.agbno = agbno,
> +		.fsbno = XFS_AGB_TO_FSB(mp, agno, agbno),
> +		.minlen = len,
> +		.maxlen = len,
> +		.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
> +		.resv = XFS_AG_RESV_NONE,
> +		.prod = 1,
> +		.alignment = 1,
> +		.pag = agbp->b_pag
> +	};
> +	int			error;
> +
> +	error = xfs_alloc_ag_vextent_exact(&args);
> +	if (error || args.agbno == NULLAGBLOCK)
> +		return -EBUSY;

I think it's generally better to call into the top-level allocator API
(xfs_alloc_vextent()) because it will handle internal allocator business
like fixing up the AGFL and whatnot. Then you probably don't have to
specify as much in the args structure as well. The allocation mode
you've specified (THIS_BNO) will fall into the exact allocation codepath
and should enforce the semantics we need here (i.e. grant the exact
allocation or fail).

I also wonder if we'll eventually have to be more intelligent here in
scenarios where ag metadata (i.e., free space root blocks, etc.) or the
agfl holds blocks in a range we're asked to shrink. I think those are
scenarios where such an allocation request would fail even though the
blocks are internal or technically free. Have you explored such
scenarios so far? I know we're trying to be opportunistic here, but if
the AG (or subset) is otherwise empty it seems a bit random to fail.
Hmm, maybe scrub/repair could help to reinit/defrag such an AG if we
could otherwise determine that blocks beyond a certain range are unused
externally.

> +
> +	ASSERT(args.agbno == agbno);
> +	ASSERT(args.len == len);
> +	ASSERT(!args.wasfromfl || args.resv != XFS_AG_RESV_AGFL);
> +
> +	if (!args.wasfromfl) {
> +		error = xfs_alloc_update_counters(tp, agbp, -(long)len);
> +		if (error)
> +			return error;
> +
> +		ASSERT(!xfs_extent_busy_search(mp, args.agno, agbno, args.len));
> +	}
> +	xfs_ag_resv_alloc_extent(args.pag, args.resv, &args);
> +
> +	XFS_STATS_INC(mp, xs_allocx);
> +	XFS_STATS_ADD(mp, xs_allocb, args.len);
> +	return 0;
> +}
> +
>  /*
>   * Allocate a variable extent in the allocation group agno.
>   * Type and bno are used to determine where in the allocation group the
> diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
> index 6c22b12176b8..6080140bcb56 100644
> --- a/fs/xfs/libxfs/xfs_alloc.h
> +++ b/fs/xfs/libxfs/xfs_alloc.h
> @@ -160,6 +160,16 @@ int				/* error */
>  xfs_alloc_vextent(
>  	xfs_alloc_arg_t	*args);	/* allocation argument structure */
>  
> +/*
> + * Allocate an extent for shrinking the last AG
> + */
> +int
> +xfs_alloc_vextent_shrink(
> +	struct xfs_trans	*tp,
> +	struct xfs_buf		*agbp,
> +	xfs_agblock_t		agbno,
> +	xfs_extlen_t		len);	/* length of extent */
> +
>  /*
>   * Free an extent.
>   */
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index ef1d5bb88b93..80927d323939 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -36,19 +36,21 @@ xfs_growfs_data_private(
>  	xfs_rfsblock_t		new;
>  	xfs_agnumber_t		oagcount;
>  	xfs_trans_t		*tp;
> +	bool			extend;
>  	struct aghdr_init_data	id = {};
>  
>  	nb = in->newblocks;
> -	if (nb < mp->m_sb.sb_dblocks)
> -		return -EINVAL;
>  	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
>  		return error;
> -	error = xfs_buf_read_uncached(mp->m_ddev_targp,
> +
> +	if (nb > mp->m_sb.sb_dblocks) {
> +		error = xfs_buf_read_uncached(mp->m_ddev_targp,
>  				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
>  				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
> -	if (error)
> -		return error;
> -	xfs_buf_relse(bp);
> +		if (error)
> +			return error;
> +		xfs_buf_relse(bp);
> +	}
>  
>  	new = nb;	/* use new as a temporary here */
>  	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
> @@ -56,10 +58,18 @@ xfs_growfs_data_private(
>  	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
>  		nagcount--;
>  		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
> -		if (nb < mp->m_sb.sb_dblocks)
> +		if (!nagcount)
>  			return -EINVAL;
>  	}

We probably need to rethink the bit of logic above this check for
shrinking. It looks like the current code checks for the minimum
supported AG size and if not satisfied, reduces the size the grow to the
next smaller AG count. That would actually increase the size of the
shrink from what the user requested, so we'd probably want to do the
opposite and reduce the size of the requested shrink. For now it
probably doesn't matter much since we fail to shrink the agcount.

That said, if I'm following the growfs behavior correctly it might be
worth considering analogous behavior for shrink. E.g., if the user asks
to trim 10GB off the last AG but only the last 4GB are free, then shrink
the fs by 4GB and report the new size to the user.

> -	new = nb - mp->m_sb.sb_dblocks;
> +
> +	if (nb > mp->m_sb.sb_dblocks) {
> +		new = nb - mp->m_sb.sb_dblocks;
> +		extend = true;
> +	} else {
> +		new = mp->m_sb.sb_dblocks - nb;
> +		extend = false;
> +	}
> +

s/new/delta (or something along those lines) might be more readable if
we go this route.

>  	oagcount = mp->m_sb.sb_agcount;
>  
>  	/* allocate the new per-ag structures */
> @@ -67,10 +77,14 @@ xfs_growfs_data_private(
>  		error = xfs_initialize_perag(mp, nagcount, &nagimax);
>  		if (error)
>  			return error;
> +	} else if (nagcount != oagcount) {
> +		/* TODO: shrinking a whole AG hasn't yet implemented */
> +		return -EINVAL;
>  	}
>  
>  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
> -			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> +			(extend ? 0 : new) + XFS_GROWFS_SPACE_RES(mp), 0,
> +			XFS_TRANS_RESERVE, &tp);
>  	if (error)
>  		return error;
>  
> @@ -103,15 +117,22 @@ xfs_growfs_data_private(
>  			goto out_trans_cancel;
>  		}
>  	}
> -	error = xfs_buf_delwri_submit(&id.buffer_list);
> -	if (error)
> -		goto out_trans_cancel;
> +
> +	if (!list_empty(&id.buffer_list)) {
> +		error = xfs_buf_delwri_submit(&id.buffer_list);
> +		if (error)
> +			goto out_trans_cancel;
> +	}

The list check seems somewhat superfluous since we won't do anything
with an empty list anyways. Presumably it would be incorrect to ever
init a new AG on shrink so it might be cleaner to eventually refactor
this bit of logic out into a helper that we only call on extend since
this is a new AG initialization mechanism.

Brian

>  
>  	xfs_trans_agblocks_delta(tp, id.nfree);
>  
>  	/* If there are new blocks in the old last AG, extend it. */
>  	if (new) {
> -		error = xfs_ag_extend_space(mp, tp, &id, new);
> +		if (extend)
> +			error = xfs_ag_extend_space(mp, tp, &id, new);
> +		else
> +			error = xfs_ag_shrink_space(mp, tp, &id, new);
> +
>  		if (error)
>  			goto out_trans_cancel;
>  	}
> @@ -123,7 +144,7 @@ xfs_growfs_data_private(
>  	 */
>  	if (nagcount > oagcount)
>  		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
> -	if (nb > mp->m_sb.sb_dblocks)
> +	if (nb != mp->m_sb.sb_dblocks)
>  		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
>  				 nb - mp->m_sb.sb_dblocks);
>  	if (id.nfree)
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index c94e71f741b6..81b9c32f9bef 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -419,7 +419,6 @@ xfs_trans_mod_sb(
>  		tp->t_res_frextents_delta += delta;
>  		break;
>  	case XFS_TRANS_SB_DBLOCKS:
> -		ASSERT(delta > 0);
>  		tp->t_dblocks_delta += delta;
>  		break;
>  	case XFS_TRANS_SB_AGCOUNT:
> -- 
> 2.24.0
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-14 16:06   ` Darrick J. Wong
@ 2020-10-15  1:11     ` Gao Xiang
  0 siblings, 0 replies; 11+ messages in thread
From: Gao Xiang @ 2020-10-15  1:11 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs, Gao Xiang

Hi Darrick,

On Wed, Oct 14, 2020 at 09:06:33AM -0700, Darrick J. Wong wrote:
> On Wed, Oct 14, 2020 at 08:58:09AM +0800, Gao Xiang wrote:
> > From: Gao Xiang <hsiangkao@redhat.com>
> > 

...

> > +xfs_ag_shrink_space(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_trans	*tp,
> > +	struct aghdr_init_data	*id,
> > +	xfs_extlen_t		len)
> > +{
> > +	struct xfs_buf		*agibp, *agfbp;
> > +	struct xfs_agi		*agi;
> > +	struct xfs_agf		*agf;
> > +	int			error;
> > +
> > +	ASSERT(id->agno == mp->m_sb.sb_agcount - 1);
> > +	error = xfs_ialloc_read_agi(mp, tp, id->agno, &agibp);
> > +	if (error)
> > +		return error;
> > +
> > +	agi = agibp->b_addr;
> > +
> > +	/* Cannot touch the log space */
> > +	if (is_log_ag(mp, id) &&
> > +	    XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) +
> > +	    mp->m_sb.sb_logblocks > be32_to_cpu(agi->agi_length) - len)
> > +		return -EINVAL;
> 
> The space used by the internal log shouldn't also show up in the free
> space btrees, so I think you could rely on the _alloc_vextent_shrink
> call to return ENOSPC, right?

Yeah, it makes sense since freespace btree doesn't have log space
at all. I will drop this and make a comment on this.

> 
> > +
> > +	error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &agfbp);
> > +	if (error)
> > +		return error;
> > +
> > +	error = xfs_alloc_vextent_shrink(tp, agfbp,
> > +			be32_to_cpu(agi->agi_length) - len, len);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Change the agi length */
> > +	be32_add_cpu(&agi->agi_length, -len);
> > +	xfs_ialloc_log_agi(tp, agibp, XFS_AGI_LENGTH);
> > +
> > +	/* Change agf length */
> > +	agf = agfbp->b_addr;
> > +	be32_add_cpu(&agf->agf_length, -len);
> > +	ASSERT(agf->agf_length == agi->agi_length);
> > +	xfs_alloc_log_agf(tp, agfbp, XFS_AGF_LENGTH);
> > +	return 0;
> > +}
> > +
> >  /*
> >   * Extent the AG indicated by the @id by the length passed in
> 
> "Extend..." ?

Yeah, yet not related to this patch, I could fix it independently
(maybe together with other typos)...

> 
> >   */
> > diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> > index 5166322807e7..f3b5bbfeadce 100644
> > --- a/fs/xfs/libxfs/xfs_ag.h
> > +++ b/fs/xfs/libxfs/xfs_ag.h
> > @@ -24,6 +24,8 @@ struct aghdr_init_data {
> >  };
> >  
> >  int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
> > +int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans *tp,
> > +			struct aghdr_init_data *id, xfs_extlen_t len);
> >  int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
> >  			struct aghdr_init_data *id, xfs_extlen_t len);
> >  int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
> > diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> > index 852b536551b5..681357bb2701 100644
> > --- a/fs/xfs/libxfs/xfs_alloc.c
> > +++ b/fs/xfs/libxfs/xfs_alloc.c
> > @@ -1118,6 +1118,61 @@ xfs_alloc_ag_vextent_small(
> >  	return error;
> >  }
> >  
> > +/*
> > + * Allocate an extent for shrinking the last allocation group
> > + * to fix the freespace btree.
> > + * agfl fix is avoided in order to keep from dirtying the
> > + * transaction unnecessarily compared with xfs_alloc_vextent().
> > + */
> > +int
> > +xfs_alloc_vextent_shrink(
> > +	struct xfs_trans	*tp,
> > +	struct xfs_buf		*agbp,
> > +	xfs_agblock_t		agbno,
> > +	xfs_extlen_t		len)
> > +{
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	xfs_agnumber_t		agno = agbp->b_pag->pag_agno;
> > +	struct xfs_alloc_arg	args = {
> > +		.tp = tp,
> > +		.mp = mp,
> > +		.type = XFS_ALLOCTYPE_THIS_BNO,
> > +		.agbp = agbp,
> > +		.agno = agno,
> > +		.agbno = agbno,
> > +		.fsbno = XFS_AGB_TO_FSB(mp, agno, agbno),
> 
> /me would have thought you could compute agbno from the AGF buffer
> passed in and the length parameter.

Yeah, that would be fine since agfbp has been passed in here. will update.

> 
> > +		.minlen = len,
> > +		.maxlen = len,
> > +		.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
> > +		.resv = XFS_AG_RESV_NONE,
> 
> Hmm.  Using AG_RESV_NONE here means that the allocation can fail because
> there won't be enough free space left to satisfy the AG metadata
> preallocations.  I think you have to call xfs_ag_resv_free before
> calling this allocation function to remove the preallocations, and then
> call xfs_ag_resv_init afterwards (regardless of whether the shrink
> succeeds) to re-establish the preallocations to fit the new AG size.

Ok, currently I'm not quite familiar with code in libxfs/xfs_ag_resv.c.
will look into that and try to follow your suggestion.

> 
> > +		.prod = 1,
> > +		.alignment = 1,
> > +		.pag = agbp->b_pag
> > +	};
> > +	int			error;
> > +
> > +	error = xfs_alloc_ag_vextent_exact(&args);
> > +	if (error || args.agbno == NULLAGBLOCK)
> > +		return -EBUSY;
> > +
> > +	ASSERT(args.agbno == agbno);
> > +	ASSERT(args.len == len);
> > +	ASSERT(!args.wasfromfl || args.resv != XFS_AG_RESV_AGFL);
> 
> Can wasfromfl==true actually happen?  I think for this shrink case we
> need to prevent that from ever happening.

I think it won't happen as well, since my idea currently is to avoid
any reduntant step, e.g. to adjust agfl or move/defrag metadata. I think
laterly userspace could do that with another ioctl (?) and if it fails
(e.g. change concurrently....) to try some process again.

> 
> > +
> > +	if (!args.wasfromfl) {
> > +		error = xfs_alloc_update_counters(tp, agbp, -(long)len);
> > +		if (error)
> > +			return error;
> > +
> > +		ASSERT(!xfs_extent_busy_search(mp, args.agno, agbno, args.len));
> > +	}
> > +	xfs_ag_resv_alloc_extent(args.pag, args.resv, &args);
> 
> (I think you can skip this call if you free the AG's preallocations
> before allocating the last freespace.)

Ok, will look into that as well. Currently the logic is mostly taken from
xfs_alloc_vextent() but without agfl fix, so the entriance seems more
straight-forward than shrinking stuffs are deep into a large
xfs_alloc_vextent() details...

> 
> > +
> > +	XFS_STATS_INC(mp, xs_allocx);
> > +	XFS_STATS_ADD(mp, xs_allocb, args.len);
> > +	return 0;
> > +}
> > +
> >  /*
> >   * Allocate a variable extent in the allocation group agno.
> >   * Type and bno are used to determine where in the allocation group the
> > diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
> > index 6c22b12176b8..6080140bcb56 100644
> > --- a/fs/xfs/libxfs/xfs_alloc.h
> > +++ b/fs/xfs/libxfs/xfs_alloc.h
> > @@ -160,6 +160,16 @@ int				/* error */
> >  xfs_alloc_vextent(
> >  	xfs_alloc_arg_t	*args);	/* allocation argument structure */
> >  
> > +/*
> > + * Allocate an extent for shrinking the last AG
> > + */
> > +int
> > +xfs_alloc_vextent_shrink(
> > +	struct xfs_trans	*tp,
> > +	struct xfs_buf		*agbp,
> > +	xfs_agblock_t		agbno,
> > +	xfs_extlen_t		len);	/* length of extent */
> > +
> >  /*
> >   * Free an extent.
> >   */
> > diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> > index ef1d5bb88b93..80927d323939 100644
> > --- a/fs/xfs/xfs_fsops.c
> > +++ b/fs/xfs/xfs_fsops.c
> > @@ -36,19 +36,21 @@ xfs_growfs_data_private(
> >  	xfs_rfsblock_t		new;
> >  	xfs_agnumber_t		oagcount;
> >  	xfs_trans_t		*tp;
> > +	bool			extend;
> >  	struct aghdr_init_data	id = {};
> >  
> >  	nb = in->newblocks;
> > -	if (nb < mp->m_sb.sb_dblocks)
> > -		return -EINVAL;
> >  	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
> >  		return error;
> 
> Please convert this to the more normal format:
> 
> error = xfs_sb_validate...();
> if (error)
> 	return error;

Ok, will update it with this patch.

> 
> > -	error = xfs_buf_read_uncached(mp->m_ddev_targp,
> > +
> > +	if (nb > mp->m_sb.sb_dblocks) {
> > +		error = xfs_buf_read_uncached(mp->m_ddev_targp,
> >  				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
> >  				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
> > -	if (error)
> > -		return error;
> > -	xfs_buf_relse(bp);
> > +		if (error)
> > +			return error;
> > +		xfs_buf_relse(bp);
> > +	}
> >  
> >  	new = nb;	/* use new as a temporary here */
> >  	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
> > @@ -56,10 +58,18 @@ xfs_growfs_data_private(
> >  	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
> >  		nagcount--;
> >  		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
> > -		if (nb < mp->m_sb.sb_dblocks)
> > +		if (!nagcount)
> >  			return -EINVAL;
> 
> Realistically, we don't ever want fewer than 2 AGs since that would
> leave us with no superblock redundancy.  Repair doesn't really support
> the 1 AG case.

yeah, I notice that userspace warning, will update it.

> 
> >  	}
> > -	new = nb - mp->m_sb.sb_dblocks;
> > +
> > +	if (nb > mp->m_sb.sb_dblocks) {
> > +		new = nb - mp->m_sb.sb_dblocks;
> > +		extend = true;
> > +	} else {
> > +		new = mp->m_sb.sb_dblocks - nb;
> > +		extend = false;
> > +	}
> 
> Er... maybe you should start by hoisting all the "make data device
> larger" code into a separate function so that you can add the "make data
> device smaller" code as a second function.  Then xfs_growfs_data_private
> can decide which of the two to call based on the new size.

Yeah, that is fine, I will try to do in the next version.

> 
> > +
> >  	oagcount = mp->m_sb.sb_agcount;
> >  
> >  	/* allocate the new per-ag structures */
> > @@ -67,10 +77,14 @@ xfs_growfs_data_private(
> >  		error = xfs_initialize_perag(mp, nagcount, &nagimax);
> >  		if (error)
> >  			return error;
> > +	} else if (nagcount != oagcount) {
> > +		/* TODO: shrinking a whole AG hasn't yet implemented */
> > +		return -EINVAL;
> >  	}
> >  
> >  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
> > -			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> > +			(extend ? 0 : new) + XFS_GROWFS_SPACE_RES(mp), 0,
> 
> XFS_GROWFS_SPACE_RES is defined as twice m_ag_maxlevels, which I think
> means (in the grow case) that it's reserving space for one full split
> for each free space btree.
> 
> Shrink, by contrast, is removing the rightmost record from the bnobt and
> some arbitrary record from the cntbt.  Can removing 1 record from each
> btree actually cause a split?

Yeah, I don't think so from btree implementation itself since it
removes/adjusts the record, will think it deeper later.

> 
> > +			XFS_TRANS_RESERVE, &tp);
> >  	if (error)
> >  		return error;
> >  
> > @@ -103,15 +117,22 @@ xfs_growfs_data_private(
> >  			goto out_trans_cancel;
> >  		}
> >  	}
> > -	error = xfs_buf_delwri_submit(&id.buffer_list);
> > -	if (error)
> > -		goto out_trans_cancel;
> > +
> > +	if (!list_empty(&id.buffer_list)) {
> > +		error = xfs_buf_delwri_submit(&id.buffer_list);
> > +		if (error)
> > +			goto out_trans_cancel;
> > +	}
> >  
> >  	xfs_trans_agblocks_delta(tp, id.nfree);
> >  
> >  	/* If there are new blocks in the old last AG, extend it. */
> >  	if (new) {
> > -		error = xfs_ag_extend_space(mp, tp, &id, new);
> > +		if (extend)
> > +			error = xfs_ag_extend_space(mp, tp, &id, new);
> > +		else
> > +			error = xfs_ag_shrink_space(mp, tp, &id, new);
> 
> This logic is getting harder for me to track because the patch combines
> grow and shrink logic in the same function....

Yeah, currently it shares almost the same logic, I will try to seperate them
in the next version.


It's rough a preliminary version for now, if this direction is roughly
acceptable for upstream, will look into it further...

Thanks,
Gao Xiang


> 
> --D
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-14 17:01   ` Brian Foster
@ 2020-10-15  1:49     ` Gao Xiang
  2020-10-20 14:50       ` Brian Foster
  0 siblings, 1 reply; 11+ messages in thread
From: Gao Xiang @ 2020-10-15  1:49 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs, Gao Xiang

Hi Brian,

On Wed, Oct 14, 2020 at 01:01:39PM -0400, Brian Foster wrote:
> On Wed, Oct 14, 2020 at 08:58:09AM +0800, Gao Xiang wrote:
> > From: Gao Xiang <hsiangkao@redhat.com>
> > 
> > At the first step of shrinking, this attempts to enable shrinking
> > unused space in the last allocation group by fixing up freespace
> > btree, agi, agf and adjusting super block.
> > 
> > This can be all done in one transaction for now, so I think no
> > additional protection is needed.
> > 
> > Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> > ---
> > 
> > Honestly, I've got headache about shrinking entire AGs
> > since the codebase doesn't expect agcount can be decreased
> > suddenly, I got some ideas from people but the modification
> > seems all over the codebase, I will keep on this at least
> > to learn more about XFS internals.
> > 
> > It might be worth sending out shrinking the last AG first
> > since users might need to shrink a little unused space
> > occasionally, yet I'm not quite sure the log space reservation
> > calculation in this patch and other details are correct.
> > I've done some manual test and it seems work. Yeah, as a
> > formal patch it needs more test to be done but I'd like
> > to hear more ideas about this first since I'm not quite
> > familiar with XFS for now and this topic involves a lot
> > new XFS-specific implementation details.
> > 
> > Kindly point out all strange places and what I'm missing
> > so I can revise it. It would be of great help for me to
> > learn more about XFS. At least just as a record on this
> > topic for further discussion.
> > 
> 
> Interesting... this seems fundamentally sane when narrowing the scope
> down to tail AG shrinking. Does xfs_repair flag any issues in the simple
> tail AG shrink case?

Yeah, I ran xfs_repair together as well, For smaller sizes, it seems
all fine, but I did observe some failure when much larger values
passed in, so as a formal patch, it really needs to be solved later.

Anyway, this patch tries to show if the overall direction is acceptable
for further development / upstream. And I could get some more
suggestion from it..

> 
> Some random initial thoughts..
> 

...

> > +int
> > +xfs_alloc_vextent_shrink(
> > +	struct xfs_trans	*tp,
> > +	struct xfs_buf		*agbp,
> > +	xfs_agblock_t		agbno,
> > +	xfs_extlen_t		len)
> > +{
> > +	struct xfs_mount	*mp = tp->t_mountp;
> > +	xfs_agnumber_t		agno = agbp->b_pag->pag_agno;
> > +	struct xfs_alloc_arg	args = {
> > +		.tp = tp,
> > +		.mp = mp,
> > +		.type = XFS_ALLOCTYPE_THIS_BNO,
> > +		.agbp = agbp,
> > +		.agno = agno,
> > +		.agbno = agbno,
> > +		.fsbno = XFS_AGB_TO_FSB(mp, agno, agbno),
> > +		.minlen = len,
> > +		.maxlen = len,
> > +		.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
> > +		.resv = XFS_AG_RESV_NONE,
> > +		.prod = 1,
> > +		.alignment = 1,
> > +		.pag = agbp->b_pag
> > +	};
> > +	int			error;
> > +
> > +	error = xfs_alloc_ag_vextent_exact(&args);
> > +	if (error || args.agbno == NULLAGBLOCK)
> > +		return -EBUSY;
> 
> I think it's generally better to call into the top-level allocator API
> (xfs_alloc_vextent()) because it will handle internal allocator business
> like fixing up the AGFL and whatnot. Then you probably don't have to
> specify as much in the args structure as well. The allocation mode
> you've specified (THIS_BNO) will fall into the exact allocation codepath
> and should enforce the semantics we need here (i.e. grant the exact
> allocation or fail).

Actually, I did in the same way (use xfs_alloc_vextent()) in my previous
hack version
https://git.kernel.org/pub/scm/linux/kernel/git/xiang/linux.git/commit/?id=65d87d223a4d984441453659f1baeca560f07de4

yet Dave pointed out in private agfl fix could dirty the transaction
and if the later allocation fails, it would be unsafe to cancel
the dirty transaction. So as far as my current XFS knowledge, I think
that makes sense so I introduce a separate helper
xfs_alloc_vextent_shrink()...

I tend to avoid bury all shrinking specfic logic too deep in the large
xfs_alloc_vextent() logic by using another new bool or something
since it's rather complicated for now.

Intoduce a new helper would make this process more straight-forward
and bug less in the future IMO... Just my own current thought about
this...

> 
> I also wonder if we'll eventually have to be more intelligent here in
> scenarios where ag metadata (i.e., free space root blocks, etc.) or the
> agfl holds blocks in a range we're asked to shrink. I think those are
> scenarios where such an allocation request would fail even though the
> blocks are internal or technically free. Have you explored such
> scenarios so far? I know we're trying to be opportunistic here, but if
> the AG (or subset) is otherwise empty it seems a bit random to fail.
> Hmm, maybe scrub/repair could help to reinit/defrag such an AG if we
> could otherwise determine that blocks beyond a certain range are unused
> externally.

Yeah, currently I don't tend to defrag or fix agfl in the process but rather
on shrinking unused space (not in AGFL) and make the kernel side simplier,
since I think for the long term we could have 2 options by some combination
with the userspace prog:
 - lock the AG, defrag / move the AG, shrinking, unlock the AG;
 - defrag / move the AG, shrinking, retry.

> > +

...

> >  	new = nb;	/* use new as a temporary here */
> >  	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
> > @@ -56,10 +58,18 @@ xfs_growfs_data_private(
> >  	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
> >  		nagcount--;
> >  		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
> > -		if (nb < mp->m_sb.sb_dblocks)
> > +		if (!nagcount)
> >  			return -EINVAL;
> >  	}
> 
> We probably need to rethink the bit of logic above this check for
> shrinking. It looks like the current code checks for the minimum
> supported AG size and if not satisfied, reduces the size the grow to the
> next smaller AG count. That would actually increase the size of the
> shrink from what the user requested, so we'd probably want to do the
> opposite and reduce the size of the requested shrink. For now it
> probably doesn't matter much since we fail to shrink the agcount.
> 
> That said, if I'm following the growfs behavior correctly it might be
> worth considering analogous behavior for shrink. E.g., if the user asks
> to trim 10GB off the last AG but only the last 4GB are free, then shrink
> the fs by 4GB and report the new size to the user.

I thought about this topic as well, yeah, anyway, I think it needs
some clearer documented words about the behavior (round down or round
up). My original idea is to unify them. But yeah, increase the size
of the shrink might cause unexpected fail.

> 
> > -	new = nb - mp->m_sb.sb_dblocks;
> > +
> > +	if (nb > mp->m_sb.sb_dblocks) {
> > +		new = nb - mp->m_sb.sb_dblocks;
> > +		extend = true;
> > +	} else {
> > +		new = mp->m_sb.sb_dblocks - nb;
> > +		extend = false;
> > +	}
> > +
> 
> s/new/delta (or something along those lines) might be more readable if
> we go this route.

In my previous random version, I once renamed it to bdelta, but I found
the modification is large, I might need to clean up growfs naming first.

> 
> >  	oagcount = mp->m_sb.sb_agcount;
> >  
> >  	/* allocate the new per-ag structures */
> > @@ -67,10 +77,14 @@ xfs_growfs_data_private(
> >  		error = xfs_initialize_perag(mp, nagcount, &nagimax);
> >  		if (error)
> >  			return error;
> > +	} else if (nagcount != oagcount) {
> > +		/* TODO: shrinking a whole AG hasn't yet implemented */
> > +		return -EINVAL;
> >  	}
> >  
> >  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
> > -			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> > +			(extend ? 0 : new) + XFS_GROWFS_SPACE_RES(mp), 0,
> > +			XFS_TRANS_RESERVE, &tp);
> >  	if (error)
> >  		return error;
> >  
> > @@ -103,15 +117,22 @@ xfs_growfs_data_private(
> >  			goto out_trans_cancel;
> >  		}
> >  	}
> > -	error = xfs_buf_delwri_submit(&id.buffer_list);
> > -	if (error)
> > -		goto out_trans_cancel;
> > +
> > +	if (!list_empty(&id.buffer_list)) {
> > +		error = xfs_buf_delwri_submit(&id.buffer_list);
> > +		if (error)
> > +			goto out_trans_cancel;
> > +	}
> 
> The list check seems somewhat superfluous since we won't do anything
> with an empty list anyways. Presumably it would be incorrect to ever
> init a new AG on shrink so it might be cleaner to eventually refactor
> this bit of logic out into a helper that we only call on extend since
> this is a new AG initialization mechanism.

Yeah, actually my previous hack version
https://git.kernel.org/pub/scm/linux/kernel/git/xiang/linux.git/commit/?id=65d87d223a4d984441453659f1baeca560f07de4

did like this, but in this version I'd like to avoid touching unrelated
topic as much as possible.

xfs_buf_delwri_submit() is not no-op for empty lists. Anyway, I will
use 2 independent logic for entire extend / shrink seperately.

Thanks for your suggestion!

Thanks,
Gao Xiang

> 
> Brian


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-15  1:49     ` Gao Xiang
@ 2020-10-20 14:50       ` Brian Foster
  2020-10-21  3:19         ` Gao Xiang
  0 siblings, 1 reply; 11+ messages in thread
From: Brian Foster @ 2020-10-20 14:50 UTC (permalink / raw)
  To: Gao Xiang; +Cc: linux-xfs, Gao Xiang

On Thu, Oct 15, 2020 at 09:49:15AM +0800, Gao Xiang wrote:
> Hi Brian,
> 
> On Wed, Oct 14, 2020 at 01:01:39PM -0400, Brian Foster wrote:
> > On Wed, Oct 14, 2020 at 08:58:09AM +0800, Gao Xiang wrote:
> > > From: Gao Xiang <hsiangkao@redhat.com>
> > > 
> > > At the first step of shrinking, this attempts to enable shrinking
> > > unused space in the last allocation group by fixing up freespace
> > > btree, agi, agf and adjusting super block.
> > > 
> > > This can be all done in one transaction for now, so I think no
> > > additional protection is needed.
> > > 
> > > Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> > > ---
> > > 
> > > Honestly, I've got headache about shrinking entire AGs
> > > since the codebase doesn't expect agcount can be decreased
> > > suddenly, I got some ideas from people but the modification
> > > seems all over the codebase, I will keep on this at least
> > > to learn more about XFS internals.
> > > 
> > > It might be worth sending out shrinking the last AG first
> > > since users might need to shrink a little unused space
> > > occasionally, yet I'm not quite sure the log space reservation
> > > calculation in this patch and other details are correct.
> > > I've done some manual test and it seems work. Yeah, as a
> > > formal patch it needs more test to be done but I'd like
> > > to hear more ideas about this first since I'm not quite
> > > familiar with XFS for now and this topic involves a lot
> > > new XFS-specific implementation details.
> > > 
> > > Kindly point out all strange places and what I'm missing
> > > so I can revise it. It would be of great help for me to
> > > learn more about XFS. At least just as a record on this
> > > topic for further discussion.
> > > 
> > 
> > Interesting... this seems fundamentally sane when narrowing the scope
> > down to tail AG shrinking. Does xfs_repair flag any issues in the simple
> > tail AG shrink case?
> 
> Yeah, I ran xfs_repair together as well, For smaller sizes, it seems
> all fine, but I did observe some failure when much larger values
> passed in, so as a formal patch, it really needs to be solved later.
> 

I'm curious to see what xfs_repair complained about if you have a record
of it. That might call out some other things we could be overlooking.

> Anyway, this patch tries to show if the overall direction is acceptable
> for further development / upstream. And I could get some more
> suggestion from it..
> 

Sure.

> > 
> > Some random initial thoughts..
> > 
> 
> ...
> 
> > > +int
> > > +xfs_alloc_vextent_shrink(
> > > +	struct xfs_trans	*tp,
> > > +	struct xfs_buf		*agbp,
> > > +	xfs_agblock_t		agbno,
> > > +	xfs_extlen_t		len)
> > > +{
> > > +	struct xfs_mount	*mp = tp->t_mountp;
> > > +	xfs_agnumber_t		agno = agbp->b_pag->pag_agno;
> > > +	struct xfs_alloc_arg	args = {
> > > +		.tp = tp,
> > > +		.mp = mp,
> > > +		.type = XFS_ALLOCTYPE_THIS_BNO,
> > > +		.agbp = agbp,
> > > +		.agno = agno,
> > > +		.agbno = agbno,
> > > +		.fsbno = XFS_AGB_TO_FSB(mp, agno, agbno),
> > > +		.minlen = len,
> > > +		.maxlen = len,
> > > +		.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
> > > +		.resv = XFS_AG_RESV_NONE,
> > > +		.prod = 1,
> > > +		.alignment = 1,
> > > +		.pag = agbp->b_pag
> > > +	};
> > > +	int			error;
> > > +
> > > +	error = xfs_alloc_ag_vextent_exact(&args);
> > > +	if (error || args.agbno == NULLAGBLOCK)
> > > +		return -EBUSY;
> > 
> > I think it's generally better to call into the top-level allocator API
> > (xfs_alloc_vextent()) because it will handle internal allocator business
> > like fixing up the AGFL and whatnot. Then you probably don't have to
> > specify as much in the args structure as well. The allocation mode
> > you've specified (THIS_BNO) will fall into the exact allocation codepath
> > and should enforce the semantics we need here (i.e. grant the exact
> > allocation or fail).
> 
> Actually, I did in the same way (use xfs_alloc_vextent()) in my previous
> hack version
> https://git.kernel.org/pub/scm/linux/kernel/git/xiang/linux.git/commit/?id=65d87d223a4d984441453659f1baeca560f07de4
> 
> yet Dave pointed out in private agfl fix could dirty the transaction
> and if the later allocation fails, it would be unsafe to cancel
> the dirty transaction. So as far as my current XFS knowledge, I think
> that makes sense so I introduce a separate helper
> xfs_alloc_vextent_shrink()...
> 

Yeah, I could see that being an issue. I'm curious if we're exposed to
that problem with exact allocation requests in other places.  We only
use it in a couple places that look like they have fallback allocation
requests. Combine that with the pre-allocation space checks and perhaps
this isn't something we'd currently hit in practice.

That said, I don't think this justifies diving directly into the lower
levels of the allocator (or branching any of the code, etc.). I suspect
not doing the agfl fixups and whatnot could cause other problems if they
are ultimately required for the subsequent allocation. The easiest
workaround is to just commit the transaction instead of cancelling it
once the allocation call is made. A more involved followon fix might be
to improve the early checks for exact allocations, but it's not clear at
this stage if that's really worth the extra code. We might also
eventually want to handle that another way to ensure that the agfl fixup
doesn't actually do an allocation that conflicts with the shrink itself.

> I tend to avoid bury all shrinking specfic logic too deep in the large
> xfs_alloc_vextent() logic by using another new bool or something
> since it's rather complicated for now.
> 
> Intoduce a new helper would make this process more straight-forward
> and bug less in the future IMO... Just my own current thought about
> this...
> 

I'd just work around it as above for now. It's a fairly minor
distraction with respect to the feature.

> > 
> > I also wonder if we'll eventually have to be more intelligent here in
> > scenarios where ag metadata (i.e., free space root blocks, etc.) or the
> > agfl holds blocks in a range we're asked to shrink. I think those are
> > scenarios where such an allocation request would fail even though the
> > blocks are internal or technically free. Have you explored such
> > scenarios so far? I know we're trying to be opportunistic here, but if
> > the AG (or subset) is otherwise empty it seems a bit random to fail.
> > Hmm, maybe scrub/repair could help to reinit/defrag such an AG if we
> > could otherwise determine that blocks beyond a certain range are unused
> > externally.
> 
> Yeah, currently I don't tend to defrag or fix agfl in the process but rather
> on shrinking unused space (not in AGFL) and make the kernel side simplier,
> since I think for the long term we could have 2 options by some combination
> with the userspace prog:
>  - lock the AG, defrag / move the AG, shrinking, unlock the AG;
>  - defrag / move the AG, shrinking, retry.
> 
> > > +
> 
> ...
> 
> > >  	new = nb;	/* use new as a temporary here */
> > >  	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
> > > @@ -56,10 +58,18 @@ xfs_growfs_data_private(
> > >  	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
> > >  		nagcount--;
> > >  		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
> > > -		if (nb < mp->m_sb.sb_dblocks)
> > > +		if (!nagcount)
> > >  			return -EINVAL;
> > >  	}
> > 
> > We probably need to rethink the bit of logic above this check for
> > shrinking. It looks like the current code checks for the minimum
> > supported AG size and if not satisfied, reduces the size the grow to the
> > next smaller AG count. That would actually increase the size of the
> > shrink from what the user requested, so we'd probably want to do the
> > opposite and reduce the size of the requested shrink. For now it
> > probably doesn't matter much since we fail to shrink the agcount.
> > 
> > That said, if I'm following the growfs behavior correctly it might be
> > worth considering analogous behavior for shrink. E.g., if the user asks
> > to trim 10GB off the last AG but only the last 4GB are free, then shrink
> > the fs by 4GB and report the new size to the user.
> 
> I thought about this topic as well, yeah, anyway, I think it needs
> some clearer documented words about the behavior (round down or round
> up). My original idea is to unify them. But yeah, increase the size
> of the shrink might cause unexpected fail.
> 

It's probably debatable as to whether we should reduce the size of the
shrink or just fail the operation, but I think to increase the size of
the shrink from what the user requested (even if it occurs "by accident"
due to the AG size rules) is inappropriate. With regard to the former,
have you looked into how shrink behaves on other filesystems (ext4)? I
think one advantage of shrinking what's available is to at least give
the user an opportunity to make incremental progress.

> > 
> > > -	new = nb - mp->m_sb.sb_dblocks;
> > > +
> > > +	if (nb > mp->m_sb.sb_dblocks) {
> > > +		new = nb - mp->m_sb.sb_dblocks;
> > > +		extend = true;
> > > +	} else {
> > > +		new = mp->m_sb.sb_dblocks - nb;
> > > +		extend = false;
> > > +	}
> > > +
> > 
> > s/new/delta (or something along those lines) might be more readable if
> > we go this route.
> 
> In my previous random version, I once renamed it to bdelta, but I found
> the modification is large, I might need to clean up growfs naming first.
> 
> > 
> > >  	oagcount = mp->m_sb.sb_agcount;
> > >  
> > >  	/* allocate the new per-ag structures */
> > > @@ -67,10 +77,14 @@ xfs_growfs_data_private(
> > >  		error = xfs_initialize_perag(mp, nagcount, &nagimax);
> > >  		if (error)
> > >  			return error;
> > > +	} else if (nagcount != oagcount) {
> > > +		/* TODO: shrinking a whole AG hasn't yet implemented */
> > > +		return -EINVAL;
> > >  	}
> > >  
> > >  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
> > > -			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> > > +			(extend ? 0 : new) + XFS_GROWFS_SPACE_RES(mp), 0,
> > > +			XFS_TRANS_RESERVE, &tp);
> > >  	if (error)
> > >  		return error;
> > >  
> > > @@ -103,15 +117,22 @@ xfs_growfs_data_private(
> > >  			goto out_trans_cancel;
> > >  		}
> > >  	}
> > > -	error = xfs_buf_delwri_submit(&id.buffer_list);
> > > -	if (error)
> > > -		goto out_trans_cancel;
> > > +
> > > +	if (!list_empty(&id.buffer_list)) {
> > > +		error = xfs_buf_delwri_submit(&id.buffer_list);
> > > +		if (error)
> > > +			goto out_trans_cancel;
> > > +	}
> > 
> > The list check seems somewhat superfluous since we won't do anything
> > with an empty list anyways. Presumably it would be incorrect to ever
> > init a new AG on shrink so it might be cleaner to eventually refactor
> > this bit of logic out into a helper that we only call on extend since
> > this is a new AG initialization mechanism.
> 
> Yeah, actually my previous hack version
> https://git.kernel.org/pub/scm/linux/kernel/git/xiang/linux.git/commit/?id=65d87d223a4d984441453659f1baeca560f07de4
> 
> did like this, but in this version I'd like to avoid touching unrelated
> topic as much as possible.
> 
> xfs_buf_delwri_submit() is not no-op for empty lists. Anyway, I will
> use 2 independent logic for entire extend / shrink seperately.
> 

I'm not sure we need to split out the entire function. It just might
make some sense to refactor the existing code a bit so the common code
is clearly readable for shrink/grow and that any larger hunks of code
specific to either grow or shrink are factored out into separate
functions.

Brian

> Thanks for your suggestion!
> 
> Thanks,
> Gao Xiang
> 
> > 
> > Brian
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-20 14:50       ` Brian Foster
@ 2020-10-21  3:19         ` Gao Xiang
  2020-10-21  9:55           ` Brian Foster
  0 siblings, 1 reply; 11+ messages in thread
From: Gao Xiang @ 2020-10-21  3:19 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs, Gao Xiang

On Tue, Oct 20, 2020 at 10:50:12AM -0400, Brian Foster wrote:
> On Thu, Oct 15, 2020 at 09:49:15AM +0800, Gao Xiang wrote:

...

> > > 
> > > Interesting... this seems fundamentally sane when narrowing the scope
> > > down to tail AG shrinking. Does xfs_repair flag any issues in the simple
> > > tail AG shrink case?
> > 
> > Yeah, I ran xfs_repair together as well, For smaller sizes, it seems
> > all fine, but I did observe some failure when much larger values
> > passed in, so as a formal patch, it really needs to be solved later.
> > 
> 
> I'm curious to see what xfs_repair complained about if you have a record
> of it. That might call out some other things we could be overlooking.

Sorry for somewhat slow progress...

it could show random "SB summary counter sanity check failed" runtime message
when the shrink size is large (much close to ag start).

# mkfs.xfs -mrmapbt=1 -f /dev/sdb
meta-data=/dev/sdb               isize=512    agcount=4, agsize=98304 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=1        finobt=1, sparse=1, rmapbt=1
         =                       reflink=1
data     =                       bsize=4096   blocks=393216, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
log      =internal log           bsize=4096   blocks=3693, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0

(98304*3 = 294912)

# /tmp/mnt1/shrink /tmp/mnt2 297000
# /tmp/mnt1/shrink /tmp/mnt2 296999
# /tmp/mnt1/shrink /tmp/mnt2 296998
# /tmp/mnt1/shrink /tmp/mnt2 296997
[   76.852075] XFS (sdb): SB summary counter sanity check failed
[   76.854958] XFS (sdb): Metadata corruption detected at xfs_sb_write_verify+0x57/0x100, xfs_sb block 0xffffffffffffffff                                                                                         
[   76.859602] XFS (sdb): Unmount and run xfs_repair
[   76.861311] XFS (sdb): First 128 bytes of corrupted metadata buffer:
[   76.864794] 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 04 88 25  XFSB...........%
[   76.869368] 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[   76.871662] 00000020: 01 fa 92 14 04 e7 4f 7e 8e 65 77 bd ff d9 d2 f9  ......O~.ew.....
[   76.873636] 00000030: 00 00 00 00 00 04 00 07 00 00 00 00 00 00 00 80  ................
[   76.875544] 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82  ................
[   76.877050] 00000050: 00 00 00 01 00 01 80 00 00 00 00 04 00 00 00 00  ................
[   76.878473] 00000060: 00 00 0e 6d b4 a5 02 00 02 00 00 08 00 00 00 00  ...m............
[   76.879634] 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 11 00 00 00  ................
[   76.880818] XFS (sdb): xfs_do_force_shutdown(0x8) called from line 1582 of file fs/xfs/xfs_buf.c. Return address = ffffffffbbbb7e84                                                                            
[   76.882223] XFS (sdb): Corruption of in-memory data detected.  Shutting down filesystem
[   76.883187] XFS (sdb): Please unmount the filesystem and rectify the problem(s)

and

# xfs_repair /dev/sdb
Phase 1 - find and verify superblock...
Phase 2 - using internal log
        - zero log...
ERROR: The filesystem has valuable metadata changes in a log which needs to
be replayed.  Mount the filesystem to replay the log, and unmount it before
re-running xfs_repair.  If you are unable to mount the filesystem, then use
the -L option to destroy the log and attempt a repair.
Note that destroying the log may cause corruption -- please attempt a mount
of the filesystem before doing this.

cannot replay the log as well, as follows:
# mount /dev/sdb /tmp/mnt2
[  298.811478] XFS (sdb): Mounting V5 Filesystem
[  298.888455] XFS (sdb): Starting recovery (logdev: internal)
[  298.892614] XFS (sdb): SB summary counter sanity check failed
[  298.893201] XFS (sdb): Metadata corruption detected at xfs_sb_write_verify+0x57/0x100, xfs_sb block 0x0
[  298.894113] XFS (sdb): Unmount and run xfs_repair
[  298.894572] XFS (sdb): First 128 bytes of corrupted metadata buffer:
[  298.895180] 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 04 88 26  XFSB...........&
[  298.895965] 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
[  298.896777] 00000020: 01 fa 92 14 04 e7 4f 7e 8e 65 77 bd ff d9 d2 f9  ......O~.ew.....
[  298.897564] 00000030: 00 00 00 00 00 04 00 07 00 00 00 00 00 00 00 80  ................
[  298.898330] 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82  ................
[  298.899111] 00000050: 00 00 00 01 00 01 80 00 00 00 00 04 00 00 00 00  ................
[  298.900075] 00000060: 00 00 0e 6d b4 a5 02 00 02 00 00 08 00 00 00 00  ...m............
[  298.901130] 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 11 00 00 00  ................
[  298.902142] XFS (sdb): xfs_do_force_shutdown(0x8) called from line 1582 of file fs/xfs/xfs_buf.c. Return address = ffffffffbbbb7e84
[  298.903564] XFS (sdb): Corruption of in-memory data detected.  Shutting down filesystem
[  298.904559] XFS (sdb): Please unmount the filesystem and rectify the problem(s)
[  298.905477] XFS (sdb): log mount/recovery failed: error -117
[  298.906258] XFS (sdb): log mount failed

and ignore the log is not working as well.
# xfs_repair -L /dev/sdb
Phase 1 - find and verify superblock...
Phase 2 - using internal log
        - zero log...
        - scan filesystem freespace and inode maps...
invalid length 98291 in record 0 of bno btree block 3/1
invalid length 98291 in record 0 of cnt btree block 3/2
agf_freeblks 98291, counted 0 in ag 3
agf_longest 98291, counted 0 in ag 3
sb_icount 0, counted 64
sb_ifree 0, counted 61
sb_fdblocks 0, counted 291196
        - found root inode chunk
Phase 3 - for each AG...
        - scan and clear agi unlinked lists...
        - process known inodes and perform inode discovery...
        - agno = 0
        - agno = 1
        - agno = 2
        - agno = 3
        - process newly discovered inodes...
Phase 4 - check for duplicate blocks...
        - setting up duplicate extent list...
        - check for inodes claiming duplicate blocks...
        - agno = 0
        - agno = 1
        - agno = 2
        - agno = 3
Phase 5 - rebuild AG headers and trees...

fatal error -- unable to add AG 0 reverse-mapping data to btree.


I haven't digged into that for now, will look into that later.

> 

...

> > 
> > > > +int
> > > > +xfs_alloc_vextent_shrink(
> > > > +	struct xfs_trans	*tp,
> > > > +	struct xfs_buf		*agbp,
> > > > +	xfs_agblock_t		agbno,
> > > > +	xfs_extlen_t		len)
> > > > +{
> > > > +	struct xfs_mount	*mp = tp->t_mountp;
> > > > +	xfs_agnumber_t		agno = agbp->b_pag->pag_agno;
> > > > +	struct xfs_alloc_arg	args = {
> > > > +		.tp = tp,
> > > > +		.mp = mp,
> > > > +		.type = XFS_ALLOCTYPE_THIS_BNO,
> > > > +		.agbp = agbp,
> > > > +		.agno = agno,
> > > > +		.agbno = agbno,
> > > > +		.fsbno = XFS_AGB_TO_FSB(mp, agno, agbno),
> > > > +		.minlen = len,
> > > > +		.maxlen = len,
> > > > +		.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
> > > > +		.resv = XFS_AG_RESV_NONE,
> > > > +		.prod = 1,
> > > > +		.alignment = 1,
> > > > +		.pag = agbp->b_pag
> > > > +	};
> > > > +	int			error;
> > > > +
> > > > +	error = xfs_alloc_ag_vextent_exact(&args);
> > > > +	if (error || args.agbno == NULLAGBLOCK)
> > > > +		return -EBUSY;
> > > 
> > > I think it's generally better to call into the top-level allocator API
> > > (xfs_alloc_vextent()) because it will handle internal allocator business
> > > like fixing up the AGFL and whatnot. Then you probably don't have to
> > > specify as much in the args structure as well. The allocation mode
> > > you've specified (THIS_BNO) will fall into the exact allocation codepath
> > > and should enforce the semantics we need here (i.e. grant the exact
> > > allocation or fail).
> > 
> > Actually, I did in the same way (use xfs_alloc_vextent()) in my previous
> > hack version
> > https://git.kernel.org/pub/scm/linux/kernel/git/xiang/linux.git/commit/?id=65d87d223a4d984441453659f1baeca560f07de4
> > 
> > yet Dave pointed out in private agfl fix could dirty the transaction
> > and if the later allocation fails, it would be unsafe to cancel
> > the dirty transaction. So as far as my current XFS knowledge, I think
> > that makes sense so I introduce a separate helper
> > xfs_alloc_vextent_shrink()...
> > 
> 
> Yeah, I could see that being an issue. I'm curious if we're exposed to
> that problem with exact allocation requests in other places.  We only
> use it in a couple places that look like they have fallback allocation
> requests. Combine that with the pre-allocation space checks and perhaps
> this isn't something we'd currently hit in practice.
> 
> That said, I don't think this justifies diving directly into the lower
> levels of the allocator (or branching any of the code, etc.). I suspect
> not doing the agfl fixups and whatnot could cause other problems if they
> are ultimately required for the subsequent allocation. The easiest
> workaround is to just commit the transaction instead of cancelling it
> once the allocation call is made. A more involved followon fix might be
> to improve the early checks for exact allocations, but it's not clear at
> this stage if that's really worth the extra code. We might also
> eventually want to handle that another way to ensure that the agfl fixup
> doesn't actually do an allocation that conflicts with the shrink itself.

Ok, I will commit the transaction if the allocation fails and
the transaction is dirty.

> 
> > > 
> > > We probably need to rethink the bit of logic above this check for
> > > shrinking. It looks like the current code checks for the minimum
> > > supported AG size and if not satisfied, reduces the size the grow to the
> > > next smaller AG count. That would actually increase the size of the
> > > shrink from what the user requested, so we'd probably want to do the
> > > opposite and reduce the size of the requested shrink. For now it
> > > probably doesn't matter much since we fail to shrink the agcount.
> > > 
> > > That said, if I'm following the growfs behavior correctly it might be
> > > worth considering analogous behavior for shrink. E.g., if the user asks
> > > to trim 10GB off the last AG but only the last 4GB are free, then shrink
> > > the fs by 4GB and report the new size to the user.
> > 
> > I thought about this topic as well, yeah, anyway, I think it needs
> > some clearer documented words about the behavior (round down or round
> > up). My original idea is to unify them. But yeah, increase the size
> > of the shrink might cause unexpected fail.
> > 
> 
> It's probably debatable as to whether we should reduce the size of the
> shrink or just fail the operation, but I think to increase the size of
> the shrink from what the user requested (even if it occurs "by accident"
> due to the AG size rules) is inappropriate. With regard to the former,
> have you looked into how shrink behaves on other filesystems (ext4)? I
> think one advantage of shrinking what's available is to at least give
> the user an opportunity to make incremental progress.

I quickly check what resize2fs does.

errcode_t adjust_fs_info(ext2_filsys fs, ext2_filsys old_fs,
			 ext2fs_block_bitmap reserve_blocks, blk64_t new_size)
...
	ext2fs_blocks_count_set(fs->super, new_size);
	fs->super->s_overhead_clusters = 0;

retry:
...
	/*
	 * Overhead is the number of bookkeeping blocks per group.  It
	 * includes the superblock backup, the group descriptor
	 * backups, the inode bitmap, the block bitmap, and the inode
	 * table.
	 */
	overhead = (int) (2 + fs->inode_blocks_per_group);
...
	/*
	 * See if the last group is big enough to support the
	 * necessary data structures.  If not, we need to get rid of
	 * it.
	 */
	rem = (ext2fs_blocks_count(fs->super) - fs->super->s_first_data_block) %
		fs->super->s_blocks_per_group;
	if ((fs->group_desc_count == 1) && rem && (rem < overhead))
		return EXT2_ET_TOOSMALL;
	if ((fs->group_desc_count > 1) && rem && (rem < overhead+50)) {
		ext2fs_blocks_count_set(fs->super,
					ext2fs_blocks_count(fs->super) - rem);
		goto retry;
	}

from the code itself it seems for some cases it increases the size of
the shrink from what the user requested. and for the other cases, it
just errors out.

and I also tried with some configuration:

First block:              1
Block size:               1024
Fragment size:            1024
Group descriptor size:    64
Reserved GDT blocks:      256
Blocks per group:         8192
Fragments per group:      8192
Inodes per group:         2016
Inode blocks per group:   252

# resize2fs test.ext4.img 262500
resize2fs 1.44.5 (15-Dec-2018)
Resizing the filesystem on test.ext4.img to 262500 (1k) blocks.
The filesystem on test.ext4.img is now 262500 (1k) blocks long.

# resize2fs test.ext4.img 262403
resize2fs 1.44.5 (15-Dec-2018)
Resizing the filesystem on test.ext4.img to 262403 (1k) blocks.
The filesystem on test.ext4.img is now 262145 (1k) blocks long.

> 

...

> > 
> > > 
> > > >  	oagcount = mp->m_sb.sb_agcount;
> > > >  
> > > >  	/* allocate the new per-ag structures */
> > > > @@ -67,10 +77,14 @@ xfs_growfs_data_private(
> > > >  		error = xfs_initialize_perag(mp, nagcount, &nagimax);
> > > >  		if (error)
> > > >  			return error;
> > > > +	} else if (nagcount != oagcount) {
> > > > +		/* TODO: shrinking a whole AG hasn't yet implemented */
> > > > +		return -EINVAL;
> > > >  	}
> > > >  
> > > >  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
> > > > -			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> > > > +			(extend ? 0 : new) + XFS_GROWFS_SPACE_RES(mp), 0,
> > > > +			XFS_TRANS_RESERVE, &tp);
> > > >  	if (error)
> > > >  		return error;
> > > >  
> > > > @@ -103,15 +117,22 @@ xfs_growfs_data_private(
> > > >  			goto out_trans_cancel;
> > > >  		}
> > > >  	}
> > > > -	error = xfs_buf_delwri_submit(&id.buffer_list);
> > > > -	if (error)
> > > > -		goto out_trans_cancel;
> > > > +
> > > > +	if (!list_empty(&id.buffer_list)) {
> > > > +		error = xfs_buf_delwri_submit(&id.buffer_list);
> > > > +		if (error)
> > > > +			goto out_trans_cancel;
> > > > +	}
> > > 
> > > The list check seems somewhat superfluous since we won't do anything
> > > with an empty list anyways. Presumably it would be incorrect to ever
> > > init a new AG on shrink so it might be cleaner to eventually refactor
> > > this bit of logic out into a helper that we only call on extend since
> > > this is a new AG initialization mechanism.
> > 
> > Yeah, actually my previous hack version
> > https://git.kernel.org/pub/scm/linux/kernel/git/xiang/linux.git/commit/?id=65d87d223a4d984441453659f1baeca560f07de4
> > 
> > did like this, but in this version I'd like to avoid touching unrelated
> > topic as much as possible.
> > 
> > xfs_buf_delwri_submit() is not no-op for empty lists. Anyway, I will
> > use 2 independent logic for entire extend / shrink seperately.
> > 
> 
> I'm not sure we need to split out the entire function. It just might
> make some sense to refactor the existing code a bit so the common code
> is clearly readable for shrink/grow and that any larger hunks of code
> specific to either grow or shrink are factored out into separate
> functions.

Ok.

Thanks,
Gao Xiang

> 
> Brian
> 
> > Thanks for your suggestion!
> > 
> > Thanks,
> > Gao Xiang
> > 
> > > 
> > > Brian
> > 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-21  3:19         ` Gao Xiang
@ 2020-10-21  9:55           ` Brian Foster
  2020-10-21 13:21             ` Gao Xiang
  0 siblings, 1 reply; 11+ messages in thread
From: Brian Foster @ 2020-10-21  9:55 UTC (permalink / raw)
  To: Gao Xiang; +Cc: linux-xfs, Gao Xiang

On Wed, Oct 21, 2020 at 11:19:28AM +0800, Gao Xiang wrote:
> On Tue, Oct 20, 2020 at 10:50:12AM -0400, Brian Foster wrote:
> > On Thu, Oct 15, 2020 at 09:49:15AM +0800, Gao Xiang wrote:
> 
> ...
> 
> > > > 
> > > > Interesting... this seems fundamentally sane when narrowing the scope
> > > > down to tail AG shrinking. Does xfs_repair flag any issues in the simple
> > > > tail AG shrink case?
> > > 
> > > Yeah, I ran xfs_repair together as well, For smaller sizes, it seems
> > > all fine, but I did observe some failure when much larger values
> > > passed in, so as a formal patch, it really needs to be solved later.
> > > 
> > 
> > I'm curious to see what xfs_repair complained about if you have a record
> > of it. That might call out some other things we could be overlooking.
> 
> Sorry for somewhat slow progress...
> 
> it could show random "SB summary counter sanity check failed" runtime message
> when the shrink size is large (much close to ag start).
> 

Ok. That error looks associated with a few different checks:

        if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
            (sbp->sb_fdblocks > sbp->sb_dblocks ||
             !xfs_verify_icount(mp, sbp->sb_icount) ||
             sbp->sb_ifree > sbp->sb_icount)) {
                xfs_warn(mp, "SB summary counter sanity check failed");
                return -EFSCORRUPTED;
        }

Though I think the inode counters should be a subset of allocated space
(i.e. inode chunks) so are unlikely to be impacted by a removal of free
space. Without looking into details, I'd guess it's most likely just an
accounting bug and it's easiest to dump the relevant values that land in
the superblock and work backwards from there. FWIW, the followon
shutdown, repair (dirty log) and log recovery behavior (write and read
verifier failures) are typical and to be expected on metadata
corruption. IOW, I suspect that if we address the write verifier
failure, the followon issues will likely be resolved as well.

> # mkfs.xfs -mrmapbt=1 -f /dev/sdb
> meta-data=/dev/sdb               isize=512    agcount=4, agsize=98304 blks
>          =                       sectsz=512   attr=2, projid32bit=1
>          =                       crc=1        finobt=1, sparse=1, rmapbt=1
>          =                       reflink=1
> data     =                       bsize=4096   blocks=393216, imaxpct=25
>          =                       sunit=0      swidth=0 blks
> naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
> log      =internal log           bsize=4096   blocks=3693, version=2
>          =                       sectsz=512   sunit=0 blks, lazy-count=1
> realtime =none                   extsz=4096   blocks=0, rtextents=0
> 
> (98304*3 = 294912)
> 
> # /tmp/mnt1/shrink /tmp/mnt2 297000
> # /tmp/mnt1/shrink /tmp/mnt2 296999
> # /tmp/mnt1/shrink /tmp/mnt2 296998
> # /tmp/mnt1/shrink /tmp/mnt2 296997
> [   76.852075] XFS (sdb): SB summary counter sanity check failed
> [   76.854958] XFS (sdb): Metadata corruption detected at xfs_sb_write_verify+0x57/0x100, xfs_sb block 0xffffffffffffffff                                                                                         
> [   76.859602] XFS (sdb): Unmount and run xfs_repair
> [   76.861311] XFS (sdb): First 128 bytes of corrupted metadata buffer:
> [   76.864794] 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 04 88 25  XFSB...........%
> [   76.869368] 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
> [   76.871662] 00000020: 01 fa 92 14 04 e7 4f 7e 8e 65 77 bd ff d9 d2 f9  ......O~.ew.....
> [   76.873636] 00000030: 00 00 00 00 00 04 00 07 00 00 00 00 00 00 00 80  ................
> [   76.875544] 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82  ................
> [   76.877050] 00000050: 00 00 00 01 00 01 80 00 00 00 00 04 00 00 00 00  ................
> [   76.878473] 00000060: 00 00 0e 6d b4 a5 02 00 02 00 00 08 00 00 00 00  ...m............
> [   76.879634] 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 11 00 00 00  ................
> [   76.880818] XFS (sdb): xfs_do_force_shutdown(0x8) called from line 1582 of file fs/xfs/xfs_buf.c. Return address = ffffffffbbbb7e84                                                                            
> [   76.882223] XFS (sdb): Corruption of in-memory data detected.  Shutting down filesystem
> [   76.883187] XFS (sdb): Please unmount the filesystem and rectify the problem(s)
> 
> and
> 
> # xfs_repair /dev/sdb
> Phase 1 - find and verify superblock...
> Phase 2 - using internal log
>         - zero log...
> ERROR: The filesystem has valuable metadata changes in a log which needs to
> be replayed.  Mount the filesystem to replay the log, and unmount it before
> re-running xfs_repair.  If you are unable to mount the filesystem, then use
> the -L option to destroy the log and attempt a repair.
> Note that destroying the log may cause corruption -- please attempt a mount
> of the filesystem before doing this.
> 
> cannot replay the log as well, as follows:
> # mount /dev/sdb /tmp/mnt2
> [  298.811478] XFS (sdb): Mounting V5 Filesystem
> [  298.888455] XFS (sdb): Starting recovery (logdev: internal)
> [  298.892614] XFS (sdb): SB summary counter sanity check failed
> [  298.893201] XFS (sdb): Metadata corruption detected at xfs_sb_write_verify+0x57/0x100, xfs_sb block 0x0
> [  298.894113] XFS (sdb): Unmount and run xfs_repair
> [  298.894572] XFS (sdb): First 128 bytes of corrupted metadata buffer:
> [  298.895180] 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 04 88 26  XFSB...........&
> [  298.895965] 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
> [  298.896777] 00000020: 01 fa 92 14 04 e7 4f 7e 8e 65 77 bd ff d9 d2 f9  ......O~.ew.....
> [  298.897564] 00000030: 00 00 00 00 00 04 00 07 00 00 00 00 00 00 00 80  ................
> [  298.898330] 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82  ................
> [  298.899111] 00000050: 00 00 00 01 00 01 80 00 00 00 00 04 00 00 00 00  ................
> [  298.900075] 00000060: 00 00 0e 6d b4 a5 02 00 02 00 00 08 00 00 00 00  ...m............
> [  298.901130] 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 11 00 00 00  ................
> [  298.902142] XFS (sdb): xfs_do_force_shutdown(0x8) called from line 1582 of file fs/xfs/xfs_buf.c. Return address = ffffffffbbbb7e84
> [  298.903564] XFS (sdb): Corruption of in-memory data detected.  Shutting down filesystem
> [  298.904559] XFS (sdb): Please unmount the filesystem and rectify the problem(s)
> [  298.905477] XFS (sdb): log mount/recovery failed: error -117
> [  298.906258] XFS (sdb): log mount failed
> 
> and ignore the log is not working as well.
> # xfs_repair -L /dev/sdb
> Phase 1 - find and verify superblock...
> Phase 2 - using internal log
>         - zero log...
>         - scan filesystem freespace and inode maps...
> invalid length 98291 in record 0 of bno btree block 3/1
> invalid length 98291 in record 0 of cnt btree block 3/2
> agf_freeblks 98291, counted 0 in ag 3
> agf_longest 98291, counted 0 in ag 3
> sb_icount 0, counted 64
> sb_ifree 0, counted 61
> sb_fdblocks 0, counted 291196
>         - found root inode chunk
> Phase 3 - for each AG...
>         - scan and clear agi unlinked lists...
>         - process known inodes and perform inode discovery...
>         - agno = 0
>         - agno = 1
>         - agno = 2
>         - agno = 3
>         - process newly discovered inodes...
> Phase 4 - check for duplicate blocks...
>         - setting up duplicate extent list...
>         - check for inodes claiming duplicate blocks...
>         - agno = 0
>         - agno = 1
>         - agno = 2
>         - agno = 3
> Phase 5 - rebuild AG headers and trees...
> 
> fatal error -- unable to add AG 0 reverse-mapping data to btree.
> 
> 
> I haven't digged into that for now, will look into that later.
> 
> > 
> 
> ...
> 
> > > 
> > > > > +int
> > > > > +xfs_alloc_vextent_shrink(
> > > > > +	struct xfs_trans	*tp,
> > > > > +	struct xfs_buf		*agbp,
> > > > > +	xfs_agblock_t		agbno,
> > > > > +	xfs_extlen_t		len)
> > > > > +{
> > > > > +	struct xfs_mount	*mp = tp->t_mountp;
> > > > > +	xfs_agnumber_t		agno = agbp->b_pag->pag_agno;
> > > > > +	struct xfs_alloc_arg	args = {
> > > > > +		.tp = tp,
> > > > > +		.mp = mp,
> > > > > +		.type = XFS_ALLOCTYPE_THIS_BNO,
> > > > > +		.agbp = agbp,
> > > > > +		.agno = agno,
> > > > > +		.agbno = agbno,
> > > > > +		.fsbno = XFS_AGB_TO_FSB(mp, agno, agbno),
> > > > > +		.minlen = len,
> > > > > +		.maxlen = len,
> > > > > +		.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
> > > > > +		.resv = XFS_AG_RESV_NONE,
> > > > > +		.prod = 1,
> > > > > +		.alignment = 1,
> > > > > +		.pag = agbp->b_pag
> > > > > +	};
> > > > > +	int			error;
> > > > > +
> > > > > +	error = xfs_alloc_ag_vextent_exact(&args);
> > > > > +	if (error || args.agbno == NULLAGBLOCK)
> > > > > +		return -EBUSY;
> > > > 
> > > > I think it's generally better to call into the top-level allocator API
> > > > (xfs_alloc_vextent()) because it will handle internal allocator business
> > > > like fixing up the AGFL and whatnot. Then you probably don't have to
> > > > specify as much in the args structure as well. The allocation mode
> > > > you've specified (THIS_BNO) will fall into the exact allocation codepath
> > > > and should enforce the semantics we need here (i.e. grant the exact
> > > > allocation or fail).
> > > 
> > > Actually, I did in the same way (use xfs_alloc_vextent()) in my previous
> > > hack version
> > > https://git.kernel.org/pub/scm/linux/kernel/git/xiang/linux.git/commit/?id=65d87d223a4d984441453659f1baeca560f07de4
> > > 
> > > yet Dave pointed out in private agfl fix could dirty the transaction
> > > and if the later allocation fails, it would be unsafe to cancel
> > > the dirty transaction. So as far as my current XFS knowledge, I think
> > > that makes sense so I introduce a separate helper
> > > xfs_alloc_vextent_shrink()...
> > > 
> > 
> > Yeah, I could see that being an issue. I'm curious if we're exposed to
> > that problem with exact allocation requests in other places.  We only
> > use it in a couple places that look like they have fallback allocation
> > requests. Combine that with the pre-allocation space checks and perhaps
> > this isn't something we'd currently hit in practice.
> > 
> > That said, I don't think this justifies diving directly into the lower
> > levels of the allocator (or branching any of the code, etc.). I suspect
> > not doing the agfl fixups and whatnot could cause other problems if they
> > are ultimately required for the subsequent allocation. The easiest
> > workaround is to just commit the transaction instead of cancelling it
> > once the allocation call is made. A more involved followon fix might be
> > to improve the early checks for exact allocations, but it's not clear at
> > this stage if that's really worth the extra code. We might also
> > eventually want to handle that another way to ensure that the agfl fixup
> > doesn't actually do an allocation that conflicts with the shrink itself.
> 
> Ok, I will commit the transaction if the allocation fails and
> the transaction is dirty.
> 
> > 
> > > > 
> > > > We probably need to rethink the bit of logic above this check for
> > > > shrinking. It looks like the current code checks for the minimum
> > > > supported AG size and if not satisfied, reduces the size the grow to the
> > > > next smaller AG count. That would actually increase the size of the
> > > > shrink from what the user requested, so we'd probably want to do the
> > > > opposite and reduce the size of the requested shrink. For now it
> > > > probably doesn't matter much since we fail to shrink the agcount.
> > > > 
> > > > That said, if I'm following the growfs behavior correctly it might be
> > > > worth considering analogous behavior for shrink. E.g., if the user asks
> > > > to trim 10GB off the last AG but only the last 4GB are free, then shrink
> > > > the fs by 4GB and report the new size to the user.
> > > 
> > > I thought about this topic as well, yeah, anyway, I think it needs
> > > some clearer documented words about the behavior (round down or round
> > > up). My original idea is to unify them. But yeah, increase the size
> > > of the shrink might cause unexpected fail.
> > > 
> > 
> > It's probably debatable as to whether we should reduce the size of the
> > shrink or just fail the operation, but I think to increase the size of
> > the shrink from what the user requested (even if it occurs "by accident"
> > due to the AG size rules) is inappropriate. With regard to the former,
> > have you looked into how shrink behaves on other filesystems (ext4)? I
> > think one advantage of shrinking what's available is to at least give
> > the user an opportunity to make incremental progress.
> 
> I quickly check what resize2fs does.
> 
> errcode_t adjust_fs_info(ext2_filsys fs, ext2_filsys old_fs,
> 			 ext2fs_block_bitmap reserve_blocks, blk64_t new_size)
> ...
> 	ext2fs_blocks_count_set(fs->super, new_size);
> 	fs->super->s_overhead_clusters = 0;
> 
> retry:
> ...
> 	/*
> 	 * Overhead is the number of bookkeeping blocks per group.  It
> 	 * includes the superblock backup, the group descriptor
> 	 * backups, the inode bitmap, the block bitmap, and the inode
> 	 * table.
> 	 */
> 	overhead = (int) (2 + fs->inode_blocks_per_group);
> ...
> 	/*
> 	 * See if the last group is big enough to support the
> 	 * necessary data structures.  If not, we need to get rid of
> 	 * it.
> 	 */
> 	rem = (ext2fs_blocks_count(fs->super) - fs->super->s_first_data_block) %
> 		fs->super->s_blocks_per_group;
> 	if ((fs->group_desc_count == 1) && rem && (rem < overhead))
> 		return EXT2_ET_TOOSMALL;
> 	if ((fs->group_desc_count > 1) && rem && (rem < overhead+50)) {
> 		ext2fs_blocks_count_set(fs->super,
> 					ext2fs_blocks_count(fs->super) - rem);
> 		goto retry;
> 	}
> 
> from the code itself it seems for some cases it increases the size of
> the shrink from what the user requested. and for the other cases, it
> just errors out.
> 
> and I also tried with some configuration:
> 
> First block:              1
> Block size:               1024
> Fragment size:            1024
> Group descriptor size:    64
> Reserved GDT blocks:      256
> Blocks per group:         8192
> Fragments per group:      8192
> Inodes per group:         2016
> Inode blocks per group:   252
> 
> # resize2fs test.ext4.img 262500
> resize2fs 1.44.5 (15-Dec-2018)
> Resizing the filesystem on test.ext4.img to 262500 (1k) blocks.
> The filesystem on test.ext4.img is now 262500 (1k) blocks long.
> 
> # resize2fs test.ext4.img 262403
> resize2fs 1.44.5 (15-Dec-2018)
> Resizing the filesystem on test.ext4.img to 262403 (1k) blocks.
> The filesystem on test.ext4.img is now 262145 (1k) blocks long.
> 

Interesting. It looks like there's similar logic around having a minimum
size "allocation group" to support internal structures, but I really
don't know enough about ext4 to comment further. I suppose this behavior
does make sense if you consider that a common purpose of shrink is to
inform the filesystem of a pending block device size change. In that
case, the desired result is to ensure the fs fits within the new,
smaller device and thus it might make sense to either increase the size
of the shrink and otherwise have straightforward success or failure
semantics. Thanks for the research.

Brian

> > 
> 
> ...
> 
> > > 
> > > > 
> > > > >  	oagcount = mp->m_sb.sb_agcount;
> > > > >  
> > > > >  	/* allocate the new per-ag structures */
> > > > > @@ -67,10 +77,14 @@ xfs_growfs_data_private(
> > > > >  		error = xfs_initialize_perag(mp, nagcount, &nagimax);
> > > > >  		if (error)
> > > > >  			return error;
> > > > > +	} else if (nagcount != oagcount) {
> > > > > +		/* TODO: shrinking a whole AG hasn't yet implemented */
> > > > > +		return -EINVAL;
> > > > >  	}
> > > > >  
> > > > >  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
> > > > > -			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> > > > > +			(extend ? 0 : new) + XFS_GROWFS_SPACE_RES(mp), 0,
> > > > > +			XFS_TRANS_RESERVE, &tp);
> > > > >  	if (error)
> > > > >  		return error;
> > > > >  
> > > > > @@ -103,15 +117,22 @@ xfs_growfs_data_private(
> > > > >  			goto out_trans_cancel;
> > > > >  		}
> > > > >  	}
> > > > > -	error = xfs_buf_delwri_submit(&id.buffer_list);
> > > > > -	if (error)
> > > > > -		goto out_trans_cancel;
> > > > > +
> > > > > +	if (!list_empty(&id.buffer_list)) {
> > > > > +		error = xfs_buf_delwri_submit(&id.buffer_list);
> > > > > +		if (error)
> > > > > +			goto out_trans_cancel;
> > > > > +	}
> > > > 
> > > > The list check seems somewhat superfluous since we won't do anything
> > > > with an empty list anyways. Presumably it would be incorrect to ever
> > > > init a new AG on shrink so it might be cleaner to eventually refactor
> > > > this bit of logic out into a helper that we only call on extend since
> > > > this is a new AG initialization mechanism.
> > > 
> > > Yeah, actually my previous hack version
> > > https://git.kernel.org/pub/scm/linux/kernel/git/xiang/linux.git/commit/?id=65d87d223a4d984441453659f1baeca560f07de4
> > > 
> > > did like this, but in this version I'd like to avoid touching unrelated
> > > topic as much as possible.
> > > 
> > > xfs_buf_delwri_submit() is not no-op for empty lists. Anyway, I will
> > > use 2 independent logic for entire extend / shrink seperately.
> > > 
> > 
> > I'm not sure we need to split out the entire function. It just might
> > make some sense to refactor the existing code a bit so the common code
> > is clearly readable for shrink/grow and that any larger hunks of code
> > specific to either grow or shrink are factored out into separate
> > functions.
> 
> Ok.
> 
> Thanks,
> Gao Xiang
> 
> > 
> > Brian
> > 
> > > Thanks for your suggestion!
> > > 
> > > Thanks,
> > > Gao Xiang
> > > 
> > > > 
> > > > Brian
> > > 
> > 
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-21  9:55           ` Brian Foster
@ 2020-10-21 13:21             ` Gao Xiang
  2020-10-21 14:22               ` Gao Xiang
  0 siblings, 1 reply; 11+ messages in thread
From: Gao Xiang @ 2020-10-21 13:21 UTC (permalink / raw)
  To: Brian Foster; +Cc: Gao Xiang, linux-xfs

On Wed, Oct 21, 2020 at 05:55:19AM -0400, Brian Foster wrote:
> On Wed, Oct 21, 2020 at 11:19:28AM +0800, Gao Xiang wrote:
> > On Tue, Oct 20, 2020 at 10:50:12AM -0400, Brian Foster wrote:
> > > On Thu, Oct 15, 2020 at 09:49:15AM +0800, Gao Xiang wrote:
> > 
> > ...
> > 
> > > > > 
> > > > > Interesting... this seems fundamentally sane when narrowing the scope
> > > > > down to tail AG shrinking. Does xfs_repair flag any issues in the simple
> > > > > tail AG shrink case?
> > > > 
> > > > Yeah, I ran xfs_repair together as well, For smaller sizes, it seems
> > > > all fine, but I did observe some failure when much larger values
> > > > passed in, so as a formal patch, it really needs to be solved later.
> > > > 
> > > 
> > > I'm curious to see what xfs_repair complained about if you have a record
> > > of it. That might call out some other things we could be overlooking.
> > 
> > Sorry for somewhat slow progress...
> > 
> > it could show random "SB summary counter sanity check failed" runtime message
> > when the shrink size is large (much close to ag start).
> > 
> 
> Ok. That error looks associated with a few different checks:
> 
>         if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
>             (sbp->sb_fdblocks > sbp->sb_dblocks ||
>              !xfs_verify_icount(mp, sbp->sb_icount) ||
>              sbp->sb_ifree > sbp->sb_icount)) {
>                 xfs_warn(mp, "SB summary counter sanity check failed");
>                 return -EFSCORRUPTED;
>         }
> 
> Though I think the inode counters should be a subset of allocated space
> (i.e. inode chunks) so are unlikely to be impacted by a removal of free
> space. Without looking into details, I'd guess it's most likely just an
> accounting bug and it's easiest to dump the relevant values that land in
> the superblock and work backwards from there. FWIW, the followon
> shutdown, repair (dirty log) and log recovery behavior (write and read
> verifier failures) are typical and to be expected on metadata
> corruption. IOW, I suspect that if we address the write verifier
> failure, the followon issues will likely be resolved as well.

After looking into a little bit, the exact failure condition is
sbp->sb_fdblocks > sbp->sb_dblocks,

and it seems sbp->sb_fdblocks doesn't decrease as expected when the shrink
size is large (in fact, it's still the number as the origin compared with
correct small shrink size) I'm still looking into what's exactly happening.

> 

...

> > > 
> > > It's probably debatable as to whether we should reduce the size of the
> > > shrink or just fail the operation, but I think to increase the size of
> > > the shrink from what the user requested (even if it occurs "by accident"
> > > due to the AG size rules) is inappropriate. With regard to the former,
> > > have you looked into how shrink behaves on other filesystems (ext4)? I
> > > think one advantage of shrinking what's available is to at least give
> > > the user an opportunity to make incremental progress.
> > 
> > I quickly check what resize2fs does.
> > 
> > errcode_t adjust_fs_info(ext2_filsys fs, ext2_filsys old_fs,
> > 			 ext2fs_block_bitmap reserve_blocks, blk64_t new_size)
> > ...
> > 	ext2fs_blocks_count_set(fs->super, new_size);
> > 	fs->super->s_overhead_clusters = 0;
> > 
> > retry:
> > ...
> > 	/*
> > 	 * Overhead is the number of bookkeeping blocks per group.  It
> > 	 * includes the superblock backup, the group descriptor
> > 	 * backups, the inode bitmap, the block bitmap, and the inode
> > 	 * table.
> > 	 */
> > 	overhead = (int) (2 + fs->inode_blocks_per_group);
> > ...
> > 	/*
> > 	 * See if the last group is big enough to support the
> > 	 * necessary data structures.  If not, we need to get rid of
> > 	 * it.
> > 	 */
> > 	rem = (ext2fs_blocks_count(fs->super) - fs->super->s_first_data_block) %
> > 		fs->super->s_blocks_per_group;
> > 	if ((fs->group_desc_count == 1) && rem && (rem < overhead))
> > 		return EXT2_ET_TOOSMALL;
> > 	if ((fs->group_desc_count > 1) && rem && (rem < overhead+50)) {
> > 		ext2fs_blocks_count_set(fs->super,
> > 					ext2fs_blocks_count(fs->super) - rem);
> > 		goto retry;
> > 	}
> > 
> > from the code itself it seems for some cases it increases the size of
> > the shrink from what the user requested. and for the other cases, it
> > just errors out.
> > 
> > and I also tried with some configuration:
> > 
> > First block:              1
> > Block size:               1024
> > Fragment size:            1024
> > Group descriptor size:    64
> > Reserved GDT blocks:      256
> > Blocks per group:         8192
> > Fragments per group:      8192
> > Inodes per group:         2016
> > Inode blocks per group:   252
> > 
> > # resize2fs test.ext4.img 262500
> > resize2fs 1.44.5 (15-Dec-2018)
> > Resizing the filesystem on test.ext4.img to 262500 (1k) blocks.
> > The filesystem on test.ext4.img is now 262500 (1k) blocks long.
> > 
> > # resize2fs test.ext4.img 262403
> > resize2fs 1.44.5 (15-Dec-2018)
> > Resizing the filesystem on test.ext4.img to 262403 (1k) blocks.
> > The filesystem on test.ext4.img is now 262145 (1k) blocks long.
> > 
> 
> Interesting. It looks like there's similar logic around having a minimum
> size "allocation group" to support internal structures, but I really
> don't know enough about ext4 to comment further. I suppose this behavior
> does make sense if you consider that a common purpose of shrink is to
> inform the filesystem of a pending block device size change. In that
> case, the desired result is to ensure the fs fits within the new,
> smaller device and thus it might make sense to either increase the size
> of the shrink and otherwise have straightforward success or failure
> semantics. Thanks for the research.

Yeah, I'm fine with either of that if we document the behavior well.

Thanks,
Gao Xiang

> 
> Brian
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH] xfs: support shrinking unused space in the last AG
  2020-10-21 13:21             ` Gao Xiang
@ 2020-10-21 14:22               ` Gao Xiang
  0 siblings, 0 replies; 11+ messages in thread
From: Gao Xiang @ 2020-10-21 14:22 UTC (permalink / raw)
  To: Brian Foster; +Cc: Gao Xiang, linux-xfs

On Wed, Oct 21, 2020 at 09:21:08PM +0800, Gao Xiang wrote:
> On Wed, Oct 21, 2020 at 05:55:19AM -0400, Brian Foster wrote:
...
> > > > > > 
> > > > > > Interesting... this seems fundamentally sane when narrowing the scope
> > > > > > down to tail AG shrinking. Does xfs_repair flag any issues in the simple
> > > > > > tail AG shrink case?
> > > > > 
> > > > > Yeah, I ran xfs_repair together as well, For smaller sizes, it seems
> > > > > all fine, but I did observe some failure when much larger values
> > > > > passed in, so as a formal patch, it really needs to be solved later.
> > > > > 
> > > > 
> > > > I'm curious to see what xfs_repair complained about if you have a record
> > > > of it. That might call out some other things we could be overlooking.
> > > 
> > > Sorry for somewhat slow progress...
> > > 
> > > it could show random "SB summary counter sanity check failed" runtime message
> > > when the shrink size is large (much close to ag start).
> > > 
> > 
> > Ok. That error looks associated with a few different checks:
> > 
> >         if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
> >             (sbp->sb_fdblocks > sbp->sb_dblocks ||
> >              !xfs_verify_icount(mp, sbp->sb_icount) ||
> >              sbp->sb_ifree > sbp->sb_icount)) {
> >                 xfs_warn(mp, "SB summary counter sanity check failed");
> >                 return -EFSCORRUPTED;
> >         }
> > 
> > Though I think the inode counters should be a subset of allocated space
> > (i.e. inode chunks) so are unlikely to be impacted by a removal of free
> > space. Without looking into details, I'd guess it's most likely just an
> > accounting bug and it's easiest to dump the relevant values that land in
> > the superblock and work backwards from there. FWIW, the followon
> > shutdown, repair (dirty log) and log recovery behavior (write and read
> > verifier failures) are typical and to be expected on metadata
> > corruption. IOW, I suspect that if we address the write verifier
> > failure, the followon issues will likely be resolved as well.
> 
> After looking into a little bit, the exact failure condition is
> sbp->sb_fdblocks > sbp->sb_dblocks,
> 
> and it seems sbp->sb_fdblocks doesn't decrease as expected when the shrink
> size is large (in fact, it's still the number as the origin compared with
> correct small shrink size) I'm still looking into what's exactly happening.
>

Update: the following incremental patch can fix the issue, yet I'm not sure
if it's the correct way or not...

Thanks,
Gao Xiang

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 80927d323939..0a395901bc3f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -149,6 +149,14 @@ xfs_growfs_data_private(
                                 nb - mp->m_sb.sb_dblocks);
        if (id.nfree)
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
+
+       /*
+        * update in-core counters (especially sb_fdblocks) now
+        * so xfs_validate_sb_write() can pass.
+        */
+       if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+               xfs_log_sb(tp);
+
        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp);
        if (error)


 


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, back to index

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20201014005809.6619-1-hsiangkao.ref@aol.com>
2020-10-14  0:58 ` [RFC PATCH] xfs: support shrinking unused space in the last AG Gao Xiang
2020-10-14  1:06   ` Gao Xiang
2020-10-14 16:06   ` Darrick J. Wong
2020-10-15  1:11     ` Gao Xiang
2020-10-14 17:01   ` Brian Foster
2020-10-15  1:49     ` Gao Xiang
2020-10-20 14:50       ` Brian Foster
2020-10-21  3:19         ` Gao Xiang
2020-10-21  9:55           ` Brian Foster
2020-10-21 13:21             ` Gao Xiang
2020-10-21 14:22               ` Gao Xiang

Linux-XFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-xfs/0 linux-xfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-xfs linux-xfs/ https://lore.kernel.org/linux-xfs \
		linux-xfs@vger.kernel.org
	public-inbox-index linux-xfs

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-xfs


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git