* [PATCH v2 01/11] xfs: clean up small allocation helper
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-06-21 23:57 ` Darrick J. Wong
2019-05-22 18:05 ` [PATCH v2 02/11] xfs: move " Brian Foster
` (11 subsequent siblings)
12 siblings, 1 reply; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
xfs_alloc_ag_vextent_small() is kind of a mess. Clean it up in
preparation for future changes. No functional changes.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_alloc.c | 133 +++++++++++++++++---------------------
1 file changed, 61 insertions(+), 72 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a9ff3cf82cce..9751531d3000 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1583,92 +1583,81 @@ xfs_alloc_ag_vextent_size(
}
/*
- * Deal with the case where only small freespaces remain.
- * Either return the contents of the last freespace record,
- * or allocate space from the freelist if there is nothing in the tree.
+ * Deal with the case where only small freespaces remain. Either return the
+ * contents of the last freespace record, or allocate space from the freelist if
+ * there is nothing in the tree.
*/
STATIC int /* error */
xfs_alloc_ag_vextent_small(
- xfs_alloc_arg_t *args, /* allocation argument structure */
- xfs_btree_cur_t *ccur, /* by-size cursor */
- xfs_agblock_t *fbnop, /* result block number */
- xfs_extlen_t *flenp, /* result length */
- int *stat) /* status: 0-freelist, 1-normal/none */
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ struct xfs_btree_cur *ccur, /* optional by-size cursor */
+ xfs_agblock_t *fbnop, /* result block number */
+ xfs_extlen_t *flenp, /* result length */
+ int *stat) /* status: 0-freelist, 1-normal/none */
{
- int error;
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
- int i;
+ int error = 0;
+ xfs_agblock_t fbno = NULLAGBLOCK;
+ xfs_extlen_t flen = 0;
+ int i;
- if ((error = xfs_btree_decrement(ccur, 0, &i)))
- goto error0;
+ error = xfs_btree_decrement(ccur, 0, &i);
+ if (error)
+ goto error;
if (i) {
- if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- }
- /*
- * Nothing in the btree, try the freelist. Make sure
- * to respect minleft even when pulling from the
- * freelist.
- */
- else if (args->minlen == 1 && args->alignment == 1 &&
- args->resv != XFS_AG_RESV_AGFL &&
- (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
- > args->minleft)) {
- error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
if (error)
- goto error0;
- if (fbno != NULLAGBLOCK) {
- xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
- xfs_alloc_allow_busy_reuse(args->datatype));
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
+ goto out;
+ }
- if (xfs_alloc_is_userdata(args->datatype)) {
- xfs_buf_t *bp;
+ if (args->minlen != 1 || args->alignment != 1 ||
+ args->resv == XFS_AG_RESV_AGFL ||
+ (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
+ args->minleft))
+ goto out;
- bp = xfs_btree_get_bufs(args->mp, args->tp,
- args->agno, fbno, 0);
- if (!bp) {
- error = -EFSCORRUPTED;
- goto error0;
- }
- xfs_trans_binval(args->tp, bp);
- }
- args->len = 1;
- args->agbno = fbno;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error0);
- args->wasfromfl = 1;
- trace_xfs_alloc_small_freelist(args);
+ error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ if (error)
+ goto error;
+ if (fbno == NULLAGBLOCK)
+ goto out;
- /*
- * If we're feeding an AGFL block to something that
- * doesn't live in the free space, we need to clear
- * out the OWN_AG rmap.
- */
- error = xfs_rmap_free(args->tp, args->agbp, args->agno,
- fbno, 1, &XFS_RMAP_OINFO_AG);
- if (error)
- goto error0;
+ xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_alloc_allow_busy_reuse(args->datatype));
- *stat = 0;
- return 0;
+ if (xfs_alloc_is_userdata(args->datatype)) {
+ struct xfs_buf *bp;
+
+ bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno,
+ 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto error;
}
- /*
- * Nothing in the freelist.
- */
- else
- flen = 0;
+ xfs_trans_binval(args->tp, bp);
}
+ args->len = 1;
+ args->agbno = fbno;
+ XFS_WANT_CORRUPTED_GOTO(args->mp,
+ fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+ error);
+ args->wasfromfl = 1;
+ trace_xfs_alloc_small_freelist(args);
+
/*
- * Can't allocate from the freelist for some reason.
+ * If we're feeding an AGFL block to something that doesn't live in the
+ * free space, we need to clear out the OWN_AG rmap.
*/
- else {
- fbno = NULLAGBLOCK;
- flen = 0;
- }
+ error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
+ &XFS_RMAP_OINFO_AG);
+ if (error)
+ goto error;
+
+ *stat = 0;
+ return 0;
+
+out:
/*
* Can't do the allocation, give up.
*/
@@ -1683,7 +1672,7 @@ xfs_alloc_ag_vextent_small(
trace_xfs_alloc_small_done(args);
return 0;
-error0:
+error:
trace_xfs_alloc_small_error(args);
return error;
}
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* Re: [PATCH v2 01/11] xfs: clean up small allocation helper
2019-05-22 18:05 ` [PATCH v2 01/11] xfs: clean up small allocation helper Brian Foster
@ 2019-06-21 23:57 ` Darrick J. Wong
0 siblings, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2019-06-21 23:57 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Wed, May 22, 2019 at 02:05:36PM -0400, Brian Foster wrote:
> xfs_alloc_ag_vextent_small() is kind of a mess. Clean it up in
> preparation for future changes. No functional changes.
>
> Signed-off-by: Brian Foster <bfoster@redhat.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
--D
> ---
> fs/xfs/libxfs/xfs_alloc.c | 133 +++++++++++++++++---------------------
> 1 file changed, 61 insertions(+), 72 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index a9ff3cf82cce..9751531d3000 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -1583,92 +1583,81 @@ xfs_alloc_ag_vextent_size(
> }
>
> /*
> - * Deal with the case where only small freespaces remain.
> - * Either return the contents of the last freespace record,
> - * or allocate space from the freelist if there is nothing in the tree.
> + * Deal with the case where only small freespaces remain. Either return the
> + * contents of the last freespace record, or allocate space from the freelist if
> + * there is nothing in the tree.
> */
> STATIC int /* error */
> xfs_alloc_ag_vextent_small(
> - xfs_alloc_arg_t *args, /* allocation argument structure */
> - xfs_btree_cur_t *ccur, /* by-size cursor */
> - xfs_agblock_t *fbnop, /* result block number */
> - xfs_extlen_t *flenp, /* result length */
> - int *stat) /* status: 0-freelist, 1-normal/none */
> + struct xfs_alloc_arg *args, /* allocation argument structure */
> + struct xfs_btree_cur *ccur, /* optional by-size cursor */
> + xfs_agblock_t *fbnop, /* result block number */
> + xfs_extlen_t *flenp, /* result length */
> + int *stat) /* status: 0-freelist, 1-normal/none */
> {
> - int error;
> - xfs_agblock_t fbno;
> - xfs_extlen_t flen;
> - int i;
> + int error = 0;
> + xfs_agblock_t fbno = NULLAGBLOCK;
> + xfs_extlen_t flen = 0;
> + int i;
>
> - if ((error = xfs_btree_decrement(ccur, 0, &i)))
> - goto error0;
> + error = xfs_btree_decrement(ccur, 0, &i);
> + if (error)
> + goto error;
> if (i) {
> - if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
> - goto error0;
> - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
> - }
> - /*
> - * Nothing in the btree, try the freelist. Make sure
> - * to respect minleft even when pulling from the
> - * freelist.
> - */
> - else if (args->minlen == 1 && args->alignment == 1 &&
> - args->resv != XFS_AG_RESV_AGFL &&
> - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
> - > args->minleft)) {
> - error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
> + error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
> if (error)
> - goto error0;
> - if (fbno != NULLAGBLOCK) {
> - xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
> - xfs_alloc_allow_busy_reuse(args->datatype));
> + goto error;
> + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
> + goto out;
> + }
>
> - if (xfs_alloc_is_userdata(args->datatype)) {
> - xfs_buf_t *bp;
> + if (args->minlen != 1 || args->alignment != 1 ||
> + args->resv == XFS_AG_RESV_AGFL ||
> + (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
> + args->minleft))
> + goto out;
>
> - bp = xfs_btree_get_bufs(args->mp, args->tp,
> - args->agno, fbno, 0);
> - if (!bp) {
> - error = -EFSCORRUPTED;
> - goto error0;
> - }
> - xfs_trans_binval(args->tp, bp);
> - }
> - args->len = 1;
> - args->agbno = fbno;
> - XFS_WANT_CORRUPTED_GOTO(args->mp,
> - args->agbno + args->len <=
> - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
> - error0);
> - args->wasfromfl = 1;
> - trace_xfs_alloc_small_freelist(args);
> + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
> + if (error)
> + goto error;
> + if (fbno == NULLAGBLOCK)
> + goto out;
>
> - /*
> - * If we're feeding an AGFL block to something that
> - * doesn't live in the free space, we need to clear
> - * out the OWN_AG rmap.
> - */
> - error = xfs_rmap_free(args->tp, args->agbp, args->agno,
> - fbno, 1, &XFS_RMAP_OINFO_AG);
> - if (error)
> - goto error0;
> + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
> + xfs_alloc_allow_busy_reuse(args->datatype));
>
> - *stat = 0;
> - return 0;
> + if (xfs_alloc_is_userdata(args->datatype)) {
> + struct xfs_buf *bp;
> +
> + bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno,
> + 0);
> + if (!bp) {
> + error = -EFSCORRUPTED;
> + goto error;
> }
> - /*
> - * Nothing in the freelist.
> - */
> - else
> - flen = 0;
> + xfs_trans_binval(args->tp, bp);
> }
> + args->len = 1;
> + args->agbno = fbno;
> + XFS_WANT_CORRUPTED_GOTO(args->mp,
> + fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
> + error);
> + args->wasfromfl = 1;
> + trace_xfs_alloc_small_freelist(args);
> +
> /*
> - * Can't allocate from the freelist for some reason.
> + * If we're feeding an AGFL block to something that doesn't live in the
> + * free space, we need to clear out the OWN_AG rmap.
> */
> - else {
> - fbno = NULLAGBLOCK;
> - flen = 0;
> - }
> + error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
> + &XFS_RMAP_OINFO_AG);
> + if (error)
> + goto error;
> +
> + *stat = 0;
> + return 0;
> +
> +out:
> /*
> * Can't do the allocation, give up.
> */
> @@ -1683,7 +1672,7 @@ xfs_alloc_ag_vextent_small(
> trace_xfs_alloc_small_done(args);
> return 0;
>
> -error0:
> +error:
> trace_xfs_alloc_small_error(args);
> return error;
> }
> --
> 2.17.2
>
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v2 02/11] xfs: move small allocation helper
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
2019-05-22 18:05 ` [PATCH v2 01/11] xfs: clean up small allocation helper Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-06-21 23:58 ` Darrick J. Wong
2019-05-22 18:05 ` [PATCH v2 03/11] xfs: skip small alloc cntbt logic on NULL cursor Brian Foster
` (10 subsequent siblings)
12 siblings, 1 reply; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
Move the small allocation helper further up in the file to avoid the
need for a function declaration. The remaining declarations will be
removed by followup patches. No functional changes.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/libxfs/xfs_alloc.c | 192 +++++++++++++++++++-------------------
1 file changed, 95 insertions(+), 97 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 9751531d3000..b345fe771c54 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -41,8 +41,6 @@ struct workqueue_struct *xfs_alloc_wq;
STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
- xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
/*
* Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
@@ -699,6 +697,101 @@ xfs_alloc_update_counters(
* Allocation group level functions.
*/
+/*
+ * Deal with the case where only small freespaces remain. Either return the
+ * contents of the last freespace record, or allocate space from the freelist if
+ * there is nothing in the tree.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_small(
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ struct xfs_btree_cur *ccur, /* optional by-size cursor */
+ xfs_agblock_t *fbnop, /* result block number */
+ xfs_extlen_t *flenp, /* result length */
+ int *stat) /* status: 0-freelist, 1-normal/none */
+{
+ int error = 0;
+ xfs_agblock_t fbno = NULLAGBLOCK;
+ xfs_extlen_t flen = 0;
+ int i;
+
+ error = xfs_btree_decrement(ccur, 0, &i);
+ if (error)
+ goto error;
+ if (i) {
+ error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
+ goto out;
+ }
+
+ if (args->minlen != 1 || args->alignment != 1 ||
+ args->resv == XFS_AG_RESV_AGFL ||
+ (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
+ args->minleft))
+ goto out;
+
+ error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ if (error)
+ goto error;
+ if (fbno == NULLAGBLOCK)
+ goto out;
+
+ xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_alloc_allow_busy_reuse(args->datatype));
+
+ if (xfs_alloc_is_userdata(args->datatype)) {
+ struct xfs_buf *bp;
+
+ bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno,
+ 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ xfs_trans_binval(args->tp, bp);
+ }
+ args->len = 1;
+ args->agbno = fbno;
+ XFS_WANT_CORRUPTED_GOTO(args->mp,
+ fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+ error);
+ args->wasfromfl = 1;
+ trace_xfs_alloc_small_freelist(args);
+
+ /*
+ * If we're feeding an AGFL block to something that doesn't live in the
+ * free space, we need to clear out the OWN_AG rmap.
+ */
+ error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
+ &XFS_RMAP_OINFO_AG);
+ if (error)
+ goto error;
+
+ *stat = 0;
+ return 0;
+
+out:
+ /*
+ * Can't do the allocation, give up.
+ */
+ if (flen < args->minlen) {
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_small_notenough(args);
+ flen = 0;
+ }
+ *fbnop = fbno;
+ *flenp = flen;
+ *stat = 1;
+ trace_xfs_alloc_small_done(args);
+ return 0;
+
+error:
+ trace_xfs_alloc_small_error(args);
+ return error;
+}
+
/*
* Allocate a variable extent in the allocation group agno.
* Type and bno are used to determine where in the allocation group the
@@ -1582,101 +1675,6 @@ xfs_alloc_ag_vextent_size(
return 0;
}
-/*
- * Deal with the case where only small freespaces remain. Either return the
- * contents of the last freespace record, or allocate space from the freelist if
- * there is nothing in the tree.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent_small(
- struct xfs_alloc_arg *args, /* allocation argument structure */
- struct xfs_btree_cur *ccur, /* optional by-size cursor */
- xfs_agblock_t *fbnop, /* result block number */
- xfs_extlen_t *flenp, /* result length */
- int *stat) /* status: 0-freelist, 1-normal/none */
-{
- int error = 0;
- xfs_agblock_t fbno = NULLAGBLOCK;
- xfs_extlen_t flen = 0;
- int i;
-
- error = xfs_btree_decrement(ccur, 0, &i);
- if (error)
- goto error;
- if (i) {
- error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
- if (error)
- goto error;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
- goto out;
- }
-
- if (args->minlen != 1 || args->alignment != 1 ||
- args->resv == XFS_AG_RESV_AGFL ||
- (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
- args->minleft))
- goto out;
-
- error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
- if (error)
- goto error;
- if (fbno == NULLAGBLOCK)
- goto out;
-
- xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
- xfs_alloc_allow_busy_reuse(args->datatype));
-
- if (xfs_alloc_is_userdata(args->datatype)) {
- struct xfs_buf *bp;
-
- bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno,
- 0);
- if (!bp) {
- error = -EFSCORRUPTED;
- goto error;
- }
- xfs_trans_binval(args->tp, bp);
- }
- args->len = 1;
- args->agbno = fbno;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error);
- args->wasfromfl = 1;
- trace_xfs_alloc_small_freelist(args);
-
- /*
- * If we're feeding an AGFL block to something that doesn't live in the
- * free space, we need to clear out the OWN_AG rmap.
- */
- error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
- &XFS_RMAP_OINFO_AG);
- if (error)
- goto error;
-
- *stat = 0;
- return 0;
-
-out:
- /*
- * Can't do the allocation, give up.
- */
- if (flen < args->minlen) {
- args->agbno = NULLAGBLOCK;
- trace_xfs_alloc_small_notenough(args);
- flen = 0;
- }
- *fbnop = fbno;
- *flenp = flen;
- *stat = 1;
- trace_xfs_alloc_small_done(args);
- return 0;
-
-error:
- trace_xfs_alloc_small_error(args);
- return error;
-}
-
/*
* Free the extent starting at agno/bno for length.
*/
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* Re: [PATCH v2 02/11] xfs: move small allocation helper
2019-05-22 18:05 ` [PATCH v2 02/11] xfs: move " Brian Foster
@ 2019-06-21 23:58 ` Darrick J. Wong
0 siblings, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2019-06-21 23:58 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Wed, May 22, 2019 at 02:05:37PM -0400, Brian Foster wrote:
> Move the small allocation helper further up in the file to avoid the
> need for a function declaration. The remaining declarations will be
> removed by followup patches. No functional changes.
>
> Signed-off-by: Brian Foster <bfoster@redhat.com>
Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
--D
> ---
> fs/xfs/libxfs/xfs_alloc.c | 192 +++++++++++++++++++-------------------
> 1 file changed, 95 insertions(+), 97 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 9751531d3000..b345fe771c54 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -41,8 +41,6 @@ struct workqueue_struct *xfs_alloc_wq;
> STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
> STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
> STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
> -STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
> - xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
>
> /*
> * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
> @@ -699,6 +697,101 @@ xfs_alloc_update_counters(
> * Allocation group level functions.
> */
>
> +/*
> + * Deal with the case where only small freespaces remain. Either return the
> + * contents of the last freespace record, or allocate space from the freelist if
> + * there is nothing in the tree.
> + */
> +STATIC int /* error */
> +xfs_alloc_ag_vextent_small(
> + struct xfs_alloc_arg *args, /* allocation argument structure */
> + struct xfs_btree_cur *ccur, /* optional by-size cursor */
> + xfs_agblock_t *fbnop, /* result block number */
> + xfs_extlen_t *flenp, /* result length */
> + int *stat) /* status: 0-freelist, 1-normal/none */
> +{
> + int error = 0;
> + xfs_agblock_t fbno = NULLAGBLOCK;
> + xfs_extlen_t flen = 0;
> + int i;
> +
> + error = xfs_btree_decrement(ccur, 0, &i);
> + if (error)
> + goto error;
> + if (i) {
> + error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
> + if (error)
> + goto error;
> + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
> + goto out;
> + }
> +
> + if (args->minlen != 1 || args->alignment != 1 ||
> + args->resv == XFS_AG_RESV_AGFL ||
> + (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
> + args->minleft))
> + goto out;
> +
> + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
> + if (error)
> + goto error;
> + if (fbno == NULLAGBLOCK)
> + goto out;
> +
> + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
> + xfs_alloc_allow_busy_reuse(args->datatype));
> +
> + if (xfs_alloc_is_userdata(args->datatype)) {
> + struct xfs_buf *bp;
> +
> + bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno,
> + 0);
> + if (!bp) {
> + error = -EFSCORRUPTED;
> + goto error;
> + }
> + xfs_trans_binval(args->tp, bp);
> + }
> + args->len = 1;
> + args->agbno = fbno;
> + XFS_WANT_CORRUPTED_GOTO(args->mp,
> + fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
> + error);
> + args->wasfromfl = 1;
> + trace_xfs_alloc_small_freelist(args);
> +
> + /*
> + * If we're feeding an AGFL block to something that doesn't live in the
> + * free space, we need to clear out the OWN_AG rmap.
> + */
> + error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
> + &XFS_RMAP_OINFO_AG);
> + if (error)
> + goto error;
> +
> + *stat = 0;
> + return 0;
> +
> +out:
> + /*
> + * Can't do the allocation, give up.
> + */
> + if (flen < args->minlen) {
> + args->agbno = NULLAGBLOCK;
> + trace_xfs_alloc_small_notenough(args);
> + flen = 0;
> + }
> + *fbnop = fbno;
> + *flenp = flen;
> + *stat = 1;
> + trace_xfs_alloc_small_done(args);
> + return 0;
> +
> +error:
> + trace_xfs_alloc_small_error(args);
> + return error;
> +}
> +
> /*
> * Allocate a variable extent in the allocation group agno.
> * Type and bno are used to determine where in the allocation group the
> @@ -1582,101 +1675,6 @@ xfs_alloc_ag_vextent_size(
> return 0;
> }
>
> -/*
> - * Deal with the case where only small freespaces remain. Either return the
> - * contents of the last freespace record, or allocate space from the freelist if
> - * there is nothing in the tree.
> - */
> -STATIC int /* error */
> -xfs_alloc_ag_vextent_small(
> - struct xfs_alloc_arg *args, /* allocation argument structure */
> - struct xfs_btree_cur *ccur, /* optional by-size cursor */
> - xfs_agblock_t *fbnop, /* result block number */
> - xfs_extlen_t *flenp, /* result length */
> - int *stat) /* status: 0-freelist, 1-normal/none */
> -{
> - int error = 0;
> - xfs_agblock_t fbno = NULLAGBLOCK;
> - xfs_extlen_t flen = 0;
> - int i;
> -
> - error = xfs_btree_decrement(ccur, 0, &i);
> - if (error)
> - goto error;
> - if (i) {
> - error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
> - if (error)
> - goto error;
> - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
> - goto out;
> - }
> -
> - if (args->minlen != 1 || args->alignment != 1 ||
> - args->resv == XFS_AG_RESV_AGFL ||
> - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
> - args->minleft))
> - goto out;
> -
> - error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
> - if (error)
> - goto error;
> - if (fbno == NULLAGBLOCK)
> - goto out;
> -
> - xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
> - xfs_alloc_allow_busy_reuse(args->datatype));
> -
> - if (xfs_alloc_is_userdata(args->datatype)) {
> - struct xfs_buf *bp;
> -
> - bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno,
> - 0);
> - if (!bp) {
> - error = -EFSCORRUPTED;
> - goto error;
> - }
> - xfs_trans_binval(args->tp, bp);
> - }
> - args->len = 1;
> - args->agbno = fbno;
> - XFS_WANT_CORRUPTED_GOTO(args->mp,
> - fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
> - error);
> - args->wasfromfl = 1;
> - trace_xfs_alloc_small_freelist(args);
> -
> - /*
> - * If we're feeding an AGFL block to something that doesn't live in the
> - * free space, we need to clear out the OWN_AG rmap.
> - */
> - error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
> - &XFS_RMAP_OINFO_AG);
> - if (error)
> - goto error;
> -
> - *stat = 0;
> - return 0;
> -
> -out:
> - /*
> - * Can't do the allocation, give up.
> - */
> - if (flen < args->minlen) {
> - args->agbno = NULLAGBLOCK;
> - trace_xfs_alloc_small_notenough(args);
> - flen = 0;
> - }
> - *fbnop = fbno;
> - *flenp = flen;
> - *stat = 1;
> - trace_xfs_alloc_small_done(args);
> - return 0;
> -
> -error:
> - trace_xfs_alloc_small_error(args);
> - return error;
> -}
> -
> /*
> * Free the extent starting at agno/bno for length.
> */
> --
> 2.17.2
>
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v2 03/11] xfs: skip small alloc cntbt logic on NULL cursor
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
2019-05-22 18:05 ` [PATCH v2 01/11] xfs: clean up small allocation helper Brian Foster
2019-05-22 18:05 ` [PATCH v2 02/11] xfs: move " Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-06-21 23:58 ` Darrick J. Wong
2019-05-22 18:05 ` [PATCH v2 04/11] xfs: always update params on small allocation Brian Foster
` (9 subsequent siblings)
12 siblings, 1 reply; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
The small allocation helper is implemented in a way that is fairly
tightly integrated to the existing allocation algorithms. It expects
a cntbt cursor beyond the end of the tree, attempts to locate the
last record in the tree and only attempts an AGFL allocation if the
cntbt is empty.
The upcoming generic algorithm doesn't rely on the cntbt processing
of this function. It will only call this function when the cntbt
doesn't have a big enough extent or is empty and thus AGFL
allocation is the only remaining option. Tweak
xfs_alloc_ag_vextent_small() to handle a NULL cntbt cursor and skip
the cntbt logic. This facilitates use by the existing allocation
code and new code that only requires an AGFL allocation attempt.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_alloc.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b345fe771c54..436f8eb0bc4c 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -713,9 +713,16 @@ xfs_alloc_ag_vextent_small(
int error = 0;
xfs_agblock_t fbno = NULLAGBLOCK;
xfs_extlen_t flen = 0;
- int i;
+ int i = 0;
- error = xfs_btree_decrement(ccur, 0, &i);
+ /*
+ * If a cntbt cursor is provided, try to allocate the largest record in
+ * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
+ * allocation. Make sure to respect minleft even when pulling from the
+ * freelist.
+ */
+ if (ccur)
+ error = xfs_btree_decrement(ccur, 0, &i);
if (error)
goto error;
if (i) {
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* Re: [PATCH v2 03/11] xfs: skip small alloc cntbt logic on NULL cursor
2019-05-22 18:05 ` [PATCH v2 03/11] xfs: skip small alloc cntbt logic on NULL cursor Brian Foster
@ 2019-06-21 23:58 ` Darrick J. Wong
0 siblings, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2019-06-21 23:58 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Wed, May 22, 2019 at 02:05:38PM -0400, Brian Foster wrote:
> The small allocation helper is implemented in a way that is fairly
> tightly integrated to the existing allocation algorithms. It expects
> a cntbt cursor beyond the end of the tree, attempts to locate the
> last record in the tree and only attempts an AGFL allocation if the
> cntbt is empty.
>
> The upcoming generic algorithm doesn't rely on the cntbt processing
> of this function. It will only call this function when the cntbt
> doesn't have a big enough extent or is empty and thus AGFL
> allocation is the only remaining option. Tweak
> xfs_alloc_ag_vextent_small() to handle a NULL cntbt cursor and skip
> the cntbt logic. This facilitates use by the existing allocation
> code and new code that only requires an AGFL allocation attempt.
>
> Signed-off-by: Brian Foster <bfoster@redhat.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
--D
> ---
> fs/xfs/libxfs/xfs_alloc.c | 11 +++++++++--
> 1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index b345fe771c54..436f8eb0bc4c 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -713,9 +713,16 @@ xfs_alloc_ag_vextent_small(
> int error = 0;
> xfs_agblock_t fbno = NULLAGBLOCK;
> xfs_extlen_t flen = 0;
> - int i;
> + int i = 0;
>
> - error = xfs_btree_decrement(ccur, 0, &i);
> + /*
> + * If a cntbt cursor is provided, try to allocate the largest record in
> + * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
> + * allocation. Make sure to respect minleft even when pulling from the
> + * freelist.
> + */
> + if (ccur)
> + error = xfs_btree_decrement(ccur, 0, &i);
> if (error)
> goto error;
> if (i) {
> --
> 2.17.2
>
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v2 04/11] xfs: always update params on small allocation
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (2 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 03/11] xfs: skip small alloc cntbt logic on NULL cursor Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-06-21 23:59 ` Darrick J. Wong
2019-05-22 18:05 ` [PATCH v2 05/11] xfs: track active state of allocation btree cursors Brian Foster
` (8 subsequent siblings)
12 siblings, 1 reply; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
xfs_alloc_ag_vextent_small() doesn't update the output parameters in
the event of an AGFL allocation. Instead, it updates the
xfs_alloc_arg structure directly to complete the allocation.
Update both args and the output params to provide consistent
behavior for future callers.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_alloc.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 436f8eb0bc4c..e2fa58f4d477 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -759,8 +759,8 @@ xfs_alloc_ag_vextent_small(
}
xfs_trans_binval(args->tp, bp);
}
- args->len = 1;
- args->agbno = fbno;
+ *fbnop = args->agbno = fbno;
+ *flenp = args->len = 1;
XFS_WANT_CORRUPTED_GOTO(args->mp,
fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
error);
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* Re: [PATCH v2 04/11] xfs: always update params on small allocation
2019-05-22 18:05 ` [PATCH v2 04/11] xfs: always update params on small allocation Brian Foster
@ 2019-06-21 23:59 ` Darrick J. Wong
0 siblings, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2019-06-21 23:59 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Wed, May 22, 2019 at 02:05:39PM -0400, Brian Foster wrote:
> xfs_alloc_ag_vextent_small() doesn't update the output parameters in
> the event of an AGFL allocation. Instead, it updates the
> xfs_alloc_arg structure directly to complete the allocation.
>
> Update both args and the output params to provide consistent
> behavior for future callers.
>
> Signed-off-by: Brian Foster <bfoster@redhat.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
--D
> ---
> fs/xfs/libxfs/xfs_alloc.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 436f8eb0bc4c..e2fa58f4d477 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -759,8 +759,8 @@ xfs_alloc_ag_vextent_small(
> }
> xfs_trans_binval(args->tp, bp);
> }
> - args->len = 1;
> - args->agbno = fbno;
> + *fbnop = args->agbno = fbno;
> + *flenp = args->len = 1;
> XFS_WANT_CORRUPTED_GOTO(args->mp,
> fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
> error);
> --
> 2.17.2
>
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v2 05/11] xfs: track active state of allocation btree cursors
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (3 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 04/11] xfs: always update params on small allocation Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-05-22 18:05 ` [PATCH v2 06/11] xfs: use locality optimized cntbt lookups for near mode allocations Brian Foster
` (7 subsequent siblings)
12 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
The upcoming allocation algorithm update searches multiple
allocation btree cursors concurrently. As such, it requires an
active state to track when a particular cursor should continue
searching. While active state will be modified based on higher level
logic, we can define base functionality based on the result of
allocation btree lookups.
Define an active flag in the private area of the btree cursor.
Update it based on the result of lookups in the existing allocation
btree helpers. Finally, provide a new helper to query the current
state.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/libxfs/xfs_alloc.c | 24 +++++++++++++++++++++---
fs/xfs/libxfs/xfs_alloc_btree.c | 1 +
fs/xfs/libxfs/xfs_btree.h | 3 +++
3 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index e2fa58f4d477..11d284989399 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -148,9 +148,13 @@ xfs_alloc_lookup_eq(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
+
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+ cur->bc_private.a.priv.abt.active = *stat;
+ return error;
}
/*
@@ -164,9 +168,13 @@ xfs_alloc_lookup_ge(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
+
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ cur->bc_private.a.priv.abt.active = *stat;
+ return error;
}
/*
@@ -180,9 +188,19 @@ xfs_alloc_lookup_le(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ cur->bc_private.a.priv.abt.active = *stat;
+ return error;
+}
+
+static inline bool
+xfs_alloc_cur_active(
+ struct xfs_btree_cur *cur)
+{
+ return cur && cur->bc_private.a.priv.abt.active;
}
/*
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 9fe949f6055e..cfa28890af27 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -508,6 +508,7 @@ xfs_allocbt_init_cursor(
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
+ cur->bc_private.a.priv.abt.active = false;
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index e3b3e9dce5da..3c105b7dffe8 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -183,6 +183,9 @@ union xfs_btree_cur_private {
unsigned long nr_ops; /* # record updates */
int shape_changes; /* # of extent splits */
} refc;
+ struct {
+ bool active; /* allocation cursor state */
+ } abt;
};
/*
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* [PATCH v2 06/11] xfs: use locality optimized cntbt lookups for near mode allocations
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (4 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 05/11] xfs: track active state of allocation btree cursors Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-05-22 18:05 ` [PATCH v2 07/11] xfs: refactor exact extent allocation mode Brian Foster
` (6 subsequent siblings)
12 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
The extent allocation code in XFS has several allocation modes with
unique implementations. This is slightly unfortunate as the
allocation modes are not all that different from a high level
perspective. The most involved mode is the near allocation mode
which attempts to allocate an optimally sized extent with ideal
locality with respect to a provided agbno.
In the common case, a near mode allocation consists of a conditional
scan of the last cntbt block followed by a concurrent left and right
spanning search of the bnobt starting from the ideal point of
locality in the bnobt. This works reasonably well as filesystems age
via most common allocation patterns. If free space fragments as the
filesystem ages, however, the near algorithm has very poor breakdown
characteristics. If the extent size lookup happens to land outside
(i.e., before) the last cntbt block, the alloc bypasses the cntbt
entirely. If a suitably sized extent lies beyond a large enough
number of unusable extents from the starting point(s) of the bnobt
search, the bnobt search can take a significant amount of time to
locate the target extent. This leads to pathological allocation
latencies in certain workloads.
The near allocation algorithm can be fundamentally improved to take
advantage of a preexisting mechanism: that by-size cntbt record
lookups can incorporate locality. This means that a single cntbt
lookup can return the extent of a particular size with best
locality. A single locality lookup only covers extents of the
requested size, but for larger extent allocations, repeated locality
lookups of increasing sizes can search more efficiently than the
bnobt scan because it isolates the search space to extents of
suitable size. Such a cntbt search may not always find the extent
with absolute best locality, but the tradeoff for good enough
locality for a more efficient scan is worthwhile because more often
than not, extent size and contiguity are more important for
performance than perfect locality for data allocations.
This patch introduces generic allocation infrastructure for cursor
setup/teardown, selected extent allocation and various means of
btree scanning. Based on this infrastructure, it reimplements the
near allocation algorithm to balance between repeated cntbt lookups
and bnobt left/right scans as opposed to effectively choosing one
search algorithm or the other. This provides more predictable
allocation latency under breakdown conditions with good enough
locality in the common case. The algorithm naturally balances
between smaller and larger allocations as smaller allocations are
more likely to be satisfied immediately from the bnobt whereas
larger allocations are more likely satisfied by the cntbt. The
generic infrastructure introduced by this patch will be reused to
reduce code duplication between different, but conceptually similar
allocation modes in subsequent patches.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/libxfs/xfs_alloc.c | 1004 +++++++++++++++++++------------------
fs/xfs/xfs_trace.h | 33 +-
2 files changed, 529 insertions(+), 508 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 11d284989399..149309e17095 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -39,7 +39,6 @@ struct workqueue_struct *xfs_alloc_wq;
#define XFSA_FIXUP_CNT_OK 2
STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
/*
@@ -712,8 +711,435 @@ xfs_alloc_update_counters(
}
/*
- * Allocation group level functions.
+ * Block allocation algorithm and data structures.
*/
+struct xfs_alloc_cur {
+ struct xfs_btree_cur *cnt; /* btree cursors */
+ struct xfs_btree_cur *bnolt;
+ struct xfs_btree_cur *bnogt;
+ xfs_extlen_t cur_len;/* current search length */
+ xfs_agblock_t rec_bno;/* extent startblock */
+ xfs_extlen_t rec_len;/* extent length */
+ xfs_agblock_t bno; /* alloc bno */
+ xfs_extlen_t len; /* alloc len */
+ xfs_extlen_t diff; /* diff from search bno */
+ unsigned busy_gen;/* busy state */
+ bool busy;
+};
+
+/*
+ * Set up cursors, etc. in the extent allocation cursor. This function can be
+ * called multiple times to reset an initialized structure without having to
+ * reallocate cursors.
+ */
+static int
+xfs_alloc_cur_setup(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ xfs_agblock_t agbno = 0;
+ int error;
+ int i;
+
+ acur->cur_len = args->maxlen;
+ acur->rec_bno = 0;
+ acur->rec_len = 0;
+ acur->bno = 0;
+ acur->len = 0;
+ acur->diff = -1;
+ acur->busy = false;
+ acur->busy_gen = 0;
+
+ if (args->agbno != NULLAGBLOCK)
+ agbno = args->agbno;
+
+ /*
+ * Initialize the cntbt cursor and determine whether to start the search
+ * at maxlen or minlen. THIS_AG allocation mode expects the cursor at
+ * the first available maxlen extent or at the end of the tree.
+ */
+ if (!acur->cnt)
+ acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_CNT);
+ error = xfs_alloc_lookup_ge(acur->cnt, agbno, acur->cur_len, &i);
+ if (!i) {
+ acur->cur_len = args->minlen;
+ error = xfs_alloc_lookup_ge(acur->cnt, agbno, acur->cur_len,
+ &i);
+ if (error)
+ return error;
+ }
+
+ /* init bnobt left/right search cursors */
+ if (!acur->bnolt)
+ acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
+ error = xfs_alloc_lookup_le(acur->bnolt, agbno, args->maxlen, &i);
+ if (error)
+ return error;
+
+ if (!acur->bnogt)
+ acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
+ error = xfs_alloc_lookup_ge(acur->bnogt, agbno, args->maxlen, &i);
+
+ return error;
+}
+
+static void
+xfs_alloc_cur_close(
+ struct xfs_alloc_cur *acur,
+ bool error)
+{
+ int cur_error = XFS_BTREE_NOERROR;
+
+ if (error)
+ cur_error = XFS_BTREE_ERROR;
+
+ if (acur->cnt)
+ xfs_btree_del_cursor(acur->cnt, cur_error);
+ if (acur->bnolt)
+ xfs_btree_del_cursor(acur->bnolt, cur_error);
+ if (acur->bnogt)
+ xfs_btree_del_cursor(acur->bnogt, cur_error);
+ acur->cnt = acur->bnolt = acur->bnogt = NULL;
+}
+
+/*
+ * Check an extent for allocation and track the best available candidate in the
+ * allocation structure. The cursor is deactivated if it has entered an out of
+ * range state based on allocation arguments. Optionally return the extent
+ * extent geometry and allocation status if requested by the caller.
+ */
+static int
+xfs_alloc_cur_check(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t *obno,
+ xfs_extlen_t *olen,
+ bool *new)
+{
+ int error, i;
+ xfs_agblock_t bno, bnoa, bnew;
+ xfs_extlen_t len, lena, diff = -1;
+ bool busy;
+ unsigned busy_gen = 0;
+ bool deactivate = false;
+ bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+
+ if (new)
+ *new = false;
+
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(args->mp, i == 1);
+ if (obno)
+ *obno = bno;
+ if (olen)
+ *olen = len;
+
+ /*
+ * Check against minlen and then compute and check the aligned record.
+ * If a cntbt record is out of size range (i.e., we're walking
+ * backwards) or a bnobt record is out of locality range, deactivate the
+ * cursor.
+ */
+ if (len < args->minlen) {
+ deactivate = !isbnobt;
+ goto fail;
+ }
+
+ busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
+ &busy_gen);
+ acur->busy |= busy;
+ if (busy)
+ acur->busy_gen = busy_gen;
+ if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
+ deactivate = isbnobt;
+ goto fail;
+ }
+ if (lena < args->minlen)
+ goto fail;
+
+ args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
+ xfs_alloc_fix_len(args);
+ ASSERT(args->len >= args->minlen);
+ if (args->len < acur->len)
+ goto fail;
+
+ /*
+ * We have an aligned record that satisfies minlen and beats the current
+ * candidate length. The remaining locality checks are specific to near
+ * allocation mode.
+ */
+ ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+ diff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, args->datatype,
+ bnoa, lena, &bnew);
+ if (bnew == NULLAGBLOCK)
+ goto fail;
+ if (diff > acur->diff) {
+ /* deactivate bnobt cursor with worse locality */
+ deactivate = isbnobt;
+ goto fail;
+ }
+ if (args->len < acur->len)
+ goto fail;
+
+ /* found a new candidate extent */
+ acur->rec_bno = bno;
+ acur->rec_len = len;
+ acur->bno = bnew;
+ acur->len = args->len;
+ acur->diff = diff;
+ if (new)
+ *new = true;
+ trace_xfs_alloc_cur_new(args->mp, acur->bno, acur->len, acur->diff);
+ return 0;
+
+fail:
+ if (deactivate)
+ cur->bc_private.a.priv.abt.active = false;
+ return 0;
+}
+
+/*
+ * Complete an allocation of a candidate extent. Remove the extent from both
+ * trees and update the args structure.
+ */
+STATIC int
+xfs_alloc_cur_finish(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ int error;
+
+ ASSERT(acur->len);
+ ASSERT(acur->cnt && acur->bnolt);
+
+ error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
+ acur->rec_len, acur->bno, acur->len, 0);
+ if (error)
+ return error;
+
+ args->agbno = acur->bno;
+ args->len = acur->len;
+ args->wasfromfl = 0;
+
+ trace_xfs_alloc_cur(args);
+ return 0;
+}
+
+/*
+ * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
+ * bno optimized lookup to search for extents with ideal size and locality.
+ */
+STATIC int
+xfs_alloc_lookup_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur)
+{
+ xfs_agblock_t bno;
+ xfs_extlen_t len, cur_len;
+ int error;
+ int i;
+
+ if (!xfs_alloc_cur_active(cur))
+ return 0;
+
+ /* locality optimized lookup */
+ cur_len = acur->cur_len;
+ error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
+ if (error)
+ return error;
+ if (i == 0)
+ return 0;
+
+ /* check the current record and update search length from it */
+ error = xfs_alloc_cur_check(args, acur, cur, &bno, &len, NULL);
+ if (error)
+ return error;
+ ASSERT(len >= acur->cur_len);
+ acur->cur_len = len;
+
+ /*
+ * We looked up the first record >= [agbno, len] above. The agbno is a
+ * secondary key and so the current record may lie just before or after
+ * agbno. If it is past agbno, check the previous record too so long as
+ * the length matches as it may be closer. Don't check a smaller record
+ * because that could deactivate our cursor.
+ */
+ if (bno > args->agbno) {
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (!error && i) {
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (!error && i && len == acur->cur_len) {
+ error = xfs_alloc_cur_check(args, acur, cur,
+ NULL, NULL, NULL);
+ }
+ }
+ if (error)
+ return error;
+ }
+
+ /*
+ * Increment the search key until we find at least one allocation
+ * candidate or if the extent we found was larger. Otherwise, double the
+ * search key to optimize the search. Efficiency is more important here
+ * than absolute best locality.
+ */
+ cur_len <<= 1;
+ if (!acur->len || acur->cur_len >= cur_len)
+ acur->cur_len++;
+ else
+ acur->cur_len = cur_len;
+
+ return error;
+}
+
+/*
+ * Incremental lookup algorithm. Walk a btree in either direction looking for
+ * candidate extents. This works for either bnobt (locality allocation) or cntbt
+ * (by-size allocation) cursors.
+ */
+STATIC int
+xfs_alloc_walk_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur,
+ bool increment,
+ bool findone,
+ int iters,
+ int *stat)
+{
+ int error;
+ int i;
+ bool found = false;
+
+ if (!xfs_alloc_cur_active(cur))
+ return 0;
+
+ *stat = 0;
+ for (; iters > 0; iters--) {
+ error = xfs_alloc_cur_check(args, acur, cur, NULL, NULL,
+ &found);
+ if (error)
+ return error;
+ if (found) {
+ *stat = 1;
+ if (findone)
+ break;
+ }
+ if (!xfs_alloc_cur_active(cur))
+ break;
+
+ if (increment)
+ error = xfs_btree_increment(cur, 0, &i);
+ else
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ return error;
+ if (i == 0) {
+ cur->bc_private.a.priv.abt.active = false;
+ break;
+ }
+ }
+
+ return error;
+}
+
+/*
+ * High level locality allocation algorithm. Search the bnobt (left and right)
+ * in parallel with locality-optimized cntbt lookups to find an extent with
+ * ideal locality.
+ */
+STATIC int
+xfs_alloc_ag_vextent_cur(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ int *stat)
+{
+ int error;
+ int i;
+ unsigned int findbestcount;
+ struct xfs_btree_cur *fbcur = NULL;
+ bool fbinc = false;
+
+ ASSERT(acur->cnt);
+ ASSERT(args->type != XFS_ALLOCTYPE_THIS_AG);
+ findbestcount = args->mp->m_alloc_mxr[0];
+ *stat = 0;
+
+ /* search as long as we have at least one active cursor */
+ while (xfs_alloc_cur_active(acur->cnt) ||
+ xfs_alloc_cur_active(acur->bnolt) ||
+ xfs_alloc_cur_active(acur->bnogt)) {
+ /*
+ * Search the bnobt left and right. If either of these find a
+ * suitable extent, we know we've found ideal locality. Do a
+ * capped search in the opposite direction and we're done.
+ */
+ error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
+ true, 1, &i);
+ if (error)
+ return error;
+ if (i) {
+ fbcur = acur->bnogt;
+ fbinc = true;
+ break;
+ }
+
+ error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true,
+ true, 1, &i);
+ if (error)
+ return error;
+ if (i) {
+ fbcur = acur->bnolt;
+ break;
+ }
+
+ /*
+ * Search the cntbt for a maximum sized extent with ideal
+ * locality. The lookup search terminates on reaching the end of
+ * the cntbt. Break out of the loop if this occurs to throttle
+ * the bnobt scans.
+ */
+ error = xfs_alloc_lookup_iter(args, acur, acur->cnt);
+ if (error)
+ return error;
+ if (!xfs_alloc_cur_active(acur->cnt)) {
+ if (!acur->len) {
+ fbcur = acur->cnt;
+ error = xfs_btree_decrement(fbcur, 0, &i);
+ if (error)
+ return error;
+ if (i)
+ fbcur->bc_private.a.priv.abt.active = true;
+ }
+ break;
+ }
+ }
+
+ /*
+ * Perform a best-effort search in the opposite direction from a bnobt
+ * allocation or backwards from the end of the cntbt if we couldn't find
+ * a maxlen extent.
+ */
+ if (fbcur) {
+ error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true,
+ findbestcount, &i);
+ if (error)
+ return error;
+ }
+
+ if (acur->len)
+ *stat = 1;
+
+ return error;
+}
/*
* Deal with the case where only small freespaces remain. Either return the
@@ -817,6 +1243,81 @@ xfs_alloc_ag_vextent_small(
return error;
}
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_type(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ struct xfs_alloc_cur acur = {0,};
+ int error; /* error code */
+ int i; /* result code, temporary */
+ xfs_agblock_t bno; /* start bno of left side entry */
+ xfs_extlen_t len; /* length of left side entry */
+
+ /* handle unitialized agbno range so caller doesn't have to */
+ if (!args->min_agbno && !args->max_agbno)
+ args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+ ASSERT(args->min_agbno <= args->max_agbno);
+
+ /* clamp agbno to the range if it's outside */
+ if (args->agbno < args->min_agbno)
+ args->agbno = args->min_agbno;
+ if (args->agbno > args->max_agbno)
+ args->agbno = args->max_agbno;
+
+restart:
+ /* set up cursors and allocation tracking structure based on args */
+ error = xfs_alloc_cur_setup(args, &acur);
+ if (error)
+ goto out;
+
+ error = xfs_alloc_ag_vextent_cur(args, &acur, &i);
+ if (error)
+ goto out;
+
+ /*
+ * If we got an extent, finish the allocation. Otherwise check for busy
+ * extents and retry or attempt a small allocation.
+ */
+ if (i) {
+ error = xfs_alloc_cur_finish(args, &acur);
+ if (error)
+ goto out;
+ } else {
+ if (acur.busy) {
+ trace_xfs_alloc_ag_busy(args);
+ xfs_extent_busy_flush(args->mp, args->pag,
+ acur.busy_gen);
+ goto restart;
+ }
+
+ /*
+ * We get here if we can't satisfy minlen or the trees are
+ * empty. We don't pass a cursor so this returns an AGFL block
+ * (i == 0) or nothing.
+ */
+ error = xfs_alloc_ag_vextent_small(args, NULL, &bno, &len, &i);
+ if (error)
+ goto out;
+ ASSERT(i == 0 || (i && len == 0));
+ trace_xfs_alloc_ag_noentry(args);
+
+ args->agbno = bno;
+ args->len = len;
+ }
+
+out:
+ xfs_alloc_cur_close(&acur, error);
+ if (error)
+ trace_xfs_alloc_ag_error(args);
+ return error;
+}
+
/*
* Allocate a variable extent in the allocation group agno.
* Type and bno are used to determine where in the allocation group the
@@ -846,7 +1347,7 @@ xfs_alloc_ag_vextent(
error = xfs_alloc_ag_vextent_size(args);
break;
case XFS_ALLOCTYPE_NEAR_BNO:
- error = xfs_alloc_ag_vextent_near(args);
+ error = xfs_alloc_ag_vextent_type(args);
break;
case XFS_ALLOCTYPE_THIS_BNO:
error = xfs_alloc_ag_vextent_exact(args);
@@ -1004,503 +1505,6 @@ xfs_alloc_ag_vextent_exact(
return error;
}
-/*
- * Search the btree in a given direction via the search cursor and compare
- * the records found against the good extent we've already found.
- */
-STATIC int
-xfs_alloc_find_best_extent(
- struct xfs_alloc_arg *args, /* allocation argument structure */
- struct xfs_btree_cur **gcur, /* good cursor */
- struct xfs_btree_cur **scur, /* searching cursor */
- xfs_agblock_t gdiff, /* difference for search comparison */
- xfs_agblock_t *sbno, /* extent found by search */
- xfs_extlen_t *slen, /* extent length */
- xfs_agblock_t *sbnoa, /* aligned extent found by search */
- xfs_extlen_t *slena, /* aligned extent length */
- int dir) /* 0 = search right, 1 = search left */
-{
- xfs_agblock_t new;
- xfs_agblock_t sdiff;
- int error;
- int i;
- unsigned busy_gen;
-
- /* The good extent is perfect, no need to search. */
- if (!gdiff)
- goto out_use_good;
-
- /*
- * Look until we find a better one, run out of space or run off the end.
- */
- do {
- error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
- if (error)
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- xfs_alloc_compute_aligned(args, *sbno, *slen,
- sbnoa, slena, &busy_gen);
-
- /*
- * The good extent is closer than this one.
- */
- if (!dir) {
- if (*sbnoa > args->max_agbno)
- goto out_use_good;
- if (*sbnoa >= args->agbno + gdiff)
- goto out_use_good;
- } else {
- if (*sbnoa < args->min_agbno)
- goto out_use_good;
- if (*sbnoa <= args->agbno - gdiff)
- goto out_use_good;
- }
-
- /*
- * Same distance, compare length and pick the best.
- */
- if (*slena >= args->minlen) {
- args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
- xfs_alloc_fix_len(args);
-
- sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment,
- args->datatype, *sbnoa,
- *slena, &new);
-
- /*
- * Choose closer size and invalidate other cursor.
- */
- if (sdiff < gdiff)
- goto out_use_search;
- goto out_use_good;
- }
-
- if (!dir)
- error = xfs_btree_increment(*scur, 0, &i);
- else
- error = xfs_btree_decrement(*scur, 0, &i);
- if (error)
- goto error0;
- } while (i);
-
-out_use_good:
- xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
- *scur = NULL;
- return 0;
-
-out_use_search:
- xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
- *gcur = NULL;
- return 0;
-
-error0:
- /* caller invalidates cursors */
- return error;
-}
-
-/*
- * Allocate a variable extent near bno in the allocation group agno.
- * Extent's length (returned in len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent_near(
- xfs_alloc_arg_t *args) /* allocation argument structure */
-{
- xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */
- xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */
- xfs_btree_cur_t *cnt_cur; /* cursor for count btree */
- xfs_agblock_t gtbno; /* start bno of right side entry */
- xfs_agblock_t gtbnoa; /* aligned ... */
- xfs_extlen_t gtdiff; /* difference to right side entry */
- xfs_extlen_t gtlen; /* length of right side entry */
- xfs_extlen_t gtlena; /* aligned ... */
- xfs_agblock_t gtnew; /* useful start bno of right side */
- int error; /* error code */
- int i; /* result code, temporary */
- int j; /* result code, temporary */
- xfs_agblock_t ltbno; /* start bno of left side entry */
- xfs_agblock_t ltbnoa; /* aligned ... */
- xfs_extlen_t ltdiff; /* difference to left side entry */
- xfs_extlen_t ltlen; /* length of left side entry */
- xfs_extlen_t ltlena; /* aligned ... */
- xfs_agblock_t ltnew; /* useful start bno of left side */
- xfs_extlen_t rlen; /* length of returned extent */
- bool busy;
- unsigned busy_gen;
-#ifdef DEBUG
- /*
- * Randomly don't execute the first algorithm.
- */
- int dofirst; /* set to do first algorithm */
-
- dofirst = prandom_u32() & 1;
-#endif
-
- /* handle unitialized agbno range so caller doesn't have to */
- if (!args->min_agbno && !args->max_agbno)
- args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
- ASSERT(args->min_agbno <= args->max_agbno);
-
- /* clamp agbno to the range if it's outside */
- if (args->agbno < args->min_agbno)
- args->agbno = args->min_agbno;
- if (args->agbno > args->max_agbno)
- args->agbno = args->max_agbno;
-
-restart:
- bno_cur_lt = NULL;
- bno_cur_gt = NULL;
- ltlen = 0;
- gtlena = 0;
- ltlena = 0;
- busy = false;
-
- /*
- * Get a cursor for the by-size btree.
- */
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
-
- /*
- * See if there are any free extents as big as maxlen.
- */
- if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
- goto error0;
- /*
- * If none, then pick up the last entry in the tree unless the
- * tree is empty.
- */
- if (!i) {
- if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno,
- <len, &i)))
- goto error0;
- if (i == 0 || ltlen == 0) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- trace_xfs_alloc_near_noentry(args);
- return 0;
- }
- ASSERT(i == 1);
- }
- args->wasfromfl = 0;
-
- /*
- * First algorithm.
- * If the requested extent is large wrt the freespaces available
- * in this a.g., then the cursor will be pointing to a btree entry
- * near the right edge of the tree. If it's in the last btree leaf
- * block, then we just examine all the entries in that block
- * that are big enough, and pick the best one.
- * This is written as a while loop so we can break out of it,
- * but we never loop back to the top.
- */
- while (xfs_btree_islastblock(cnt_cur, 0)) {
- xfs_extlen_t bdiff;
- int besti=0;
- xfs_extlen_t blen=0;
- xfs_agblock_t bnew=0;
-
-#ifdef DEBUG
- if (dofirst)
- break;
-#endif
- /*
- * Start from the entry that lookup found, sequence through
- * all larger free blocks. If we're actually pointing at a
- * record smaller than maxlen, go to the start of this block,
- * and skip all those smaller than minlen.
- */
- if (ltlen || args->alignment > 1) {
- cnt_cur->bc_ptrs[0] = 1;
- do {
- if ((error = xfs_alloc_get_rec(cnt_cur, <bno,
- <len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- if (ltlen >= args->minlen)
- break;
- if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
- goto error0;
- } while (i);
- ASSERT(ltlen >= args->minlen);
- if (!i)
- break;
- }
- i = cnt_cur->bc_ptrs[0];
- for (j = 1, blen = 0, bdiff = 0;
- !error && j && (blen < args->maxlen || bdiff > 0);
- error = xfs_btree_increment(cnt_cur, 0, &j)) {
- /*
- * For each entry, decide if it's better than
- * the previous best entry.
- */
- if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy = xfs_alloc_compute_aligned(args, ltbno, ltlen,
- <bnoa, <lena, &busy_gen);
- if (ltlena < args->minlen)
- continue;
- if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
- continue;
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- ASSERT(args->len >= args->minlen);
- if (args->len < blen)
- continue;
- ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, ltbnoa,
- ltlena, <new);
- if (ltnew != NULLAGBLOCK &&
- (args->len > blen || ltdiff < bdiff)) {
- bdiff = ltdiff;
- bnew = ltnew;
- blen = args->len;
- besti = cnt_cur->bc_ptrs[0];
- }
- }
- /*
- * It didn't work. We COULD be in a case where
- * there's a good record somewhere, so try again.
- */
- if (blen == 0)
- break;
- /*
- * Point at the best entry, and retrieve it again.
- */
- cnt_cur->bc_ptrs[0] = besti;
- if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- args->len = blen;
-
- /*
- * We are allocating starting at bnew for blen blocks.
- */
- args->agbno = bnew;
- ASSERT(bnew >= ltbno);
- ASSERT(bnew + blen <= ltbno + ltlen);
- /*
- * Set up a cursor for the by-bno tree.
- */
- bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO);
- /*
- * Fix up the btree entries.
- */
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
- ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
- goto error0;
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-
- trace_xfs_alloc_near_first(args);
- return 0;
- }
- /*
- * Second algorithm.
- * Search in the by-bno tree to the left and to the right
- * simultaneously, until in each case we find a space big enough,
- * or run into the edge of the tree. When we run into the edge,
- * we deallocate that cursor.
- * If both searches succeed, we compare the two spaces and pick
- * the better one.
- * With alignment, it's possible for both to fail; the upper
- * level algorithm that picks allocation groups for allocations
- * is not supposed to do this.
- */
- /*
- * Allocate and initialize the cursor for the leftward search.
- */
- bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
- /*
- * Lookup <= bno to find the leftward search's starting point.
- */
- if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
- goto error0;
- if (!i) {
- /*
- * Didn't find anything; use this cursor for the rightward
- * search.
- */
- bno_cur_gt = bno_cur_lt;
- bno_cur_lt = NULL;
- }
- /*
- * Found something. Duplicate the cursor for the rightward search.
- */
- else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
- goto error0;
- /*
- * Increment the cursor, so we will point at the entry just right
- * of the leftward entry if any, or to the leftmost entry.
- */
- if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
- goto error0;
- if (!i) {
- /*
- * It failed, there are no rightward entries.
- */
- xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- /*
- * Loop going left with the leftward cursor, right with the
- * rightward cursor, until either both directions give up or
- * we find an entry at least as big as minlen.
- */
- do {
- if (bno_cur_lt) {
- if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
- <bnoa, <lena, &busy_gen);
- if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
- break;
- if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
- goto error0;
- if (!i || ltbnoa < args->min_agbno) {
- xfs_btree_del_cursor(bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
- }
- if (bno_cur_gt) {
- if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
- >bnoa, >lena, &busy_gen);
- if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
- break;
- if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
- goto error0;
- if (!i || gtbnoa > args->max_agbno) {
- xfs_btree_del_cursor(bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- }
- } while (bno_cur_lt || bno_cur_gt);
-
- /*
- * Got both cursors still active, need to find better entry.
- */
- if (bno_cur_lt && bno_cur_gt) {
- if (ltlena >= args->minlen) {
- /*
- * Left side is good, look for a right side entry.
- */
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, ltbnoa,
- ltlena, <new);
-
- error = xfs_alloc_find_best_extent(args,
- &bno_cur_lt, &bno_cur_gt,
- ltdiff, >bno, >len,
- >bnoa, >lena,
- 0 /* search right */);
- } else {
- ASSERT(gtlena >= args->minlen);
-
- /*
- * Right side is good, look for a left side entry.
- */
- args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
- xfs_alloc_fix_len(args);
- gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, gtbnoa,
- gtlena, >new);
-
- error = xfs_alloc_find_best_extent(args,
- &bno_cur_gt, &bno_cur_lt,
- gtdiff, <bno, <len,
- <bnoa, <lena,
- 1 /* search left */);
- }
-
- if (error)
- goto error0;
- }
-
- /*
- * If we couldn't get anything, give up.
- */
- if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-
- if (busy) {
- trace_xfs_alloc_near_busy(args);
- xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
- goto restart;
- }
- trace_xfs_alloc_size_neither(args);
- args->agbno = NULLAGBLOCK;
- return 0;
- }
-
- /*
- * At this point we have selected a freespace entry, either to the
- * left or to the right. If it's on the right, copy all the
- * useful variables to the "left" set so we only have one
- * copy of this code.
- */
- if (bno_cur_gt) {
- bno_cur_lt = bno_cur_gt;
- bno_cur_gt = NULL;
- ltbno = gtbno;
- ltbnoa = gtbnoa;
- ltlen = gtlen;
- ltlena = gtlena;
- j = 1;
- } else
- j = 0;
-
- /*
- * Fix up the length and compute the useful address.
- */
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- rlen = args->len;
- (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- args->datatype, ltbnoa, ltlena, <new);
- ASSERT(ltnew >= ltbno);
- ASSERT(ltnew + rlen <= ltbnoa + ltlena);
- ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
- args->agbno = ltnew;
-
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
- ltnew, rlen, XFSA_FIXUP_BNO_OK)))
- goto error0;
-
- if (j)
- trace_xfs_alloc_near_greater(args);
- else
- trace_xfs_alloc_near_lesser(args);
-
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
- return 0;
-
- error0:
- trace_xfs_alloc_near_error(args);
- if (cnt_cur != NULL)
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
- if (bno_cur_lt != NULL)
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
- if (bno_cur_gt != NULL)
- xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
- return error;
-}
-
/*
* Allocate a variable extent anywhere in the allocation group agno.
* Extent's length (returned in len) will be between minlen and maxlen,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2464ea351f83..c49bbe0e06e3 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1635,14 +1635,10 @@ DEFINE_EVENT(xfs_alloc_class, name, \
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
+DEFINE_ALLOC_EVENT(xfs_alloc_ag_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_ag_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_ag_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur);
DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
@@ -1658,6 +1654,27 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+TRACE_EVENT(xfs_alloc_cur_new,
+ TP_PROTO(struct xfs_mount *mp, xfs_agblock_t bno, xfs_extlen_t len,
+ xfs_extlen_t diff),
+ TP_ARGS(mp, bno, len, diff),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agblock_t, bno)
+ __field(xfs_extlen_t, len)
+ __field(xfs_extlen_t, diff)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->bno = bno;
+ __entry->len = len;
+ __entry->diff = diff;
+ ),
+ TP_printk("dev %d:%d bno 0x%x len 0x%x diff 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->bno, __entry->len, __entry->diff)
+)
+
DECLARE_EVENT_CLASS(xfs_da_class,
TP_PROTO(struct xfs_da_args *args),
TP_ARGS(args),
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* [PATCH v2 07/11] xfs: refactor exact extent allocation mode
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (5 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 06/11] xfs: use locality optimized cntbt lookups for near mode allocations Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-05-22 18:05 ` [PATCH v2 08/11] xfs: refactor by-size " Brian Foster
` (5 subsequent siblings)
12 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
Exact allocation mode attemps to allocate at a specific block and
otherwise fails. The implementation is straightforward and mostly
contained in a single function. It uses the bnobt to look up the
requested block and succeeds or fails.
An exact allocation is essentially just a near allocation with
slightly more strict requirements. Most of the boilerplate code
associated with an exact allocation is already implemented in the
generic infrastructure. The additional logic that is required is
oneshot behavior for cursor allocation and lookup and the record
examination requirements specific to allocation mode.
Update the generic allocation code to support exact mode allocations
and replace the existing implementation. This essentially provides
the same behavior with improved code reuse and less duplicated code.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/libxfs/xfs_alloc.c | 181 +++++++++++---------------------------
fs/xfs/xfs_trace.h | 1 -
2 files changed, 49 insertions(+), 133 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 149309e17095..d180d1940039 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -38,7 +38,6 @@ struct workqueue_struct *xfs_alloc_wq;
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
/*
@@ -778,6 +777,15 @@ xfs_alloc_cur_setup(
if (error)
return error;
+ /*
+ * Exact allocation mode requires only one bnobt cursor.
+ */
+ if (args->type == XFS_ALLOCTYPE_THIS_BNO) {
+ ASSERT(args->alignment == 1);
+ acur->cnt->bc_private.a.priv.abt.active = false;
+ return 0;
+ }
+
if (!acur->bnogt)
acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
args->agbp, args->agno, XFS_BTNUM_BNO);
@@ -840,6 +848,12 @@ xfs_alloc_cur_check(
if (olen)
*olen = len;
+ /* exact allocs only check one record, mark the cursor inactive */
+ if (args->type == XFS_ALLOCTYPE_THIS_BNO) {
+ ASSERT(isbnobt);
+ cur->bc_private.a.priv.abt.active = false;
+ }
+
/*
* Check against minlen and then compute and check the aligned record.
* If a cntbt record is out of size range (i.e., we're walking
@@ -871,22 +885,39 @@ xfs_alloc_cur_check(
/*
* We have an aligned record that satisfies minlen and beats the current
- * candidate length. The remaining locality checks are specific to near
- * allocation mode.
+ * candidate length. The remaining checks depend on allocation type.
+ * Exact allocation checks one record and either succeeds or fails. Near
+ * allocation computes and checks locality. Near allocation computes
+ * and checks locality.
*/
- ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
- diff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype,
- bnoa, lena, &bnew);
- if (bnew == NULLAGBLOCK)
- goto fail;
- if (diff > acur->diff) {
- /* deactivate bnobt cursor with worse locality */
- deactivate = isbnobt;
- goto fail;
+ if (args->type == XFS_ALLOCTYPE_THIS_BNO) {
+ if ((bnoa > args->agbno) ||
+ (bnoa + lena < args->agbno + args->minlen)) {
+ trace_xfs_alloc_exact_notfound(args);
+ goto fail;
+ }
+
+ bnew = args->agbno;
+ args->len = XFS_AGBLOCK_MIN(bnoa + lena,
+ args->agbno + args->maxlen) -
+ args->agbno;
+ diff = 0;
+ trace_xfs_alloc_exact_done(args);
+ } else {
+ ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+ diff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, args->datatype,
+ bnoa, lena, &bnew);
+ if (bnew == NULLAGBLOCK)
+ goto fail;
+ if (diff > acur->diff) {
+ /* deactivate bnobt cursor with worse locality */
+ deactivate = isbnobt;
+ goto fail;
+ }
+ if (args->len < acur->len)
+ goto fail;
}
- if (args->len < acur->len)
- goto fail;
/* found a new candidate extent */
acur->rec_bno = bno;
@@ -1086,6 +1117,8 @@ xfs_alloc_ag_vextent_cur(
true, 1, &i);
if (error)
return error;
+ if (args->type == XFS_ALLOCTYPE_THIS_BNO)
+ break;
if (i) {
fbcur = acur->bnogt;
fbinc = true;
@@ -1347,10 +1380,8 @@ xfs_alloc_ag_vextent(
error = xfs_alloc_ag_vextent_size(args);
break;
case XFS_ALLOCTYPE_NEAR_BNO:
- error = xfs_alloc_ag_vextent_type(args);
- break;
case XFS_ALLOCTYPE_THIS_BNO:
- error = xfs_alloc_ag_vextent_exact(args);
+ error = xfs_alloc_ag_vextent_type(args);
break;
default:
ASSERT(0);
@@ -1391,120 +1422,6 @@ xfs_alloc_ag_vextent(
return error;
}
-/*
- * Allocate a variable extent at exactly agno/bno.
- * Extent's length (returned in *len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent_exact(
- xfs_alloc_arg_t *args) /* allocation argument structure */
-{
- xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
- xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
- int error;
- xfs_agblock_t fbno; /* start block of found extent */
- xfs_extlen_t flen; /* length of found extent */
- xfs_agblock_t tbno; /* start block of busy extent */
- xfs_extlen_t tlen; /* length of busy extent */
- xfs_agblock_t tend; /* end block of busy extent */
- int i; /* success/failure of operation */
- unsigned busy_gen;
-
- ASSERT(args->alignment == 1);
-
- /*
- * Allocate/initialize a cursor for the by-number freespace btree.
- */
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
-
- /*
- * Lookup bno and minlen in the btree (minlen is irrelevant, really).
- * Look for the closest free block <= bno, it must contain bno
- * if any free block does.
- */
- error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
- if (error)
- goto error0;
- if (!i)
- goto not_found;
-
- /*
- * Grab the freespace record.
- */
- error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
- if (error)
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- ASSERT(fbno <= args->agbno);
-
- /*
- * Check for overlapping busy extents.
- */
- tbno = fbno;
- tlen = flen;
- xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
-
- /*
- * Give up if the start of the extent is busy, or the freespace isn't
- * long enough for the minimum request.
- */
- if (tbno > args->agbno)
- goto not_found;
- if (tlen < args->minlen)
- goto not_found;
- tend = tbno + tlen;
- if (tend < args->agbno + args->minlen)
- goto not_found;
-
- /*
- * End of extent will be smaller of the freespace end and the
- * maximal requested end.
- *
- * Fix the length according to mod and prod if given.
- */
- args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
- - args->agbno;
- xfs_alloc_fix_len(args);
- ASSERT(args->agbno + args->len <= tend);
-
- /*
- * We are allocating agbno for args->len
- * Allocate/initialize a cursor for the by-size btree.
- */
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
- ASSERT(args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
- args->len, XFSA_FIXUP_BNO_OK);
- if (error) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
- goto error0;
- }
-
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-
- args->wasfromfl = 0;
- trace_xfs_alloc_exact_done(args);
- return 0;
-
-not_found:
- /* Didn't find it, return null. */
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- args->agbno = NULLAGBLOCK;
- trace_xfs_alloc_exact_notfound(args);
- return 0;
-
-error0:
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
- trace_xfs_alloc_exact_error(args);
- return error;
-}
-
/*
* Allocate a variable extent anywhere in the allocation group agno.
* Extent's length (returned in len) will be between minlen and maxlen,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c49bbe0e06e3..a11aac4505ea 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1634,7 +1634,6 @@ DEFINE_EVENT(xfs_alloc_class, name, \
TP_ARGS(args))
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
-DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
DEFINE_ALLOC_EVENT(xfs_alloc_ag_error);
DEFINE_ALLOC_EVENT(xfs_alloc_ag_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_ag_busy);
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* [PATCH v2 08/11] xfs: refactor by-size extent allocation mode
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (6 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 07/11] xfs: refactor exact extent allocation mode Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-05-22 18:05 ` [PATCH v2 09/11] xfs: replace small allocation logic with agfl only logic Brian Foster
` (4 subsequent siblings)
12 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
By-size allocation mode is essentially a near allocation mode
without a locality requirement. The existing code looks up a
suitably sized extent in the cntbt and either succeeds or falls back
to a forward or reverse scan and eventually to the AGFL.
While similar in concept to near allocation mode, the lookup/search
algorithm is far more simple. As such, size allocation mode is still
more cleanly implemented with a mode-specific algorithm function.
However, this function reuses underlying mechanism used by the bnobt
scan for a near mode allocation to instead walk the cntbt looking
for a suitably sized extent. Much of the setup, finish and AGFL
fallback code is also unnecessarily duplicated in the current
implementation and can be removed.
Implement a by-size allocation mode search algorithm, tweak the
generic infrastructure to handle by-size allocations and replace the
old by-size implementation. As with exact allocation mode, this
essentially provides the same behavior with less duplicate mode
specific code.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/libxfs/xfs_alloc.c | 275 +++++++++-----------------------------
fs/xfs/xfs_trace.h | 4 -
2 files changed, 65 insertions(+), 214 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d180d1940039..6b8bd8f316cb 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -38,8 +38,6 @@ struct workqueue_struct *xfs_alloc_wq;
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-
/*
* Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
* the beginning of the block for a proper header with the location information
@@ -751,6 +749,8 @@ xfs_alloc_cur_setup(
if (args->agbno != NULLAGBLOCK)
agbno = args->agbno;
+ if (args->type == XFS_ALLOCTYPE_THIS_AG)
+ acur->cur_len += args->alignment - 1;
/*
* Initialize the cntbt cursor and determine whether to start the search
@@ -761,7 +761,7 @@ xfs_alloc_cur_setup(
acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
args->agbp, args->agno, XFS_BTNUM_CNT);
error = xfs_alloc_lookup_ge(acur->cnt, agbno, acur->cur_len, &i);
- if (!i) {
+ if (!i && args->type != XFS_ALLOCTYPE_THIS_AG) {
acur->cur_len = args->minlen;
error = xfs_alloc_lookup_ge(acur->cnt, agbno, acur->cur_len,
&i);
@@ -778,13 +778,15 @@ xfs_alloc_cur_setup(
return error;
/*
- * Exact allocation mode requires only one bnobt cursor.
+ * Exact allocation mode uses the bnobt, by-size allocation mode uses
+ * the cntbt, either one requires only one bnobt cursor.
*/
if (args->type == XFS_ALLOCTYPE_THIS_BNO) {
ASSERT(args->alignment == 1);
acur->cnt->bc_private.a.priv.abt.active = false;
return 0;
- }
+ } else if (args->type == XFS_ALLOCTYPE_THIS_AG)
+ return 0;
if (!acur->bnogt)
acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
@@ -886,9 +888,10 @@ xfs_alloc_cur_check(
/*
* We have an aligned record that satisfies minlen and beats the current
* candidate length. The remaining checks depend on allocation type.
- * Exact allocation checks one record and either succeeds or fails. Near
- * allocation computes and checks locality. Near allocation computes
- * and checks locality.
+ * Exact allocation checks one record and either succeeds or fails.
+ * By-size allocation only needs to deactivate the cursor once we've
+ * found a maxlen candidate. Near allocation computes and checks
+ * locality. Near allocation computes and checks locality.
*/
if (args->type == XFS_ALLOCTYPE_THIS_BNO) {
if ((bnoa > args->agbno) ||
@@ -903,6 +906,12 @@ xfs_alloc_cur_check(
args->agbno;
diff = 0;
trace_xfs_alloc_exact_done(args);
+ } else if (args->type == XFS_ALLOCTYPE_THIS_AG) {
+ if (lena >= args->maxlen) {
+ cur->bc_private.a.priv.abt.active = false;
+ trace_xfs_alloc_size_done(args);
+ }
+ bnew = bnoa;
} else {
ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
diff = xfs_alloc_compute_diff(args->agbno, args->len,
@@ -1082,6 +1091,50 @@ xfs_alloc_walk_iter(
return error;
}
+/*
+ * High level size allocation algorithm.
+ */
+STATIC int
+xfs_alloc_ag_vextent_size(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ int *stat)
+{
+ int error;
+ int i;
+ bool increment = true;
+
+ ASSERT(args->type == XFS_ALLOCTYPE_THIS_AG);
+ *stat = 0;
+
+ /*
+ * The cursor either points at the first sufficiently sized extent for
+ * an aligned maxlen allocation or off the edge of the tree. The only
+ * way the former should fail is if the target extents are busy, so
+ * return nothing and let the caller flush and retry. If the latter,
+ * point the cursor at the last valid record and walk backwards from
+ * there. There is still a chance to find a minlen extent.
+ */
+ if (!xfs_alloc_cur_active(acur->cnt)) {
+ increment = false;
+ error = xfs_btree_decrement(acur->cnt, 0, &i);
+ if (error)
+ return error;
+ if (i)
+ acur->cnt->bc_private.a.priv.abt.active = true;
+ }
+
+ error = xfs_alloc_walk_iter(args, acur, acur->cnt, increment, false,
+ INT_MAX, &i);
+ if (error)
+ return error;
+
+ ASSERT(i == 1 || acur->busy || !increment);
+ if (acur->len)
+ *stat = 1;
+ return 0;
+}
+
/*
* High level locality allocation algorithm. Search the bnobt (left and right)
* in parallel with locality-optimized cntbt lookups to find an extent with
@@ -1309,7 +1362,10 @@ xfs_alloc_ag_vextent_type(
if (error)
goto out;
- error = xfs_alloc_ag_vextent_cur(args, &acur, &i);
+ if (args->type == XFS_ALLOCTYPE_THIS_AG)
+ error = xfs_alloc_ag_vextent_size(args, &acur, &i);
+ else
+ error = xfs_alloc_ag_vextent_cur(args, &acur, &i);
if (error)
goto out;
@@ -1377,8 +1433,6 @@ xfs_alloc_ag_vextent(
args->wasfromfl = 0;
switch (args->type) {
case XFS_ALLOCTYPE_THIS_AG:
- error = xfs_alloc_ag_vextent_size(args);
- break;
case XFS_ALLOCTYPE_NEAR_BNO:
case XFS_ALLOCTYPE_THIS_BNO:
error = xfs_alloc_ag_vextent_type(args);
@@ -1422,205 +1476,6 @@ xfs_alloc_ag_vextent(
return error;
}
-/*
- * Allocate a variable extent anywhere in the allocation group agno.
- * Extent's length (returned in len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent_size(
- xfs_alloc_arg_t *args) /* allocation argument structure */
-{
- xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
- xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
- int error; /* error result */
- xfs_agblock_t fbno; /* start of found freespace */
- xfs_extlen_t flen; /* length of found freespace */
- int i; /* temp status variable */
- xfs_agblock_t rbno; /* returned block number */
- xfs_extlen_t rlen; /* length of returned extent */
- bool busy;
- unsigned busy_gen;
-
-restart:
- /*
- * Allocate and initialize a cursor for the by-size btree.
- */
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
- bno_cur = NULL;
- busy = false;
-
- /*
- * Look for an entry >= maxlen+alignment-1 blocks.
- */
- if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
- args->maxlen + args->alignment - 1, &i)))
- goto error0;
-
- /*
- * If none then we have to settle for a smaller extent. In the case that
- * there are no large extents, this will return the last entry in the
- * tree unless the tree is empty. In the case that there are only busy
- * large extents, this will return the largest small extent unless there
- * are no smaller extents available.
- */
- if (!i) {
- error = xfs_alloc_ag_vextent_small(args, cnt_cur,
- &fbno, &flen, &i);
- if (error)
- goto error0;
- if (i == 0 || flen == 0) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- trace_xfs_alloc_size_noentry(args);
- return 0;
- }
- ASSERT(i == 1);
- busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
- &rlen, &busy_gen);
- } else {
- /*
- * Search for a non-busy extent that is large enough.
- */
- for (;;) {
- error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
- if (error)
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-
- busy = xfs_alloc_compute_aligned(args, fbno, flen,
- &rbno, &rlen, &busy_gen);
-
- if (rlen >= args->maxlen)
- break;
-
- error = xfs_btree_increment(cnt_cur, 0, &i);
- if (error)
- goto error0;
- if (i == 0) {
- /*
- * Our only valid extents must have been busy.
- * Make it unbusy by forcing the log out and
- * retrying.
- */
- xfs_btree_del_cursor(cnt_cur,
- XFS_BTREE_NOERROR);
- trace_xfs_alloc_size_busy(args);
- xfs_extent_busy_flush(args->mp,
- args->pag, busy_gen);
- goto restart;
- }
- }
- }
-
- /*
- * In the first case above, we got the last entry in the
- * by-size btree. Now we check to see if the space hits maxlen
- * once aligned; if not, we search left for something better.
- * This can't happen in the second case above.
- */
- rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
- (rlen <= flen && rbno + rlen <= fbno + flen), error0);
- if (rlen < args->maxlen) {
- xfs_agblock_t bestfbno;
- xfs_extlen_t bestflen;
- xfs_agblock_t bestrbno;
- xfs_extlen_t bestrlen;
-
- bestrlen = rlen;
- bestrbno = rbno;
- bestflen = flen;
- bestfbno = fbno;
- for (;;) {
- if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
- goto error0;
- if (i == 0)
- break;
- if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
- &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- if (flen < bestrlen)
- break;
- busy = xfs_alloc_compute_aligned(args, fbno, flen,
- &rbno, &rlen, &busy_gen);
- rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
- (rlen <= flen && rbno + rlen <= fbno + flen),
- error0);
- if (rlen > bestrlen) {
- bestrlen = rlen;
- bestrbno = rbno;
- bestflen = flen;
- bestfbno = fbno;
- if (rlen == args->maxlen)
- break;
- }
- }
- if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
- &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- rlen = bestrlen;
- rbno = bestrbno;
- flen = bestflen;
- fbno = bestfbno;
- }
- args->wasfromfl = 0;
- /*
- * Fix up the length.
- */
- args->len = rlen;
- if (rlen < args->minlen) {
- if (busy) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- trace_xfs_alloc_size_busy(args);
- xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
- goto restart;
- }
- goto out_nominleft;
- }
- xfs_alloc_fix_len(args);
-
- rlen = args->len;
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
- /*
- * Allocate and initialize a cursor for the by-block tree.
- */
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
- rbno, rlen, XFSA_FIXUP_CNT_OK)))
- goto error0;
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- cnt_cur = bno_cur = NULL;
- args->len = rlen;
- args->agbno = rbno;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error0);
- trace_xfs_alloc_size_done(args);
- return 0;
-
-error0:
- trace_xfs_alloc_size_error(args);
- if (cnt_cur)
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
- if (bno_cur)
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
- return error;
-
-out_nominleft:
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- trace_xfs_alloc_size_nominleft(args);
- args->agbno = NULLAGBLOCK;
- return 0;
-}
-
/*
* Free the extent starting at agno/bno for length.
*/
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a11aac4505ea..519bf7d104ba 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1638,11 +1638,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_ag_error);
DEFINE_ALLOC_EVENT(xfs_alloc_ag_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_ag_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_cur);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* [PATCH v2 09/11] xfs: replace small allocation logic with agfl only logic
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (7 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 08/11] xfs: refactor by-size " Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-05-22 18:05 ` [PATCH v2 10/11] xfs: refactor successful AG allocation accounting code Brian Foster
` (3 subsequent siblings)
12 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
Now that the various extent allocation modes have been reworked,
there are no more users of a large portion of
xfs_alloc_ag_vextent_small(). Remove the unnecessary record handling
logic, refactor and rename this function to a simple AGFL allocation
helper and simplify the interface.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_alloc.c | 80 ++++++++++-----------------------------
fs/xfs/xfs_trace.h | 8 ++--
2 files changed, 24 insertions(+), 64 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 6b8bd8f316cb..24485687e2ae 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1228,46 +1228,28 @@ xfs_alloc_ag_vextent_cur(
}
/*
- * Deal with the case where only small freespaces remain. Either return the
- * contents of the last freespace record, or allocate space from the freelist if
- * there is nothing in the tree.
+ * Attempt to allocate from the AGFL. This is a last resort when no other free
+ * space is available.
*/
-STATIC int /* error */
-xfs_alloc_ag_vextent_small(
- struct xfs_alloc_arg *args, /* allocation argument structure */
- struct xfs_btree_cur *ccur, /* optional by-size cursor */
- xfs_agblock_t *fbnop, /* result block number */
- xfs_extlen_t *flenp, /* result length */
- int *stat) /* status: 0-freelist, 1-normal/none */
+STATIC int
+xfs_alloc_ag_vextent_agfl(
+ struct xfs_alloc_arg *args) /* allocation argument structure */
{
- int error = 0;
+ int error;
xfs_agblock_t fbno = NULLAGBLOCK;
- xfs_extlen_t flen = 0;
- int i = 0;
/*
- * If a cntbt cursor is provided, try to allocate the largest record in
- * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
- * allocation. Make sure to respect minleft even when pulling from the
- * freelist.
+ * The AGFL can only perform unaligned, single block allocations. Also
+ * make sure this isn't an allocation for the AGFL itself and to respect
+ * minleft before we take a block.
*/
- if (ccur)
- error = xfs_btree_decrement(ccur, 0, &i);
- if (error)
- goto error;
- if (i) {
- error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
- if (error)
- goto error;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
- goto out;
- }
-
if (args->minlen != 1 || args->alignment != 1 ||
args->resv == XFS_AG_RESV_AGFL ||
(be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
- args->minleft))
+ args->minleft)) {
+ trace_xfs_alloc_agfl_notenough(args);
goto out;
+ }
error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
if (error)
@@ -1289,13 +1271,9 @@ xfs_alloc_ag_vextent_small(
}
xfs_trans_binval(args->tp, bp);
}
- *fbnop = args->agbno = fbno;
- *flenp = args->len = 1;
XFS_WANT_CORRUPTED_GOTO(args->mp,
fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
error);
- args->wasfromfl = 1;
- trace_xfs_alloc_small_freelist(args);
/*
* If we're feeding an AGFL block to something that doesn't live in the
@@ -1306,26 +1284,18 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error;
- *stat = 0;
- return 0;
-
out:
- /*
- * Can't do the allocation, give up.
- */
- if (flen < args->minlen) {
- args->agbno = NULLAGBLOCK;
- trace_xfs_alloc_small_notenough(args);
- flen = 0;
+ args->agbno = fbno;
+ if (fbno != NULLAGBLOCK) {
+ args->wasfromfl = 1;
+ args->len = 1;
}
- *fbnop = fbno;
- *flenp = flen;
- *stat = 1;
- trace_xfs_alloc_small_done(args);
+
+ trace_xfs_alloc_agfl_done(args);
return 0;
error:
- trace_xfs_alloc_small_error(args);
+ trace_xfs_alloc_agfl_error(args);
return error;
}
@@ -1342,8 +1312,6 @@ xfs_alloc_ag_vextent_type(
struct xfs_alloc_cur acur = {0,};
int error; /* error code */
int i; /* result code, temporary */
- xfs_agblock_t bno; /* start bno of left side entry */
- xfs_extlen_t len; /* length of left side entry */
/* handle unitialized agbno range so caller doesn't have to */
if (!args->min_agbno && !args->max_agbno)
@@ -1387,17 +1355,11 @@ xfs_alloc_ag_vextent_type(
/*
* We get here if we can't satisfy minlen or the trees are
- * empty. We don't pass a cursor so this returns an AGFL block
- * (i == 0) or nothing.
+ * empty.
*/
- error = xfs_alloc_ag_vextent_small(args, NULL, &bno, &len, &i);
+ error = xfs_alloc_ag_vextent_agfl(args);
if (error)
goto out;
- ASSERT(i == 0 || (i && len == 0));
- trace_xfs_alloc_ag_noentry(args);
-
- args->agbno = bno;
- args->len = len;
}
out:
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 519bf7d104ba..b3ff29325b61 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1635,14 +1635,12 @@ DEFINE_EVENT(xfs_alloc_class, name, \
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_ag_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_ag_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_ag_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_cur);
DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
-DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
-DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_agfl_notenough);
+DEFINE_ALLOC_EVENT(xfs_alloc_agfl_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_agfl_error);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* [PATCH v2 10/11] xfs: refactor successful AG allocation accounting code
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (8 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 09/11] xfs: replace small allocation logic with agfl only logic Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-05-22 18:05 ` [PATCH v2 11/11] xfs: condense high level AG allocation functions Brian Foster
` (2 subsequent siblings)
12 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
The higher level allocation code is unnecessarily split across
xfs_alloc_ag_vextent() and xfs_alloc_ag_vextent_type(). In
preparation for condensing this code, factor out the AG accounting
bits and move the caller down after the generic allocation structure
and function definitions to pick them up without the need for
declarations. No functional changes.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/libxfs/xfs_alloc.c | 75 +++++++++++++++++++++++----------------
1 file changed, 45 insertions(+), 30 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 24485687e2ae..3b0cdb8346c9 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1369,6 +1369,48 @@ xfs_alloc_ag_vextent_type(
return error;
}
+/*
+ * Various AG accounting updates for a successful allocation. This includes
+ * updating the rmapbt, AG free block accounting and AG reservation accounting.
+ */
+STATIC int
+xfs_alloc_ag_vextent_accounting(
+ struct xfs_alloc_arg *args)
+{
+ int error = 0;
+
+ ASSERT(args->agbno != NULLAGBLOCK);
+ ASSERT(args->len >= args->minlen);
+ ASSERT(args->len <= args->maxlen);
+ ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
+ ASSERT(args->agbno % args->alignment == 0);
+
+ /* if not file data, insert new block into the reverse map btree */
+ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
+ error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
+ args->agbno, args->len, &args->oinfo);
+ if (error)
+ return error;
+ }
+
+ if (!args->wasfromfl) {
+ error = xfs_alloc_update_counters(args->tp, args->pag,
+ args->agbp,
+ -((long)(args->len)));
+ if (error)
+ return error;
+
+ ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+ args->agbno, args->len));
+ }
+
+ xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
+
+ XFS_STATS_INC(args->mp, xs_allocx);
+ XFS_STATS_ADD(args->mp, xs_allocb, args->len);
+ return error;
+}
+
/*
* Allocate a variable extent in the allocation group agno.
* Type and bno are used to determine where in the allocation group the
@@ -1403,38 +1445,11 @@ xfs_alloc_ag_vextent(
ASSERT(0);
/* NOTREACHED */
}
-
- if (error || args->agbno == NULLAGBLOCK)
+ if (error)
return error;
- ASSERT(args->len >= args->minlen);
- ASSERT(args->len <= args->maxlen);
- ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
- ASSERT(args->agbno % args->alignment == 0);
-
- /* if not file data, insert new block into the reverse map btree */
- if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
- error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
- args->agbno, args->len, &args->oinfo);
- if (error)
- return error;
- }
-
- if (!args->wasfromfl) {
- error = xfs_alloc_update_counters(args->tp, args->pag,
- args->agbp,
- -((long)(args->len)));
- if (error)
- return error;
-
- ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
- args->agbno, args->len));
- }
-
- xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
-
- XFS_STATS_INC(args->mp, xs_allocx);
- XFS_STATS_ADD(args->mp, xs_allocb, args->len);
+ if (args->agbno != NULLAGBLOCK)
+ error = xfs_alloc_ag_vextent_accounting(args);
return error;
}
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* [PATCH v2 11/11] xfs: condense high level AG allocation functions
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (9 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 10/11] xfs: refactor successful AG allocation accounting code Brian Foster
@ 2019-05-22 18:05 ` Brian Foster
2019-05-23 1:56 ` [PATCH v2 00/11] xfs: rework extent allocation Dave Chinner
2019-06-21 15:18 ` Darrick J. Wong
12 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-05-22 18:05 UTC (permalink / raw)
To: linux-xfs
The higher level allocation code is unnecessarily split across
xfs_alloc_ag_vextent() and xfs_alloc_ag_vextent_type(). Fold the
latter into the former to avoid the unnecessary level of indirection
and reduce the overall amount of code. No functional changes.
Signed-off-by: Brian Foster <bfoster@redhat.com>
---
fs/xfs/libxfs/xfs_alloc.c | 158 ++++++++++++++++----------------------
1 file changed, 66 insertions(+), 92 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3b0cdb8346c9..434e5e874436 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1299,76 +1299,6 @@ xfs_alloc_ag_vextent_agfl(
return error;
}
-/*
- * Allocate a variable extent near bno in the allocation group agno.
- * Extent's length (returned in len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent_type(
- xfs_alloc_arg_t *args) /* allocation argument structure */
-{
- struct xfs_alloc_cur acur = {0,};
- int error; /* error code */
- int i; /* result code, temporary */
-
- /* handle unitialized agbno range so caller doesn't have to */
- if (!args->min_agbno && !args->max_agbno)
- args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
- ASSERT(args->min_agbno <= args->max_agbno);
-
- /* clamp agbno to the range if it's outside */
- if (args->agbno < args->min_agbno)
- args->agbno = args->min_agbno;
- if (args->agbno > args->max_agbno)
- args->agbno = args->max_agbno;
-
-restart:
- /* set up cursors and allocation tracking structure based on args */
- error = xfs_alloc_cur_setup(args, &acur);
- if (error)
- goto out;
-
- if (args->type == XFS_ALLOCTYPE_THIS_AG)
- error = xfs_alloc_ag_vextent_size(args, &acur, &i);
- else
- error = xfs_alloc_ag_vextent_cur(args, &acur, &i);
- if (error)
- goto out;
-
- /*
- * If we got an extent, finish the allocation. Otherwise check for busy
- * extents and retry or attempt a small allocation.
- */
- if (i) {
- error = xfs_alloc_cur_finish(args, &acur);
- if (error)
- goto out;
- } else {
- if (acur.busy) {
- trace_xfs_alloc_ag_busy(args);
- xfs_extent_busy_flush(args->mp, args->pag,
- acur.busy_gen);
- goto restart;
- }
-
- /*
- * We get here if we can't satisfy minlen or the trees are
- * empty.
- */
- error = xfs_alloc_ag_vextent_agfl(args);
- if (error)
- goto out;
- }
-
-out:
- xfs_alloc_cur_close(&acur, error);
- if (error)
- trace_xfs_alloc_ag_error(args);
- return error;
-}
-
/*
* Various AG accounting updates for a successful allocation. This includes
* updating the rmapbt, AG free block accounting and AG reservation accounting.
@@ -1412,44 +1342,88 @@ xfs_alloc_ag_vextent_accounting(
}
/*
- * Allocate a variable extent in the allocation group agno.
- * Type and bno are used to determine where in the allocation group the
- * extent will start.
- * Extent's length (returned in *len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ * Allocate a variable extent in the allocation group agno. Type and bno are
+ * used to determine where in the allocation group the extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen, and of
+ * the form k * prod + mod unless there's nothing that large. Return the
+ * starting a.g. block, or NULLAGBLOCK if we can't do it.
*/
-STATIC int /* error */
+STATIC int
xfs_alloc_ag_vextent(
- xfs_alloc_arg_t *args) /* argument structure for allocation */
+ struct xfs_alloc_arg *args) /* argument structure for allocation */
{
- int error=0;
+ struct xfs_alloc_cur acur = {0,};
+ int error;
+ int i;
ASSERT(args->minlen > 0);
ASSERT(args->maxlen > 0);
ASSERT(args->minlen <= args->maxlen);
ASSERT(args->mod < args->prod);
ASSERT(args->alignment > 0);
+ ASSERT(args->type == XFS_ALLOCTYPE_THIS_AG ||
+ args->type == XFS_ALLOCTYPE_NEAR_BNO ||
+ args->type == XFS_ALLOCTYPE_THIS_BNO);
+ ASSERT(args->min_agbno <= args->max_agbno);
+
+ args->wasfromfl = 0;
+
+ /* handle unitialized agbno range so caller doesn't have to */
+ if (!args->min_agbno && !args->max_agbno)
+ args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+
+ /* clamp agbno to the range if it's outside */
+ if (args->agbno < args->min_agbno)
+ args->agbno = args->min_agbno;
+ if (args->agbno > args->max_agbno)
+ args->agbno = args->max_agbno;
+
+restart:
+ /* set up cursors and allocation tracking structure based on args */
+ error = xfs_alloc_cur_setup(args, &acur);
+ if (error)
+ goto out;
+
+ if (args->type == XFS_ALLOCTYPE_THIS_AG)
+ error = xfs_alloc_ag_vextent_size(args, &acur, &i);
+ else
+ error = xfs_alloc_ag_vextent_cur(args, &acur, &i);
+ if (error)
+ goto out;
/*
- * Branch to correct routine based on the type.
+ * If we got an extent, finish the allocation. Otherwise check for busy
+ * extents and retry or attempt a small allocation.
*/
- args->wasfromfl = 0;
- switch (args->type) {
- case XFS_ALLOCTYPE_THIS_AG:
- case XFS_ALLOCTYPE_NEAR_BNO:
- case XFS_ALLOCTYPE_THIS_BNO:
- error = xfs_alloc_ag_vextent_type(args);
- break;
- default:
- ASSERT(0);
- /* NOTREACHED */
+ if (i) {
+ error = xfs_alloc_cur_finish(args, &acur);
+ if (error)
+ goto out;
+ } else {
+ if (acur.busy) {
+ trace_xfs_alloc_ag_busy(args);
+ xfs_extent_busy_flush(args->mp, args->pag,
+ acur.busy_gen);
+ goto restart;
+ }
+
+ /*
+ * We get here if we can't satisfy minlen or the trees are
+ * empty. We don't pass a cursor so this returns an AGFL block
+ * (i == 0) or nothing.
+ */
+ error = xfs_alloc_ag_vextent_agfl(args);
+ if (error)
+ goto out;
}
- if (error)
- return error;
if (args->agbno != NULLAGBLOCK)
error = xfs_alloc_ag_vextent_accounting(args);
+
+out:
+ xfs_alloc_cur_close(&acur, error);
+ if (error)
+ trace_xfs_alloc_ag_error(args);
return error;
}
--
2.17.2
^ permalink raw reply related [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (10 preceding siblings ...)
2019-05-22 18:05 ` [PATCH v2 11/11] xfs: condense high level AG allocation functions Brian Foster
@ 2019-05-23 1:56 ` Dave Chinner
2019-05-23 12:55 ` Brian Foster
2019-06-21 15:18 ` Darrick J. Wong
12 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2019-05-23 1:56 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Wed, May 22, 2019 at 02:05:35PM -0400, Brian Foster wrote:
> Hi all,
>
> This is v2 of the extent allocation rework series. The changes in this
> version are mostly associated with code factoring, based on feedback to
> v1. The small mode helper refactoring has been isolated and pulled to
> the start of the series. The active flag that necessitated the btree
> cursor container structure has been pushed into the xfs_btree_cur
> private area. The resulting high level allocation code in
> xfs_ag_alloc_vextent() has been cleaned up to remove an unnecessary
> level of abstraction. Finally, there are various minor cleanups and
> fixes.
>
> On the testing front, I've run a couple more filebench oriented tests
> since v1. The first is a high load, large filesystem, parallel file
> write+fsync test to try and determine whether the modified near mode
> allocation algorithm resulted in larger latencies in the common
> (non-fragmented) case. The results show comparable latencies, though the
> updated algorithm has a slightly faster overall runtime for whatever
> reason.
Probably indicative that over so many allocations, saving a few
microseconds of CPU time here and there adds up. That's also a fairly
good indication that the IO behaviour hasn't dramatically changed
between algorithms - we're not adding or removing a huge number of
seeks to the workload....
> The second is another filebench test (but with a smaller fileset against
> a smaller filesystem), but with the purpose of measuring "locality
> effectiveness" of the updated algorithm via post-test analysis of the
> resulting/populated filesystem. I've been thinking a bit about how to
> test for locality since starting on this series and ultimately came up
> with the following, fairly crude heuristic: track and compare the worst
> locality allocation for each regular file inode in the fs.
OK, that's pretty crude :P
> This
> essentially locates the most distant extent for each inode, tracks the
> delta from that extent to the inode location on disk and calculates the
> average worst case delta across the entire set of regular files. The
> results show that the updated algorithm provides a comparable level of
> locality to the existing algorithm.
The problem with this is that worse case locality isn't a
particularly useful measure. In general, when you have allocator
contention it occurs on the AGF locks and so the allocator skips to
the next AG it can lock. That means if we have 32 AGs and 33
allocations in progress at once, the AG that it chosen for
allocation is going to be essentially random. This means worst case
allocation locality is always going to be "ag skip" distances and so
the jumps between AGs are going to largely dominate the measured
locality distances.
In this case, 7TB, 32AGs = ~220GB per AG, so an AG skip will be
around 220 * 2^30 / 2^9 = ~460m sectors and:
> - baseline - min: 8 max: 568752250 median: 434794.5 mean: 11446328
> - test - min: 33 max: 568402234 median: 437405.5 mean: 11752963
> - by-size only - min: 33 max: 568593146 median: 784805 mean: 11912300
max are all >460m sectors and so are AG skip distances.
However, the changes you've made affect locality for allocations
_within_ an AG, not across the filesystem, and so anything that
skips to another AG really needs to be measured differently.
i.e. what we really need to measure here is "how close to target did
we get?" and for extending writes the target is always the AGBNO of
the end of the last extent.
The inode itself is only used as the target for the first extent, so
using it as the only distance comparison ignores the fact we try to
allocate as close to the end of the last extent as possible, not as
close to the inode as possible. Hence once a file has jumped AG, it
will stay in the new AG and not return to the original AG the inode
is in. This means that once the file data changes locality, it tries
to keep that same locality for the next data that is written, not
force another seek back to the original location.
So, AFAICT, the measure of locality we should be using to evaluate
the impact to locality of the new algorithm is the distance between
sequential extents in a file allocated within the same AG, not the
worst case distance from the inode....
Cheers,
Dave.
(*) Which, in reality, we really should reset because once we jump
AG we have no locality target and so should allow the full AG to be
considered. This "didn't reset target" issue is something I suspect
leads to the infamous "backwards allocation for sequential writes"
problems...
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-23 1:56 ` [PATCH v2 00/11] xfs: rework extent allocation Dave Chinner
@ 2019-05-23 12:55 ` Brian Foster
2019-05-23 22:15 ` Dave Chinner
0 siblings, 1 reply; 29+ messages in thread
From: Brian Foster @ 2019-05-23 12:55 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Thu, May 23, 2019 at 11:56:59AM +1000, Dave Chinner wrote:
> On Wed, May 22, 2019 at 02:05:35PM -0400, Brian Foster wrote:
> > Hi all,
> >
> > This is v2 of the extent allocation rework series. The changes in this
> > version are mostly associated with code factoring, based on feedback to
> > v1. The small mode helper refactoring has been isolated and pulled to
> > the start of the series. The active flag that necessitated the btree
> > cursor container structure has been pushed into the xfs_btree_cur
> > private area. The resulting high level allocation code in
> > xfs_ag_alloc_vextent() has been cleaned up to remove an unnecessary
> > level of abstraction. Finally, there are various minor cleanups and
> > fixes.
> >
> > On the testing front, I've run a couple more filebench oriented tests
> > since v1. The first is a high load, large filesystem, parallel file
> > write+fsync test to try and determine whether the modified near mode
> > allocation algorithm resulted in larger latencies in the common
> > (non-fragmented) case. The results show comparable latencies, though the
> > updated algorithm has a slightly faster overall runtime for whatever
> > reason.
>
> Probably indicative that over so many allocations, saving a few
> microseconds of CPU time here and there adds up. That's also a fairly
> good indication that the IO behaviour hasn't dramatically changed
> between algorithms - we're not adding or removing a huge number of
> seeks to the workload....
>
Makes sense. The goal here (and purpose for the higher level testing) is
basically to confirm this doesn't break/regress the common
(non-fragmented) allocation scenario. I suppose that's a bonus if we
find some incremental speedups along the way..
> > The second is another filebench test (but with a smaller fileset against
> > a smaller filesystem), but with the purpose of measuring "locality
> > effectiveness" of the updated algorithm via post-test analysis of the
> > resulting/populated filesystem. I've been thinking a bit about how to
> > test for locality since starting on this series and ultimately came up
> > with the following, fairly crude heuristic: track and compare the worst
> > locality allocation for each regular file inode in the fs.
>
> OK, that's pretty crude :P
>
Yeah, this was just a start and I figured it might generate some
feedback... ;)
> > This
> > essentially locates the most distant extent for each inode, tracks the
> > delta from that extent to the inode location on disk and calculates the
> > average worst case delta across the entire set of regular files. The
> > results show that the updated algorithm provides a comparable level of
> > locality to the existing algorithm.
>
> The problem with this is that worse case locality isn't a
> particularly useful measure. In general, when you have allocator
> contention it occurs on the AGF locks and so the allocator skips to
> the next AG it can lock. That means if we have 32 AGs and 33
> allocations in progress at once, the AG that it chosen for
> allocation is going to be essentially random. This means worst case
> allocation locality is always going to be "ag skip" distances and so
> the jumps between AGs are going to largely dominate the measured
> locality distances.
>
Good point. I was thinking about rerunning this with agcount=1 (with a
lighter workload) to isolate the analysis to a single AG, but wanted to
get this on the list for feedback given the time it takes populate the
fs and whatnot.
> In this case, 7TB, 32AGs = ~220GB per AG, so an AG skip will be
> around 220 * 2^30 / 2^9 = ~460m sectors and:
>
> > - baseline - min: 8 max: 568752250 median: 434794.5 mean: 11446328
> > - test - min: 33 max: 568402234 median: 437405.5 mean: 11752963
> > - by-size only - min: 33 max: 568593146 median: 784805 mean: 11912300
>
> max are all >460m sectors and so are AG skip distances.
>
Though it is interesting that the average and median are within that
~460m sector delta. I think that means this is at least catching some
information on intra-AG locality as many of these files might not have
had to jump AGs. Taking a look at the dataset, this could be because the
majority of these files are on the smaller side and consist of one or
two extents. There's also plenty of RAM (64GB) on the box I happened to
use.
For reference, the file sizes in the set are defined by the following
filebench mapping:
{{50, 4k, 1m},
{35, 10m, 100m},
{10, 500m, 1g},
{5, 1g, 8g}
}
... where the first value is the percentage and the next two are a size
range. E.g., 50% of files are 4k-1m, 35% are 10m-100m, etc. I can
obviously tweak this however necessary to provide most useful results or
test different distributions.
But while I don't think the current number is completely bogus, I agree
that it's diluted by those larger files that do happen to jump AGs and
thus it is probably missing critical information about file extension
allocation locality.
> However, the changes you've made affect locality for allocations
> _within_ an AG, not across the filesystem, and so anything that
> skips to another AG really needs to be measured differently.
>
> i.e. what we really need to measure here is "how close to target did
> we get?" and for extending writes the target is always the AGBNO of
> the end of the last extent.
>
> The inode itself is only used as the target for the first extent, so
> using it as the only distance comparison ignores the fact we try to
> allocate as close to the end of the last extent as possible, not as
> close to the inode as possible. Hence once a file has jumped AG, it
> will stay in the new AG and not return to the original AG the inode
> is in. This means that once the file data changes locality, it tries
> to keep that same locality for the next data that is written, not
> force another seek back to the original location.
>
Ok, I was thinking that locality is always based on inode location. I
see that xfs_bmap_btalloc() assigns ap->blkno (which makes its way to
args.fsbno/agbno) based on the inode, but I missed the
xfs_bmap_adjacent() call right after that which overrides ap->blkno
based on the previous extent, if applicable.
So this means that the worst case locality based on inode location is
actually invalid for (sequentially written) multi-extent files because
once we create a discontiguity, we're not measuring against the locality
target that was actually used by the algorithm (even within the same
AG).
> So, AFAICT, the measure of locality we should be using to evaluate
> the impact to locality of the new algorithm is the distance between
> sequential extents in a file allocated within the same AG, not the
> worst case distance from the inode....
>
Yeah, makes sense. I created metadumps of each filesystem created for
this test in anticipation of tweaks to the post-processing heuristic.
I assume we still want to fold this measurement up into some mean/median
locality value for the overall filesystem for comparison purposes, but
how would you prefer to see that tracked on a per-inode basis? I could
still track the worst case stride from one extent to the next within an
inode (provided they sit in the same AG), or we could do something like
track the average stride for each inode, or average stride in general
across all intra-AG extents. Hmmm.. I suppose if I had a script that
just dumped every applicable stride/delta value for an inode, I could
dump all of those numbers into a file and we can process it from there..
I'll play around with this some more. Thanks for the feedback!
> Cheers,
>
> Dave.
>
> (*) Which, in reality, we really should reset because once we jump
> AG we have no locality target and so should allow the full AG to be
> considered. This "didn't reset target" issue is something I suspect
> leads to the infamous "backwards allocation for sequential writes"
> problems...
>
I think this is something that's handled at a higher level. In the
nullfb case at least, we use the XFS_ALLOCTYPE_START_BNO allocation mode
which is what allows us to iterate AGs. We start with a near mode
allocation in the target AG. If that fails, xfs_alloc_vextent() switches
over to XFS_ALLOCTYPE_THIS_AG for the remaining AGs. This lands in
xfs_alloc_ag_vextent_size() which ignores args.agbno even if it's set.
Brian
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-23 12:55 ` Brian Foster
@ 2019-05-23 22:15 ` Dave Chinner
2019-05-24 12:00 ` Brian Foster
0 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2019-05-23 22:15 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Thu, May 23, 2019 at 08:55:35AM -0400, Brian Foster wrote:
> On Thu, May 23, 2019 at 11:56:59AM +1000, Dave Chinner wrote:
> > On Wed, May 22, 2019 at 02:05:35PM -0400, Brian Foster wrote:
> > > Hi all,
> > >
> > > This is v2 of the extent allocation rework series. The changes in this
> > > version are mostly associated with code factoring, based on feedback to
> > > v1. The small mode helper refactoring has been isolated and pulled to
> > > the start of the series. The active flag that necessitated the btree
> > > cursor container structure has been pushed into the xfs_btree_cur
> > > private area. The resulting high level allocation code in
> > > xfs_ag_alloc_vextent() has been cleaned up to remove an unnecessary
> > > level of abstraction. Finally, there are various minor cleanups and
> > > fixes.
> > >
> > > On the testing front, I've run a couple more filebench oriented tests
> > > since v1. The first is a high load, large filesystem, parallel file
> > > write+fsync test to try and determine whether the modified near mode
> > > allocation algorithm resulted in larger latencies in the common
> > > (non-fragmented) case. The results show comparable latencies, though the
> > > updated algorithm has a slightly faster overall runtime for whatever
> > > reason.
> >
> > Probably indicative that over so many allocations, saving a few
> > microseconds of CPU time here and there adds up. That's also a fairly
> > good indication that the IO behaviour hasn't dramatically changed
> > between algorithms - we're not adding or removing a huge number of
> > seeks to the workload....
> >
>
> Makes sense. The goal here (and purpose for the higher level testing) is
> basically to confirm this doesn't break/regress the common
> (non-fragmented) allocation scenario. I suppose that's a bonus if we
> find some incremental speedups along the way..
*nod*
[snip]
> > In this case, 7TB, 32AGs = ~220GB per AG, so an AG skip will be
> > around 220 * 2^30 / 2^9 = ~460m sectors and:
> >
> > > - baseline - min: 8 max: 568752250 median: 434794.5 mean: 11446328
> > > - test - min: 33 max: 568402234 median: 437405.5 mean: 11752963
> > > - by-size only - min: 33 max: 568593146 median: 784805 mean: 11912300
> >
> > max are all >460m sectors and so are AG skip distances.
> >
>
> Though it is interesting that the average and median are within that
> ~460m sector delta. I think that means this is at least catching some
> information on intra-AG locality as many of these files might not have
> had to jump AGs.
[....]
> But while I don't think the current number is completely bogus, I agree
> that it's diluted by those larger files that do happen to jump AGs and
> thus it is probably missing critical information about file extension
> allocation locality.
*nod*
[....]
> > So, AFAICT, the measure of locality we should be using to evaluate
> > the impact to locality of the new algorithm is the distance between
> > sequential extents in a file allocated within the same AG, not the
> > worst case distance from the inode....
> >
>
> Yeah, makes sense. I created metadumps of each filesystem created for
> this test in anticipation of tweaks to the post-processing heuristic.
Nice :)
> I assume we still want to fold this measurement up into some mean/median
> locality value for the overall filesystem for comparison purposes, but
> how would you prefer to see that tracked on a per-inode basis? I could
> still track the worst case stride from one extent to the next within an
> inode (provided they sit in the same AG), or we could do something like
> track the average stride for each inode, or average stride in general
> across all intra-AG extents.
Histograms are probably the best way to demonstrate the general
distribution of the data set. Average/median don't really convey
much other than whether the dataset is skewed to one end or another,
while histograms will give us a good indication in the change of
"distance from target" over a wide range of the filesystem. It can
even incorporate "skipped AG" as a bucket to indicate how often
thatis occurring and whether the change in algorithms
increase/decreases the occurence of that.
FWIW, I suspect a histogram that describes "extent size vs locality
of next extent" will give us an idea of whether small or large
allocations have worse locality. I'd also expect a measure like this
to give insight into how badly free space fragmentation is affecting
the filesystem, as it will tend to push the extent size distribution
towards the smaller end of the scale....
> Hmmm.. I suppose if I had a script that
> just dumped every applicable stride/delta value for an inode, I could
> dump all of those numbers into a file and we can process it from there..
See how the freesp commands work in xfs_db - they just generate a
set of {offset, size} tuples that are then bucketted appropriately.
This is probably the best way to do this at the moment - have xfs_db
walk the inode BMBTs outputting something like {extent size,
distance to next extent} tuples and everything else falls out from
how we bucket that information.
> > (*) Which, in reality, we really should reset because once we jump
> > AG we have no locality target and so should allow the full AG to be
> > considered. This "didn't reset target" issue is something I suspect
> > leads to the infamous "backwards allocation for sequential writes"
> > problems...
>
> I think this is something that's handled at a higher level. In the
> nullfb case at least, we use the XFS_ALLOCTYPE_START_BNO allocation mode
> which is what allows us to iterate AGs.
The nullfb case isn't the problem here - I think the problem comes
when we fail the THIS_BNO/NEAR_BNO allocation attempts in
xfs_bmap_btalloc() and then fall back to XFS_ALLOCTYPE_START_BNO
with the same fsbno target.
> We start with a near mode
> allocation in the target AG. If that fails, xfs_alloc_vextent() switches
> over to XFS_ALLOCTYPE_THIS_AG for the remaining AGs. This lands in
> xfs_alloc_ag_vextent_size() which ignores args.agbno even if it's set.
Hmmm. I think you just pointed out a bug in this code. The initial
NEARBNO alloc sets ->min_agbno and ->max_agbno based on the
args->agbno, which xfs_alloc_compute_aligned then uses to determine
if the found free extent can be used. When we immediately fall back
to XFS_ALLOCTYPE_THIS_AG allocation, we don't reset
min_agbno/max_agbno, and so even though we ignore the args->agbno
target that is set when selecting a free extent, we still call
xfs_alloc_compute_aligned() to determine if it is appropriate to
use.
I think the bug here is that xfs_alloc_compute_aligned() applies the
NEARBNO args->min_agbno to the free extent found by the THIS_AG
allocation, and so if the large free extent in the new AG overlaps
min_agbno it selects the higher up part of the free extent that
starts at min_agbno, rather than using the start of the free
extent...
Of course, I could be missing something obvious (still haven't
finished my morning coffee!), so it's worth checking I read the code
properly....
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-23 22:15 ` Dave Chinner
@ 2019-05-24 12:00 ` Brian Foster
2019-05-25 22:43 ` Dave Chinner
0 siblings, 1 reply; 29+ messages in thread
From: Brian Foster @ 2019-05-24 12:00 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Fri, May 24, 2019 at 08:15:52AM +1000, Dave Chinner wrote:
> On Thu, May 23, 2019 at 08:55:35AM -0400, Brian Foster wrote:
> > On Thu, May 23, 2019 at 11:56:59AM +1000, Dave Chinner wrote:
> > > On Wed, May 22, 2019 at 02:05:35PM -0400, Brian Foster wrote:
> > > > Hi all,
> > > >
> > > > This is v2 of the extent allocation rework series. The changes in this
> > > > version are mostly associated with code factoring, based on feedback to
> > > > v1. The small mode helper refactoring has been isolated and pulled to
> > > > the start of the series. The active flag that necessitated the btree
> > > > cursor container structure has been pushed into the xfs_btree_cur
> > > > private area. The resulting high level allocation code in
> > > > xfs_ag_alloc_vextent() has been cleaned up to remove an unnecessary
> > > > level of abstraction. Finally, there are various minor cleanups and
> > > > fixes.
> > > >
> > > > On the testing front, I've run a couple more filebench oriented tests
> > > > since v1. The first is a high load, large filesystem, parallel file
> > > > write+fsync test to try and determine whether the modified near mode
> > > > allocation algorithm resulted in larger latencies in the common
> > > > (non-fragmented) case. The results show comparable latencies, though the
> > > > updated algorithm has a slightly faster overall runtime for whatever
> > > > reason.
> > >
> > > Probably indicative that over so many allocations, saving a few
> > > microseconds of CPU time here and there adds up. That's also a fairly
> > > good indication that the IO behaviour hasn't dramatically changed
> > > between algorithms - we're not adding or removing a huge number of
> > > seeks to the workload....
> > >
> >
> > Makes sense. The goal here (and purpose for the higher level testing) is
> > basically to confirm this doesn't break/regress the common
> > (non-fragmented) allocation scenario. I suppose that's a bonus if we
> > find some incremental speedups along the way..
>
> *nod*
>
> [snip]
>
>
> > > In this case, 7TB, 32AGs = ~220GB per AG, so an AG skip will be
> > > around 220 * 2^30 / 2^9 = ~460m sectors and:
> > >
> > > > - baseline - min: 8 max: 568752250 median: 434794.5 mean: 11446328
> > > > - test - min: 33 max: 568402234 median: 437405.5 mean: 11752963
> > > > - by-size only - min: 33 max: 568593146 median: 784805 mean: 11912300
> > >
> > > max are all >460m sectors and so are AG skip distances.
> > >
> >
> > Though it is interesting that the average and median are within that
> > ~460m sector delta. I think that means this is at least catching some
> > information on intra-AG locality as many of these files might not have
> > had to jump AGs.
>
> [....]
>
> > But while I don't think the current number is completely bogus, I agree
> > that it's diluted by those larger files that do happen to jump AGs and
> > thus it is probably missing critical information about file extension
> > allocation locality.
>
> *nod*
>
> [....]
>
> > > So, AFAICT, the measure of locality we should be using to evaluate
> > > the impact to locality of the new algorithm is the distance between
> > > sequential extents in a file allocated within the same AG, not the
> > > worst case distance from the inode....
> > >
> >
> > Yeah, makes sense. I created metadumps of each filesystem created for
> > this test in anticipation of tweaks to the post-processing heuristic.
>
> Nice :)
>
> > I assume we still want to fold this measurement up into some mean/median
> > locality value for the overall filesystem for comparison purposes, but
> > how would you prefer to see that tracked on a per-inode basis? I could
> > still track the worst case stride from one extent to the next within an
> > inode (provided they sit in the same AG), or we could do something like
> > track the average stride for each inode, or average stride in general
> > across all intra-AG extents.
>
> Histograms are probably the best way to demonstrate the general
> distribution of the data set. Average/median don't really convey
> much other than whether the dataset is skewed to one end or another,
> while histograms will give us a good indication in the change of
> "distance from target" over a wide range of the filesystem. It can
> even incorporate "skipped AG" as a bucket to indicate how often
> thatis occurring and whether the change in algorithms
> increase/decreases the occurence of that.
>
> FWIW, I suspect a histogram that describes "extent size vs locality
> of next extent" will give us an idea of whether small or large
> allocations have worse locality. I'd also expect a measure like this
> to give insight into how badly free space fragmentation is affecting
> the filesystem, as it will tend to push the extent size distribution
> towards the smaller end of the scale....
>
Ok. I actually attempted a histogram of the current dataset but the
random CLI utility I found didn't quite spit out what I expected. I was
probably using it wrong, but didn't dig much further into it..
> > Hmmm.. I suppose if I had a script that
> > just dumped every applicable stride/delta value for an inode, I could
> > dump all of those numbers into a file and we can process it from there..
>
> See how the freesp commands work in xfs_db - they just generate a
> set of {offset, size} tuples that are then bucketted appropriately.
> This is probably the best way to do this at the moment - have xfs_db
> walk the inode BMBTs outputting something like {extent size,
> distance to next extent} tuples and everything else falls out from
> how we bucket that information.
>
That sounds plausible. A bit more involved than what I'm currently
working with, but we do already have a blueprint for the scanning
implementation required to collect this data via the frag command.
Perhaps some of this code between the frag/freesp can be generalized and
reused. I'll take a closer look at it.
My only concern is I'd prefer to only go down this path as long as we
plan to land the associated command in xfs_db. So this approach suggests
to me that we add a "locality" command similar to frag/freesp that
presents the locality state of the fs. For now I'm only really concerned
with the data associated with known near mode allocations (i.e., such as
the extent size:distance relationship you've outlined above) so we can
evaluate these algorithmic changes, but this would be for fs devs only
so we could always expand on it down the road if we want to assess
different allocations. Hm?
>
> > > (*) Which, in reality, we really should reset because once we jump
> > > AG we have no locality target and so should allow the full AG to be
> > > considered. This "didn't reset target" issue is something I suspect
> > > leads to the infamous "backwards allocation for sequential writes"
> > > problems...
> >
> > I think this is something that's handled at a higher level. In the
> > nullfb case at least, we use the XFS_ALLOCTYPE_START_BNO allocation mode
> > which is what allows us to iterate AGs.
>
> The nullfb case isn't the problem here - I think the problem comes
> when we fail the THIS_BNO/NEAR_BNO allocation attempts in
> xfs_bmap_btalloc() and then fall back to XFS_ALLOCTYPE_START_BNO
> with the same fsbno target.
>
Ok, that's the retry where we restore minlen from maxlen back to 1. Note
that still depends on nullfb, but that's just context. I think the
majority of the time data allocations come into the allocator as the
first allocation in the transaction.
That aside, I've also seen that this particular retry is hotter than
you'd think just by glancing at the code due to the whole minlen =
maxlen thing in xfs_bmap_select_minlen() combined with potential size of
delalloc extents. See the patches I posted a couple weeks ago in this
area for example:
https://marc.info/?l=linux-xfs&m=155671950608062&w=2
... though I don't think they relate to the issue you're tracking here.
> > We start with a near mode
> > allocation in the target AG. If that fails, xfs_alloc_vextent() switches
> > over to XFS_ALLOCTYPE_THIS_AG for the remaining AGs. This lands in
> > xfs_alloc_ag_vextent_size() which ignores args.agbno even if it's set.
>
> Hmmm. I think you just pointed out a bug in this code. The initial
> NEARBNO alloc sets ->min_agbno and ->max_agbno based on the
> args->agbno, which xfs_alloc_compute_aligned then uses to determine
> if the found free extent can be used. When we immediately fall back
> to XFS_ALLOCTYPE_THIS_AG allocation, we don't reset
> min_agbno/max_agbno, and so even though we ignore the args->agbno
> target that is set when selecting a free extent, we still call
> xfs_alloc_compute_aligned() to determine if it is appropriate to
> use.
>
I don't see where the near mode alloc assigns ->[min|max]_agbno. The
only place I see ->min_agbno used is for sparse inode chunk allocation.
->max_agbno is assigned in the near mode alloc when min == 0, but it's
set to agsize so I'm guessing that this is just an initialization so the
checking logic works with zeroed out values from the caller.
Just to clarify.. what exactly is the reverse allocation problem you're
thinking about here? Logically increasing writes resulting in physically
decreasing allocations? If so, have we confirmed that writes are indeed
sequential when this occurs (obvious I guess, but worth asking).
I'm not really familiar with this problem so I've not thought about it,
but one random thought comes to mind: is there any chance of allocation
interlocking behind extent frees contributing to this behavior? We do
truncate files in reverse order an extent at a time and immediately
finish the dfops (to free the extent and busy it) before we unmap the
next piece. If allocations could interlock with such frees (or busy
clearing), I could see in theory how that might result in backwards
allocations, but I'm just handwaving and not sure that's possible in
practice.
Brian
> I think the bug here is that xfs_alloc_compute_aligned() applies the
> NEARBNO args->min_agbno to the free extent found by the THIS_AG
> allocation, and so if the large free extent in the new AG overlaps
> min_agbno it selects the higher up part of the free extent that
> starts at min_agbno, rather than using the start of the free
> extent...
>
> Of course, I could be missing something obvious (still haven't
> finished my morning coffee!), so it's worth checking I read the code
> properly....
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-24 12:00 ` Brian Foster
@ 2019-05-25 22:43 ` Dave Chinner
2019-05-31 17:11 ` Brian Foster
0 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2019-05-25 22:43 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Fri, May 24, 2019 at 08:00:18AM -0400, Brian Foster wrote:
> On Fri, May 24, 2019 at 08:15:52AM +1000, Dave Chinner wrote:
> > On Thu, May 23, 2019 at 08:55:35AM -0400, Brian Foster wrote:
> > > Hmmm.. I suppose if I had a script that
> > > just dumped every applicable stride/delta value for an inode, I could
> > > dump all of those numbers into a file and we can process it from there..
> >
> > See how the freesp commands work in xfs_db - they just generate a
> > set of {offset, size} tuples that are then bucketted appropriately.
> > This is probably the best way to do this at the moment - have xfs_db
> > walk the inode BMBTs outputting something like {extent size,
> > distance to next extent} tuples and everything else falls out from
> > how we bucket that information.
> >
>
> That sounds plausible. A bit more involved than what I'm currently
> working with, but we do already have a blueprint for the scanning
> implementation required to collect this data via the frag command.
> Perhaps some of this code between the frag/freesp can be generalized and
> reused. I'll take a closer look at it.
>
> My only concern is I'd prefer to only go down this path as long as we
> plan to land the associated command in xfs_db. So this approach suggests
> to me that we add a "locality" command similar to frag/freesp that
> presents the locality state of the fs. For now I'm only really concerned
> with the data associated with known near mode allocations (i.e., such as
> the extent size:distance relationship you've outlined above) so we can
> evaluate these algorithmic changes, but this would be for fs devs only
> so we could always expand on it down the road if we want to assess
> different allocations. Hm?
Yup, I'm needing to do similar analysis myself to determine how
quickly I'm aging the filesystem, so having the tool in xfs_db or
xfs_spaceman would be very useful.
FWIW, the tool I've just started writing will just use fallocate and
truncate to hammer the allocation code as hard and as quickly as
possible - I want to do accelerated aging of the filesystem, and so
being able to run tens to hundreds of thousands of free space
manipulations a second is the goal here....
> > > > (*) Which, in reality, we really should reset because once we jump
> > > > AG we have no locality target and so should allow the full AG to be
> > > > considered. This "didn't reset target" issue is something I suspect
> > > > leads to the infamous "backwards allocation for sequential writes"
> > > > problems...
> > >
> > > I think this is something that's handled at a higher level. In the
> > > nullfb case at least, we use the XFS_ALLOCTYPE_START_BNO allocation mode
> > > which is what allows us to iterate AGs.
> >
> > The nullfb case isn't the problem here - I think the problem comes
> > when we fail the THIS_BNO/NEAR_BNO allocation attempts in
> > xfs_bmap_btalloc() and then fall back to XFS_ALLOCTYPE_START_BNO
> > with the same fsbno target.
> >
>
> Ok, that's the retry where we restore minlen from maxlen back to 1. Note
> that still depends on nullfb, but that's just context. I think the
> majority of the time data allocations come into the allocator as the
> first allocation in the transaction.
>
> That aside, I've also seen that this particular retry is hotter than
> you'd think just by glancing at the code due to the whole minlen =
> maxlen thing in xfs_bmap_select_minlen() combined with potential size of
> delalloc extents. See the patches I posted a couple weeks ago in this
> area for example:
>
> https://marc.info/?l=linux-xfs&m=155671950608062&w=2
>
> ... though I don't think they relate to the issue you're tracking here.
>
> > > We start with a near mode
> > > allocation in the target AG. If that fails, xfs_alloc_vextent() switches
> > > over to XFS_ALLOCTYPE_THIS_AG for the remaining AGs. This lands in
> > > xfs_alloc_ag_vextent_size() which ignores args.agbno even if it's set.
> >
> > Hmmm. I think you just pointed out a bug in this code. The initial
> > NEARBNO alloc sets ->min_agbno and ->max_agbno based on the
> > args->agbno, which xfs_alloc_compute_aligned then uses to determine
> > if the found free extent can be used. When we immediately fall back
> > to XFS_ALLOCTYPE_THIS_AG allocation, we don't reset
> > min_agbno/max_agbno, and so even though we ignore the args->agbno
> > target that is set when selecting a free extent, we still call
> > xfs_alloc_compute_aligned() to determine if it is appropriate to
> > use.
> >
>
> I don't see where the near mode alloc assigns ->[min|max]_agbno. The
> only place I see ->min_agbno used is for sparse inode chunk allocation.
> ->max_agbno is assigned in the near mode alloc when min == 0, but it's
> set to agsize so I'm guessing that this is just an initialization so the
> checking logic works with zeroed out values from the caller.
I did say:
> > Of course, I could be missing something obvious (still haven't
> > finished my morning coffee!), so it's worth checking I read the code
> > properly....
And that's clearly the case here - I read the code that clamps agbno
to min/max backwards....
> Just to clarify.. what exactly is the reverse allocation problem you're
> thinking about here? Logically increasing writes resulting in physically
> decreasing allocations? If so, have we confirmed that writes are indeed
> sequential when this occurs (obvious I guess, but worth asking).
Precisely this. And, yes, I've confirmed many times that it is
sequential writes that display it. It happens when the closest nearest free
extent is below the current target. i.e.:
+FFFFFFFFFFFFF+aaaaaaaT+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb+FFFFF+
We just allocated "a" so the next allocation target is T, but "b" is
already allocated there. The freespace before "a" is larger than
the free space after "b" (which may be less than maxlen), and so the
"nearest best free space" is below the target. If "b" is large
enough, we end up with file allocation that looks like (where n =
relative file offset for the given allocation)
+10+9+8+7+6+5+4+1+2+3+
i.e. allocation starts forward, uses the remaining ascending free
space in the extent, then starts allocating backwards as it takes
the nearest free space and trims off the top of the free extent for
the allocation.
> I'm not really familiar with this problem so I've not thought about it,
> but one random thought comes to mind: is there any chance of allocation
> interlocking behind extent frees contributing to this behavior? We do
> truncate files in reverse order an extent at a time and immediately
> finish the dfops (to free the extent and busy it) before we unmap the
> next piece.
Unlikely, because such extents are not immediately avaiable for
allocation (they are busy). And I've seen it on pure contended
sequential write workloads, too...
> If allocations could interlock with such frees (or busy
> clearing), I could see in theory how that might result in backwards
> allocations, but I'm just handwaving and not sure that's possible in
> practice.
Most of the cases I've seen have had the same symptom - "skip to
next AG, allocate at same high-up-in AGBNO target as the previous AG
wanted, then allocate backwards in the same AG until freespace
extent is exhausted. It then skips to some other freespace extent,
and depending on whether it's a forwards or backwards skip the
problem either goes away or continues. This is not a new behaviour,
I first saw it some 15 years ago, but I've never been able to
provoke it reliably enough with test code to get to the root
cause...
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-25 22:43 ` Dave Chinner
@ 2019-05-31 17:11 ` Brian Foster
2019-06-06 15:21 ` Brian Foster
2019-06-06 22:05 ` Dave Chinner
0 siblings, 2 replies; 29+ messages in thread
From: Brian Foster @ 2019-05-31 17:11 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Sun, May 26, 2019 at 08:43:17AM +1000, Dave Chinner wrote:
> On Fri, May 24, 2019 at 08:00:18AM -0400, Brian Foster wrote:
> > On Fri, May 24, 2019 at 08:15:52AM +1000, Dave Chinner wrote:
> > > On Thu, May 23, 2019 at 08:55:35AM -0400, Brian Foster wrote:
> > > > Hmmm.. I suppose if I had a script that
> > > > just dumped every applicable stride/delta value for an inode, I could
> > > > dump all of those numbers into a file and we can process it from there..
> > >
> > > See how the freesp commands work in xfs_db - they just generate a
> > > set of {offset, size} tuples that are then bucketted appropriately.
> > > This is probably the best way to do this at the moment - have xfs_db
> > > walk the inode BMBTs outputting something like {extent size,
> > > distance to next extent} tuples and everything else falls out from
> > > how we bucket that information.
> > >
> >
> > That sounds plausible. A bit more involved than what I'm currently
> > working with, but we do already have a blueprint for the scanning
> > implementation required to collect this data via the frag command.
> > Perhaps some of this code between the frag/freesp can be generalized and
> > reused. I'll take a closer look at it.
> >
> > My only concern is I'd prefer to only go down this path as long as we
> > plan to land the associated command in xfs_db. So this approach suggests
> > to me that we add a "locality" command similar to frag/freesp that
> > presents the locality state of the fs. For now I'm only really concerned
> > with the data associated with known near mode allocations (i.e., such as
> > the extent size:distance relationship you've outlined above) so we can
> > evaluate these algorithmic changes, but this would be for fs devs only
> > so we could always expand on it down the road if we want to assess
> > different allocations. Hm?
>
> Yup, I'm needing to do similar analysis myself to determine how
> quickly I'm aging the filesystem, so having the tool in xfs_db or
> xfs_spaceman would be very useful.
>
> FWIW, the tool I've just started writing will just use fallocate and
> truncate to hammer the allocation code as hard and as quickly as
> possible - I want to do accelerated aging of the filesystem, and so
> being able to run tens to hundreds of thousands of free space
> manipulations a second is the goal here....
>
Ok. FWIW, from playing with this so far (before getting distracted for
much of this week) the most straightforward place to add this kind of
functionality turns out to be the frag command itself. It does 99% of
the work required to process data extents already, including pulling the
on-disk records of each inode in-core for processing. I basically just
had to update that code to include all of the record data and add the
locality tracking logic (I haven't got to actually presenting it yet..).
> > > > > (*) Which, in reality, we really should reset because once we jump
> > > > > AG we have no locality target and so should allow the full AG to be
> > > > > considered. This "didn't reset target" issue is something I suspect
> > > > > leads to the infamous "backwards allocation for sequential writes"
> > > > > problems...
> > > >
> > > > I think this is something that's handled at a higher level. In the
> > > > nullfb case at least, we use the XFS_ALLOCTYPE_START_BNO allocation mode
> > > > which is what allows us to iterate AGs.
> > >
> > > The nullfb case isn't the problem here - I think the problem comes
> > > when we fail the THIS_BNO/NEAR_BNO allocation attempts in
> > > xfs_bmap_btalloc() and then fall back to XFS_ALLOCTYPE_START_BNO
> > > with the same fsbno target.
> > >
> >
> > Ok, that's the retry where we restore minlen from maxlen back to 1. Note
> > that still depends on nullfb, but that's just context. I think the
> > majority of the time data allocations come into the allocator as the
> > first allocation in the transaction.
> >
> > That aside, I've also seen that this particular retry is hotter than
> > you'd think just by glancing at the code due to the whole minlen =
> > maxlen thing in xfs_bmap_select_minlen() combined with potential size of
> > delalloc extents. See the patches I posted a couple weeks ago in this
> > area for example:
> >
> > https://marc.info/?l=linux-xfs&m=155671950608062&w=2
> >
> > ... though I don't think they relate to the issue you're tracking here.
> >
> > > > We start with a near mode
> > > > allocation in the target AG. If that fails, xfs_alloc_vextent() switches
> > > > over to XFS_ALLOCTYPE_THIS_AG for the remaining AGs. This lands in
> > > > xfs_alloc_ag_vextent_size() which ignores args.agbno even if it's set.
> > >
> > > Hmmm. I think you just pointed out a bug in this code. The initial
> > > NEARBNO alloc sets ->min_agbno and ->max_agbno based on the
> > > args->agbno, which xfs_alloc_compute_aligned then uses to determine
> > > if the found free extent can be used. When we immediately fall back
> > > to XFS_ALLOCTYPE_THIS_AG allocation, we don't reset
> > > min_agbno/max_agbno, and so even though we ignore the args->agbno
> > > target that is set when selecting a free extent, we still call
> > > xfs_alloc_compute_aligned() to determine if it is appropriate to
> > > use.
> > >
> >
> > I don't see where the near mode alloc assigns ->[min|max]_agbno. The
> > only place I see ->min_agbno used is for sparse inode chunk allocation.
> > ->max_agbno is assigned in the near mode alloc when min == 0, but it's
> > set to agsize so I'm guessing that this is just an initialization so the
> > checking logic works with zeroed out values from the caller.
>
> I did say:
>
> > > Of course, I could be missing something obvious (still haven't
> > > finished my morning coffee!), so it's worth checking I read the code
> > > properly....
>
> And that's clearly the case here - I read the code that clamps agbno
> to min/max backwards....
>
Ok..
> > Just to clarify.. what exactly is the reverse allocation problem you're
> > thinking about here? Logically increasing writes resulting in physically
> > decreasing allocations? If so, have we confirmed that writes are indeed
> > sequential when this occurs (obvious I guess, but worth asking).
>
> Precisely this. And, yes, I've confirmed many times that it is
> sequential writes that display it. It happens when the closest nearest free
> extent is below the current target. i.e.:
>
>
> +FFFFFFFFFFFFF+aaaaaaaT+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb+FFFFF+
>
> We just allocated "a" so the next allocation target is T, but "b" is
> already allocated there. The freespace before "a" is larger than
> the free space after "b" (which may be less than maxlen), and so the
> "nearest best free space" is below the target. If "b" is large
> enough, we end up with file allocation that looks like (where n =
> relative file offset for the given allocation)
>
> +10+9+8+7+6+5+4+1+2+3+
>
> i.e. allocation starts forward, uses the remaining ascending free
> space in the extent, then starts allocating backwards as it takes
> the nearest free space and trims off the top of the free extent for
> the allocation.
>
Hmm, so are you saying that the allocation is not only physically
reversed, but the (physically out of order) ascending logical extents in
the file are actually physically contiguous as well? IOW, if freed, they
would form a single/contiguous free extent?
If so, any idea how common that pattern is to the instances of this
problem you've seen? I.e., is it always the case, sometimes the case..?
Also, any intuition on the size of these extents relative to the file?
> > I'm not really familiar with this problem so I've not thought about it,
> > but one random thought comes to mind: is there any chance of allocation
> > interlocking behind extent frees contributing to this behavior? We do
> > truncate files in reverse order an extent at a time and immediately
> > finish the dfops (to free the extent and busy it) before we unmap the
> > next piece.
>
> Unlikely, because such extents are not immediately avaiable for
> allocation (they are busy). And I've seen it on pure contended
> sequential write workloads, too...
>
I was thinking that unbusying of extents could be similarly interlocked
across log buffer completions, but your latter point and the additional
context suggests this is more likely influenced by a combination
allocation timing (i.e., AGF locking to determine when we jump AGs) and
free space state than anything like this.
> > If allocations could interlock with such frees (or busy
> > clearing), I could see in theory how that might result in backwards
> > allocations, but I'm just handwaving and not sure that's possible in
> > practice.
>
> Most of the cases I've seen have had the same symptom - "skip to
> next AG, allocate at same high-up-in AGBNO target as the previous AG
> wanted, then allocate backwards in the same AG until freespace
> extent is exhausted. It then skips to some other freespace extent,
> and depending on whether it's a forwards or backwards skip the
> problem either goes away or continues. This is not a new behaviour,
> I first saw it some 15 years ago, but I've never been able to
> provoke it reliably enough with test code to get to the root
> cause...
>
I guess the biggest question to me is whether we're more looking for a
backwards searching pattern or a pattern where we split up a larger free
extent into smaller chunks (in reverse), or both. I can definitely see
some isolated areas where a backwards search could lead to this
behavior. E.g., my previous experiment to replace near mode allocs with
size mode allocs always allocates in reverse when free space is
sufficiently fragmented. To see this in practice would require repeated
size mode allocations, however, which I think is unlikely because once
we jump AGs and do a size mode alloc, the subsequent allocs should be
near mode within the new AG (unless we jump again and again, which I
don't think is consistent with what you're describing).
Hmm, the next opportunity for this kind of behavior in the near mode
allocator is probably the bnobt left/right span. This would require the
right circumstances to hit. We'd have to bypass the first (cntbt)
algorithm then find a closer extent in the left mode search vs. the
right mode search, and then probably repeat that across however many
allocations it takes to work out of this state.
If instead we're badly breaking up an extent in the wrong order, it
looks like we do have the capability to allocate the right portion of an
extent (in xfs_alloc_compute_diff()) but that is only enabled for non
data allocations. xfs_alloc_compute_aligned() can cause a similar effect
if alignment is set, but I'm not sure that would break up an extent into
more than one usable chunk.
In any event, maybe I'll hack some temporary code in the xfs_db locality
stuff to quickly flag whether I happen to get lucky enough to reproduce
any instances of this during the associated test workloads (and if so,
try and collect more data).
Brian
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-31 17:11 ` Brian Foster
@ 2019-06-06 15:21 ` Brian Foster
2019-06-06 22:13 ` Dave Chinner
2019-06-06 22:05 ` Dave Chinner
1 sibling, 1 reply; 29+ messages in thread
From: Brian Foster @ 2019-06-06 15:21 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Fri, May 31, 2019 at 01:11:36PM -0400, Brian Foster wrote:
> On Sun, May 26, 2019 at 08:43:17AM +1000, Dave Chinner wrote:
> > On Fri, May 24, 2019 at 08:00:18AM -0400, Brian Foster wrote:
> > > On Fri, May 24, 2019 at 08:15:52AM +1000, Dave Chinner wrote:
> > > > On Thu, May 23, 2019 at 08:55:35AM -0400, Brian Foster wrote:
> > > > > Hmmm.. I suppose if I had a script that
> > > > > just dumped every applicable stride/delta value for an inode, I could
> > > > > dump all of those numbers into a file and we can process it from there..
> > > >
> > > > See how the freesp commands work in xfs_db - they just generate a
> > > > set of {offset, size} tuples that are then bucketted appropriately.
> > > > This is probably the best way to do this at the moment - have xfs_db
> > > > walk the inode BMBTs outputting something like {extent size,
> > > > distance to next extent} tuples and everything else falls out from
> > > > how we bucket that information.
> > > >
> > >
> > > That sounds plausible. A bit more involved than what I'm currently
> > > working with, but we do already have a blueprint for the scanning
> > > implementation required to collect this data via the frag command.
> > > Perhaps some of this code between the frag/freesp can be generalized and
> > > reused. I'll take a closer look at it.
> > >
> > > My only concern is I'd prefer to only go down this path as long as we
> > > plan to land the associated command in xfs_db. So this approach suggests
> > > to me that we add a "locality" command similar to frag/freesp that
> > > presents the locality state of the fs. For now I'm only really concerned
> > > with the data associated with known near mode allocations (i.e., such as
> > > the extent size:distance relationship you've outlined above) so we can
> > > evaluate these algorithmic changes, but this would be for fs devs only
> > > so we could always expand on it down the road if we want to assess
> > > different allocations. Hm?
> >
> > Yup, I'm needing to do similar analysis myself to determine how
> > quickly I'm aging the filesystem, so having the tool in xfs_db or
> > xfs_spaceman would be very useful.
> >
> > FWIW, the tool I've just started writing will just use fallocate and
> > truncate to hammer the allocation code as hard and as quickly as
> > possible - I want to do accelerated aging of the filesystem, and so
> > being able to run tens to hundreds of thousands of free space
> > manipulations a second is the goal here....
> >
>
> Ok. FWIW, from playing with this so far (before getting distracted for
> much of this week) the most straightforward place to add this kind of
> functionality turns out to be the frag command itself. It does 99% of
> the work required to process data extents already, including pulling the
> on-disk records of each inode in-core for processing. I basically just
> had to update that code to include all of the record data and add the
> locality tracking logic (I haven't got to actually presenting it yet..).
>
I managed to collect some preliminary data based on this strategy. I
regenerated the associated dataset as I wanted to introduce more near
mode allocation events where locality is relevant/measurable. The set is
still generated via filebench, but the workload now runs file 8x file
creators in parallel with 16x random size file appenders (with 1% of the
dataset being preallocated to support the appends, and thus without
contention). In total, it creates 10k files that amount to ~5.8TB of
space. The filesystem geometry is as follows:
meta-data=/dev/mapper/fedora_hpe--apollo--cn99xx--19-tmp isize=512 agcount=8, agsize=268435455 blks
= sectsz=4096 attr=2, projid32bit=1
= crc=1 finobt=1, sparse=1, rmapbt=0
= reflink=0
data = bsize=4096 blocks=1941012480, imaxpct=5
= sunit=0 swidth=0 blks
naming =version 2 bsize=4096 ascii-ci=0, ftype=1
log =internal log bsize=4096 blocks=521728, version=2
= sectsz=4096 sunit=1 blks, lazy-count=1
realtime =none extsz=4096 blocks=0, rtextents=0
The locality data is collected via a hacked up variant of 'xfs_db -c
frag ...' that reuses the 'freesp' command histogram to display locality
deltas instead of extent lengths. Each bucket shows how many
extents/allocs fall into that particular delta range and the average
length of the associated extents. Note that the percent column is based
on extent count vs the total (not block or delta counts). Finally, note
that the final row uses a dummy value of agsize to account AG jumps. As
such, the associated delta values are invalid, but the extent count/size
values are valid. This data is included to provide some insight on how
often we fall back to external AGs (from the locality target) due to
contention.
Bear in mind that so far I've only run this workload once on each kernel
and I believe there is opportunity for variance run-to-run. Current data
and observations follow:
- Baseline kernel (5.2.0-rc1):
Files on this filesystem average 59.81 extents per file
from to extents delta avgsz pct
0 0 1 0 4879 0.00
1 1 18 18 633 0.00
2 3 76 193 630 0.01
4 7 117 644 10801 0.02
8 15 246 2735 701 0.04
16 31 411 9769 873 0.07
32 63 858 40614 877 0.14
64 127 1931 183122 872 0.32
128 255 3658 693079 1423 0.60
256 511 4393 1619094 767 0.73
512 1023 6049 4582491 666 1.00
1024 2047 19564 34684608 810 3.23
2048 4095 21391 62471744 828 3.53
4096 8191 24735 140424459 920 4.08
8192 16383 18383 213465918 1030 3.03
16384 32767 14447 336272956 1195 2.38
32768 65535 12359 580683797 1154 2.04
65536 131071 11943 1138606730 1446 1.97
131072 262143 16825 3279118006 1701 2.78
262144 524287 32133 12725299194 1905 5.30
524288 1048575 58899 45775066024 1845 9.72
1048576 2097151 95302 147197567651 2020 15.73
2097152 4194303 86659 252037848233 2794 14.30
4194304 8388607 67067 397513880288 2876 11.07
8388608 16777215 47940 583161227489 2319 7.91
16777216 33554431 24878 537577890034 3321 4.11
33554432 67108863 5889 269065981311 16106 0.97
67108864 134217727 3022 304867554478 33429 0.50
134217728 268435454 2994 539180795612 35744 0.49
268435455 268435455 23709 6364336202595 4840 3.91
- Control (base w/ unconditional SIZE mode allocs):
Files on this filesystem average 58.60 extents per file
from to extents delta avgsz pct
0 0 1 0 180 0.00
4 7 19 115 11379 0.00
8 15 21 231 15 0.00
16 31 3 58 45 0.00
64 127 3 288 7124 0.00
128 255 4 780 60296 0.00
256 511 3 978 51563 0.00
512 1023 9 7072 105727 0.00
1024 2047 33 50773 4765 0.01
2048 4095 98 306258 15689 0.02
4096 8191 258 1513775 1981 0.04
8192 16383 458 5633481 2537 0.08
16384 32767 934 23078682 3013 0.16
32768 65535 1783 87851701 3109 0.30
65536 131071 3382 332685185 1810 0.57
131072 262143 8904 1784659842 2275 1.50
262144 524287 23878 9433551033 1903 4.02
524288 1048575 54422 42483032893 1894 9.17
1048576 2097151 97884 148883431239 2048 16.49
2097152 4194303 81999 236737184381 2741 13.81
4194304 8388607 86826 510450130696 2639 14.63
8388608 16777215 54870 652250378434 2101 9.24
16777216 33554431 40408 985568011752 1959 6.81
33554432 67108863 46354 2258464180697 2538 7.81
67108864 134217727 59461 5705095317989 3380 10.02
134217728 268435454 16205 2676447855794 4891 2.73
268435455 268435455 15423 4140080022465 5243 2.60
- Test (base + this series):
Files on this filesystem average 59.76 extents per file
from to extents delta avgsz pct
0 0 2 0 419 0.00
1 1 258 258 387 0.04
2 3 81 201 598 0.01
4 7 139 790 13824 0.02
8 15 257 2795 710 0.04
16 31 417 9790 852 0.07
32 63 643 30714 901 0.11
64 127 1158 110148 835 0.19
128 255 1947 370953 822 0.32
256 511 3567 1348313 744 0.59
512 1023 5151 3921794 695 0.85
1024 2047 22895 39640251 924 3.78
2048 4095 34375 100757727 922 5.68
4096 8191 30381 171773183 914 5.02
8192 16383 18977 214896159 1091 3.13
16384 32767 8460 192726268 1212 1.40
32768 65535 6071 286544623 1611 1.00
65536 131071 7803 757765738 1680 1.29
131072 262143 15300 3001300980 1877 2.53
262144 524287 27218 10868169139 1993 4.50
524288 1048575 60423 47321126020 1948 9.98
1048576 2097151 100683 158191884842 2174 16.63
2097152 4194303 92642 274106200889 2508 15.30
4194304 8388607 73987 436219940202 2421 12.22
8388608 16777215 49636 591854981117 2434 8.20
16777216 33554431 15716 353157130237 4772 2.60
33554432 67108863 4948 228004142686 19221 0.82
67108864 134217727 2381 231811967738 35781 0.39
134217728 268435454 2140 385403697868 29221 0.35
268435455 268435455 17746 4763655584430 7603 2.93
Firstly, comparison of the baseline and control data shows that near
mode allocation is effective at improving allocation locality compared
to size mode. In both cases, the 1048576-4194304 buckets hold the
majority of extents. If we look at the sub-1048576 data, however, ~40%
of allocations land in this range on the baseline kernel vs. only ~16%
for the control. Another interesting data point is the noticeable drop
in AG skips (~24k) from the baseline kernel to the control (~15k). I
suspect this is due to the additional overhead of locality based
allocation causing more contention.
Comparison of the baseline and test data shows a generally similar
breakdown between the two. The sub-1048576 range populates the same
buckets and makes up ~41% of the total extents. The per-bucket numbers
differ, but all of the buckets are within a percentage point or so. One
previously unknown advantage of the test algorithm shown by this data is
that the number of AG skips drops down to ~18k, which almost splits the
difference between the baseline and control (and slightly in favor of
the latter). I suspect that is related to the more simple and bounded
near mode algorithm as compared to the current
Thoughts on any of this data or presentation? I could dig further into
details or alternatively base the histogram on something like extent
size and show the average delta for each extent size bucket, but I'm not
sure that will tell us anything profound with respect to this patchset.
One thing I noticed while processing this data is that the current
dataset skews heavily towards smaller allocations. I still think it's a
useful comparison because smaller allocations are more likely to stress
either algorithm via a larger locality search space, but I may try to
repeat this test with a workload with fewer files and larger allocations
and see how that changes things.
Brian
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-06-06 15:21 ` Brian Foster
@ 2019-06-06 22:13 ` Dave Chinner
2019-06-07 12:57 ` Brian Foster
0 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2019-06-06 22:13 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Thu, Jun 06, 2019 at 11:21:04AM -0400, Brian Foster wrote:
> On Fri, May 31, 2019 at 01:11:36PM -0400, Brian Foster wrote:
> > On Sun, May 26, 2019 at 08:43:17AM +1000, Dave Chinner wrote:
> > > On Fri, May 24, 2019 at 08:00:18AM -0400, Brian Foster wrote:
> > > > On Fri, May 24, 2019 at 08:15:52AM +1000, Dave Chinner wrote:
> > > > > On Thu, May 23, 2019 at 08:55:35AM -0400, Brian Foster wrote:
> > > > > > Hmmm.. I suppose if I had a script that
> > > > > > just dumped every applicable stride/delta value for an inode, I could
> > > > > > dump all of those numbers into a file and we can process it from there..
> > > > >
> > > > > See how the freesp commands work in xfs_db - they just generate a
> > > > > set of {offset, size} tuples that are then bucketted appropriately.
> > > > > This is probably the best way to do this at the moment - have xfs_db
> > > > > walk the inode BMBTs outputting something like {extent size,
> > > > > distance to next extent} tuples and everything else falls out from
> > > > > how we bucket that information.
> > > > >
> > > >
> > > > That sounds plausible. A bit more involved than what I'm currently
> > > > working with, but we do already have a blueprint for the scanning
> > > > implementation required to collect this data via the frag command.
> > > > Perhaps some of this code between the frag/freesp can be generalized and
> > > > reused. I'll take a closer look at it.
> > > >
> > > > My only concern is I'd prefer to only go down this path as long as we
> > > > plan to land the associated command in xfs_db. So this approach suggests
> > > > to me that we add a "locality" command similar to frag/freesp that
> > > > presents the locality state of the fs. For now I'm only really concerned
> > > > with the data associated with known near mode allocations (i.e., such as
> > > > the extent size:distance relationship you've outlined above) so we can
> > > > evaluate these algorithmic changes, but this would be for fs devs only
> > > > so we could always expand on it down the road if we want to assess
> > > > different allocations. Hm?
> > >
> > > Yup, I'm needing to do similar analysis myself to determine how
> > > quickly I'm aging the filesystem, so having the tool in xfs_db or
> > > xfs_spaceman would be very useful.
> > >
> > > FWIW, the tool I've just started writing will just use fallocate and
> > > truncate to hammer the allocation code as hard and as quickly as
> > > possible - I want to do accelerated aging of the filesystem, and so
> > > being able to run tens to hundreds of thousands of free space
> > > manipulations a second is the goal here....
> > >
> >
> > Ok. FWIW, from playing with this so far (before getting distracted for
> > much of this week) the most straightforward place to add this kind of
> > functionality turns out to be the frag command itself. It does 99% of
> > the work required to process data extents already, including pulling the
> > on-disk records of each inode in-core for processing. I basically just
> > had to update that code to include all of the record data and add the
> > locality tracking logic (I haven't got to actually presenting it yet..).
> >
>
> I managed to collect some preliminary data based on this strategy. I
....
>
> Comparison of the baseline and test data shows a generally similar
> breakdown between the two.
Which is the result I wanted to see :)
> Thoughts on any of this data or presentation?
I think it's useful for comparing whether an allocator change has
affected the overall locality of allocation. If it's working as we
expect, you should get vastly different results for inode32 vs
inode64 mount options, with inode32 showing much, much higher
distances for most allocations, so it might be worth running a quick
test to confirm that it does, indeed, demonstrate the results we'd
expect from such a change.
> I could dig further into
> details or alternatively base the histogram on something like extent
> size and show the average delta for each extent size bucket, but I'm not
> sure that will tell us anything profound with respect to this patchset.
*nod*
> One thing I noticed while processing this data is that the current
> dataset skews heavily towards smaller allocations. I still think it's a
> useful comparison because smaller allocations are more likely to stress
> either algorithm via a larger locality search space, but I may try to
> repeat this test with a workload with fewer files and larger allocations
> and see how that changes things.
>From the testing I've been doing, I think the file count of around
10k isn't sufficient to really cause severe allocation issues.
Directory and inodes metadata are great for fragmenting free space,
so dramtically increasing the number of smaller files might actually
produce worse behaviour....
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-06-06 22:13 ` Dave Chinner
@ 2019-06-07 12:57 ` Brian Foster
0 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-06-07 12:57 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Fri, Jun 07, 2019 at 08:13:01AM +1000, Dave Chinner wrote:
> On Thu, Jun 06, 2019 at 11:21:04AM -0400, Brian Foster wrote:
> > On Fri, May 31, 2019 at 01:11:36PM -0400, Brian Foster wrote:
> > > On Sun, May 26, 2019 at 08:43:17AM +1000, Dave Chinner wrote:
> > > > On Fri, May 24, 2019 at 08:00:18AM -0400, Brian Foster wrote:
> > > > > On Fri, May 24, 2019 at 08:15:52AM +1000, Dave Chinner wrote:
> > > > > > On Thu, May 23, 2019 at 08:55:35AM -0400, Brian Foster wrote:
> > > > > > > Hmmm.. I suppose if I had a script that
> > > > > > > just dumped every applicable stride/delta value for an inode, I could
> > > > > > > dump all of those numbers into a file and we can process it from there..
> > > > > >
> > > > > > See how the freesp commands work in xfs_db - they just generate a
> > > > > > set of {offset, size} tuples that are then bucketted appropriately.
> > > > > > This is probably the best way to do this at the moment - have xfs_db
> > > > > > walk the inode BMBTs outputting something like {extent size,
> > > > > > distance to next extent} tuples and everything else falls out from
> > > > > > how we bucket that information.
> > > > > >
> > > > >
> > > > > That sounds plausible. A bit more involved than what I'm currently
> > > > > working with, but we do already have a blueprint for the scanning
> > > > > implementation required to collect this data via the frag command.
> > > > > Perhaps some of this code between the frag/freesp can be generalized and
> > > > > reused. I'll take a closer look at it.
> > > > >
> > > > > My only concern is I'd prefer to only go down this path as long as we
> > > > > plan to land the associated command in xfs_db. So this approach suggests
> > > > > to me that we add a "locality" command similar to frag/freesp that
> > > > > presents the locality state of the fs. For now I'm only really concerned
> > > > > with the data associated with known near mode allocations (i.e., such as
> > > > > the extent size:distance relationship you've outlined above) so we can
> > > > > evaluate these algorithmic changes, but this would be for fs devs only
> > > > > so we could always expand on it down the road if we want to assess
> > > > > different allocations. Hm?
> > > >
> > > > Yup, I'm needing to do similar analysis myself to determine how
> > > > quickly I'm aging the filesystem, so having the tool in xfs_db or
> > > > xfs_spaceman would be very useful.
> > > >
> > > > FWIW, the tool I've just started writing will just use fallocate and
> > > > truncate to hammer the allocation code as hard and as quickly as
> > > > possible - I want to do accelerated aging of the filesystem, and so
> > > > being able to run tens to hundreds of thousands of free space
> > > > manipulations a second is the goal here....
> > > >
> > >
> > > Ok. FWIW, from playing with this so far (before getting distracted for
> > > much of this week) the most straightforward place to add this kind of
> > > functionality turns out to be the frag command itself. It does 99% of
> > > the work required to process data extents already, including pulling the
> > > on-disk records of each inode in-core for processing. I basically just
> > > had to update that code to include all of the record data and add the
> > > locality tracking logic (I haven't got to actually presenting it yet..).
> > >
> >
> > I managed to collect some preliminary data based on this strategy. I
> ....
> >
> > Comparison of the baseline and test data shows a generally similar
> > breakdown between the two.
>
> Which is the result I wanted to see :)
>
Indeed. ;)
> > Thoughts on any of this data or presentation?
>
> I think it's useful for comparing whether an allocator change has
> affected the overall locality of allocation. If it's working as we
> expect, you should get vastly different results for inode32 vs
> inode64 mount options, with inode32 showing much, much higher
> distances for most allocations, so it might be worth running a quick
> test to confirm that it does, indeed, demonstrate the results we'd
> expect from such a change.
>
Ok, I can add inode32 into the mix as a sanity check. I'm guessing this
will result in an increase in AG skips. I also think that once we have
that first data allocation, the existing heuristics may apply similarly
since we're only concerned about contiguity with the previous extent at
that point. Perhaps I'll reserve this for a test with an even higher
inode count so there are more first data extent allocations (where the
inode is the locality target, but sits in a metadata preferred AG) to
measure.
> > I could dig further into
> > details or alternatively base the histogram on something like extent
> > size and show the average delta for each extent size bucket, but I'm not
> > sure that will tell us anything profound with respect to this patchset.
>
> *nod*
>
> > One thing I noticed while processing this data is that the current
> > dataset skews heavily towards smaller allocations. I still think it's a
> > useful comparison because smaller allocations are more likely to stress
> > either algorithm via a larger locality search space, but I may try to
> > repeat this test with a workload with fewer files and larger allocations
> > and see how that changes things.
>
> From the testing I've been doing, I think the file count of around
> 10k isn't sufficient to really cause severe allocation issues.
> Directory and inodes metadata are great for fragmenting free space,
> so dramtically increasing the number of smaller files might actually
> produce worse behaviour....
>
I'd still like to see the results for larger allocations if for nothing
else that I was hoping for more coverage from the most recent test. Once
I get through that, I'll move back in the smaller file direction and
increase the file count as well as the span of the directory tree.
Hmm, random file sizes between 4k and say 96k or so should allow me to
at least create tens of millions of files/dirs (I'd like to get above
64k for some of these allocations to cover post-eof prealloc). That
should stress the allocator as noted with the additional inode/dir
metadata allocations as well as facilitate inode32 experiments.
Thanks for the feedback..
Brian
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-31 17:11 ` Brian Foster
2019-06-06 15:21 ` Brian Foster
@ 2019-06-06 22:05 ` Dave Chinner
2019-06-07 12:56 ` Brian Foster
1 sibling, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2019-06-06 22:05 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Fri, May 31, 2019 at 01:11:36PM -0400, Brian Foster wrote:
> On Sun, May 26, 2019 at 08:43:17AM +1000, Dave Chinner wrote:
> > Most of the cases I've seen have had the same symptom - "skip to
> > next AG, allocate at same high-up-in AGBNO target as the previous AG
> > wanted, then allocate backwards in the same AG until freespace
> > extent is exhausted. It then skips to some other freespace extent,
> > and depending on whether it's a forwards or backwards skip the
> > problem either goes away or continues. This is not a new behaviour,
> > I first saw it some 15 years ago, but I've never been able to
> > provoke it reliably enough with test code to get to the root
> > cause...
> >
>
> I guess the biggest question to me is whether we're more looking for a
> backwards searching pattern or a pattern where we split up a larger free
> extent into smaller chunks (in reverse), or both. I can definitely see
> some isolated areas where a backwards search could lead to this
> behavior. E.g., my previous experiment to replace near mode allocs with
> size mode allocs always allocates in reverse when free space is
> sufficiently fragmented. To see this in practice would require repeated
> size mode allocations, however, which I think is unlikely because once
> we jump AGs and do a size mode alloc, the subsequent allocs should be
> near mode within the new AG (unless we jump again and again, which I
> don't think is consistent with what you're describing).
>
> Hmm, the next opportunity for this kind of behavior in the near mode
> allocator is probably the bnobt left/right span. This would require the
> right circumstances to hit. We'd have to bypass the first (cntbt)
> algorithm then find a closer extent in the left mode search vs. the
> right mode search, and then probably repeat that across however many
> allocations it takes to work out of this state.
>
> If instead we're badly breaking up an extent in the wrong order, it
> looks like we do have the capability to allocate the right portion of an
> extent (in xfs_alloc_compute_diff()) but that is only enabled for non
> data allocations. xfs_alloc_compute_aligned() can cause a similar effect
> if alignment is set, but I'm not sure that would break up an extent into
> more than one usable chunk.
This is pretty much matches what I've been able to infer about the
cause, but lacking a way to actually trigger it and be able to
monitor the behviour in real time is where I've got stuck on this.
I see the result in aged, fragmented filesystems and can infer how
it may have occurred, but can't cause it to occur on demand...
> In any event, maybe I'll hack some temporary code in the xfs_db locality
> stuff to quickly flag whether I happen to get lucky enough to reproduce
> any instances of this during the associated test workloads (and if so,
> try and collect more data).
*nod*
Best we can do, I think, and hope we stumble across an easily
reproducable trigger...
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-06-06 22:05 ` Dave Chinner
@ 2019-06-07 12:56 ` Brian Foster
0 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-06-07 12:56 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Fri, Jun 07, 2019 at 08:05:59AM +1000, Dave Chinner wrote:
> On Fri, May 31, 2019 at 01:11:36PM -0400, Brian Foster wrote:
> > On Sun, May 26, 2019 at 08:43:17AM +1000, Dave Chinner wrote:
> > > Most of the cases I've seen have had the same symptom - "skip to
> > > next AG, allocate at same high-up-in AGBNO target as the previous AG
> > > wanted, then allocate backwards in the same AG until freespace
> > > extent is exhausted. It then skips to some other freespace extent,
> > > and depending on whether it's a forwards or backwards skip the
> > > problem either goes away or continues. This is not a new behaviour,
> > > I first saw it some 15 years ago, but I've never been able to
> > > provoke it reliably enough with test code to get to the root
> > > cause...
> > >
> >
> > I guess the biggest question to me is whether we're more looking for a
> > backwards searching pattern or a pattern where we split up a larger free
> > extent into smaller chunks (in reverse), or both. I can definitely see
> > some isolated areas where a backwards search could lead to this
> > behavior. E.g., my previous experiment to replace near mode allocs with
> > size mode allocs always allocates in reverse when free space is
> > sufficiently fragmented. To see this in practice would require repeated
> > size mode allocations, however, which I think is unlikely because once
> > we jump AGs and do a size mode alloc, the subsequent allocs should be
> > near mode within the new AG (unless we jump again and again, which I
> > don't think is consistent with what you're describing).
> >
> > Hmm, the next opportunity for this kind of behavior in the near mode
> > allocator is probably the bnobt left/right span. This would require the
> > right circumstances to hit. We'd have to bypass the first (cntbt)
> > algorithm then find a closer extent in the left mode search vs. the
> > right mode search, and then probably repeat that across however many
> > allocations it takes to work out of this state.
> >
> > If instead we're badly breaking up an extent in the wrong order, it
> > looks like we do have the capability to allocate the right portion of an
> > extent (in xfs_alloc_compute_diff()) but that is only enabled for non
> > data allocations. xfs_alloc_compute_aligned() can cause a similar effect
> > if alignment is set, but I'm not sure that would break up an extent into
> > more than one usable chunk.
>
> This is pretty much matches what I've been able to infer about the
> cause, but lacking a way to actually trigger it and be able to
> monitor the behviour in real time is where I've got stuck on this.
> I see the result in aged, fragmented filesystems and can infer how
> it may have occurred, but can't cause it to occur on demand...
>
Ok.
> > In any event, maybe I'll hack some temporary code in the xfs_db locality
> > stuff to quickly flag whether I happen to get lucky enough to reproduce
> > any instances of this during the associated test workloads (and if so,
> > try and collect more data).
>
> *nod*
>
> Best we can do, I think, and hope we stumble across an easily
> reproducable trigger...
>
Unfortunately I haven't seen any instances of this in the workloads I
ran to generate the most recent datasets. I have a couple more
experiments to try though so I'll keep an eye out.
Brian
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-05-22 18:05 [PATCH v2 00/11] xfs: rework extent allocation Brian Foster
` (11 preceding siblings ...)
2019-05-23 1:56 ` [PATCH v2 00/11] xfs: rework extent allocation Dave Chinner
@ 2019-06-21 15:18 ` Darrick J. Wong
2019-07-01 19:12 ` Brian Foster
12 siblings, 1 reply; 29+ messages in thread
From: Darrick J. Wong @ 2019-06-21 15:18 UTC (permalink / raw)
To: Brian Foster; +Cc: linux-xfs
On Wed, May 22, 2019 at 02:05:35PM -0400, Brian Foster wrote:
> Hi all,
>
> This is v2 of the extent allocation rework series. The changes in this
> version are mostly associated with code factoring, based on feedback to
> v1. The small mode helper refactoring has been isolated and pulled to
> the start of the series. The active flag that necessitated the btree
> cursor container structure has been pushed into the xfs_btree_cur
> private area. The resulting high level allocation code in
> xfs_ag_alloc_vextent() has been cleaned up to remove an unnecessary
> level of abstraction. Finally, there are various minor cleanups and
> fixes.
Hmmm. Just for fun I decided to apply this series to see what would
happen, and on a 1k block filesystem shook out a test failure that seems
like it could be related?
MKFS_OPTIONS='-f -m reflink=1,rmapbt=1 -i sparse=1, -b size=1024, /dev/sdf'
MOUNT_OPTIONS='-o usrquota,grpquota,prjquota /dev/sdf /opt'
--D
--- generic/223.out
+++ generic/223.out.bad
@@ -48,7 +48,7 @@
=== Testing size 1g falloc on 8k stripe ===
SCRATCH_MNT/file-1g-falloc: well-aligned
=== Testing size 1073745920 falloc on 8k stripe ===
-SCRATCH_MNT/file-1073745920-falloc: well-aligned
+SCRATCH_MNT/file-1073745920-falloc: Start block 197673 not multiple of sunit 2
=== mkfs with su 4 blocks x 4 ===
=== Testing size 1*16k on 16k stripe ===
SCRATCH_MNT/file-1-16384-falloc: well-aligned
@@ -98,7 +98,7 @@
=== Testing size 1g falloc on 16k stripe ===
SCRATCH_MNT/file-1g-falloc: well-aligned
=== Testing size 1073745920 falloc on 16k stripe ===
-SCRATCH_MNT/file-1073745920-falloc: well-aligned
+SCRATCH_MNT/file-1073745920-falloc: Start block 197673 not multiple of sunit 4
=== mkfs with su 8 blocks x 4 ===
=== Testing size 1*32k on 32k stripe ===
SCRATCH_MNT/file-1-32768-falloc: well-aligned
@@ -148,7 +148,7 @@
=== Testing size 1g falloc on 32k stripe ===
SCRATCH_MNT/file-1g-falloc: well-aligned
=== Testing size 1073745920 falloc on 32k stripe ===
-SCRATCH_MNT/file-1073745920-falloc: well-aligned
+SCRATCH_MNT/file-1073745920-falloc: Start block 197673 not multiple of sunit 8
=== mkfs with su 16 blocks x 4 ===
=== Testing size 1*64k on 64k stripe ===
SCRATCH_MNT/file-1-65536-falloc: well-aligned
@@ -198,7 +198,7 @@
=== Testing size 1g falloc on 64k stripe ===
SCRATCH_MNT/file-1g-falloc: well-aligned
=== Testing size 1073745920 falloc on 64k stripe ===
-SCRATCH_MNT/file-1073745920-falloc: well-aligned
+SCRATCH_MNT/file-1073745920-falloc: Start block 197665 not multiple of sunit 16
=== mkfs with su 32 blocks x 4 ===
=== Testing size 1*128k on 128k stripe ===
SCRATCH_MNT/file-1-131072-falloc: well-aligned
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v2 00/11] xfs: rework extent allocation
2019-06-21 15:18 ` Darrick J. Wong
@ 2019-07-01 19:12 ` Brian Foster
0 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2019-07-01 19:12 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Fri, Jun 21, 2019 at 08:18:35AM -0700, Darrick J. Wong wrote:
> On Wed, May 22, 2019 at 02:05:35PM -0400, Brian Foster wrote:
> > Hi all,
> >
> > This is v2 of the extent allocation rework series. The changes in this
> > version are mostly associated with code factoring, based on feedback to
> > v1. The small mode helper refactoring has been isolated and pulled to
> > the start of the series. The active flag that necessitated the btree
> > cursor container structure has been pushed into the xfs_btree_cur
> > private area. The resulting high level allocation code in
> > xfs_ag_alloc_vextent() has been cleaned up to remove an unnecessary
> > level of abstraction. Finally, there are various minor cleanups and
> > fixes.
>
> Hmmm. Just for fun I decided to apply this series to see what would
> happen, and on a 1k block filesystem shook out a test failure that seems
> like it could be related?
>
I had reproduced this earlier on and eventually determined it was kind
of circumstantial with respect to this series. I had eliminated some of
the operations from generic/223 for more quick reproduction/RCA and
ultimately reproduced the same behavior on upstream (at the time, which
was probably a month or two ago by now) with a slightly different
workload. That lead to the following series:
https://marc.info/?l=linux-xfs&m=155671950608062&w=2
... which IIRC addressed the problem in both scenarios. Thoughts on
those patches?
Brian
> MKFS_OPTIONS='-f -m reflink=1,rmapbt=1 -i sparse=1, -b size=1024, /dev/sdf'
> MOUNT_OPTIONS='-o usrquota,grpquota,prjquota /dev/sdf /opt'
>
> --D
>
> --- generic/223.out
> +++ generic/223.out.bad
> @@ -48,7 +48,7 @@
> === Testing size 1g falloc on 8k stripe ===
> SCRATCH_MNT/file-1g-falloc: well-aligned
> === Testing size 1073745920 falloc on 8k stripe ===
> -SCRATCH_MNT/file-1073745920-falloc: well-aligned
> +SCRATCH_MNT/file-1073745920-falloc: Start block 197673 not multiple of sunit 2
> === mkfs with su 4 blocks x 4 ===
> === Testing size 1*16k on 16k stripe ===
> SCRATCH_MNT/file-1-16384-falloc: well-aligned
> @@ -98,7 +98,7 @@
> === Testing size 1g falloc on 16k stripe ===
> SCRATCH_MNT/file-1g-falloc: well-aligned
> === Testing size 1073745920 falloc on 16k stripe ===
> -SCRATCH_MNT/file-1073745920-falloc: well-aligned
> +SCRATCH_MNT/file-1073745920-falloc: Start block 197673 not multiple of sunit 4
> === mkfs with su 8 blocks x 4 ===
> === Testing size 1*32k on 32k stripe ===
> SCRATCH_MNT/file-1-32768-falloc: well-aligned
> @@ -148,7 +148,7 @@
> === Testing size 1g falloc on 32k stripe ===
> SCRATCH_MNT/file-1g-falloc: well-aligned
> === Testing size 1073745920 falloc on 32k stripe ===
> -SCRATCH_MNT/file-1073745920-falloc: well-aligned
> +SCRATCH_MNT/file-1073745920-falloc: Start block 197673 not multiple of sunit 8
> === mkfs with su 16 blocks x 4 ===
> === Testing size 1*64k on 64k stripe ===
> SCRATCH_MNT/file-1-65536-falloc: well-aligned
> @@ -198,7 +198,7 @@
> === Testing size 1g falloc on 64k stripe ===
> SCRATCH_MNT/file-1g-falloc: well-aligned
> === Testing size 1073745920 falloc on 64k stripe ===
> -SCRATCH_MNT/file-1073745920-falloc: well-aligned
> +SCRATCH_MNT/file-1073745920-falloc: Start block 197665 not multiple of sunit 16
> === mkfs with su 32 blocks x 4 ===
> === Testing size 1*128k on 128k stripe ===
> SCRATCH_MNT/file-1-131072-falloc: well-aligned
^ permalink raw reply [flat|nested] 29+ messages in thread