* [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
@ 2018-05-15 22:33 ` Darrick J. Wong
2018-05-16 6:51 ` Dave Chinner
` (2 more replies)
2018-05-15 22:33 ` [PATCH 02/22] xfs: add helpers to allocate and initialize fresh btree roots Darrick J. Wong
` (21 subsequent siblings)
22 siblings, 3 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:33 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
For repairs, we need to reserve at least as many blocks as we think
we're going to need to rebuild the data structure, and we're going to
need some helpers to roll transactions while maintaining locks on the AG
headers so that other threads cannot wander into the middle of a repair.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/bmap.c | 2 -
fs/xfs/scrub/common.c | 21 ++++++-
fs/xfs/scrub/common.h | 2 -
fs/xfs/scrub/inode.c | 4 +
fs/xfs/scrub/repair.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 12 ++++
6 files changed, 186 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 42a115e83739..eeadb33a701c 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -74,7 +74,7 @@ xfs_scrub_setup_inode_bmap(
}
/* Got the inode, lock it and we're ready to go. */
- error = xfs_scrub_trans_alloc(sc);
+ error = xfs_scrub_trans_alloc(sc, 0);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 518bff2be0c9..f1826b4b7572 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -51,6 +51,7 @@
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/btree.h"
+#include "scrub/repair.h"
/* Common code for the metadata scrubbers. */
@@ -573,11 +574,22 @@ xfs_scrub_ag_init(
/*
* Grab an empty transaction so that we can re-grab locked buffers if
* one of our btrees turns out to be cyclic.
+ *
+ * If we're going to repair something, we need to ask for the largest possible
+ * log reservation so that we can handle the worst case scenario for metadata
+ * updates while rebuilding a metadata item. We also need to reserve as many
+ * blocks in the head transaction as we think we're going to need to rebuild
+ * the metadata object.
*/
int
xfs_scrub_trans_alloc(
- struct xfs_scrub_context *sc)
+ struct xfs_scrub_context *sc,
+ uint resblks)
{
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+ resblks, 0, 0, &sc->tp);
+
return xfs_trans_alloc_empty(sc->mp, &sc->tp);
}
@@ -587,7 +599,10 @@ xfs_scrub_setup_fs(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- return xfs_scrub_trans_alloc(sc);
+ uint resblks;
+
+ resblks = xfs_repair_calc_ag_resblks(sc);
+ return xfs_scrub_trans_alloc(sc, resblks);
}
/* Set us up with AG headers and btree cursors. */
@@ -717,7 +732,7 @@ xfs_scrub_setup_inode_contents(
/* Got the inode, lock it and we're ready to go. */
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
- error = xfs_scrub_trans_alloc(sc);
+ error = xfs_scrub_trans_alloc(sc, resblks);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index a660087b606e..6012049a8617 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -38,7 +38,7 @@ xfs_scrub_should_terminate(
return false;
}
-int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc);
+int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks);
bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error);
bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 550c0cf70a92..0c696f7018de 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -67,7 +67,7 @@ xfs_scrub_setup_inode(
break;
case -EFSCORRUPTED:
case -EFSBADCRC:
- return xfs_scrub_trans_alloc(sc);
+ return xfs_scrub_trans_alloc(sc, 0);
default:
return error;
}
@@ -75,7 +75,7 @@ xfs_scrub_setup_inode(
/* Got the inode, lock it and we're ready to go. */
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
- error = xfs_scrub_trans_alloc(sc);
+ error = xfs_scrub_trans_alloc(sc, 0);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index be30825c47c6..486e6e319b1f 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -128,3 +128,155 @@ xfs_repair_probe(
return 0;
}
+
+/*
+ * Roll a transaction, keeping the AG headers locked and reinitializing
+ * the btree cursors.
+ */
+int
+xfs_repair_roll_ag_trans(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ /* Keep the AG header buffers locked so we can keep going. */
+ xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+
+ /* Roll the transaction. */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ goto out_release;
+
+ /* Join AG headers to the new transaction. */
+ xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+
+ return 0;
+
+out_release:
+ /*
+ * Rolling failed, so release the hold on the buffers. The
+ * buffers will be released during teardown on our way out
+ * of the kernel.
+ */
+ xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
+
+ return error;
+}
+
+/*
+ * Does the given AG have enough space to rebuild a btree? Neither AG
+ * reservation can be critical, and we must have enough space (factoring
+ * in AG reservations) to construct a whole btree.
+ */
+bool
+xfs_repair_ag_has_space(
+ struct xfs_perag *pag,
+ xfs_extlen_t nr_blocks,
+ enum xfs_ag_resv_type type)
+{
+ return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
+ !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
+ pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
+}
+
+/*
+ * Figure out how many blocks to reserve for an AG repair. We calculate the
+ * worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-AG btree.
+ */
+xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_metadata *sm = sc->sm;
+ struct xfs_perag *pag;
+ struct xfs_buf *bp;
+ xfs_agino_t icount = 0;
+ xfs_extlen_t aglen = 0;
+ xfs_extlen_t usedlen;
+ xfs_extlen_t freelen;
+ xfs_extlen_t bnobt_sz;
+ xfs_extlen_t inobt_sz;
+ xfs_extlen_t rmapbt_sz;
+ xfs_extlen_t refcbt_sz;
+ int error;
+
+ if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ return 0;
+
+ /* Use in-core counters if possible. */
+ pag = xfs_perag_get(mp, sm->sm_agno);
+ if (pag->pagi_init)
+ icount = pag->pagi_count;
+ xfs_perag_put(pag);
+
+ /*
+ * Otherwise try to get the actual counters from disk; if not, make
+ * some worst case assumptions.
+ */
+ if (icount == 0) {
+ error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+ if (error) {
+ icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
+ } else {
+ pag = xfs_perag_get(mp, sm->sm_agno);
+ icount = pag->pagi_count;
+ xfs_perag_put(pag);
+ xfs_buf_relse(bp);
+ }
+ }
+
+ /* Now grab the block counters from the AGF. */
+ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+ if (error) {
+ aglen = mp->m_sb.sb_agblocks;
+ freelen = aglen;
+ usedlen = aglen;
+ } else {
+ pag = xfs_perag_get(mp, sm->sm_agno);
+ aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
+ freelen = pag->pagf_freeblks;
+ usedlen = aglen - freelen;
+ xfs_perag_put(pag);
+ xfs_buf_relse(bp);
+ }
+
+ trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
+ freelen, usedlen);
+
+ /*
+ * Figure out how many blocks we'd need worst case to rebuild
+ * each type of btree. Note that we can only rebuild the
+ * bnobt/cntbt or inobt/finobt as pairs.
+ */
+ bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+ XFS_INODES_PER_HOLEMASK_BIT);
+ else
+ inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+ XFS_INODES_PER_CHUNK);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ inobt_sz *= 2;
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
+ refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
+ } else {
+ rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
+ refcbt_sz = 0;
+ }
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ rmapbt_sz = 0;
+
+ trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
+ inobt_sz, rmapbt_sz, refcbt_sz);
+
+ return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 83170dd3388c..8d181dce6171 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -32,6 +32,10 @@ static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc)
int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc,
bool *fixed);
void xfs_repair_failure(struct xfs_mount *mp);
+int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
+bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
+ enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
/* Metadata repairers */
@@ -49,6 +53,14 @@ static inline int xfs_repair_attempt(
static inline void xfs_repair_failure(struct xfs_mount *mp) {}
+static inline xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+ struct xfs_scrub_context *sc)
+{
+ ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
+ return 0;
+}
+
#define xfs_repair_probe xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling
2018-05-15 22:33 ` [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling Darrick J. Wong
@ 2018-05-16 6:51 ` Dave Chinner
2018-05-16 16:46 ` Darrick J. Wong
2018-05-16 16:48 ` Allison Henderson
2018-05-18 3:49 ` [PATCH v2 " Darrick J. Wong
2 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 6:51 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Tue, May 15, 2018 at 03:33:45PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> For repairs, we need to reserve at least as many blocks as we think
> we're going to need to rebuild the data structure, and we're going to
> need some helpers to roll transactions while maintaining locks on the AG
> headers so that other threads cannot wander into the middle of a repair.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/bmap.c | 2 -
> fs/xfs/scrub/common.c | 21 ++++++-
> fs/xfs/scrub/common.h | 2 -
> fs/xfs/scrub/inode.c | 4 +
> fs/xfs/scrub/repair.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 12 ++++
> 6 files changed, 186 insertions(+), 7 deletions(-)
mostly looks good.
[...]
> +/*
> + * Figure out how many blocks to reserve for an AG repair. We calculate the
> + * worst case estimate for the number of blocks we'd need to rebuild one of
> + * any type of per-AG btree.
> + */
> +xfs_extlen_t
> +xfs_repair_calc_ag_resblks(
> + struct xfs_scrub_context *sc)
> +{
> + struct xfs_mount *mp = sc->mp;
> + struct xfs_scrub_metadata *sm = sc->sm;
> + struct xfs_perag *pag;
> + struct xfs_buf *bp;
> + xfs_agino_t icount = 0;
> + xfs_extlen_t aglen = 0;
> + xfs_extlen_t usedlen;
> + xfs_extlen_t freelen;
> + xfs_extlen_t bnobt_sz;
> + xfs_extlen_t inobt_sz;
> + xfs_extlen_t rmapbt_sz;
> + xfs_extlen_t refcbt_sz;
> + int error;
> +
> + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
> + return 0;
> +
> + /* Use in-core counters if possible. */
> + pag = xfs_perag_get(mp, sm->sm_agno);
> + if (pag->pagi_init)
> + icount = pag->pagi_count;
> + xfs_perag_put(pag);
Don't drop the pag here, do it ....
> +
> + /*
> + * Otherwise try to get the actual counters from disk; if not, make
> + * some worst case assumptions.
> + */
> + if (icount == 0) {
> + error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
> + if (error) {
> + icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
> + } else {
> + pag = xfs_perag_get(mp, sm->sm_agno);
> + icount = pag->pagi_count;
> + xfs_perag_put(pag);
> + xfs_buf_relse(bp);
> + }
> + }
> +
> + /* Now grab the block counters from the AGF. */
> + error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
> + if (error) {
> + aglen = mp->m_sb.sb_agblocks;
> + freelen = aglen;
> + usedlen = aglen;
> + } else {
> + pag = xfs_perag_get(mp, sm->sm_agno);
> + aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
> + freelen = pag->pagf_freeblks;
> + usedlen = aglen - freelen;
> + xfs_perag_put(pag);
> + xfs_buf_relse(bp);
> + }
.... here, so we don't have to do repeated lookups.
> + trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
> + freelen, usedlen);
> +
> + /*
> + * Figure out how many blocks we'd need worst case to rebuild
> + * each type of btree. Note that we can only rebuild the
> + * bnobt/cntbt or inobt/finobt as pairs.
> + */
> + bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
> + if (xfs_sb_version_hassparseinodes(&mp->m_sb))
> + inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> + XFS_INODES_PER_HOLEMASK_BIT);
> + else
> + inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> + XFS_INODES_PER_CHUNK);
> + if (xfs_sb_version_hasfinobt(&mp->m_sb))
> + inobt_sz *= 2;
> + if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> + rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
> + refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> + } else {
> + rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> + refcbt_sz = 0;
> + }
> + if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> + rmapbt_sz = 0;
This looks kinda whacky. reflink and rmapbt are different features,
maybe a comment to explain why the rmapbt size calc is done all
back to front?
Otherwise looks ok.
Reviewed-by: Dave Chinner <dchinner@redhat.com>
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling
2018-05-16 6:51 ` Dave Chinner
@ 2018-05-16 16:46 ` Darrick J. Wong
2018-05-16 21:19 ` Dave Chinner
0 siblings, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-16 16:46 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Wed, May 16, 2018 at 04:51:03PM +1000, Dave Chinner wrote:
> On Tue, May 15, 2018 at 03:33:45PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > For repairs, we need to reserve at least as many blocks as we think
> > we're going to need to rebuild the data structure, and we're going to
> > need some helpers to roll transactions while maintaining locks on the AG
> > headers so that other threads cannot wander into the middle of a repair.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > fs/xfs/scrub/bmap.c | 2 -
> > fs/xfs/scrub/common.c | 21 ++++++-
> > fs/xfs/scrub/common.h | 2 -
> > fs/xfs/scrub/inode.c | 4 +
> > fs/xfs/scrub/repair.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++
> > fs/xfs/scrub/repair.h | 12 ++++
> > 6 files changed, 186 insertions(+), 7 deletions(-)
>
> mostly looks good.
>
> [...]
> > +/*
> > + * Figure out how many blocks to reserve for an AG repair. We calculate the
> > + * worst case estimate for the number of blocks we'd need to rebuild one of
> > + * any type of per-AG btree.
> > + */
> > +xfs_extlen_t
> > +xfs_repair_calc_ag_resblks(
> > + struct xfs_scrub_context *sc)
> > +{
> > + struct xfs_mount *mp = sc->mp;
> > + struct xfs_scrub_metadata *sm = sc->sm;
> > + struct xfs_perag *pag;
> > + struct xfs_buf *bp;
> > + xfs_agino_t icount = 0;
> > + xfs_extlen_t aglen = 0;
> > + xfs_extlen_t usedlen;
> > + xfs_extlen_t freelen;
> > + xfs_extlen_t bnobt_sz;
> > + xfs_extlen_t inobt_sz;
> > + xfs_extlen_t rmapbt_sz;
> > + xfs_extlen_t refcbt_sz;
> > + int error;
> > +
> > + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
> > + return 0;
> > +
> > + /* Use in-core counters if possible. */
> > + pag = xfs_perag_get(mp, sm->sm_agno);
> > + if (pag->pagi_init)
> > + icount = pag->pagi_count;
> > + xfs_perag_put(pag);
>
> Don't drop the pag here, do it ....
> > +
> > + /*
> > + * Otherwise try to get the actual counters from disk; if not, make
> > + * some worst case assumptions.
> > + */
> > + if (icount == 0) {
> > + error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
> > + if (error) {
> > + icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
> > + } else {
> > + pag = xfs_perag_get(mp, sm->sm_agno);
> > + icount = pag->pagi_count;
> > + xfs_perag_put(pag);
> > + xfs_buf_relse(bp);
> > + }
> > + }
> > +
> > + /* Now grab the block counters from the AGF. */
> > + error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
> > + if (error) {
> > + aglen = mp->m_sb.sb_agblocks;
> > + freelen = aglen;
> > + usedlen = aglen;
> > + } else {
> > + pag = xfs_perag_get(mp, sm->sm_agno);
> > + aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
> > + freelen = pag->pagf_freeblks;
> > + usedlen = aglen - freelen;
> > + xfs_perag_put(pag);
> > + xfs_buf_relse(bp);
> > + }
>
> .... here, so we don't have to do repeated lookups.
Ok, fixed.
> > + trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
> > + freelen, usedlen);
> > +
> > + /*
> > + * Figure out how many blocks we'd need worst case to rebuild
> > + * each type of btree. Note that we can only rebuild the
> > + * bnobt/cntbt or inobt/finobt as pairs.
> > + */
> > + bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
> > + if (xfs_sb_version_hassparseinodes(&mp->m_sb))
> > + inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> > + XFS_INODES_PER_HOLEMASK_BIT);
> > + else
> > + inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> > + XFS_INODES_PER_CHUNK);
> > + if (xfs_sb_version_hasfinobt(&mp->m_sb))
> > + inobt_sz *= 2;
> > + if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > + rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
> > + refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> > + } else {
> > + rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> > + refcbt_sz = 0;
> > + }
> > + if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > + rmapbt_sz = 0;
>
> This looks kinda whacky. reflink and rmapbt are different features,
> maybe a comment to explain why the rmapbt size calc is done all
> back to front?
Yeah. Replace that whole section with:
if (xfs_sb_version_hasreflink(&mp->m_sb))
refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
else
refcbt_sz = 0;
if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
/*
* Guess how many blocks we need to rebuild the rmapbt.
* For non-reflink filesystems we can't have more
* records than used blocks. However, with reflink it's
* possible to have more than one rmap record per AG
* block. We don't know how many rmaps there could be
* in the AG, so we start off with what we hope is an
* generous over-estimation.
*/
if (xfs_sb_version_hasreflink(&mp->m_sb))
rmapbt_sz = xfs_rmapbt_calc_size(mp,
(unsigned long long)aglen * 2);
else
rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
} else {
rmapbt_sz = 0;
}
--D
>
> Otherwise looks ok.
>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling
2018-05-16 16:46 ` Darrick J. Wong
@ 2018-05-16 21:19 ` Dave Chinner
0 siblings, 0 replies; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 21:19 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Wed, May 16, 2018 at 09:46:45AM -0700, Darrick J. Wong wrote:
> On Wed, May 16, 2018 at 04:51:03PM +1000, Dave Chinner wrote:
> > On Tue, May 15, 2018 at 03:33:45PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > >
> > > For repairs, we need to reserve at least as many blocks as we think
> > > we're going to need to rebuild the data structure, and we're going to
> > > need some helpers to roll transactions while maintaining locks on the AG
> > > headers so that other threads cannot wander into the middle of a repair.
> > >
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > > fs/xfs/scrub/bmap.c | 2 -
> > > fs/xfs/scrub/common.c | 21 ++++++-
> > > fs/xfs/scrub/common.h | 2 -
> > > fs/xfs/scrub/inode.c | 4 +
> > > fs/xfs/scrub/repair.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > fs/xfs/scrub/repair.h | 12 ++++
> > > 6 files changed, 186 insertions(+), 7 deletions(-)
> >
> > mostly looks good.
> >
> > [...]
> > > + freelen, usedlen);
> > > +
> > > + /*
> > > + * Figure out how many blocks we'd need worst case to rebuild
> > > + * each type of btree. Note that we can only rebuild the
> > > + * bnobt/cntbt or inobt/finobt as pairs.
> > > + */
> > > + bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
> > > + if (xfs_sb_version_hassparseinodes(&mp->m_sb))
> > > + inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> > > + XFS_INODES_PER_HOLEMASK_BIT);
> > > + else
> > > + inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> > > + XFS_INODES_PER_CHUNK);
> > > + if (xfs_sb_version_hasfinobt(&mp->m_sb))
> > > + inobt_sz *= 2;
> > > + if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > > + rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
> > > + refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> > > + } else {
> > > + rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> > > + refcbt_sz = 0;
> > > + }
> > > + if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > > + rmapbt_sz = 0;
> >
> > This looks kinda whacky. reflink and rmapbt are different features,
> > maybe a comment to explain why the rmapbt size calc is done all
> > back to front?
>
> Yeah. Replace that whole section with:
>
> if (xfs_sb_version_hasreflink(&mp->m_sb))
> refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> else
> refcbt_sz = 0;
> if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
> /*
> * Guess how many blocks we need to rebuild the rmapbt.
> * For non-reflink filesystems we can't have more
> * records than used blocks. However, with reflink it's
> * possible to have more than one rmap record per AG
> * block. We don't know how many rmaps there could be
> * in the AG, so we start off with what we hope is an
> * generous over-estimation.
> */
> if (xfs_sb_version_hasreflink(&mp->m_sb))
> rmapbt_sz = xfs_rmapbt_calc_size(mp,
> (unsigned long long)aglen * 2);
> else
> rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> } else {
> rmapbt_sz = 0;
> }
Yup, that's much better :)
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling
2018-05-15 22:33 ` [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling Darrick J. Wong
2018-05-16 6:51 ` Dave Chinner
@ 2018-05-16 16:48 ` Allison Henderson
2018-05-18 3:49 ` [PATCH v2 " Darrick J. Wong
2 siblings, 0 replies; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 16:48 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs, david
Looks good to me, you can add:
Reviewed by: Allison Henderson <allison.henderson@oracle.com>
On 05/15/2018 03:33 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> For repairs, we need to reserve at least as many blocks as we think
> we're going to need to rebuild the data structure, and we're going to
> need some helpers to roll transactions while maintaining locks on the AG
> headers so that other threads cannot wander into the middle of a repair.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/bmap.c | 2 -
> fs/xfs/scrub/common.c | 21 ++++++-
> fs/xfs/scrub/common.h | 2 -
> fs/xfs/scrub/inode.c | 4 +
> fs/xfs/scrub/repair.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 12 ++++
> 6 files changed, 186 insertions(+), 7 deletions(-)
>
>
> diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
> index 42a115e83739..eeadb33a701c 100644
> --- a/fs/xfs/scrub/bmap.c
> +++ b/fs/xfs/scrub/bmap.c
> @@ -74,7 +74,7 @@ xfs_scrub_setup_inode_bmap(
> }
>
> /* Got the inode, lock it and we're ready to go. */
> - error = xfs_scrub_trans_alloc(sc);
> + error = xfs_scrub_trans_alloc(sc, 0);
> if (error)
> goto out;
> sc->ilock_flags |= XFS_ILOCK_EXCL;
> diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
> index 518bff2be0c9..f1826b4b7572 100644
> --- a/fs/xfs/scrub/common.c
> +++ b/fs/xfs/scrub/common.c
> @@ -51,6 +51,7 @@
> #include "scrub/common.h"
> #include "scrub/trace.h"
> #include "scrub/btree.h"
> +#include "scrub/repair.h"
>
> /* Common code for the metadata scrubbers. */
>
> @@ -573,11 +574,22 @@ xfs_scrub_ag_init(
> /*
> * Grab an empty transaction so that we can re-grab locked buffers if
> * one of our btrees turns out to be cyclic.
> + *
> + * If we're going to repair something, we need to ask for the largest possible
> + * log reservation so that we can handle the worst case scenario for metadata
> + * updates while rebuilding a metadata item. We also need to reserve as many
> + * blocks in the head transaction as we think we're going to need to rebuild
> + * the metadata object.
> */
> int
> xfs_scrub_trans_alloc(
> - struct xfs_scrub_context *sc)
> + struct xfs_scrub_context *sc,
> + uint resblks)
> {
> + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
> + return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
> + resblks, 0, 0, &sc->tp);
> +
> return xfs_trans_alloc_empty(sc->mp, &sc->tp);
> }
>
> @@ -587,7 +599,10 @@ xfs_scrub_setup_fs(
> struct xfs_scrub_context *sc,
> struct xfs_inode *ip)
> {
> - return xfs_scrub_trans_alloc(sc);
> + uint resblks;
> +
> + resblks = xfs_repair_calc_ag_resblks(sc);
> + return xfs_scrub_trans_alloc(sc, resblks);
> }
>
> /* Set us up with AG headers and btree cursors. */
> @@ -717,7 +732,7 @@ xfs_scrub_setup_inode_contents(
> /* Got the inode, lock it and we're ready to go. */
> sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> xfs_ilock(sc->ip, sc->ilock_flags);
> - error = xfs_scrub_trans_alloc(sc);
> + error = xfs_scrub_trans_alloc(sc, resblks);
> if (error)
> goto out;
> sc->ilock_flags |= XFS_ILOCK_EXCL;
> diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
> index a660087b606e..6012049a8617 100644
> --- a/fs/xfs/scrub/common.h
> +++ b/fs/xfs/scrub/common.h
> @@ -38,7 +38,7 @@ xfs_scrub_should_terminate(
> return false;
> }
>
> -int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc);
> +int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks);
> bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
> xfs_agblock_t bno, int *error);
> bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
> diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
> index 550c0cf70a92..0c696f7018de 100644
> --- a/fs/xfs/scrub/inode.c
> +++ b/fs/xfs/scrub/inode.c
> @@ -67,7 +67,7 @@ xfs_scrub_setup_inode(
> break;
> case -EFSCORRUPTED:
> case -EFSBADCRC:
> - return xfs_scrub_trans_alloc(sc);
> + return xfs_scrub_trans_alloc(sc, 0);
> default:
> return error;
> }
> @@ -75,7 +75,7 @@ xfs_scrub_setup_inode(
> /* Got the inode, lock it and we're ready to go. */
> sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> xfs_ilock(sc->ip, sc->ilock_flags);
> - error = xfs_scrub_trans_alloc(sc);
> + error = xfs_scrub_trans_alloc(sc, 0);
> if (error)
> goto out;
> sc->ilock_flags |= XFS_ILOCK_EXCL;
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index be30825c47c6..486e6e319b1f 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -128,3 +128,155 @@ xfs_repair_probe(
>
> return 0;
> }
> +
> +/*
> + * Roll a transaction, keeping the AG headers locked and reinitializing
> + * the btree cursors.
> + */
> +int
> +xfs_repair_roll_ag_trans(
> + struct xfs_scrub_context *sc)
> +{
> + int error;
> +
> + /* Keep the AG header buffers locked so we can keep going. */
> + xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> + xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> + xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> +
> + /* Roll the transaction. */
> + error = xfs_trans_roll(&sc->tp);
> + if (error)
> + goto out_release;
> +
> + /* Join AG headers to the new transaction. */
> + xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> + xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> +
> + return 0;
> +
> +out_release:
> + /*
> + * Rolling failed, so release the hold on the buffers. The
> + * buffers will be released during teardown on our way out
> + * of the kernel.
> + */
> + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> + xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> +
> + return error;
> +}
> +
> +/*
> + * Does the given AG have enough space to rebuild a btree? Neither AG
> + * reservation can be critical, and we must have enough space (factoring
> + * in AG reservations) to construct a whole btree.
> + */
> +bool
> +xfs_repair_ag_has_space(
> + struct xfs_perag *pag,
> + xfs_extlen_t nr_blocks,
> + enum xfs_ag_resv_type type)
> +{
> + return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
> + !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
> + pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
> +}
> +
> +/*
> + * Figure out how many blocks to reserve for an AG repair. We calculate the
> + * worst case estimate for the number of blocks we'd need to rebuild one of
> + * any type of per-AG btree.
> + */
> +xfs_extlen_t
> +xfs_repair_calc_ag_resblks(
> + struct xfs_scrub_context *sc)
> +{
> + struct xfs_mount *mp = sc->mp;
> + struct xfs_scrub_metadata *sm = sc->sm;
> + struct xfs_perag *pag;
> + struct xfs_buf *bp;
> + xfs_agino_t icount = 0;
> + xfs_extlen_t aglen = 0;
> + xfs_extlen_t usedlen;
> + xfs_extlen_t freelen;
> + xfs_extlen_t bnobt_sz;
> + xfs_extlen_t inobt_sz;
> + xfs_extlen_t rmapbt_sz;
> + xfs_extlen_t refcbt_sz;
> + int error;
> +
> + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
> + return 0;
> +
> + /* Use in-core counters if possible. */
> + pag = xfs_perag_get(mp, sm->sm_agno);
> + if (pag->pagi_init)
> + icount = pag->pagi_count;
> + xfs_perag_put(pag);
> +
> + /*
> + * Otherwise try to get the actual counters from disk; if not, make
> + * some worst case assumptions.
> + */
> + if (icount == 0) {
> + error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
> + if (error) {
> + icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
> + } else {
> + pag = xfs_perag_get(mp, sm->sm_agno);
> + icount = pag->pagi_count;
> + xfs_perag_put(pag);
> + xfs_buf_relse(bp);
> + }
> + }
> +
> + /* Now grab the block counters from the AGF. */
> + error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
> + if (error) {
> + aglen = mp->m_sb.sb_agblocks;
> + freelen = aglen;
> + usedlen = aglen;
> + } else {
> + pag = xfs_perag_get(mp, sm->sm_agno);
> + aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
> + freelen = pag->pagf_freeblks;
> + usedlen = aglen - freelen;
> + xfs_perag_put(pag);
> + xfs_buf_relse(bp);
> + }
> +
> + trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
> + freelen, usedlen);
> +
> + /*
> + * Figure out how many blocks we'd need worst case to rebuild
> + * each type of btree. Note that we can only rebuild the
> + * bnobt/cntbt or inobt/finobt as pairs.
> + */
> + bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
> + if (xfs_sb_version_hassparseinodes(&mp->m_sb))
> + inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> + XFS_INODES_PER_HOLEMASK_BIT);
> + else
> + inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> + XFS_INODES_PER_CHUNK);
> + if (xfs_sb_version_hasfinobt(&mp->m_sb))
> + inobt_sz *= 2;
> + if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> + rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
> + refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> + } else {
> + rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> + refcbt_sz = 0;
> + }
> + if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> + rmapbt_sz = 0;
> +
> + trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
> + inobt_sz, rmapbt_sz, refcbt_sz);
> +
> + return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
> +}
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index 83170dd3388c..8d181dce6171 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -32,6 +32,10 @@ static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc)
> int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc,
> bool *fixed);
> void xfs_repair_failure(struct xfs_mount *mp);
> +int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
> +bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
> + enum xfs_ag_resv_type type);
> +xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
>
> /* Metadata repairers */
>
> @@ -49,6 +53,14 @@ static inline int xfs_repair_attempt(
>
> static inline void xfs_repair_failure(struct xfs_mount *mp) {}
>
> +static inline xfs_extlen_t
> +xfs_repair_calc_ag_resblks(
> + struct xfs_scrub_context *sc)
> +{
> + ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
> + return 0;
> +}
> +
> #define xfs_repair_probe xfs_repair_notsupported
>
> #endif /* CONFIG_XFS_ONLINE_REPAIR */
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at https://urldefense.proofpoint.com/v2/url?u=http-3A__vger.kernel.org_majordomo-2Dinfo.html&d=DwICaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=LHZQ8fHvy6wDKXGTWcm97burZH5sQKHRDMaY1UthQxc&m=TQHk1pYShWjO6ANE7iY6Bn29jiBqkeNo5IMsc5kla3U&s=vF097w5pOXeZmfX1JYpzuN-nJX8gG5B7EUcQQkONWKk&e=
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v2 01/22] xfs: add helpers to deal with transaction allocation and rolling
2018-05-15 22:33 ` [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling Darrick J. Wong
2018-05-16 6:51 ` Dave Chinner
2018-05-16 16:48 ` Allison Henderson
@ 2018-05-18 3:49 ` Darrick J. Wong
2 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-18 3:49 UTC (permalink / raw)
To: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
For repairs, we need to reserve at least as many blocks as we think
we're going to need to rebuild the data structure, and we're going to
need some helpers to roll transactions while maintaining locks on the AG
headers so that other threads cannot wander into the middle of a repair.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
---
v2: fix whacky rmap reservation code, better docs
---
fs/xfs/scrub/bmap.c | 2 -
fs/xfs/scrub/common.c | 21 ++++++
fs/xfs/scrub/common.h | 2 -
fs/xfs/scrub/inode.c | 4 +
fs/xfs/scrub/repair.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 12 ++++
6 files changed, 194 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 42a115e83739..eeadb33a701c 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -74,7 +74,7 @@ xfs_scrub_setup_inode_bmap(
}
/* Got the inode, lock it and we're ready to go. */
- error = xfs_scrub_trans_alloc(sc);
+ error = xfs_scrub_trans_alloc(sc, 0);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index d3e5adc96411..41198a5f872c 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -51,6 +51,7 @@
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/btree.h"
+#include "scrub/repair.h"
/* Common code for the metadata scrubbers. */
@@ -590,11 +591,22 @@ xfs_scrub_perag_get(
/*
* Grab an empty transaction so that we can re-grab locked buffers if
* one of our btrees turns out to be cyclic.
+ *
+ * If we're going to repair something, we need to ask for the largest possible
+ * log reservation so that we can handle the worst case scenario for metadata
+ * updates while rebuilding a metadata item. We also need to reserve as many
+ * blocks in the head transaction as we think we're going to need to rebuild
+ * the metadata object.
*/
int
xfs_scrub_trans_alloc(
- struct xfs_scrub_context *sc)
+ struct xfs_scrub_context *sc,
+ uint resblks)
{
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+ resblks, 0, 0, &sc->tp);
+
return xfs_trans_alloc_empty(sc->mp, &sc->tp);
}
@@ -604,7 +616,10 @@ xfs_scrub_setup_fs(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- return xfs_scrub_trans_alloc(sc);
+ uint resblks;
+
+ resblks = xfs_repair_calc_ag_resblks(sc);
+ return xfs_scrub_trans_alloc(sc, resblks);
}
/* Set us up with AG headers and btree cursors. */
@@ -734,7 +749,7 @@ xfs_scrub_setup_inode_contents(
/* Got the inode, lock it and we're ready to go. */
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
- error = xfs_scrub_trans_alloc(sc);
+ error = xfs_scrub_trans_alloc(sc, resblks);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index fbb91a7144fd..76bb2d1d808c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -38,7 +38,7 @@ xfs_scrub_should_terminate(
return false;
}
-int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc);
+int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks);
bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error);
bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 550c0cf70a92..0c696f7018de 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -67,7 +67,7 @@ xfs_scrub_setup_inode(
break;
case -EFSCORRUPTED:
case -EFSBADCRC:
- return xfs_scrub_trans_alloc(sc);
+ return xfs_scrub_trans_alloc(sc, 0);
default:
return error;
}
@@ -75,7 +75,7 @@ xfs_scrub_setup_inode(
/* Got the inode, lock it and we're ready to go. */
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
- error = xfs_scrub_trans_alloc(sc);
+ error = xfs_scrub_trans_alloc(sc, 0);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index be30825c47c6..d86f8731a78f 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -128,3 +128,163 @@ xfs_repair_probe(
return 0;
}
+
+/*
+ * Roll a transaction, keeping the AG headers locked and reinitializing
+ * the btree cursors.
+ */
+int
+xfs_repair_roll_ag_trans(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ /* Keep the AG header buffers locked so we can keep going. */
+ xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+
+ /* Roll the transaction. */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ goto out_release;
+
+ /* Join AG headers to the new transaction. */
+ xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+
+ return 0;
+
+out_release:
+ /*
+ * Rolling failed, so release the hold on the buffers. The
+ * buffers will be released during teardown on our way out
+ * of the kernel.
+ */
+ xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
+
+ return error;
+}
+
+/*
+ * Does the given AG have enough space to rebuild a btree? Neither AG
+ * reservation can be critical, and we must have enough space (factoring
+ * in AG reservations) to construct a whole btree.
+ */
+bool
+xfs_repair_ag_has_space(
+ struct xfs_perag *pag,
+ xfs_extlen_t nr_blocks,
+ enum xfs_ag_resv_type type)
+{
+ return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
+ !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
+ pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
+}
+
+/*
+ * Figure out how many blocks to reserve for an AG repair. We calculate the
+ * worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-AG btree.
+ */
+xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_metadata *sm = sc->sm;
+ struct xfs_perag *pag;
+ struct xfs_buf *bp;
+ xfs_agino_t icount = 0;
+ xfs_extlen_t aglen = 0;
+ xfs_extlen_t usedlen;
+ xfs_extlen_t freelen;
+ xfs_extlen_t bnobt_sz;
+ xfs_extlen_t inobt_sz;
+ xfs_extlen_t rmapbt_sz;
+ xfs_extlen_t refcbt_sz;
+ int error;
+
+ if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ return 0;
+
+ /* Use in-core counters if possible. */
+ pag = xfs_perag_get(mp, sm->sm_agno);
+ if (pag->pagi_init)
+ icount = pag->pagi_count;
+
+ /*
+ * Otherwise try to get the actual counters from disk; if not, make
+ * some worst case assumptions.
+ */
+ if (icount == 0) {
+ error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+ if (error) {
+ icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
+ } else {
+ icount = pag->pagi_count;
+ xfs_buf_relse(bp);
+ }
+ }
+
+ /* Now grab the block counters from the AGF. */
+ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+ if (error) {
+ aglen = mp->m_sb.sb_agblocks;
+ freelen = aglen;
+ usedlen = aglen;
+ } else {
+ aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
+ freelen = pag->pagf_freeblks;
+ usedlen = aglen - freelen;
+ xfs_buf_relse(bp);
+ }
+ xfs_perag_put(pag);
+
+ trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
+ freelen, usedlen);
+
+ /*
+ * Figure out how many blocks we'd need worst case to rebuild
+ * each type of btree. Note that we can only rebuild the
+ * bnobt/cntbt or inobt/finobt as pairs.
+ */
+ bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+ XFS_INODES_PER_HOLEMASK_BIT);
+ else
+ inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+ XFS_INODES_PER_CHUNK);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ inobt_sz *= 2;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
+ else
+ refcbt_sz = 0;
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ /*
+ * Guess how many blocks we need to rebuild the rmapbt.
+ * For non-reflink filesystems we can't have more records than
+ * used blocks. However, with reflink it's possible to have
+ * more than one rmap record per AG block. We don't know how
+ * many rmaps there could be in the AG, so we start off with
+ * what we hope is an generous over-estimation.
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ rmapbt_sz = xfs_rmapbt_calc_size(mp,
+ (unsigned long long)aglen * 2);
+ else
+ rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
+ } else {
+ rmapbt_sz = 0;
+ }
+
+ trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
+ inobt_sz, rmapbt_sz, refcbt_sz);
+
+ return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 83170dd3388c..8d181dce6171 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -32,6 +32,10 @@ static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc)
int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc,
bool *fixed);
void xfs_repair_failure(struct xfs_mount *mp);
+int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
+bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
+ enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
/* Metadata repairers */
@@ -49,6 +53,14 @@ static inline int xfs_repair_attempt(
static inline void xfs_repair_failure(struct xfs_mount *mp) {}
+static inline xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+ struct xfs_scrub_context *sc)
+{
+ ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
+ return 0;
+}
+
#define xfs_repair_probe xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 02/22] xfs: add helpers to allocate and initialize fresh btree roots
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
2018-05-15 22:33 ` [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling Darrick J. Wong
@ 2018-05-15 22:33 ` Darrick J. Wong
2018-05-16 7:07 ` Dave Chinner
2018-05-16 17:00 ` Allison Henderson
2018-05-15 22:33 ` [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair Darrick J. Wong
` (20 subsequent siblings)
22 siblings, 2 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:33 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Add a pair of helper functions to allocate and initialize fresh btree
roots. The repair functions will use these as part of recreating
corrupted metadata.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/repair.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 6 ++++
2 files changed, 87 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 486e6e319b1f..72f04a717150 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -280,3 +280,84 @@ xfs_repair_calc_ag_resblks(
return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
}
+
+/* Allocate a block in an AG. */
+int
+xfs_repair_alloc_ag_block(
+ struct xfs_scrub_context *sc,
+ struct xfs_owner_info *oinfo,
+ xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv)
+{
+ struct xfs_alloc_arg args = {0};
+ xfs_agblock_t bno;
+ int error;
+
+ switch (resv) {
+ case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_RMAPBT:
+ error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+ if (error)
+ return error;
+ if (bno == NULLAGBLOCK)
+ return -ENOSPC;
+ xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
+ 1, false);
+ *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
+ if (resv == XFS_AG_RESV_RMAPBT)
+ xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
+ return 0;
+ default:
+ break;
+ }
+
+ args.tp = sc->tp;
+ args.mp = sc->mp;
+ args.oinfo = *oinfo;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
+ args.minlen = 1;
+ args.maxlen = 1;
+ args.prod = 1;
+ args.type = XFS_ALLOCTYPE_THIS_AG;
+ args.resv = resv;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+ ASSERT(args.len == 1);
+ *fsbno = args.fsbno;
+
+ return 0;
+}
+
+/* Initialize a new AG btree root block with zero entries. */
+int
+xfs_repair_init_btblock(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsb,
+ struct xfs_buf **bpp,
+ xfs_btnum_t btnum,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+
+ trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
+ XFS_FSB_TO_AGBNO(mp, fsb), btnum);
+
+ ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
+ XFS_FSB_TO_BB(mp, 1), 0);
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno,
+ XFS_BTREE_CRC_BLOCKS);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+ bp->b_ops = ops;
+ *bpp = bp;
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 8d181dce6171..40990fa5f381 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -36,6 +36,12 @@ int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
enum xfs_ag_resv_type type);
xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
+int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc,
+ struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv);
+int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
+ struct xfs_buf **bpp, xfs_btnum_t btnum,
+ const struct xfs_buf_ops *ops);
/* Metadata repairers */
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH 02/22] xfs: add helpers to allocate and initialize fresh btree roots
2018-05-15 22:33 ` [PATCH 02/22] xfs: add helpers to allocate and initialize fresh btree roots Darrick J. Wong
@ 2018-05-16 7:07 ` Dave Chinner
2018-05-16 17:15 ` Darrick J. Wong
2018-05-16 17:00 ` Allison Henderson
1 sibling, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 7:07 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Tue, May 15, 2018 at 03:33:51PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add a pair of helper functions to allocate and initialize fresh btree
> roots. The repair functions will use these as part of recreating
> corrupted metadata.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/repair.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 6 ++++
> 2 files changed, 87 insertions(+)
Looks good, but....
> +/* Initialize a new AG btree root block with zero entries. */
> +int
> +xfs_repair_init_btblock(
> + struct xfs_scrub_context *sc,
> + xfs_fsblock_t fsb,
> + struct xfs_buf **bpp,
> + xfs_btnum_t btnum,
> + const struct xfs_buf_ops *ops)
> +{
> + struct xfs_trans *tp = sc->tp;
> + struct xfs_mount *mp = sc->mp;
> + struct xfs_buf *bp;
> +
> + trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
> + XFS_FSB_TO_AGBNO(mp, fsb), btnum);
> +
> + ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
> + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
> + XFS_FSB_TO_BB(mp, 1), 0);
> + xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
> + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno,
> + XFS_BTREE_CRC_BLOCKS);
This flag does nothing in xfs_btree_init_block(). Any reason for
setting it?
With that fixed, though,
Reviewed-by: Dave Chinner <dchinner@redhat.com>
> + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
> + xfs_trans_log_buf(tp, bp, 0, bp->b_length);
> + bp->b_ops = ops;
> + *bpp = bp;
> +
> + return 0;
> +}
For followup patches, I think there's some overlap with the new
libxfs/xfs_ag.c functions for initialising new btree root blocks for
growfs? e.g. Make the struct xfs_aghdr_grow_data aghdr_data[] array
a static global, and then we can do something like:
bp = xfs_ag_init_btroot(mp, tp, agno, agbno, btnum);
xfs_trans_log_buf(tp, bp, 0, bp->b_length);
and all the details of reading the btree block and setting it up go
away from this code....
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 02/22] xfs: add helpers to allocate and initialize fresh btree roots
2018-05-16 7:07 ` Dave Chinner
@ 2018-05-16 17:15 ` Darrick J. Wong
0 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-16 17:15 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Wed, May 16, 2018 at 05:07:46PM +1000, Dave Chinner wrote:
> On Tue, May 15, 2018 at 03:33:51PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Add a pair of helper functions to allocate and initialize fresh btree
> > roots. The repair functions will use these as part of recreating
> > corrupted metadata.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > fs/xfs/scrub/repair.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
> > fs/xfs/scrub/repair.h | 6 ++++
> > 2 files changed, 87 insertions(+)
>
> Looks good, but....
>
> > +/* Initialize a new AG btree root block with zero entries. */
> > +int
> > +xfs_repair_init_btblock(
> > + struct xfs_scrub_context *sc,
> > + xfs_fsblock_t fsb,
> > + struct xfs_buf **bpp,
> > + xfs_btnum_t btnum,
> > + const struct xfs_buf_ops *ops)
> > +{
> > + struct xfs_trans *tp = sc->tp;
> > + struct xfs_mount *mp = sc->mp;
> > + struct xfs_buf *bp;
> > +
> > + trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
> > + XFS_FSB_TO_AGBNO(mp, fsb), btnum);
> > +
> > + ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
> > + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
> > + XFS_FSB_TO_BB(mp, 1), 0);
> > + xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
> > + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno,
> > + XFS_BTREE_CRC_BLOCKS);
>
> This flag does nothing in xfs_btree_init_block(). Any reason for
> setting it?
Nope. Will fix.
> With that fixed, though,
>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
>
> > + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
> > + xfs_trans_log_buf(tp, bp, 0, bp->b_length);
> > + bp->b_ops = ops;
> > + *bpp = bp;
> > +
> > + return 0;
> > +}
>
> For followup patches, I think there's some overlap with the new
> libxfs/xfs_ag.c functions for initialising new btree root blocks for
> growfs? e.g. Make the struct xfs_aghdr_grow_data aghdr_data[] array
> a static global, and then we can do something like:
>
> bp = xfs_ag_init_btroot(mp, tp, agno, agbno, btnum);
> xfs_trans_log_buf(tp, bp, 0, bp->b_length);
>
> and all the details of reading the btree block and setting it up go
> away from this code....
I'm not sure aghdr_data can be made static global since parts of it
depend on the struct xfs_mount, but at the very least we should try to
refactor this (in the followup series) to simplify the repair code.
There may be complications relating to the xfs_*root_init functions
initializing default records that are appropriate for a freshly created
AG that are totally wrong for regenerating a broken AG, and in any case
repair really requires totally empty root blocks since it has already
generated the list of records it's going to put into the new btree.
--D
> -Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 02/22] xfs: add helpers to allocate and initialize fresh btree roots
2018-05-15 22:33 ` [PATCH 02/22] xfs: add helpers to allocate and initialize fresh btree roots Darrick J. Wong
2018-05-16 7:07 ` Dave Chinner
@ 2018-05-16 17:00 ` Allison Henderson
1 sibling, 0 replies; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 17:00 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs, david
Alrighty, you can add my review:
Reviewed by: Allison Henderson <allison.henderson@oracle.com>
Thx!
On 05/15/2018 03:33 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add a pair of helper functions to allocate and initialize fresh btree
> roots. The repair functions will use these as part of recreating
> corrupted metadata.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/repair.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 6 ++++
> 2 files changed, 87 insertions(+)
>
>
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 486e6e319b1f..72f04a717150 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -280,3 +280,84 @@ xfs_repair_calc_ag_resblks(
>
> return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
> }
> +
> +/* Allocate a block in an AG. */
> +int
> +xfs_repair_alloc_ag_block(
> + struct xfs_scrub_context *sc,
> + struct xfs_owner_info *oinfo,
> + xfs_fsblock_t *fsbno,
> + enum xfs_ag_resv_type resv)
> +{
> + struct xfs_alloc_arg args = {0};
> + xfs_agblock_t bno;
> + int error;
> +
> + switch (resv) {
> + case XFS_AG_RESV_AGFL:
> + case XFS_AG_RESV_RMAPBT:
> + error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
> + if (error)
> + return error;
> + if (bno == NULLAGBLOCK)
> + return -ENOSPC;
> + xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
> + 1, false);
> + *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
> + if (resv == XFS_AG_RESV_RMAPBT)
> + xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
> + return 0;
> + default:
> + break;
> + }
> +
> + args.tp = sc->tp;
> + args.mp = sc->mp;
> + args.oinfo = *oinfo;
> + args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
> + args.minlen = 1;
> + args.maxlen = 1;
> + args.prod = 1;
> + args.type = XFS_ALLOCTYPE_THIS_AG;
> + args.resv = resv;
> +
> + error = xfs_alloc_vextent(&args);
> + if (error)
> + return error;
> + if (args.fsbno == NULLFSBLOCK)
> + return -ENOSPC;
> + ASSERT(args.len == 1);
> + *fsbno = args.fsbno;
> +
> + return 0;
> +}
> +
> +/* Initialize a new AG btree root block with zero entries. */
> +int
> +xfs_repair_init_btblock(
> + struct xfs_scrub_context *sc,
> + xfs_fsblock_t fsb,
> + struct xfs_buf **bpp,
> + xfs_btnum_t btnum,
> + const struct xfs_buf_ops *ops)
> +{
> + struct xfs_trans *tp = sc->tp;
> + struct xfs_mount *mp = sc->mp;
> + struct xfs_buf *bp;
> +
> + trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
> + XFS_FSB_TO_AGBNO(mp, fsb), btnum);
> +
> + ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
> + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
> + XFS_FSB_TO_BB(mp, 1), 0);
> + xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
> + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno,
> + XFS_BTREE_CRC_BLOCKS);
> + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
> + xfs_trans_log_buf(tp, bp, 0, bp->b_length);
> + bp->b_ops = ops;
> + *bpp = bp;
> +
> + return 0;
> +}
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index 8d181dce6171..40990fa5f381 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -36,6 +36,12 @@ int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
> bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
> enum xfs_ag_resv_type type);
> xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
> +int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc,
> + struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
> + enum xfs_ag_resv_type resv);
> +int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
> + struct xfs_buf **bpp, xfs_btnum_t btnum,
> + const struct xfs_buf_ops *ops);
>
> /* Metadata repairers */
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at https://urldefense.proofpoint.com/v2/url?u=http-3A__vger.kernel.org_majordomo-2Dinfo.html&d=DwICaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=LHZQ8fHvy6wDKXGTWcm97burZH5sQKHRDMaY1UthQxc&m=4QHdmNFIP7trSueA8XyY8TjPFBlv0bJ8UpmheOu6zJA&s=nvq_2KNHpSbVaDCf2vQai6hS8Pkmm8mnLSLrA8erFCk&e=
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
2018-05-15 22:33 ` [PATCH 01/22] xfs: add helpers to deal with transaction allocation and rolling Darrick J. Wong
2018-05-15 22:33 ` [PATCH 02/22] xfs: add helpers to allocate and initialize fresh btree roots Darrick J. Wong
@ 2018-05-15 22:33 ` Darrick J. Wong
2018-05-16 7:56 ` Dave Chinner
2018-05-18 3:51 ` [PATCH v2 " Darrick J. Wong
2018-05-15 22:34 ` [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair Darrick J. Wong
` (19 subsequent siblings)
22 siblings, 2 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:33 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Add some helpers to assemble a list of fs block extents. Generally,
repair functions will iterate the rmapbt to make a list (1) of all
extents owned by the nominal owner of the metadata structure; then they
will iterate all other structures with the same rmap owner to make a
list (2) of active blocks; and finally we have a subtraction function to
subtract all the blocks in (2) from (1), with the result that (1) is now
a list of blocks that were owned by the old btree and must be disposed.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 31 +++++++
2 files changed, 238 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 72f04a717150..8e8ecddd7537 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -361,3 +361,210 @@ xfs_repair_init_btblock(
return 0;
}
+
+/* Collect a dead btree extent for later disposal. */
+int
+xfs_repair_collect_btree_extent(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t len)
+{
+ struct xfs_repair_extent *rex;
+
+ trace_xfs_repair_collect_btree_extent(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
+
+ rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
+ if (!rex)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&rex->list);
+ rex->fsbno = fsbno;
+ rex->len = len;
+ list_add_tail(&rex->list, &exlist->list);
+
+ return 0;
+}
+
+/*
+ * Dispose of btree blocks from the old btree so that we can start using the
+ * new btree once the transaction commits.
+ */
+int
+xfs_repair_reap_btree_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+ int error = 0;
+
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ /* TODO: free the extent */
+ list_del(&rex->list);
+ kmem_free(rex);
+ }
+
+ return error;
+}
+
+/*
+ * An error happened during the rebuild so the transaction will be cancelled.
+ * The fs will shut down, and the administrator has to unmount and run repair.
+ * Therefore, free all the memory associated with the list so we can die.
+ */
+void
+xfs_repair_cancel_btree_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ list_del(&rex->list);
+ kmem_free(rex);
+ }
+}
+
+/* Compare two btree extents. */
+static int
+xfs_repair_btree_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_extent *ap;
+ struct xfs_repair_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_extent, list);
+ bp = container_of(b, struct xfs_repair_extent, list);
+
+ if (ap->fsbno > bp->fsbno)
+ return 1;
+ else if (ap->fsbno < bp->fsbno)
+ return -1;
+ return 0;
+}
+
+/*
+ * Remove all the blocks mentioned in sublist from the extents in exlist.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate exlist; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate sublist. This routine subtracts all the extents
+ * mentioned in sublist from all the extents linked in exlist, which leaves
+ * exlist as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure. The blocks mentioned in
+ * exlist can be reaped.
+ */
+#define XFS_REPAIR_EXT_LEFT_CONTIG (1 << 0)
+#define XFS_REPAIR_EXT_RIGHT_CONTIG (1 << 1)
+int
+xfs_repair_subtract_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_repair_extent_list *sublist)
+{
+ struct list_head *lp;
+ struct xfs_repair_extent *ex;
+ struct xfs_repair_extent *newex;
+ struct xfs_repair_extent *subex;
+ xfs_fsblock_t sub_fsb;
+ xfs_extlen_t sub_len;
+ int state;
+ int error = 0;
+
+ if (list_empty(&exlist->list) || list_empty(&sublist->list))
+ return 0;
+ ASSERT(!list_empty(&sublist->list));
+
+ list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
+ list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
+
+ subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
+ list);
+ lp = exlist->list.next;
+ while (lp != &exlist->list) {
+ ex = list_entry(lp, struct xfs_repair_extent, list);
+
+ /*
+ * Advance subex and/or ex until we find a pair that
+ * intersect or we run out of extents.
+ */
+ while (subex->fsbno + subex->len <= ex->fsbno) {
+ if (list_is_last(&subex->list, &sublist->list))
+ goto out;
+ subex = list_next_entry(subex, list);
+ }
+ if (subex->fsbno >= ex->fsbno + ex->len) {
+ lp = lp->next;
+ continue;
+ }
+
+ /* trim subex to fit the extent we have */
+ sub_fsb = subex->fsbno;
+ sub_len = subex->len;
+ if (subex->fsbno < ex->fsbno) {
+ sub_len -= ex->fsbno - subex->fsbno;
+ sub_fsb = ex->fsbno;
+ }
+ if (sub_len > ex->len)
+ sub_len = ex->len;
+
+ state = 0;
+ if (sub_fsb == ex->fsbno)
+ state |= XFS_REPAIR_EXT_LEFT_CONTIG;
+ if (sub_fsb + sub_len == ex->fsbno + ex->len)
+ state |= XFS_REPAIR_EXT_RIGHT_CONTIG;
+ switch (state) {
+ case XFS_REPAIR_EXT_LEFT_CONTIG:
+ /* Coincides with only the left. */
+ ex->fsbno += sub_len;
+ ex->len -= sub_len;
+ break;
+ case XFS_REPAIR_EXT_RIGHT_CONTIG:
+ /* Coincides with only the right. */
+ ex->len -= sub_len;
+ lp = lp->next;
+ break;
+ case XFS_REPAIR_EXT_LEFT_CONTIG | XFS_REPAIR_EXT_RIGHT_CONTIG:
+ /* Total overlap, just delete ex. */
+ lp = lp->next;
+ list_del(&ex->list);
+ kmem_free(ex);
+ break;
+ case 0:
+ /*
+ * Deleting from the middle: add the new right extent
+ * and then shrink the left extent.
+ */
+ newex = kmem_alloc(sizeof(struct xfs_repair_extent),
+ KM_MAYFAIL);
+ if (!newex) {
+ error = -ENOMEM;
+ goto out;
+ }
+ INIT_LIST_HEAD(&newex->list);
+ newex->fsbno = sub_fsb + sub_len;
+ newex->len = ex->len - (sub_fsb - ex->fsbno) - sub_len;
+ list_add(&newex->list, &ex->list);
+ ex->len = sub_fsb - ex->fsbno;
+ lp = lp->next;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ }
+
+out:
+ return error;
+}
+#undef XFS_REPAIR_EXT_LEFT_CONTIG
+#undef XFS_REPAIR_EXT_RIGHT_CONTIG
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 40990fa5f381..b288201030f8 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -43,6 +43,37 @@ int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
struct xfs_buf **bpp, xfs_btnum_t btnum,
const struct xfs_buf_ops *ops);
+struct xfs_repair_extent {
+ struct list_head list;
+ xfs_fsblock_t fsbno;
+ xfs_extlen_t len;
+};
+
+struct xfs_repair_extent_list {
+ struct list_head list;
+};
+
+static inline void
+xfs_repair_init_extent_list(
+ struct xfs_repair_extent_list *exlist)
+{
+ INIT_LIST_HEAD(&exlist->list);
+}
+
+#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \
+ list_for_each_entry_safe((rbe), (n), &(exlist)->list, list)
+int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno,
+ xfs_extlen_t len);
+int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist,
+ struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist);
+int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_repair_extent_list *sublist);
+
/* Metadata repairers */
int xfs_repair_probe(struct xfs_scrub_context *sc);
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-15 22:33 ` [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair Darrick J. Wong
@ 2018-05-16 7:56 ` Dave Chinner
2018-05-16 17:34 ` Allison Henderson
2018-05-16 18:01 ` Darrick J. Wong
2018-05-18 3:51 ` [PATCH v2 " Darrick J. Wong
1 sibling, 2 replies; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 7:56 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Tue, May 15, 2018 at 03:33:58PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add some helpers to assemble a list of fs block extents. Generally,
> repair functions will iterate the rmapbt to make a list (1) of all
> extents owned by the nominal owner of the metadata structure; then they
> will iterate all other structures with the same rmap owner to make a
> list (2) of active blocks; and finally we have a subtraction function to
> subtract all the blocks in (2) from (1), with the result that (1) is now
> a list of blocks that were owned by the old btree and must be disposed.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 31 +++++++
> 2 files changed, 238 insertions(+)
>
>
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 72f04a717150..8e8ecddd7537 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -361,3 +361,210 @@ xfs_repair_init_btblock(
>
> return 0;
> }
> +
> +/* Collect a dead btree extent for later disposal. */
> +int
> +xfs_repair_collect_btree_extent(
> + struct xfs_scrub_context *sc,
> + struct xfs_repair_extent_list *exlist,
> + xfs_fsblock_t fsbno,
> + xfs_extlen_t len)
> +{
> + struct xfs_repair_extent *rex;
> +
> + trace_xfs_repair_collect_btree_extent(sc->mp,
> + XFS_FSB_TO_AGNO(sc->mp, fsbno),
> + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> +
> + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
> + if (!rex)
> + return -ENOMEM;
Is this in transaction context? Regardless, I think we need to run
the entire of scrub/repair in a memalloc_nofs_save() context so we
don't have memory reclaim recursion issues...
[...]
> +/* Compare two btree extents. */
> +static int
> +xfs_repair_btree_extent_cmp(
> + void *priv,
> + struct list_head *a,
> + struct list_head *b)
> +{
> + struct xfs_repair_extent *ap;
> + struct xfs_repair_extent *bp;
> +
> + ap = container_of(a, struct xfs_repair_extent, list);
> + bp = container_of(b, struct xfs_repair_extent, list);
> +
> + if (ap->fsbno > bp->fsbno)
> + return 1;
> + else if (ap->fsbno < bp->fsbno)
> + return -1;
No need for the else there.
> + return 0;
> +}
> +
> +/*
> + * Remove all the blocks mentioned in sublist from the extents in exlist.
> + *
> + * The intent is that callers will iterate the rmapbt for all of its records
> + * for a given owner to generate exlist; and iterate all the blocks of the
generate @exlist
> + * metadata structures that are not being rebuilt and have the same rmapbt
> + * owner to generate sublist. This routine subtracts all the extents
generate @sublist.
> + * mentioned in sublist from all the extents linked in exlist, which leaves
> + * exlist as the list of blocks that are not accounted for, which we assume
> + * are the dead blocks of the old metadata structure. The blocks mentioned in
> + * exlist can be reaped.
> + */
> +#define XFS_REPAIR_EXT_LEFT_CONTIG (1 << 0)
> +#define XFS_REPAIR_EXT_RIGHT_CONTIG (1 << 1)
> +int
> +xfs_repair_subtract_extents(
> + struct xfs_scrub_context *sc,
> + struct xfs_repair_extent_list *exlist,
> + struct xfs_repair_extent_list *sublist)
> +{
> + struct list_head *lp;
> + struct xfs_repair_extent *ex;
> + struct xfs_repair_extent *newex;
> + struct xfs_repair_extent *subex;
> + xfs_fsblock_t sub_fsb;
> + xfs_extlen_t sub_len;
> + int state;
> + int error = 0;
> +
> + if (list_empty(&exlist->list) || list_empty(&sublist->list))
> + return 0;
> + ASSERT(!list_empty(&sublist->list));
> +
> + list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
> + list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
> +
> + subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
> + list);
> + lp = exlist->list.next;
> + while (lp != &exlist->list) {
> + ex = list_entry(lp, struct xfs_repair_extent, list);
> +
> + /*
> + * Advance subex and/or ex until we find a pair that
> + * intersect or we run out of extents.
> + */
> + while (subex->fsbno + subex->len <= ex->fsbno) {
> + if (list_is_last(&subex->list, &sublist->list))
> + goto out;
> + subex = list_next_entry(subex, list);
> + }
So this is a O(n^2) algorithm, right? How does it scale with large
extent lists? Given that these extents are dynamically allocated,
and we're already burning 16 bytes for a list head on each extent,
would it be better to use a smarter structure better suited for
exact lookups? e.g. an rbtree only takes an extra 8 bytes per
extent, and we get O(log N) searches on the inner loop here...
I guess this isn't necessary to fix right now, but I think it's
going to be an issue for maybe mark this down as "needing to be
fixed before removing EXPERIMENTAL tags"?
> + if (subex->fsbno >= ex->fsbno + ex->len) {
> + lp = lp->next;
> + continue;
> + }
> +
> + /* trim subex to fit the extent we have */
> + sub_fsb = subex->fsbno;
> + sub_len = subex->len;
> + if (subex->fsbno < ex->fsbno) {
> + sub_len -= ex->fsbno - subex->fsbno;
> + sub_fsb = ex->fsbno;
> + }
> + if (sub_len > ex->len)
> + sub_len = ex->len;
> +
> + state = 0;
> + if (sub_fsb == ex->fsbno)
> + state |= XFS_REPAIR_EXT_LEFT_CONTIG;
> + if (sub_fsb + sub_len == ex->fsbno + ex->len)
> + state |= XFS_REPAIR_EXT_RIGHT_CONTIG;
Ok, I think "CONTIG" is not the right word to use here. In the BMAP
btrees, the merge state flags were to tell us whether the edge of
the new extent is contiguous with the left and right extents in
the tree, not whether the new extents overlapped to the left/right
edges.
i.e. we're checking whether extent start/end overlaps are aligned
here, not whether they are contiguous with some other extent. So in
this case, I'd just name the variables "LEFT_ALIGNED" and
"RIGHT_ALIGNED" and drop all the namespace bits from them.
> + switch (state) {
> + case XFS_REPAIR_EXT_LEFT_CONTIG:
> + /* Coincides with only the left. */
And by calling them aligned, the comments become redundant:
case LEFT_ALIGNED:
> + ex->fsbno += sub_len;
> + ex->len -= sub_len;
> + break;
> + case XFS_REPAIR_EXT_RIGHT_CONTIG:
> + /* Coincides with only the right. */
> + ex->len -= sub_len;
> + lp = lp->next;
> + break;
> + case XFS_REPAIR_EXT_LEFT_CONTIG | XFS_REPAIR_EXT_RIGHT_CONTIG:
> + /* Total overlap, just delete ex. */
> + lp = lp->next;
> + list_del(&ex->list);
> + kmem_free(ex);
> + break;
> + case 0:
> + /*
> + * Deleting from the middle: add the new right extent
> + * and then shrink the left extent.
> + */
> + newex = kmem_alloc(sizeof(struct xfs_repair_extent),
> + KM_MAYFAIL);
> + if (!newex) {
> + error = -ENOMEM;
> + goto out;
> + }
> + INIT_LIST_HEAD(&newex->list);
> + newex->fsbno = sub_fsb + sub_len;
> + newex->len = ex->len - (sub_fsb - ex->fsbno) - sub_len;
so: new len = old len - (length of first extent) - length of overlap.
I think this is more obvious as "new len = old end - new start", i.e.:
newex->len = ex->fsbno + ex->len - newex->fsbno;
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-16 7:56 ` Dave Chinner
@ 2018-05-16 17:34 ` Allison Henderson
2018-05-16 18:06 ` Darrick J. Wong
2018-05-16 18:01 ` Darrick J. Wong
1 sibling, 1 reply; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 17:34 UTC (permalink / raw)
To: Dave Chinner, Darrick J. Wong; +Cc: linux-xfs
Ok, with the points Dave made:
Reviewed by: Allison Henderson <allison.henderson@oracle.com>
On 05/16/2018 12:56 AM, Dave Chinner wrote:
> On Tue, May 15, 2018 at 03:33:58PM -0700, Darrick J. Wong wrote:
>> From: Darrick J. Wong <darrick.wong@oracle.com>
>>
>> Add some helpers to assemble a list of fs block extents. Generally,
>> repair functions will iterate the rmapbt to make a list (1) of all
>> extents owned by the nominal owner of the metadata structure; then they
>> will iterate all other structures with the same rmap owner to make a
>> list (2) of active blocks; and finally we have a subtraction function to
>> subtract all the blocks in (2) from (1), with the result that (1) is now
>> a list of blocks that were owned by the old btree and must be disposed.
>>
>> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
>> ---
>> fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
>> fs/xfs/scrub/repair.h | 31 +++++++
>> 2 files changed, 238 insertions(+)
>>
>>
>> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
>> index 72f04a717150..8e8ecddd7537 100644
>> --- a/fs/xfs/scrub/repair.c
>> +++ b/fs/xfs/scrub/repair.c
>> @@ -361,3 +361,210 @@ xfs_repair_init_btblock(
>>
>> return 0;
>> }
>> +
>> +/* Collect a dead btree extent for later disposal. */
>> +int
>> +xfs_repair_collect_btree_extent(
>> + struct xfs_scrub_context *sc,
>> + struct xfs_repair_extent_list *exlist,
>> + xfs_fsblock_t fsbno,
>> + xfs_extlen_t len)
>> +{
>> + struct xfs_repair_extent *rex;
>> +
>> + trace_xfs_repair_collect_btree_extent(sc->mp,
>> + XFS_FSB_TO_AGNO(sc->mp, fsbno),
>> + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
>> +
>> + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
>> + if (!rex)
>> + return -ENOMEM;
> Is this in transaction context? Regardless, I think we need to run
> the entire of scrub/repair in a memalloc_nofs_save() context so we
> don't have memory reclaim recursion issues...
>
> [...]
>
>> +/* Compare two btree extents. */
>> +static int
>> +xfs_repair_btree_extent_cmp(
>> + void *priv,
>> + struct list_head *a,
>> + struct list_head *b)
>> +{
>> + struct xfs_repair_extent *ap;
>> + struct xfs_repair_extent *bp;
>> +
>> + ap = container_of(a, struct xfs_repair_extent, list);
>> + bp = container_of(b, struct xfs_repair_extent, list);
>> +
>> + if (ap->fsbno > bp->fsbno)
>> + return 1;
>> + else if (ap->fsbno < bp->fsbno)
>> + return -1;
> No need for the else there.
Well, I think he meant to return 0 in the case of ap->fsbno ==
bp->fsbno? Am i reading that right? caller expects 1 for greater than,
-1 for less than and 0 on equivalence?
>> + return 0;
>> +}
>> +
>> +/*
>> + * Remove all the blocks mentioned in sublist from the extents in exlist.
>> + *
>> + * The intent is that callers will iterate the rmapbt for all of its records
>> + * for a given owner to generate exlist; and iterate all the blocks of the
> generate @exlist
>
>> + * metadata structures that are not being rebuilt and have the same rmapbt
>> + * owner to generate sublist. This routine subtracts all the extents
> generate @sublist.
>
>> + * mentioned in sublist from all the extents linked in exlist, which leaves
>> + * exlist as the list of blocks that are not accounted for, which we assume
>> + * are the dead blocks of the old metadata structure. The blocks mentioned in
>> + * exlist can be reaped.
>> + */
>> +#define XFS_REPAIR_EXT_LEFT_CONTIG (1 << 0)
>> +#define XFS_REPAIR_EXT_RIGHT_CONTIG (1 << 1)
>> +int
>> +xfs_repair_subtract_extents(
>> + struct xfs_scrub_context *sc,
>> + struct xfs_repair_extent_list *exlist,
>> + struct xfs_repair_extent_list *sublist)
>> +{
>> + struct list_head *lp;
>> + struct xfs_repair_extent *ex;
>> + struct xfs_repair_extent *newex;
>> + struct xfs_repair_extent *subex;
>> + xfs_fsblock_t sub_fsb;
>> + xfs_extlen_t sub_len;
>> + int state;
>> + int error = 0;
>> +
>> + if (list_empty(&exlist->list) || list_empty(&sublist->list))
>> + return 0;
>> + ASSERT(!list_empty(&sublist->list));
>> +
>> + list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
>> + list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
>> +
>> + subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
>> + list);
>> + lp = exlist->list.next;
>> + while (lp != &exlist->list) {
>> + ex = list_entry(lp, struct xfs_repair_extent, list);
>> +
>> + /*
>> + * Advance subex and/or ex until we find a pair that
>> + * intersect or we run out of extents.
>> + */
>> + while (subex->fsbno + subex->len <= ex->fsbno) {
>> + if (list_is_last(&subex->list, &sublist->list))
>> + goto out;
>> + subex = list_next_entry(subex, list);
>> + }
> So this is a O(n^2) algorithm, right? How does it scale with large
> extent lists? Given that these extents are dynamically allocated,
> and we're already burning 16 bytes for a list head on each extent,
> would it be better to use a smarter structure better suited for
> exact lookups? e.g. an rbtree only takes an extra 8 bytes per
> extent, and we get O(log N) searches on the inner loop here...
>
> I guess this isn't necessary to fix right now, but I think it's
> going to be an issue for maybe mark this down as "needing to be
> fixed before removing EXPERIMENTAL tags"?
>
>> + if (subex->fsbno >= ex->fsbno + ex->len) {
>> + lp = lp->next;
>> + continue;
>> + }
>> +
>> + /* trim subex to fit the extent we have */
>> + sub_fsb = subex->fsbno;
>> + sub_len = subex->len;
>> + if (subex->fsbno < ex->fsbno) {
>> + sub_len -= ex->fsbno - subex->fsbno;
>> + sub_fsb = ex->fsbno;
>> + }
>> + if (sub_len > ex->len)
>> + sub_len = ex->len;
>> +
>> + state = 0;
>> + if (sub_fsb == ex->fsbno)
>> + state |= XFS_REPAIR_EXT_LEFT_CONTIG;
>> + if (sub_fsb + sub_len == ex->fsbno + ex->len)
>> + state |= XFS_REPAIR_EXT_RIGHT_CONTIG;
> Ok, I think "CONTIG" is not the right word to use here. In the BMAP
> btrees, the merge state flags were to tell us whether the edge of
> the new extent is contiguous with the left and right extents in
> the tree, not whether the new extents overlapped to the left/right
> edges.
>
> i.e. we're checking whether extent start/end overlaps are aligned
> here, not whether they are contiguous with some other extent. So in
> this case, I'd just name the variables "LEFT_ALIGNED" and
> "RIGHT_ALIGNED" and drop all the namespace bits from them.
>
>> + switch (state) {
>> + case XFS_REPAIR_EXT_LEFT_CONTIG:
>> + /* Coincides with only the left. */
> And by calling them aligned, the comments become redundant:
>
> case LEFT_ALIGNED:
>> + ex->fsbno += sub_len;
>> + ex->len -= sub_len;
>> + break;
>> + case XFS_REPAIR_EXT_RIGHT_CONTIG:
>> + /* Coincides with only the right. */
>> + ex->len -= sub_len;
>> + lp = lp->next;
>> + break;
>> + case XFS_REPAIR_EXT_LEFT_CONTIG | XFS_REPAIR_EXT_RIGHT_CONTIG:
>> + /* Total overlap, just delete ex. */
>> + lp = lp->next;
>> + list_del(&ex->list);
>> + kmem_free(ex);
>> + break;
>> + case 0:
>> + /*
>> + * Deleting from the middle: add the new right extent
>> + * and then shrink the left extent.
>> + */
>> + newex = kmem_alloc(sizeof(struct xfs_repair_extent),
>> + KM_MAYFAIL);
>> + if (!newex) {
>> + error = -ENOMEM;
>> + goto out;
>> + }
>> + INIT_LIST_HEAD(&newex->list);
>> + newex->fsbno = sub_fsb + sub_len;
>> + newex->len = ex->len - (sub_fsb - ex->fsbno) - sub_len;
> so: new len = old len - (length of first extent) - length of overlap.
>
> I think this is more obvious as "new len = old end - new start", i.e.:
>
> newex->len = ex->fsbno + ex->len - newex->fsbno;
>
> Cheers,
>
> Dave.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-16 17:34 ` Allison Henderson
@ 2018-05-16 18:06 ` Darrick J. Wong
2018-05-16 21:23 ` Dave Chinner
0 siblings, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-16 18:06 UTC (permalink / raw)
To: Allison Henderson; +Cc: Dave Chinner, linux-xfs
On Wed, May 16, 2018 at 10:34:36AM -0700, Allison Henderson wrote:
> Ok, with the points Dave made:
> Reviewed by: Allison Henderson <allison.henderson@oracle.com>
>
> On 05/16/2018 12:56 AM, Dave Chinner wrote:
> > On Tue, May 15, 2018 at 03:33:58PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > >
> > > Add some helpers to assemble a list of fs block extents. Generally,
> > > repair functions will iterate the rmapbt to make a list (1) of all
> > > extents owned by the nominal owner of the metadata structure; then they
> > > will iterate all other structures with the same rmap owner to make a
> > > list (2) of active blocks; and finally we have a subtraction function to
> > > subtract all the blocks in (2) from (1), with the result that (1) is now
> > > a list of blocks that were owned by the old btree and must be disposed.
> > >
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > > fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > fs/xfs/scrub/repair.h | 31 +++++++
> > > 2 files changed, 238 insertions(+)
> > >
> > >
> > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > index 72f04a717150..8e8ecddd7537 100644
> > > --- a/fs/xfs/scrub/repair.c
> > > +++ b/fs/xfs/scrub/repair.c
> > > @@ -361,3 +361,210 @@ xfs_repair_init_btblock(
> > > return 0;
> > > }
> > > +
> > > +/* Collect a dead btree extent for later disposal. */
> > > +int
> > > +xfs_repair_collect_btree_extent(
> > > + struct xfs_scrub_context *sc,
> > > + struct xfs_repair_extent_list *exlist,
> > > + xfs_fsblock_t fsbno,
> > > + xfs_extlen_t len)
> > > +{
> > > + struct xfs_repair_extent *rex;
> > > +
> > > + trace_xfs_repair_collect_btree_extent(sc->mp,
> > > + XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > > + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > > +
> > > + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
> > > + if (!rex)
> > > + return -ENOMEM;
> > Is this in transaction context? Regardless, I think we need to run
> > the entire of scrub/repair in a memalloc_nofs_save() context so we
> > don't have memory reclaim recursion issues...
> >
> > [...]
> >
> > > +/* Compare two btree extents. */
> > > +static int
> > > +xfs_repair_btree_extent_cmp(
> > > + void *priv,
> > > + struct list_head *a,
> > > + struct list_head *b)
> > > +{
> > > + struct xfs_repair_extent *ap;
> > > + struct xfs_repair_extent *bp;
> > > +
> > > + ap = container_of(a, struct xfs_repair_extent, list);
> > > + bp = container_of(b, struct xfs_repair_extent, list);
> > > +
> > > + if (ap->fsbno > bp->fsbno)
> > > + return 1;
> > > + else if (ap->fsbno < bp->fsbno)
> > > + return -1;
> > No need for the else there.
> Well, I think he meant to return 0 in the case of ap->fsbno == bp->fsbno?
> Am i reading that right? caller expects 1 for greater than, -1 for less
> than and 0 on equivalence?
Correct. I think Dave was pointing out that else-after-return is
unnecessary, since the following behaves equivalently:
if (a > b)
return 1;
if (a < b)
return -1;
return 0;
Note that gcc (7.3, anyway) generates the same asm for either version so
I assume this is mostly stylistic cleanup to make the comparisons line
up? I don't have a preference either way. :)
--D
> > > + return 0;
> > > +}
> > > +
> > > +/*
> > > + * Remove all the blocks mentioned in sublist from the extents in exlist.
> > > + *
> > > + * The intent is that callers will iterate the rmapbt for all of its records
> > > + * for a given owner to generate exlist; and iterate all the blocks of the
> > generate @exlist
> >
> > > + * metadata structures that are not being rebuilt and have the same rmapbt
> > > + * owner to generate sublist. This routine subtracts all the extents
> > generate @sublist.
> >
> > > + * mentioned in sublist from all the extents linked in exlist, which leaves
> > > + * exlist as the list of blocks that are not accounted for, which we assume
> > > + * are the dead blocks of the old metadata structure. The blocks mentioned in
> > > + * exlist can be reaped.
> > > + */
> > > +#define XFS_REPAIR_EXT_LEFT_CONTIG (1 << 0)
> > > +#define XFS_REPAIR_EXT_RIGHT_CONTIG (1 << 1)
> > > +int
> > > +xfs_repair_subtract_extents(
> > > + struct xfs_scrub_context *sc,
> > > + struct xfs_repair_extent_list *exlist,
> > > + struct xfs_repair_extent_list *sublist)
> > > +{
> > > + struct list_head *lp;
> > > + struct xfs_repair_extent *ex;
> > > + struct xfs_repair_extent *newex;
> > > + struct xfs_repair_extent *subex;
> > > + xfs_fsblock_t sub_fsb;
> > > + xfs_extlen_t sub_len;
> > > + int state;
> > > + int error = 0;
> > > +
> > > + if (list_empty(&exlist->list) || list_empty(&sublist->list))
> > > + return 0;
> > > + ASSERT(!list_empty(&sublist->list));
> > > +
> > > + list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
> > > + list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
> > > +
> > > + subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
> > > + list);
> > > + lp = exlist->list.next;
> > > + while (lp != &exlist->list) {
> > > + ex = list_entry(lp, struct xfs_repair_extent, list);
> > > +
> > > + /*
> > > + * Advance subex and/or ex until we find a pair that
> > > + * intersect or we run out of extents.
> > > + */
> > > + while (subex->fsbno + subex->len <= ex->fsbno) {
> > > + if (list_is_last(&subex->list, &sublist->list))
> > > + goto out;
> > > + subex = list_next_entry(subex, list);
> > > + }
> > So this is a O(n^2) algorithm, right? How does it scale with large
> > extent lists? Given that these extents are dynamically allocated,
> > and we're already burning 16 bytes for a list head on each extent,
> > would it be better to use a smarter structure better suited for
> > exact lookups? e.g. an rbtree only takes an extra 8 bytes per
> > extent, and we get O(log N) searches on the inner loop here...
> >
> > I guess this isn't necessary to fix right now, but I think it's
> > going to be an issue for maybe mark this down as "needing to be
> > fixed before removing EXPERIMENTAL tags"?
> >
> > > + if (subex->fsbno >= ex->fsbno + ex->len) {
> > > + lp = lp->next;
> > > + continue;
> > > + }
> > > +
> > > + /* trim subex to fit the extent we have */
> > > + sub_fsb = subex->fsbno;
> > > + sub_len = subex->len;
> > > + if (subex->fsbno < ex->fsbno) {
> > > + sub_len -= ex->fsbno - subex->fsbno;
> > > + sub_fsb = ex->fsbno;
> > > + }
> > > + if (sub_len > ex->len)
> > > + sub_len = ex->len;
> > > +
> > > + state = 0;
> > > + if (sub_fsb == ex->fsbno)
> > > + state |= XFS_REPAIR_EXT_LEFT_CONTIG;
> > > + if (sub_fsb + sub_len == ex->fsbno + ex->len)
> > > + state |= XFS_REPAIR_EXT_RIGHT_CONTIG;
> > Ok, I think "CONTIG" is not the right word to use here. In the BMAP
> > btrees, the merge state flags were to tell us whether the edge of
> > the new extent is contiguous with the left and right extents in
> > the tree, not whether the new extents overlapped to the left/right
> > edges.
> >
> > i.e. we're checking whether extent start/end overlaps are aligned
> > here, not whether they are contiguous with some other extent. So in
> > this case, I'd just name the variables "LEFT_ALIGNED" and
> > "RIGHT_ALIGNED" and drop all the namespace bits from them.
> >
> > > + switch (state) {
> > > + case XFS_REPAIR_EXT_LEFT_CONTIG:
> > > + /* Coincides with only the left. */
> > And by calling them aligned, the comments become redundant:
> >
> > case LEFT_ALIGNED:
> > > + ex->fsbno += sub_len;
> > > + ex->len -= sub_len;
> > > + break;
> > > + case XFS_REPAIR_EXT_RIGHT_CONTIG:
> > > + /* Coincides with only the right. */
> > > + ex->len -= sub_len;
> > > + lp = lp->next;
> > > + break;
> > > + case XFS_REPAIR_EXT_LEFT_CONTIG | XFS_REPAIR_EXT_RIGHT_CONTIG:
> > > + /* Total overlap, just delete ex. */
> > > + lp = lp->next;
> > > + list_del(&ex->list);
> > > + kmem_free(ex);
> > > + break;
> > > + case 0:
> > > + /*
> > > + * Deleting from the middle: add the new right extent
> > > + * and then shrink the left extent.
> > > + */
> > > + newex = kmem_alloc(sizeof(struct xfs_repair_extent),
> > > + KM_MAYFAIL);
> > > + if (!newex) {
> > > + error = -ENOMEM;
> > > + goto out;
> > > + }
> > > + INIT_LIST_HEAD(&newex->list);
> > > + newex->fsbno = sub_fsb + sub_len;
> > > + newex->len = ex->len - (sub_fsb - ex->fsbno) - sub_len;
> > so: new len = old len - (length of first extent) - length of overlap.
> >
> > I think this is more obvious as "new len = old end - new start", i.e.:
> >
> > newex->len = ex->fsbno + ex->len - newex->fsbno;
> >
> > Cheers,
> >
> > Dave.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-16 18:06 ` Darrick J. Wong
@ 2018-05-16 21:23 ` Dave Chinner
2018-05-16 21:33 ` Allison Henderson
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 21:23 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: Allison Henderson, linux-xfs
On Wed, May 16, 2018 at 11:06:15AM -0700, Darrick J. Wong wrote:
> On Wed, May 16, 2018 at 10:34:36AM -0700, Allison Henderson wrote:
> > Ok, with the points Dave made:
> > Reviewed by: Allison Henderson <allison.henderson@oracle.com>
> >
> > On 05/16/2018 12:56 AM, Dave Chinner wrote:
> > > On Tue, May 15, 2018 at 03:33:58PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > >
> > > > Add some helpers to assemble a list of fs block extents. Generally,
> > > > repair functions will iterate the rmapbt to make a list (1) of all
> > > > extents owned by the nominal owner of the metadata structure; then they
> > > > will iterate all other structures with the same rmap owner to make a
> > > > list (2) of active blocks; and finally we have a subtraction function to
> > > > subtract all the blocks in (2) from (1), with the result that (1) is now
> > > > a list of blocks that were owned by the old btree and must be disposed.
> > > >
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > > fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > > fs/xfs/scrub/repair.h | 31 +++++++
> > > > 2 files changed, 238 insertions(+)
> > > >
> > > >
> > > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > > index 72f04a717150..8e8ecddd7537 100644
> > > > --- a/fs/xfs/scrub/repair.c
> > > > +++ b/fs/xfs/scrub/repair.c
> > > > @@ -361,3 +361,210 @@ xfs_repair_init_btblock(
> > > > return 0;
> > > > }
> > > > +
> > > > +/* Collect a dead btree extent for later disposal. */
> > > > +int
> > > > +xfs_repair_collect_btree_extent(
> > > > + struct xfs_scrub_context *sc,
> > > > + struct xfs_repair_extent_list *exlist,
> > > > + xfs_fsblock_t fsbno,
> > > > + xfs_extlen_t len)
> > > > +{
> > > > + struct xfs_repair_extent *rex;
> > > > +
> > > > + trace_xfs_repair_collect_btree_extent(sc->mp,
> > > > + XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > > > + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > > > +
> > > > + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
> > > > + if (!rex)
> > > > + return -ENOMEM;
> > > Is this in transaction context? Regardless, I think we need to run
> > > the entire of scrub/repair in a memalloc_nofs_save() context so we
> > > don't have memory reclaim recursion issues...
> > >
> > > [...]
> > >
> > > > +/* Compare two btree extents. */
> > > > +static int
> > > > +xfs_repair_btree_extent_cmp(
> > > > + void *priv,
> > > > + struct list_head *a,
> > > > + struct list_head *b)
> > > > +{
> > > > + struct xfs_repair_extent *ap;
> > > > + struct xfs_repair_extent *bp;
> > > > +
> > > > + ap = container_of(a, struct xfs_repair_extent, list);
> > > > + bp = container_of(b, struct xfs_repair_extent, list);
> > > > +
> > > > + if (ap->fsbno > bp->fsbno)
> > > > + return 1;
> > > > + else if (ap->fsbno < bp->fsbno)
> > > > + return -1;
> > > No need for the else there.
> > Well, I think he meant to return 0 in the case of ap->fsbno == bp->fsbno?
> > Am i reading that right? caller expects 1 for greater than, -1 for less
> > than and 0 on equivalence?
>
> Correct. I think Dave was pointing out that else-after-return is
> unnecessary, since the following behaves equivalently:
>
> if (a > b)
> return 1;
> if (a < b)
> return -1;
> return 0;
>
> Note that gcc (7.3, anyway) generates the same asm for either version so
> I assume this is mostly stylistic cleanup to make the comparisons line
> up? I don't have a preference either way. :)
It's a style and maintenance thing. The else is redundant and could
lead to future confusion, so IMO it is best to leave it out....
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-16 21:23 ` Dave Chinner
@ 2018-05-16 21:33 ` Allison Henderson
0 siblings, 0 replies; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 21:33 UTC (permalink / raw)
To: Dave Chinner, Darrick J. Wong; +Cc: linux-xfs
On 05/16/2018 02:23 PM, Dave Chinner wrote:
> On Wed, May 16, 2018 at 11:06:15AM -0700, Darrick J. Wong wrote:
>> On Wed, May 16, 2018 at 10:34:36AM -0700, Allison Henderson wrote:
>>> Ok, with the points Dave made:
>>> Reviewed by: Allison Henderson <allison.henderson@oracle.com>
>>>
>>> On 05/16/2018 12:56 AM, Dave Chinner wrote:
>>>> On Tue, May 15, 2018 at 03:33:58PM -0700, Darrick J. Wong wrote:
>>>>> From: Darrick J. Wong <darrick.wong@oracle.com>
>>>>>
>>>>> Add some helpers to assemble a list of fs block extents. Generally,
>>>>> repair functions will iterate the rmapbt to make a list (1) of all
>>>>> extents owned by the nominal owner of the metadata structure; then they
>>>>> will iterate all other structures with the same rmap owner to make a
>>>>> list (2) of active blocks; and finally we have a subtraction function to
>>>>> subtract all the blocks in (2) from (1), with the result that (1) is now
>>>>> a list of blocks that were owned by the old btree and must be disposed.
>>>>>
>>>>> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
>>>>> ---
>>>>> fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>> fs/xfs/scrub/repair.h | 31 +++++++
>>>>> 2 files changed, 238 insertions(+)
>>>>>
>>>>>
>>>>> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
>>>>> index 72f04a717150..8e8ecddd7537 100644
>>>>> --- a/fs/xfs/scrub/repair.c
>>>>> +++ b/fs/xfs/scrub/repair.c
>>>>> @@ -361,3 +361,210 @@ xfs_repair_init_btblock(
>>>>> return 0;
>>>>> }
>>>>> +
>>>>> +/* Collect a dead btree extent for later disposal. */
>>>>> +int
>>>>> +xfs_repair_collect_btree_extent(
>>>>> + struct xfs_scrub_context *sc,
>>>>> + struct xfs_repair_extent_list *exlist,
>>>>> + xfs_fsblock_t fsbno,
>>>>> + xfs_extlen_t len)
>>>>> +{
>>>>> + struct xfs_repair_extent *rex;
>>>>> +
>>>>> + trace_xfs_repair_collect_btree_extent(sc->mp,
>>>>> + XFS_FSB_TO_AGNO(sc->mp, fsbno),
>>>>> + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
>>>>> +
>>>>> + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
>>>>> + if (!rex)
>>>>> + return -ENOMEM;
>>>> Is this in transaction context? Regardless, I think we need to run
>>>> the entire of scrub/repair in a memalloc_nofs_save() context so we
>>>> don't have memory reclaim recursion issues...
>>>>
>>>> [...]
>>>>
>>>>> +/* Compare two btree extents. */
>>>>> +static int
>>>>> +xfs_repair_btree_extent_cmp(
>>>>> + void *priv,
>>>>> + struct list_head *a,
>>>>> + struct list_head *b)
>>>>> +{
>>>>> + struct xfs_repair_extent *ap;
>>>>> + struct xfs_repair_extent *bp;
>>>>> +
>>>>> + ap = container_of(a, struct xfs_repair_extent, list);
>>>>> + bp = container_of(b, struct xfs_repair_extent, list);
>>>>> +
>>>>> + if (ap->fsbno > bp->fsbno)
>>>>> + return 1;
>>>>> + else if (ap->fsbno < bp->fsbno)
>>>>> + return -1;
>>>> No need for the else there.
>>> Well, I think he meant to return 0 in the case of ap->fsbno == bp->fsbno?
>>> Am i reading that right? caller expects 1 for greater than, -1 for less
>>> than and 0 on equivalence?
>>
>> Correct. I think Dave was pointing out that else-after-return is
>> unnecessary, since the following behaves equivalently:
>>
>> if (a > b)
>> return 1;
>> if (a < b)
>> return -1;
>> return 0;
>>
>> Note that gcc (7.3, anyway) generates the same asm for either version so
>> I assume this is mostly stylistic cleanup to make the comparisons line
>> up? I don't have a preference either way. :)
>
> It's a style and maintenance thing. The else is redundant and could
> lead to future confusion, so IMO it is best to leave it out....
>
> Cheers,
>
> Dave.
>
Alrighty then, thx for the clarification! :-)
Allison
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-16 7:56 ` Dave Chinner
2018-05-16 17:34 ` Allison Henderson
@ 2018-05-16 18:01 ` Darrick J. Wong
2018-05-16 21:32 ` Dave Chinner
1 sibling, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-16 18:01 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Wed, May 16, 2018 at 05:56:52PM +1000, Dave Chinner wrote:
> On Tue, May 15, 2018 at 03:33:58PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Add some helpers to assemble a list of fs block extents. Generally,
> > repair functions will iterate the rmapbt to make a list (1) of all
> > extents owned by the nominal owner of the metadata structure; then they
> > will iterate all other structures with the same rmap owner to make a
> > list (2) of active blocks; and finally we have a subtraction function to
> > subtract all the blocks in (2) from (1), with the result that (1) is now
> > a list of blocks that were owned by the old btree and must be disposed.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
> > fs/xfs/scrub/repair.h | 31 +++++++
> > 2 files changed, 238 insertions(+)
> >
> >
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index 72f04a717150..8e8ecddd7537 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -361,3 +361,210 @@ xfs_repair_init_btblock(
> >
> > return 0;
> > }
> > +
> > +/* Collect a dead btree extent for later disposal. */
> > +int
> > +xfs_repair_collect_btree_extent(
> > + struct xfs_scrub_context *sc,
> > + struct xfs_repair_extent_list *exlist,
> > + xfs_fsblock_t fsbno,
> > + xfs_extlen_t len)
> > +{
> > + struct xfs_repair_extent *rex;
> > +
> > + trace_xfs_repair_collect_btree_extent(sc->mp,
> > + XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > +
> > + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
> > + if (!rex)
> > + return -ENOMEM;
>
> Is this in transaction context?
Yes. After the setup function finishes we're required to own a
transaction and hold a lock on whatever resource we're playing with.
> Regardless, I think we need to run the entire of scrub/repair in a
> memalloc_nofs_save() context so we don't have memory reclaim recursion
> issues...
xfs_trans_reserve should take care of this, right? So we don't have to
feed KM_NOFS to kmem_*_alloc because this is already taken care of. The
MAYFAIL exists because we prefer ENOMEM'ing out to pushing on reclaim.
>
> [...]
>
> > +/* Compare two btree extents. */
> > +static int
> > +xfs_repair_btree_extent_cmp(
> > + void *priv,
> > + struct list_head *a,
> > + struct list_head *b)
> > +{
> > + struct xfs_repair_extent *ap;
> > + struct xfs_repair_extent *bp;
> > +
> > + ap = container_of(a, struct xfs_repair_extent, list);
> > + bp = container_of(b, struct xfs_repair_extent, list);
> > +
> > + if (ap->fsbno > bp->fsbno)
> > + return 1;
> > + else if (ap->fsbno < bp->fsbno)
> > + return -1;
>
> No need for the else there.
Fixed.
> > + return 0;
> > +}
> > +
> > +/*
> > + * Remove all the blocks mentioned in sublist from the extents in exlist.
> > + *
> > + * The intent is that callers will iterate the rmapbt for all of its records
> > + * for a given owner to generate exlist; and iterate all the blocks of the
>
> generate @exlist
>
> > + * metadata structures that are not being rebuilt and have the same rmapbt
> > + * owner to generate sublist. This routine subtracts all the extents
>
> generate @sublist.
Fixed both.
> > + * mentioned in sublist from all the extents linked in exlist, which leaves
> > + * exlist as the list of blocks that are not accounted for, which we assume
> > + * are the dead blocks of the old metadata structure. The blocks mentioned in
> > + * exlist can be reaped.
> > + */
> > +#define XFS_REPAIR_EXT_LEFT_CONTIG (1 << 0)
> > +#define XFS_REPAIR_EXT_RIGHT_CONTIG (1 << 1)
> > +int
> > +xfs_repair_subtract_extents(
> > + struct xfs_scrub_context *sc,
> > + struct xfs_repair_extent_list *exlist,
> > + struct xfs_repair_extent_list *sublist)
> > +{
> > + struct list_head *lp;
> > + struct xfs_repair_extent *ex;
> > + struct xfs_repair_extent *newex;
> > + struct xfs_repair_extent *subex;
> > + xfs_fsblock_t sub_fsb;
> > + xfs_extlen_t sub_len;
> > + int state;
> > + int error = 0;
> > +
> > + if (list_empty(&exlist->list) || list_empty(&sublist->list))
> > + return 0;
> > + ASSERT(!list_empty(&sublist->list));
> > +
> > + list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
> > + list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
> > +
> > + subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
> > + list);
> > + lp = exlist->list.next;
> > + while (lp != &exlist->list) {
> > + ex = list_entry(lp, struct xfs_repair_extent, list);
> > +
> > + /*
> > + * Advance subex and/or ex until we find a pair that
> > + * intersect or we run out of extents.
> > + */
> > + while (subex->fsbno + subex->len <= ex->fsbno) {
> > + if (list_is_last(&subex->list, &sublist->list))
> > + goto out;
> > + subex = list_next_entry(subex, list);
> > + }
>
> So this is a O(n^2) algorithm, right? How does it scale with large
> extent lists?
I don't think this is O(n^2) -- each list sort is O(n log n). Then we
iterate exlist once, rolling forward through sublist as necessary. We
never reset lp to the head of exlist nor do we reset subex to the head
of sublist. We're not performing random lookups on the sublist extents
at all.
So far I haven't noticed /much/ heat from this routine even with
deliberately aged filesystems, but that's probably due to the slab
allocator eating way more time. :(
> Given that these extents are dynamically allocated, and we're already
> burning 16 bytes for a list head on each extent, would it be better to
> use a smarter structure better suited for exact lookups? e.g. an
> rbtree only takes an extra 8 bytes per extent, and we get O(log N)
> searches on the inner loop here...
>
> I guess this isn't necessary to fix right now, but I think it's
> going to be an issue for maybe mark this down as "needing to be
> fixed before removing EXPERIMENTAL tags"?
I've thought about converting this to an rbtree or something, since
these are basically glorified bitmap operations. Set all the bits
corresponding to this rmap owner; set all the bits corresponding to
blocks with the same rmap owner but owned by the other data structures
sharing the rmap owner; then free anything in (ex & ~sub) because those
are the old btree blocks. It's on my list for optimization, but first
I want to get this thing working correctly.
TBH the other thing that irks me about the current orepair design is its
heavy reliance on creating potentially huge linked lists of the records
that need to be put into the new structure. I'd really like a data
structure where I can do fast unsorted appends without list overhead,
sort the data structure once, and then insert in sorted order. The slab
thing I put into xfs_repair provides this, but we can't easily allocate
128M of space in the kernel. I also want to do a bulk load of an empty
a btree leaf so that we log the leaf block once and update the node
pointers once, kind of like what xfs_repair does during phase 5.
...all this optimization can come after merging.
> > + if (subex->fsbno >= ex->fsbno + ex->len) {
> > + lp = lp->next;
> > + continue;
> > + }
> > +
> > + /* trim subex to fit the extent we have */
> > + sub_fsb = subex->fsbno;
> > + sub_len = subex->len;
> > + if (subex->fsbno < ex->fsbno) {
> > + sub_len -= ex->fsbno - subex->fsbno;
> > + sub_fsb = ex->fsbno;
> > + }
> > + if (sub_len > ex->len)
> > + sub_len = ex->len;
> > +
> > + state = 0;
> > + if (sub_fsb == ex->fsbno)
> > + state |= XFS_REPAIR_EXT_LEFT_CONTIG;
> > + if (sub_fsb + sub_len == ex->fsbno + ex->len)
> > + state |= XFS_REPAIR_EXT_RIGHT_CONTIG;
>
> Ok, I think "CONTIG" is not the right word to use here. In the BMAP
> btrees, the merge state flags were to tell us whether the edge of
> the new extent is contiguous with the left and right extents in
> the tree, not whether the new extents overlapped to the left/right
> edges.
>
> i.e. we're checking whether extent start/end overlaps are aligned
> here, not whether they are contiguous with some other extent. So in
> this case, I'd just name the variables "LEFT_ALIGNED" and
> "RIGHT_ALIGNED" and drop all the namespace bits from them.
>
> > + switch (state) {
> > + case XFS_REPAIR_EXT_LEFT_CONTIG:
> > + /* Coincides with only the left. */
>
> And by calling them aligned, the comments become redundant:
Fixed.
> case LEFT_ALIGNED:
> > + ex->fsbno += sub_len;
> > + ex->len -= sub_len;
> > + break;
> > + case XFS_REPAIR_EXT_RIGHT_CONTIG:
> > + /* Coincides with only the right. */
> > + ex->len -= sub_len;
> > + lp = lp->next;
> > + break;
> > + case XFS_REPAIR_EXT_LEFT_CONTIG | XFS_REPAIR_EXT_RIGHT_CONTIG:
> > + /* Total overlap, just delete ex. */
> > + lp = lp->next;
> > + list_del(&ex->list);
> > + kmem_free(ex);
> > + break;
> > + case 0:
> > + /*
> > + * Deleting from the middle: add the new right extent
> > + * and then shrink the left extent.
> > + */
> > + newex = kmem_alloc(sizeof(struct xfs_repair_extent),
> > + KM_MAYFAIL);
> > + if (!newex) {
> > + error = -ENOMEM;
> > + goto out;
> > + }
> > + INIT_LIST_HEAD(&newex->list);
> > + newex->fsbno = sub_fsb + sub_len;
> > + newex->len = ex->len - (sub_fsb - ex->fsbno) - sub_len;
>
> so: new len = old len - (length of first extent) - length of overlap.
>
> I think this is more obvious as "new len = old end - new start", i.e.:
>
> newex->len = ex->fsbno + ex->len - newex->fsbno;
Yep. Fixed.
--D
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-16 18:01 ` Darrick J. Wong
@ 2018-05-16 21:32 ` Dave Chinner
2018-05-16 22:05 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 21:32 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Wed, May 16, 2018 at 11:01:27AM -0700, Darrick J. Wong wrote:
> On Wed, May 16, 2018 at 05:56:52PM +1000, Dave Chinner wrote:
> > On Tue, May 15, 2018 at 03:33:58PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > >
> > > Add some helpers to assemble a list of fs block extents. Generally,
> > > repair functions will iterate the rmapbt to make a list (1) of all
> > > extents owned by the nominal owner of the metadata structure; then they
> > > will iterate all other structures with the same rmap owner to make a
> > > list (2) of active blocks; and finally we have a subtraction function to
> > > subtract all the blocks in (2) from (1), with the result that (1) is now
> > > a list of blocks that were owned by the old btree and must be disposed.
> > >
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > > fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > fs/xfs/scrub/repair.h | 31 +++++++
> > > 2 files changed, 238 insertions(+)
> > >
> > >
> > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > index 72f04a717150..8e8ecddd7537 100644
> > > --- a/fs/xfs/scrub/repair.c
> > > +++ b/fs/xfs/scrub/repair.c
> > > @@ -361,3 +361,210 @@ xfs_repair_init_btblock(
> > >
> > > return 0;
> > > }
> > > +
> > > +/* Collect a dead btree extent for later disposal. */
> > > +int
> > > +xfs_repair_collect_btree_extent(
> > > + struct xfs_scrub_context *sc,
> > > + struct xfs_repair_extent_list *exlist,
> > > + xfs_fsblock_t fsbno,
> > > + xfs_extlen_t len)
> > > +{
> > > + struct xfs_repair_extent *rex;
> > > +
> > > + trace_xfs_repair_collect_btree_extent(sc->mp,
> > > + XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > > + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > > +
> > > + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
> > > + if (!rex)
> > > + return -ENOMEM;
> >
> > Is this in transaction context?
>
> Yes. After the setup function finishes we're required to own a
> transaction and hold a lock on whatever resource we're playing with.
>
> > Regardless, I think we need to run the entire of scrub/repair in a
> > memalloc_nofs_save() context so we don't have memory reclaim recursion
> > issues...
>
> xfs_trans_reserve should take care of this, right? So we don't have to
> feed KM_NOFS to kmem_*_alloc because this is already taken care of. The
> MAYFAIL exists because we prefer ENOMEM'ing out to pushing on reclaim.
Right, if we have an active transaction we are under NOFS allocation
conditions. I'm jus tnot sure how much of scrub/repair is covered by
the transaction context (too early in the morning to go code
spelunking!).
w.r.t reclaim, NOFS allocations will still push on reclaim - NOFS
just means it won't push on any dirty file pages or scan/reclaim
filesystem caches.
> > > + while (lp != &exlist->list) {
> > > + ex = list_entry(lp, struct xfs_repair_extent, list);
> > > +
> > > + /*
> > > + * Advance subex and/or ex until we find a pair that
> > > + * intersect or we run out of extents.
> > > + */
> > > + while (subex->fsbno + subex->len <= ex->fsbno) {
> > > + if (list_is_last(&subex->list, &sublist->list))
> > > + goto out;
> > > + subex = list_next_entry(subex, list);
> > > + }
> >
> > So this is a O(n^2) algorithm, right? How does it scale with large
> > extent lists?
>
> I don't think this is O(n^2) -- each list sort is O(n log n).
I'm not worried about the list sort. :)
> Then we
> iterate exlist once, rolling forward through sublist as necessary. We
> never reset lp to the head of exlist nor do we reset subex to the head
> of sublist. We're not performing random lookups on the sublist extents
> at all.
Ah, I missed the fact the loop doesn't reset the subex list to the
start for each ex entry. Perhaps a better comment explaining the way
the algorithm steps through both lists?
> So far I haven't noticed /much/ heat from this routine even with
> deliberately aged filesystems, but that's probably due to the slab
> allocator eating way more time. :(
Perhaps it is worth looking at using named slab caches for some of
these objects to take some heat off of the heap slabs?
> > Given that these extents are dynamically allocated, and we're already
> > burning 16 bytes for a list head on each extent, would it be better to
> > use a smarter structure better suited for exact lookups? e.g. an
> > rbtree only takes an extra 8 bytes per extent, and we get O(log N)
> > searches on the inner loop here...
> >
> > I guess this isn't necessary to fix right now, but I think it's
> > going to be an issue for maybe mark this down as "needing to be
> > fixed before removing EXPERIMENTAL tags"?
>
> I've thought about converting this to an rbtree or something, since
> these are basically glorified bitmap operations.
You can probably ignore that because I was thinking it was a full
subex list search for each ex, which is not the case...
>
> TBH the other thing that irks me about the current orepair design is its
> heavy reliance on creating potentially huge linked lists of the records
> that need to be put into the new structure. I'd really like a data
> structure where I can do fast unsorted appends without list overhead,
> sort the data structure once, and then insert in sorted order. The slab
> thing I put into xfs_repair provides this, but we can't easily allocate
> 128M of space in the kernel. I also want to do a bulk load of an empty
> a btree leaf so that we log the leaf block once and update the node
> pointers once, kind of like what xfs_repair does during phase 5.
>
> ...all this optimization can come after merging.
*nod*.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-16 21:32 ` Dave Chinner
@ 2018-05-16 22:05 ` Darrick J. Wong
2018-05-17 0:41 ` Dave Chinner
0 siblings, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-16 22:05 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Thu, May 17, 2018 at 07:32:37AM +1000, Dave Chinner wrote:
> On Wed, May 16, 2018 at 11:01:27AM -0700, Darrick J. Wong wrote:
> > On Wed, May 16, 2018 at 05:56:52PM +1000, Dave Chinner wrote:
> > > On Tue, May 15, 2018 at 03:33:58PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > >
> > > > Add some helpers to assemble a list of fs block extents. Generally,
> > > > repair functions will iterate the rmapbt to make a list (1) of all
> > > > extents owned by the nominal owner of the metadata structure; then they
> > > > will iterate all other structures with the same rmap owner to make a
> > > > list (2) of active blocks; and finally we have a subtraction function to
> > > > subtract all the blocks in (2) from (1), with the result that (1) is now
> > > > a list of blocks that were owned by the old btree and must be disposed.
> > > >
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > > fs/xfs/scrub/repair.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > > fs/xfs/scrub/repair.h | 31 +++++++
> > > > 2 files changed, 238 insertions(+)
> > > >
> > > >
> > > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > > index 72f04a717150..8e8ecddd7537 100644
> > > > --- a/fs/xfs/scrub/repair.c
> > > > +++ b/fs/xfs/scrub/repair.c
> > > > @@ -361,3 +361,210 @@ xfs_repair_init_btblock(
> > > >
> > > > return 0;
> > > > }
> > > > +
> > > > +/* Collect a dead btree extent for later disposal. */
> > > > +int
> > > > +xfs_repair_collect_btree_extent(
> > > > + struct xfs_scrub_context *sc,
> > > > + struct xfs_repair_extent_list *exlist,
> > > > + xfs_fsblock_t fsbno,
> > > > + xfs_extlen_t len)
> > > > +{
> > > > + struct xfs_repair_extent *rex;
> > > > +
> > > > + trace_xfs_repair_collect_btree_extent(sc->mp,
> > > > + XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > > > + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > > > +
> > > > + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
> > > > + if (!rex)
> > > > + return -ENOMEM;
> > >
> > > Is this in transaction context?
> >
> > Yes. After the setup function finishes we're required to own a
> > transaction and hold a lock on whatever resource we're playing with.
> >
> > > Regardless, I think we need to run the entire of scrub/repair in a
> > > memalloc_nofs_save() context so we don't have memory reclaim recursion
> > > issues...
> >
> > xfs_trans_reserve should take care of this, right? So we don't have to
> > feed KM_NOFS to kmem_*_alloc because this is already taken care of. The
> > MAYFAIL exists because we prefer ENOMEM'ing out to pushing on reclaim.
>
> Right, if we have an active transaction we are under NOFS allocation
> conditions. I'm jus tnot sure how much of scrub/repair is covered by
> the transaction context (too early in the morning to go code
> spelunking!).
Everything that runs between ->setup and xfs_scrub_teardown is covered
by transaction context.
> w.r.t reclaim, NOFS allocations will still push on reclaim - NOFS
> just means it won't push on any dirty file pages or scan/reclaim
> filesystem caches.
Noted. I was assuming that NOFS|NOIO meant it would only try reclaim in
other parts of the kernel, it sounds like we're ok here.
> > > > + while (lp != &exlist->list) {
> > > > + ex = list_entry(lp, struct xfs_repair_extent, list);
> > > > +
> > > > + /*
> > > > + * Advance subex and/or ex until we find a pair that
> > > > + * intersect or we run out of extents.
> > > > + */
> > > > + while (subex->fsbno + subex->len <= ex->fsbno) {
> > > > + if (list_is_last(&subex->list, &sublist->list))
> > > > + goto out;
> > > > + subex = list_next_entry(subex, list);
> > > > + }
> > >
> > > So this is a O(n^2) algorithm, right? How does it scale with large
> > > extent lists?
> >
> > I don't think this is O(n^2) -- each list sort is O(n log n).
>
> I'm not worried about the list sort. :)
>
> > Then we
> > iterate exlist once, rolling forward through sublist as necessary. We
> > never reset lp to the head of exlist nor do we reset subex to the head
> > of sublist. We're not performing random lookups on the sublist extents
> > at all.
>
> Ah, I missed the fact the loop doesn't reset the subex list to the
> start for each ex entry. Perhaps a better comment explaining the way
> the algorithm steps through both lists?
Ok:
/*
* Now that we've sorted both lists, we iterate exlist once, rolling
* forward through sublist and/or exlist as necessary until we find an
* overlap or reach the end of either list. We do not reset lp to the
* head of exlist nor do we reset subex to the head of sublist. The
* list traversal is similar to merge sort, but we're deleting instead.
* In this manner we avoid O(n^2) operations.
*/
> > So far I haven't noticed /much/ heat from this routine even with
> > deliberately aged filesystems, but that's probably due to the slab
> > allocator eating way more time. :(
>
> Perhaps it is worth looking at using named slab caches for some of
> these objects to take some heat off of the heap slabs?
Early versions of repair actually did that to try to avoid fragmenting
the power-of-2 slabs by using named slab caches, but slub emits udev
events whenever a named slab is created, and the resulting horrible
cacophony of udev rule processing ate an entire CPU core and uncovered
deadlocks in the parts of the slab management code that manage
/sys/kernel/slab. That code /really/ does not like you creating and
removing slabs concurrently.
As part of re-evaluating how orepair stores in-core records I will take
another pass at reducing the efficiency problems.
--D
> > > Given that these extents are dynamically allocated, and we're already
> > > burning 16 bytes for a list head on each extent, would it be better to
> > > use a smarter structure better suited for exact lookups? e.g. an
> > > rbtree only takes an extra 8 bytes per extent, and we get O(log N)
> > > searches on the inner loop here...
> > >
> > > I guess this isn't necessary to fix right now, but I think it's
> > > going to be an issue for maybe mark this down as "needing to be
> > > fixed before removing EXPERIMENTAL tags"?
> >
> > I've thought about converting this to an rbtree or something, since
> > these are basically glorified bitmap operations.
>
> You can probably ignore that because I was thinking it was a full
> subex list search for each ex, which is not the case...
> >
> > TBH the other thing that irks me about the current orepair design is its
> > heavy reliance on creating potentially huge linked lists of the records
> > that need to be put into the new structure. I'd really like a data
> > structure where I can do fast unsorted appends without list overhead,
> > sort the data structure once, and then insert in sorted order. The slab
> > thing I put into xfs_repair provides this, but we can't easily allocate
> > 128M of space in the kernel. I also want to do a bulk load of an empty
> > a btree leaf so that we log the leaf block once and update the node
> > pointers once, kind of like what xfs_repair does during phase 5.
> >
> > ...all this optimization can come after merging.
>
> *nod*.
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-16 22:05 ` Darrick J. Wong
@ 2018-05-17 0:41 ` Dave Chinner
2018-05-17 5:05 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-17 0:41 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Wed, May 16, 2018 at 03:05:37PM -0700, Darrick J. Wong wrote:
> On Thu, May 17, 2018 at 07:32:37AM +1000, Dave Chinner wrote:
> > On Wed, May 16, 2018 at 11:01:27AM -0700, Darrick J. Wong wrote:
> > > So far I haven't noticed /much/ heat from this routine even with
> > > deliberately aged filesystems, but that's probably due to the slab
> > > allocator eating way more time. :(
> >
> > Perhaps it is worth looking at using named slab caches for some of
> > these objects to take some heat off of the heap slabs?
>
> Early versions of repair actually did that to try to avoid fragmenting
> the power-of-2 slabs by using named slab caches, but slub emits udev
> events whenever a named slab is created, and the resulting horrible
Really? That's so gross!
Anyway, WTF are udev events on internal kernel slab cache creation
needed for?
> cacophony of udev rule processing ate an entire CPU core and uncovered
> deadlocks in the parts of the slab management code that manage
> /sys/kernel/slab. That code /really/ does not like you creating and
> removing slabs concurrently.
I wasn't suggesting dynamically creating slab caches during a repair
operation, just have one for the structure type created when we
first initialise the XFS kernel module like we do for xfs_buf,
xfs_inode, etc.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-17 0:41 ` Dave Chinner
@ 2018-05-17 5:05 ` Darrick J. Wong
0 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-17 5:05 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Thu, May 17, 2018 at 10:41:19AM +1000, Dave Chinner wrote:
> On Wed, May 16, 2018 at 03:05:37PM -0700, Darrick J. Wong wrote:
> > On Thu, May 17, 2018 at 07:32:37AM +1000, Dave Chinner wrote:
> > > On Wed, May 16, 2018 at 11:01:27AM -0700, Darrick J. Wong wrote:
> > > > So far I haven't noticed /much/ heat from this routine even with
> > > > deliberately aged filesystems, but that's probably due to the slab
> > > > allocator eating way more time. :(
> > >
> > > Perhaps it is worth looking at using named slab caches for some of
> > > these objects to take some heat off of the heap slabs?
> >
> > Early versions of repair actually did that to try to avoid fragmenting
> > the power-of-2 slabs by using named slab caches, but slub emits udev
> > events whenever a named slab is created, and the resulting horrible
>
> Really? That's so gross!
>
> Anyway, WTF are udev events on internal kernel slab cache creation
> needed for?
No idea. slab and slob don't do it, just slub.
> > cacophony of udev rule processing ate an entire CPU core and uncovered
> > deadlocks in the parts of the slab management code that manage
> > /sys/kernel/slab. That code /really/ does not like you creating and
> > removing slabs concurrently.
>
> I wasn't suggesting dynamically creating slab caches during a repair
> operation, just have one for the structure type created when we
> first initialise the XFS kernel module like we do for xfs_buf,
> xfs_inode, etc.
I know, I was abusing slab caches so that I could just dump everything
at the end by deleting the slab. :P
--D
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v2 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-15 22:33 ` [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair Darrick J. Wong
2018-05-16 7:56 ` Dave Chinner
@ 2018-05-18 3:51 ` Darrick J. Wong
2018-05-29 3:10 ` Dave Chinner
1 sibling, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-18 3:51 UTC (permalink / raw)
To: linux-xfs, david; +Cc: Allison Henderson
From: Darrick J. Wong <darrick.wong@oracle.com>
Add some helpers to assemble a list of fs block extents. Generally,
repair functions will iterate the rmapbt to make a list (1) of all
extents owned by the nominal owner of the metadata structure; then they
will iterate all other structures with the same rmap owner to make a
list (2) of active blocks; and finally we have a subtraction function to
subtract all the blocks in (2) from (1), with the result that (1) is now
a list of blocks that were owned by the old btree and must be disposed.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
v2: document the design of how repair functions are supposed to work,
implement review comments
---
fs/xfs/scrub/repair.c | 217 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 28 ++++++
2 files changed, 245 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index be21a2984001..fca8e3c7887d 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -368,3 +368,220 @@ xfs_repair_init_btblock(
return 0;
}
+
+/*
+ * Reconstructing per-AG Btrees
+ *
+ * When a space btree is corrupt, we don't bother trying to fix it. Instead,
+ * we scan secondary space metadata to derive the records that should be in
+ * the damaged btree, initialize a fresh btree root, and insert the records.
+ * Note that for rebuilding the rmapbt we scan all the primary data to
+ * generate the new records.
+ *
+ * However, that leaves the matter of removing all the metadata describing the
+ * old broken structure. For primary metadata we use the rmap data to collect
+ * every extent with a matching rmap owner (exlist); we then iterate all other
+ * metadata structures with the same rmap owner to collect the extents that
+ * cannot be removed (sublist). We then subtract sublist from exlist to
+ * derive the blocks that were used by the old btree. These blocks can be
+ * reaped.
+ *
+ * For rmapbt reconstructions we must use different tactics for extent
+ * collection. First we iterate all primary metadata (this excludes the old
+ * rmapbt, obviously) to generate new rmap records. The gaps in the rmap
+ * records are collected as exlist. The bnobt records are collected as
+ * sublist. As with the other btrees we subtract sublist from exlist, and the
+ * result (since the rmapbt lives in the free space) are the blocks from the
+ * old rmapbt.
+ */
+
+/* Collect a dead btree extent for later disposal. */
+int
+xfs_repair_collect_btree_extent(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t len)
+{
+ struct xfs_repair_extent *rex;
+
+ trace_xfs_repair_collect_btree_extent(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
+
+ rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
+ if (!rex)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&rex->list);
+ rex->fsbno = fsbno;
+ rex->len = len;
+ list_add_tail(&rex->list, &exlist->list);
+
+ return 0;
+}
+
+/*
+ * An error happened during the rebuild so the transaction will be cancelled.
+ * The fs will shut down, and the administrator has to unmount and run repair.
+ * Therefore, free all the memory associated with the list so we can die.
+ */
+void
+xfs_repair_cancel_btree_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ list_del(&rex->list);
+ kmem_free(rex);
+ }
+}
+
+/* Compare two btree extents. */
+static int
+xfs_repair_btree_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_extent *ap;
+ struct xfs_repair_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_extent, list);
+ bp = container_of(b, struct xfs_repair_extent, list);
+
+ if (ap->fsbno > bp->fsbno)
+ return 1;
+ if (ap->fsbno < bp->fsbno)
+ return -1;
+ return 0;
+}
+
+/*
+ * Remove all the blocks mentioned in @sublist from the extents in @exlist.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @exlist; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sublist. This routine subtracts all the extents
+ * mentioned in sublist from all the extents linked in @exlist, which leaves
+ * @exlist as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure. The blocks mentioned in
+ * @exlist can be reaped.
+ */
+#define LEFT_ALIGNED (1 << 0)
+#define RIGHT_ALIGNED (1 << 1)
+int
+xfs_repair_subtract_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_repair_extent_list *sublist)
+{
+ struct list_head *lp;
+ struct xfs_repair_extent *ex;
+ struct xfs_repair_extent *newex;
+ struct xfs_repair_extent *subex;
+ xfs_fsblock_t sub_fsb;
+ xfs_extlen_t sub_len;
+ int state;
+ int error = 0;
+
+ if (list_empty(&exlist->list) || list_empty(&sublist->list))
+ return 0;
+ ASSERT(!list_empty(&sublist->list));
+
+ list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
+ list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
+
+ /*
+ * Now that we've sorted both lists, we iterate exlist once, rolling
+ * forward through sublist and/or exlist as necessary until we find an
+ * overlap or reach the end of either list. We do not reset lp to the
+ * head of exlist nor do we reset subex to the head of sublist. The
+ * list traversal is similar to merge sort, but we're deleting
+ * instead. In this manner we avoid O(n^2) operations.
+ */
+ subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
+ list);
+ lp = exlist->list.next;
+ while (lp != &exlist->list) {
+ ex = list_entry(lp, struct xfs_repair_extent, list);
+
+ /*
+ * Advance subex and/or ex until we find a pair that
+ * intersect or we run out of extents.
+ */
+ while (subex->fsbno + subex->len <= ex->fsbno) {
+ if (list_is_last(&subex->list, &sublist->list))
+ goto out;
+ subex = list_next_entry(subex, list);
+ }
+ if (subex->fsbno >= ex->fsbno + ex->len) {
+ lp = lp->next;
+ continue;
+ }
+
+ /* trim subex to fit the extent we have */
+ sub_fsb = subex->fsbno;
+ sub_len = subex->len;
+ if (subex->fsbno < ex->fsbno) {
+ sub_len -= ex->fsbno - subex->fsbno;
+ sub_fsb = ex->fsbno;
+ }
+ if (sub_len > ex->len)
+ sub_len = ex->len;
+
+ state = 0;
+ if (sub_fsb == ex->fsbno)
+ state |= LEFT_ALIGNED;
+ if (sub_fsb + sub_len == ex->fsbno + ex->len)
+ state |= RIGHT_ALIGNED;
+ switch (state) {
+ case LEFT_ALIGNED:
+ /* Coincides with only the left. */
+ ex->fsbno += sub_len;
+ ex->len -= sub_len;
+ break;
+ case RIGHT_ALIGNED:
+ /* Coincides with only the right. */
+ ex->len -= sub_len;
+ lp = lp->next;
+ break;
+ case LEFT_ALIGNED | RIGHT_ALIGNED:
+ /* Total overlap, just delete ex. */
+ lp = lp->next;
+ list_del(&ex->list);
+ kmem_free(ex);
+ break;
+ case 0:
+ /*
+ * Deleting from the middle: add the new right extent
+ * and then shrink the left extent.
+ */
+ newex = kmem_alloc(sizeof(struct xfs_repair_extent),
+ KM_MAYFAIL);
+ if (!newex) {
+ error = -ENOMEM;
+ goto out;
+ }
+ INIT_LIST_HEAD(&newex->list);
+ newex->fsbno = sub_fsb + sub_len;
+ newex->len = ex->fsbno + ex->len - newex->fsbno;
+ list_add(&newex->list, &ex->list);
+ ex->len = sub_fsb - ex->fsbno;
+ lp = lp->next;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ }
+
+out:
+ return error;
+}
+#undef LEFT_ALIGNED
+#undef RIGHT_ALIGNED
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 40990fa5f381..ba1fdd7b9a79 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -43,6 +43,34 @@ int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
struct xfs_buf **bpp, xfs_btnum_t btnum,
const struct xfs_buf_ops *ops);
+struct xfs_repair_extent {
+ struct list_head list;
+ xfs_fsblock_t fsbno;
+ xfs_extlen_t len;
+};
+
+struct xfs_repair_extent_list {
+ struct list_head list;
+};
+
+static inline void
+xfs_repair_init_extent_list(
+ struct xfs_repair_extent_list *exlist)
+{
+ INIT_LIST_HEAD(&exlist->list);
+}
+
+#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \
+ list_for_each_entry_safe((rbe), (n), &(exlist)->list, list)
+int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno,
+ xfs_extlen_t len);
+void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist);
+int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_repair_extent_list *sublist);
+
/* Metadata repairers */
int xfs_repair_probe(struct xfs_scrub_context *sc);
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH v2 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-18 3:51 ` [PATCH v2 " Darrick J. Wong
@ 2018-05-29 3:10 ` Dave Chinner
2018-05-29 15:28 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-29 3:10 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs, Allison Henderson
On Thu, May 17, 2018 at 08:51:55PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add some helpers to assemble a list of fs block extents. Generally,
> repair functions will iterate the rmapbt to make a list (1) of all
> extents owned by the nominal owner of the metadata structure; then they
> will iterate all other structures with the same rmap owner to make a
> list (2) of active blocks; and finally we have a subtraction function to
> subtract all the blocks in (2) from (1), with the result that (1) is now
> a list of blocks that were owned by the old btree and must be disposed.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> v2: document the design of how repair functions are supposed to work,
> implement review comments
> ---
Much easier to understand now. Thanks!
Reviewed-by: Dave Chinner <dchinner@redhat.com>
One thing for a followup patch, though.
> +/* Collect a dead btree extent for later disposal. */
> +int
> +xfs_repair_collect_btree_extent(
> + struct xfs_scrub_context *sc,
> + struct xfs_repair_extent_list *exlist,
> + xfs_fsblock_t fsbno,
> + xfs_extlen_t len)
> +{
> + struct xfs_repair_extent *rex;
> +
> + trace_xfs_repair_collect_btree_extent(sc->mp,
> + XFS_FSB_TO_AGNO(sc->mp, fsbno),
> + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> +
> + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
> + if (!rex)
> + return -ENOMEM;
> +
> + INIT_LIST_HEAD(&rex->list);
> + rex->fsbno = fsbno;
> + rex->len = len;
This allocation is repeated a couple of times. Single function to do
it would be nice to encapsulate alloc+setup - something like rex =
alloc_rex(fsbno, len) which returns NULL if it fails.
> + case LEFT_ALIGNED | RIGHT_ALIGNED:
> + /* Total overlap, just delete ex. */
> + lp = lp->next;
> + list_del(&ex->list);
> + kmem_free(ex);
> + break;
> + case 0:
> + /*
> + * Deleting from the middle: add the new right extent
> + * and then shrink the left extent.
> + */
> + newex = kmem_alloc(sizeof(struct xfs_repair_extent),
> + KM_MAYFAIL);
> + if (!newex) {
> + error = -ENOMEM;
> + goto out;
> + }
> + INIT_LIST_HEAD(&newex->list);
> + newex->fsbno = sub_fsb + sub_len;
> + newex->len = ex->fsbno + ex->len - newex->fsbno;
This is the other place it is called...
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 03/22] xfs: add helpers to collect and sift btree block pointers during repair
2018-05-29 3:10 ` Dave Chinner
@ 2018-05-29 15:28 ` Darrick J. Wong
0 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-29 15:28 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs, Allison Henderson
On Tue, May 29, 2018 at 01:10:01PM +1000, Dave Chinner wrote:
> On Thu, May 17, 2018 at 08:51:55PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Add some helpers to assemble a list of fs block extents. Generally,
> > repair functions will iterate the rmapbt to make a list (1) of all
> > extents owned by the nominal owner of the metadata structure; then they
> > will iterate all other structures with the same rmap owner to make a
> > list (2) of active blocks; and finally we have a subtraction function to
> > subtract all the blocks in (2) from (1), with the result that (1) is now
> > a list of blocks that were owned by the old btree and must be disposed.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > v2: document the design of how repair functions are supposed to work,
> > implement review comments
> > ---
>
> Much easier to understand now. Thanks!
>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
>
> One thing for a followup patch, though.
>
> > +/* Collect a dead btree extent for later disposal. */
> > +int
> > +xfs_repair_collect_btree_extent(
> > + struct xfs_scrub_context *sc,
> > + struct xfs_repair_extent_list *exlist,
> > + xfs_fsblock_t fsbno,
> > + xfs_extlen_t len)
> > +{
> > + struct xfs_repair_extent *rex;
> > +
> > + trace_xfs_repair_collect_btree_extent(sc->mp,
> > + XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > +
> > + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
> > + if (!rex)
> > + return -ENOMEM;
> > +
> > + INIT_LIST_HEAD(&rex->list);
> > + rex->fsbno = fsbno;
> > + rex->len = len;
>
> This allocation is repeated a couple of times. Single function to do
> it would be nice to encapsulate alloc+setup - something like rex =
> alloc_rex(fsbno, len) which returns NULL if it fails.
>
> > + case LEFT_ALIGNED | RIGHT_ALIGNED:
> > + /* Total overlap, just delete ex. */
> > + lp = lp->next;
> > + list_del(&ex->list);
> > + kmem_free(ex);
> > + break;
> > + case 0:
> > + /*
> > + * Deleting from the middle: add the new right extent
> > + * and then shrink the left extent.
> > + */
> > + newex = kmem_alloc(sizeof(struct xfs_repair_extent),
> > + KM_MAYFAIL);
> > + if (!newex) {
> > + error = -ENOMEM;
> > + goto out;
> > + }
> > + INIT_LIST_HEAD(&newex->list);
> > + newex->fsbno = sub_fsb + sub_len;
> > + newex->len = ex->fsbno + ex->len - newex->fsbno;
>
> This is the other place it is called...
<nod> I'll add this to the pile of enhancements to come after
upstreaming; maybe this will turn into more generic per-ag bitmap
structure.
I'd also like to replace the list usage in the later repair patches with
some kind of sort-once array structure and an xfs_btree_bulkload to
reduce overhead and memory consumption.
--D
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (2 preceding siblings ...)
2018-05-15 22:33 ` [PATCH 03/22] xfs: add helpers to collect and sift btree block pointers during repair Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-16 8:32 ` Dave Chinner
2018-05-18 3:53 ` [PATCH v2 " Darrick J. Wong
2018-05-15 22:34 ` [PATCH 05/22] xfs: recover AG btree roots from rmap data Darrick J. Wong
` (18 subsequent siblings)
22 siblings, 2 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Now that we've plumbed in the ability to construct a list of dead btree
blocks following a repair, add more helpers to dispose of them. This is
done by examining the rmapbt -- if the btree was the only owner we can
free the block, otherwise it's crosslinked and we can only remove the
rmapbt record.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/repair.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 3 +
2 files changed, 202 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 8e8ecddd7537..d820e01d1146 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -362,6 +362,173 @@ xfs_repair_init_btblock(
return 0;
}
+/* Ensure the freelist is the correct size. */
+int
+xfs_repair_fix_freelist(
+ struct xfs_scrub_context *sc,
+ bool can_shrink)
+{
+ struct xfs_alloc_arg args = {0};
+ int error;
+
+ args.mp = sc->mp;
+ args.tp = sc->tp;
+ args.agno = sc->sa.agno;
+ args.alignment = 1;
+ args.pag = xfs_perag_get(args.mp, sc->sa.agno);
+
+ error = xfs_alloc_fix_freelist(&args,
+ can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+ xfs_perag_put(args.pag);
+
+ return error;
+}
+
+/*
+ * Put a block back on the AGFL.
+ */
+STATIC int
+xfs_repair_put_freelist(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_perag *pag;
+ int error;
+
+ /* Make sure there's space on the freelist. */
+ error = xfs_repair_fix_freelist(sc, true);
+ if (error)
+ return error;
+ pag = xfs_perag_get(sc->mp, sc->sa.agno);
+ if (pag->pagf_flcount == 0) {
+ xfs_perag_put(pag);
+ return -EFSCORRUPTED;
+ }
+ xfs_perag_put(pag);
+
+ /*
+ * Since we're "freeing" a lost block onto the AGFL, we have to
+ * create an rmap for the block prior to merging it or else other
+ * parts will break.
+ */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
+ &oinfo);
+ if (error)
+ return error;
+
+ /* Put the block on the AGFL. */
+ error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
+ agbno, 0);
+ if (error)
+ return error;
+ xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+ return 0;
+}
+
+/* Dispose of a single metadata block. */
+STATIC int
+xfs_repair_dispose_btree_block(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsbno,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type resv)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf_bp = NULL;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ bool has_other_rmap;
+ int error;
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+
+ if (sc->ip) {
+ /* Repairing per-inode metadata, read in the AGF. */
+ error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+ } else {
+ /* Repairing per-AG btree, reuse existing AGF buffer. */
+ agf_bp = sc->sa.agf_bp;
+ }
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
+
+ /* Can we find any other rmappings? */
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
+ if (error)
+ goto out_cur;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the reverse mapping and move on. Otherwise,
+ * we were the only owner of the block, so free the extent, which will
+ * also remove the rmap.
+ */
+ if (has_other_rmap)
+ error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
+ else if (resv == XFS_AG_RESV_AGFL)
+ error = xfs_repair_put_freelist(sc, agbno);
+ else
+ error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+ if (agf_bp != sc->sa.agf_bp)
+ xfs_trans_brelse(sc->tp, agf_bp);
+ if (error)
+ return error;
+
+ if (sc->ip)
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+ return xfs_repair_roll_ag_trans(sc);
+
+out_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ if (agf_bp != sc->sa.agf_bp)
+ xfs_trans_brelse(sc->tp, agf_bp);
+ return error;
+}
+
+/*
+ * Dispose of a given metadata extent.
+ *
+ * If the rmapbt says the extent has multiple owners, we simply remove the
+ * rmap associated with this owner hoping that we'll eventually disentangle
+ * the crosslinked metadata. Otherwise, there's one owner, so call the
+ * regular free code to remove the rmap and free the extent. Any existing
+ * buffers for the blocks in the extent must have been _binval'd previously.
+ */
+STATIC int
+xfs_repair_dispose_btree_extent(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type resv)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
+ ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(mp, fsbno) == sc->sa.agno);
+
+ trace_xfs_repair_dispose_btree_extent(mp, XFS_FSB_TO_AGNO(mp, fsbno),
+ XFS_FSB_TO_AGBNO(mp, fsbno), len);
+
+ for (; len > 0; len--, fsbno++) {
+ error = xfs_repair_dispose_btree_block(sc, fsbno, oinfo, resv);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
/* Collect a dead btree extent for later disposal. */
int
xfs_repair_collect_btree_extent(
@@ -404,7 +571,9 @@ xfs_repair_reap_btree_extents(
int error = 0;
for_each_xfs_repair_extent_safe(rex, n, exlist) {
- /* TODO: free the extent */
+ if (!error)
+ error = xfs_repair_dispose_btree_extent(sc, rex->fsbno,
+ rex->len, oinfo, type);
list_del(&rex->list);
kmem_free(rex);
}
@@ -568,3 +737,32 @@ xfs_repair_subtract_extents(
}
#undef XFS_REPAIR_EXT_LEFT_CONTIG
#undef XFS_REPAIR_EXT_RIGHT_CONTIG
+
+/*
+ * Invalidate buffers for per-AG btree blocks we're dumping. We assume that
+ * exlist points only to metadata blocks.
+ */
+int
+xfs_repair_invalidate_blocks(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t i;
+
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ agno = XFS_FSB_TO_AGNO(sc->mp, rex->fsbno);
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno);
+ for (i = 0; i < rex->len; i++) {
+ bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
+ agbno + i, 0);
+ xfs_trans_binval(sc->tp, bp);
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index b288201030f8..e0e7f86d509c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -73,6 +73,9 @@ void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
struct xfs_repair_extent_list *exlist,
struct xfs_repair_extent_list *sublist);
+int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink);
+int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist);
/* Metadata repairers */
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-15 22:34 ` [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair Darrick J. Wong
@ 2018-05-16 8:32 ` Dave Chinner
2018-05-16 18:02 ` Allison Henderson
2018-05-16 19:34 ` Darrick J. Wong
2018-05-18 3:53 ` [PATCH v2 " Darrick J. Wong
1 sibling, 2 replies; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 8:32 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Tue, May 15, 2018 at 03:34:04PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Now that we've plumbed in the ability to construct a list of dead btree
> blocks following a repair, add more helpers to dispose of them. This is
> done by examining the rmapbt -- if the btree was the only owner we can
> free the block, otherwise it's crosslinked and we can only remove the
> rmapbt record.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/repair.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 3 +
> 2 files changed, 202 insertions(+), 1 deletion(-)
>
>
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 8e8ecddd7537..d820e01d1146 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -362,6 +362,173 @@ xfs_repair_init_btblock(
> return 0;
> }
>
> +/* Ensure the freelist is the correct size. */
> +int
> +xfs_repair_fix_freelist(
> + struct xfs_scrub_context *sc,
> + bool can_shrink)
> +{
> + struct xfs_alloc_arg args = {0};
> + int error;
> +
> + args.mp = sc->mp;
> + args.tp = sc->tp;
> + args.agno = sc->sa.agno;
> + args.alignment = 1;
> + args.pag = xfs_perag_get(args.mp, sc->sa.agno);
> +
> + error = xfs_alloc_fix_freelist(&args,
> + can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
> + xfs_perag_put(args.pag);
with all these pag lookups, I'm starting to wonder if you should
just add a lookup and store the pag in the scrub context? That'd
save a lot of lookups - the pag structures never go away and i never
really intended them to be used like this in single, very short use
contexts. Grab once per operation, hold the reference across the
entire operation context, then release it....
> + return error;
> +}
> +
> +/*
> + * Put a block back on the AGFL.
> + */
> +STATIC int
> +xfs_repair_put_freelist(
> + struct xfs_scrub_context *sc,
> + xfs_agblock_t agbno)
> +{
> + struct xfs_owner_info oinfo;
> + struct xfs_perag *pag;
> + int error;
> +
> + /* Make sure there's space on the freelist. */
> + error = xfs_repair_fix_freelist(sc, true);
> + if (error)
> + return error;
> + pag = xfs_perag_get(sc->mp, sc->sa.agno);
Because this is how it quickly gets it gets to silly numbers of
lookups. That's two now in this function.
> + if (pag->pagf_flcount == 0) {
> + xfs_perag_put(pag);
> + return -EFSCORRUPTED;
Why is having an empty freelist a problem here? It's an AG thatis
completely out of space, but it isn't corruption? And I don't see
why an empty freelist prevents us from adding a backs back onto the
AGFL?
> + }
> + xfs_perag_put(pag);
> +
> + /*
> + * Since we're "freeing" a lost block onto the AGFL, we have to
> + * create an rmap for the block prior to merging it or else other
> + * parts will break.
> + */
> + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
> + &oinfo);
> + if (error)
> + return error;
> +
> + /* Put the block on the AGFL. */
> + error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
> + agbno, 0);
There's another pag lookup in here.
> + if (error)
> + return error;
> + xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
> + XFS_EXTENT_BUSY_SKIP_DISCARD);
And another in here, so 4 perag lookups for the same structure in
one simple operation. The code here in the function is fine, but I
really think we need to rethink how we use the perag in our
allocation code...
> +
> + return 0;
> +}
> +
> +/* Dispose of a single metadata block. */
> +STATIC int
> +xfs_repair_dispose_btree_block(
> + struct xfs_scrub_context *sc,
> + xfs_fsblock_t fsbno,
> + struct xfs_owner_info *oinfo,
> + enum xfs_ag_resv_type resv)
> +{
> + struct xfs_btree_cur *cur;
> + struct xfs_buf *agf_bp = NULL;
> + xfs_agnumber_t agno;
> + xfs_agblock_t agbno;
> + bool has_other_rmap;
> + int error;
> +
> + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
> + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
> +
> + if (sc->ip) {
> + /* Repairing per-inode metadata, read in the AGF. */
This would be better with a comment above saying:
/*
* If we are repairing per-inode metadata, we need to read
* in the AGF buffer. Otherwise we can re-use the existing
* AGF buffer we set up for repairing the per-AG btrees.
*/
> + error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
> + if (error)
> + return error;
> + if (!agf_bp)
> + return -ENOMEM;
> + } else {
> + /* Repairing per-AG btree, reuse existing AGF buffer. */
> + agf_bp = sc->sa.agf_bp;
> + }
> + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
> +
> + /* Can we find any other rmappings? */
> + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
> + if (error)
> + goto out_cur;
> + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
> +
> + /*
> + * If there are other rmappings, this block is cross linked and must
> + * not be freed. Remove the reverse mapping and move on. Otherwise,
Why do we just remove the reverse mapping if the block cannot be
freed? I have my suspicions that this is removing cross-links one by
one until there's only one reference left to the extent, but then I
ask "how do we know which one is the correct mapping"?
i.e. this comment raised more questions about the algorithm for
dealing with cross-linked blocks - which doesn't appear to be
explained anywhere - than it answers....
> + * we were the only owner of the block, so free the extent, which will
> + * also remove the rmap.
> + */
> + if (has_other_rmap)
> + error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
> + else if (resv == XFS_AG_RESV_AGFL)
> + error = xfs_repair_put_freelist(sc, agbno);
> + else
> + error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
> + if (agf_bp != sc->sa.agf_bp)
> + xfs_trans_brelse(sc->tp, agf_bp);
> + if (error)
> + return error;
> +
> + if (sc->ip)
> + return xfs_trans_roll_inode(&sc->tp, sc->ip);
> + return xfs_repair_roll_ag_trans(sc);
> +
> +out_cur:
> + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
> + if (agf_bp != sc->sa.agf_bp)
> + xfs_trans_brelse(sc->tp, agf_bp);
> + return error;
> +}
> +
> +/*
> + * Dispose of a given metadata extent.
> + *
> + * If the rmapbt says the extent has multiple owners, we simply remove the
> + * rmap associated with this owner hoping that we'll eventually disentangle
> + * the crosslinked metadata. Otherwise, there's one owner, so call the
> + * regular free code to remove the rmap and free the extent. Any existing
> + * buffers for the blocks in the extent must have been _binval'd previously.
Ok, so there's a little more detail about cross-linked files. Seems
my suspicions of "remove and hope" were close to the mark :P
Perhaps we need a document that describes the various algorithms we
use for resolving these problems, so they can be discussed and
improved without having to troll through the code to understand?
> + */
> +STATIC int
> +xfs_repair_dispose_btree_extent(
> + struct xfs_scrub_context *sc,
> + xfs_fsblock_t fsbno,
> + xfs_extlen_t len,
> + struct xfs_owner_info *oinfo,
> + enum xfs_ag_resv_type resv)
> +{
> + struct xfs_mount *mp = sc->mp;
> + int error = 0;
> +
> + ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
> + ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(mp, fsbno) == sc->sa.agno);
> +
> + trace_xfs_repair_dispose_btree_extent(mp, XFS_FSB_TO_AGNO(mp, fsbno),
> + XFS_FSB_TO_AGBNO(mp, fsbno), len);
> +
> + for (; len > 0; len--, fsbno++) {
> + error = xfs_repair_dispose_btree_block(sc, fsbno, oinfo, resv);
> + if (error)
> + return error;
So why do we do this one block at a time, rather than freeing it
as an entire extent in one go? And if there is only a single caller,
why not just open code the loop in the caller?
> +/*
> + * Invalidate buffers for per-AG btree blocks we're dumping. We assume that
> + * exlist points only to metadata blocks.
> + */
> +int
> +xfs_repair_invalidate_blocks(
> + struct xfs_scrub_context *sc,
> + struct xfs_repair_extent_list *exlist)
> +{
> + struct xfs_repair_extent *rex;
> + struct xfs_repair_extent *n;
> + struct xfs_buf *bp;
> + xfs_agnumber_t agno;
> + xfs_agblock_t agbno;
> + xfs_agblock_t i;
> +
> + for_each_xfs_repair_extent_safe(rex, n, exlist) {
> + agno = XFS_FSB_TO_AGNO(sc->mp, rex->fsbno);
> + agbno = XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno);
> + for (i = 0; i < rex->len; i++) {
> + bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> + agbno + i, 0);
> + xfs_trans_binval(sc->tp, bp);
> + }
Again, this is doing things by single blocks. We do have multi-block
metadata (inodes, directory blocks, remote attrs) that, if it
is already in memory, needs to be treated as multi-block extents. If
we don't do that, we'll cause aliasing problems in the buffer cache
(see _xfs_buf_obj_cmp()) and it's all downhill from there.
That's why I get worried when I see assumptions that we can process
contiguous metadata ranges in single block buffers....
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-16 8:32 ` Dave Chinner
@ 2018-05-16 18:02 ` Allison Henderson
2018-05-16 19:34 ` Darrick J. Wong
1 sibling, 0 replies; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 18:02 UTC (permalink / raw)
To: Dave Chinner, Darrick J. Wong; +Cc: linux-xfs
Alrighty, with Daves concerns addressed:
Reviewed by: Allison Henderson <allison.henderson@oracle.com>
On 05/16/2018 01:32 AM, Dave Chinner wrote:
> On Tue, May 15, 2018 at 03:34:04PM -0700, Darrick J. Wong wrote:
>> From: Darrick J. Wong <darrick.wong@oracle.com>
>>
>> Now that we've plumbed in the ability to construct a list of dead btree
>> blocks following a repair, add more helpers to dispose of them. This is
>> done by examining the rmapbt -- if the btree was the only owner we can
>> free the block, otherwise it's crosslinked and we can only remove the
>> rmapbt record.
>>
>> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
>> ---
>> fs/xfs/scrub/repair.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++
>> fs/xfs/scrub/repair.h | 3 +
>> 2 files changed, 202 insertions(+), 1 deletion(-)
>>
>>
>> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
>> index 8e8ecddd7537..d820e01d1146 100644
>> --- a/fs/xfs/scrub/repair.c
>> +++ b/fs/xfs/scrub/repair.c
>> @@ -362,6 +362,173 @@ xfs_repair_init_btblock(
>> return 0;
>> }
>>
>> +/* Ensure the freelist is the correct size. */
>> +int
>> +xfs_repair_fix_freelist(
>> + struct xfs_scrub_context *sc,
>> + bool can_shrink)
>> +{
>> + struct xfs_alloc_arg args = {0};
>> + int error;
>> +
>> + args.mp = sc->mp;
>> + args.tp = sc->tp;
>> + args.agno = sc->sa.agno;
>> + args.alignment = 1;
>> + args.pag = xfs_perag_get(args.mp, sc->sa.agno);
>> +
>> + error = xfs_alloc_fix_freelist(&args,
>> + can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
>> + xfs_perag_put(args.pag);
> with all these pag lookups, I'm starting to wonder if you should
> just add a lookup and store the pag in the scrub context? That'd
> save a lot of lookups - the pag structures never go away and i never
> really intended them to be used like this in single, very short use
> contexts. Grab once per operation, hold the reference across the
> entire operation context, then release it....
>
>> + return error;
>> +}
>> +
>> +/*
>> + * Put a block back on the AGFL.
>> + */
>> +STATIC int
>> +xfs_repair_put_freelist(
>> + struct xfs_scrub_context *sc,
>> + xfs_agblock_t agbno)
>> +{
>> + struct xfs_owner_info oinfo;
>> + struct xfs_perag *pag;
>> + int error;
>> +
>> + /* Make sure there's space on the freelist. */
>> + error = xfs_repair_fix_freelist(sc, true);
>> + if (error)
>> + return error;
>> + pag = xfs_perag_get(sc->mp, sc->sa.agno);
> Because this is how it quickly gets it gets to silly numbers of
> lookups. That's two now in this function.
>
>> + if (pag->pagf_flcount == 0) {
>> + xfs_perag_put(pag);
>> + return -EFSCORRUPTED;
> Why is having an empty freelist a problem here? It's an AG thatis
> completely out of space, but it isn't corruption? And I don't see
> why an empty freelist prevents us from adding a backs back onto the
> AGFL?
>
>> + }
>> + xfs_perag_put(pag);
>> +
>> + /*
>> + * Since we're "freeing" a lost block onto the AGFL, we have to
>> + * create an rmap for the block prior to merging it or else other
>> + * parts will break.
>> + */
>> + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
>> + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
>> + &oinfo);
>> + if (error)
>> + return error;
>> +
>> + /* Put the block on the AGFL. */
>> + error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
>> + agbno, 0);
> There's another pag lookup in here.
>
>> + if (error)
>> + return error;
>> + xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
>> + XFS_EXTENT_BUSY_SKIP_DISCARD);
> And another in here, so 4 perag lookups for the same structure in
> one simple operation. The code here in the function is fine, but I
> really think we need to rethink how we use the perag in our
> allocation code...
>
>> +
>> + return 0;
>> +}
>> +
>> +/* Dispose of a single metadata block. */
>> +STATIC int
>> +xfs_repair_dispose_btree_block(
>> + struct xfs_scrub_context *sc,
>> + xfs_fsblock_t fsbno,
>> + struct xfs_owner_info *oinfo,
>> + enum xfs_ag_resv_type resv)
>> +{
>> + struct xfs_btree_cur *cur;
>> + struct xfs_buf *agf_bp = NULL;
>> + xfs_agnumber_t agno;
>> + xfs_agblock_t agbno;
>> + bool has_other_rmap;
>> + int error;
>> +
>> + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
>> + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
>> +
>> + if (sc->ip) {
>> + /* Repairing per-inode metadata, read in the AGF. */
> This would be better with a comment above saying:
>
> /*
> * If we are repairing per-inode metadata, we need to read
> * in the AGF buffer. Otherwise we can re-use the existing
> * AGF buffer we set up for repairing the per-AG btrees.
> */
>> + error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
>> + if (error)
>> + return error;
>> + if (!agf_bp)
>> + return -ENOMEM;
>> + } else {
>> + /* Repairing per-AG btree, reuse existing AGF buffer. */
>> + agf_bp = sc->sa.agf_bp;
>> + }
>> + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
>> +
>> + /* Can we find any other rmappings? */
>> + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
>> + if (error)
>> + goto out_cur;
>> + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
>> +
>> + /*
>> + * If there are other rmappings, this block is cross linked and must
>> + * not be freed. Remove the reverse mapping and move on. Otherwise,
> Why do we just remove the reverse mapping if the block cannot be
> freed? I have my suspicions that this is removing cross-links one by
> one until there's only one reference left to the extent, but then I
> ask "how do we know which one is the correct mapping"?
>
> i.e. this comment raised more questions about the algorithm for
> dealing with cross-linked blocks - which doesn't appear to be
> explained anywhere - than it answers....
>
>> + * we were the only owner of the block, so free the extent, which will
>> + * also remove the rmap.
>> + */
>> + if (has_other_rmap)
>> + error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
>> + else if (resv == XFS_AG_RESV_AGFL)
>> + error = xfs_repair_put_freelist(sc, agbno);
>> + else
>> + error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
>> + if (agf_bp != sc->sa.agf_bp)
>> + xfs_trans_brelse(sc->tp, agf_bp);
>> + if (error)
>> + return error;
>> +
>> + if (sc->ip)
>> + return xfs_trans_roll_inode(&sc->tp, sc->ip);
>> + return xfs_repair_roll_ag_trans(sc);
>> +
>> +out_cur:
>> + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
>> + if (agf_bp != sc->sa.agf_bp)
>> + xfs_trans_brelse(sc->tp, agf_bp);
>> + return error;
>> +}
>> +
>> +/*
>> + * Dispose of a given metadata extent.
>> + *
>> + * If the rmapbt says the extent has multiple owners, we simply remove the
>> + * rmap associated with this owner hoping that we'll eventually disentangle
>> + * the crosslinked metadata. Otherwise, there's one owner, so call the
>> + * regular free code to remove the rmap and free the extent. Any existing
>> + * buffers for the blocks in the extent must have been _binval'd previously.
> Ok, so there's a little more detail about cross-linked files. Seems
> my suspicions of "remove and hope" were close to the mark :P
>
> Perhaps we need a document that describes the various algorithms we
> use for resolving these problems, so they can be discussed and
> improved without having to troll through the code to understand?
>
>> + */
>> +STATIC int
>> +xfs_repair_dispose_btree_extent(
>> + struct xfs_scrub_context *sc,
>> + xfs_fsblock_t fsbno,
>> + xfs_extlen_t len,
>> + struct xfs_owner_info *oinfo,
>> + enum xfs_ag_resv_type resv)
>> +{
>> + struct xfs_mount *mp = sc->mp;
>> + int error = 0;
>> +
>> + ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
>> + ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(mp, fsbno) == sc->sa.agno);
>> +
>> + trace_xfs_repair_dispose_btree_extent(mp, XFS_FSB_TO_AGNO(mp, fsbno),
>> + XFS_FSB_TO_AGBNO(mp, fsbno), len);
>> +
>> + for (; len > 0; len--, fsbno++) {
>> + error = xfs_repair_dispose_btree_block(sc, fsbno, oinfo, resv);
>> + if (error)
>> + return error;
> So why do we do this one block at a time, rather than freeing it
> as an entire extent in one go? And if there is only a single caller,
> why not just open code the loop in the caller?
>
>
>> +/*
>> + * Invalidate buffers for per-AG btree blocks we're dumping. We assume that
>> + * exlist points only to metadata blocks.
>> + */
>> +int
>> +xfs_repair_invalidate_blocks(
>> + struct xfs_scrub_context *sc,
>> + struct xfs_repair_extent_list *exlist)
>> +{
>> + struct xfs_repair_extent *rex;
>> + struct xfs_repair_extent *n;
>> + struct xfs_buf *bp;
>> + xfs_agnumber_t agno;
>> + xfs_agblock_t agbno;
>> + xfs_agblock_t i;
>> +
>> + for_each_xfs_repair_extent_safe(rex, n, exlist) {
>> + agno = XFS_FSB_TO_AGNO(sc->mp, rex->fsbno);
>> + agbno = XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno);
>> + for (i = 0; i < rex->len; i++) {
>> + bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
>> + agbno + i, 0);
>> + xfs_trans_binval(sc->tp, bp);
>> + }
> Again, this is doing things by single blocks. We do have multi-block
> metadata (inodes, directory blocks, remote attrs) that, if it
> is already in memory, needs to be treated as multi-block extents. If
> we don't do that, we'll cause aliasing problems in the buffer cache
> (see _xfs_buf_obj_cmp()) and it's all downhill from there.
>
> That's why I get worried when I see assumptions that we can process
> contiguous metadata ranges in single block buffers....
>
> Cheers,
>
> Dave.
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-16 8:32 ` Dave Chinner
2018-05-16 18:02 ` Allison Henderson
@ 2018-05-16 19:34 ` Darrick J. Wong
2018-05-16 22:32 ` Dave Chinner
1 sibling, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-16 19:34 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Wed, May 16, 2018 at 06:32:32PM +1000, Dave Chinner wrote:
> On Tue, May 15, 2018 at 03:34:04PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Now that we've plumbed in the ability to construct a list of dead btree
> > blocks following a repair, add more helpers to dispose of them. This is
> > done by examining the rmapbt -- if the btree was the only owner we can
> > free the block, otherwise it's crosslinked and we can only remove the
> > rmapbt record.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > fs/xfs/scrub/repair.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++
> > fs/xfs/scrub/repair.h | 3 +
> > 2 files changed, 202 insertions(+), 1 deletion(-)
> >
> >
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index 8e8ecddd7537..d820e01d1146 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -362,6 +362,173 @@ xfs_repair_init_btblock(
> > return 0;
> > }
> >
> > +/* Ensure the freelist is the correct size. */
> > +int
> > +xfs_repair_fix_freelist(
> > + struct xfs_scrub_context *sc,
> > + bool can_shrink)
> > +{
> > + struct xfs_alloc_arg args = {0};
> > + int error;
> > +
> > + args.mp = sc->mp;
> > + args.tp = sc->tp;
> > + args.agno = sc->sa.agno;
> > + args.alignment = 1;
> > + args.pag = xfs_perag_get(args.mp, sc->sa.agno);
> > +
> > + error = xfs_alloc_fix_freelist(&args,
> > + can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
> > + xfs_perag_put(args.pag);
>
> with all these pag lookups, I'm starting to wonder if you should
> just add a lookup and store the pag in the scrub context? That'd
> save a lot of lookups - the pag structures never go away and i never
> really intended them to be used like this in single, very short use
> contexts. Grab once per operation, hold the reference across the
> entire operation context, then release it....
Ok, I'll change xfs_scrub_ag_{init,free} to get / put the perag
structure so we don't have to do this repeatedly.
> > + return error;
> > +}
> > +
> > +/*
> > + * Put a block back on the AGFL.
> > + */
> > +STATIC int
> > +xfs_repair_put_freelist(
> > + struct xfs_scrub_context *sc,
> > + xfs_agblock_t agbno)
> > +{
> > + struct xfs_owner_info oinfo;
> > + struct xfs_perag *pag;
> > + int error;
> > +
> > + /* Make sure there's space on the freelist. */
> > + error = xfs_repair_fix_freelist(sc, true);
> > + if (error)
> > + return error;
> > + pag = xfs_perag_get(sc->mp, sc->sa.agno);
>
> Because this is how it quickly gets it gets to silly numbers of
> lookups. That's two now in this function.
>
> > + if (pag->pagf_flcount == 0) {
> > + xfs_perag_put(pag);
> > + return -EFSCORRUPTED;
>
> Why is having an empty freelist a problem here? It's an AG thatis
> completely out of space, but it isn't corruption? And I don't see
> why an empty freelist prevents us from adding a backs back onto the
> AGFL?
>
> > + }
> > + xfs_perag_put(pag);
> > +
> > + /*
> > + * Since we're "freeing" a lost block onto the AGFL, we have to
> > + * create an rmap for the block prior to merging it or else other
> > + * parts will break.
> > + */
> > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
> > + &oinfo);
> > + if (error)
> > + return error;
> > +
> > + /* Put the block on the AGFL. */
> > + error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
> > + agbno, 0);
>
> There's another pag lookup in here.
>
> > + if (error)
> > + return error;
> > + xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
> > + XFS_EXTENT_BUSY_SKIP_DISCARD);
>
> And another in here, so 4 perag lookups for the same structure in
> one simple operation. The code here in the function is fine, but I
> really think we need to rethink how we use the perag in our
> allocation code...
Yeah. If we ever introduce the ability to lock an AG for the duration
of a transaction then we might consider linking the perag structures
through the transaction at the same time so that these library functions
can skip the _perag_get.
> > +
> > + return 0;
> > +}
> > +
> > +/* Dispose of a single metadata block. */
> > +STATIC int
> > +xfs_repair_dispose_btree_block(
> > + struct xfs_scrub_context *sc,
> > + xfs_fsblock_t fsbno,
> > + struct xfs_owner_info *oinfo,
> > + enum xfs_ag_resv_type resv)
> > +{
> > + struct xfs_btree_cur *cur;
> > + struct xfs_buf *agf_bp = NULL;
> > + xfs_agnumber_t agno;
> > + xfs_agblock_t agbno;
> > + bool has_other_rmap;
> > + int error;
> > +
> > + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
> > + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
> > +
> > + if (sc->ip) {
> > + /* Repairing per-inode metadata, read in the AGF. */
>
> This would be better with a comment above saying:
>
> /*
> * If we are repairing per-inode metadata, we need to read
> * in the AGF buffer. Otherwise we can re-use the existing
> * AGF buffer we set up for repairing the per-AG btrees.
> */
Ok.
> > + error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
> > + if (error)
> > + return error;
> > + if (!agf_bp)
> > + return -ENOMEM;
> > + } else {
> > + /* Repairing per-AG btree, reuse existing AGF buffer. */
> > + agf_bp = sc->sa.agf_bp;
> > + }
> > + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
> > +
> > + /* Can we find any other rmappings? */
> > + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
> > + if (error)
> > + goto out_cur;
> > + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
> > +
> > + /*
> > + * If there are other rmappings, this block is cross linked and must
> > + * not be freed. Remove the reverse mapping and move on. Otherwise,
>
> Why do we just remove the reverse mapping if the block cannot be
> freed? I have my suspicions that this is removing cross-links one by
> one until there's only one reference left to the extent, but then I
> ask "how do we know which one is the correct mapping"?
Right. Prior to calling this function we built a totally new btree with
blocks from the freespace, so now we need to remove the rmaps that
covered the old btree and/or free the block. The goal is to rebuild
/all/ the trees that think they own this block so that we can free the
block and not have to care which one is correct.
> i.e. this comment raised more questions about the algorithm for
> dealing with cross-linked blocks - which doesn't appear to be
> explained anywhere - than it answers....
>
> > + * we were the only owner of the block, so free the extent, which will
> > + * also remove the rmap.
> > + */
> > + if (has_other_rmap)
> > + error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
> > + else if (resv == XFS_AG_RESV_AGFL)
> > + error = xfs_repair_put_freelist(sc, agbno);
> > + else
> > + error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
> > + if (agf_bp != sc->sa.agf_bp)
> > + xfs_trans_brelse(sc->tp, agf_bp);
> > + if (error)
> > + return error;
> > +
> > + if (sc->ip)
> > + return xfs_trans_roll_inode(&sc->tp, sc->ip);
> > + return xfs_repair_roll_ag_trans(sc);
> > +
> > +out_cur:
> > + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
> > + if (agf_bp != sc->sa.agf_bp)
> > + xfs_trans_brelse(sc->tp, agf_bp);
> > + return error;
> > +}
> > +
> > +/*
> > + * Dispose of a given metadata extent.
> > + *
> > + * If the rmapbt says the extent has multiple owners, we simply remove the
> > + * rmap associated with this owner hoping that we'll eventually disentangle
> > + * the crosslinked metadata. Otherwise, there's one owner, so call the
> > + * regular free code to remove the rmap and free the extent. Any existing
> > + * buffers for the blocks in the extent must have been _binval'd previously.
>
> Ok, so there's a little more detail about cross-linked files. Seems
> my suspicions of "remove and hope" were close to the mark :P
>
> Perhaps we need a document that describes the various algorithms we
> use for resolving these problems, so they can be discussed and
> improved without having to troll through the code to understand?
Yeah. I'll work on consolidating the crosslink functions after lunch
and documenting how the tree rebuilder interacts with fixing crosslinks.
> > + */
> > +STATIC int
> > +xfs_repair_dispose_btree_extent(
> > + struct xfs_scrub_context *sc,
> > + xfs_fsblock_t fsbno,
> > + xfs_extlen_t len,
> > + struct xfs_owner_info *oinfo,
> > + enum xfs_ag_resv_type resv)
> > +{
> > + struct xfs_mount *mp = sc->mp;
> > + int error = 0;
> > +
> > + ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
> > + ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(mp, fsbno) == sc->sa.agno);
> > +
> > + trace_xfs_repair_dispose_btree_extent(mp, XFS_FSB_TO_AGNO(mp, fsbno),
> > + XFS_FSB_TO_AGBNO(mp, fsbno), len);
> > +
> > + for (; len > 0; len--, fsbno++) {
> > + error = xfs_repair_dispose_btree_block(sc, fsbno, oinfo, resv);
> > + if (error)
> > + return error;
>
> So why do we do this one block at a time, rather than freeing it
> as an entire extent in one go?
At the moment the xfs_rmap_has_other_keys helper can only tell you if
there are multiple rmap owners for any part of a given extent. For
example, if the rmap records were:
(start = 35, len = 3, owner = rmap)
(start = 35, len = 1, owner = refcount)
(start = 37, len = 1, owner = inobt)
Notice how block 35 and 37 are crosslinked, but 36 isn't. A call to
xfs_rmap_has_other_keys(35, 3) will say "yes" but doesn't have a way to
signal back that the yes applies to 35 but that the caller should try
again with block 36. Doing so would require _has_other_keys to maintain
a refcount and to return to the caller any time the refcount changed,
and the caller would still have to loop the extent. It's easier to have
a dumb loop for the initial implementation and optimize it if we start
taking more heat than we'd like on crosslinked filesystems.
> And if there is only a single caller, why not just open code the loop
> in the caller?
I certainly could do that.
>
> > +/*
> > + * Invalidate buffers for per-AG btree blocks we're dumping. We assume that
> > + * exlist points only to metadata blocks.
> > + */
> > +int
> > +xfs_repair_invalidate_blocks(
> > + struct xfs_scrub_context *sc,
> > + struct xfs_repair_extent_list *exlist)
> > +{
> > + struct xfs_repair_extent *rex;
> > + struct xfs_repair_extent *n;
> > + struct xfs_buf *bp;
> > + xfs_agnumber_t agno;
> > + xfs_agblock_t agbno;
> > + xfs_agblock_t i;
> > +
> > + for_each_xfs_repair_extent_safe(rex, n, exlist) {
> > + agno = XFS_FSB_TO_AGNO(sc->mp, rex->fsbno);
> > + agbno = XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno);
> > + for (i = 0; i < rex->len; i++) {
> > + bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> > + agbno + i, 0);
> > + xfs_trans_binval(sc->tp, bp);
> > + }
>
> Again, this is doing things by single blocks. We do have multi-block
> metadata (inodes, directory blocks, remote attrs) that, if it
> is already in memory, needs to be treated as multi-block extents. If
> we don't do that, we'll cause aliasing problems in the buffer cache
> (see _xfs_buf_obj_cmp()) and it's all downhill from there.
I only recently started testing with filesystems containing multiblock
dir/rmt metadata, and this is an unsolved problem. :(
I /think/ the solution is that we need to query the buffer cache to see
if it has a buffer for the given disk blocks, and if it matches the
btree we're discarding (correct magic/uuid/b_length) then we invalidate
it, otherwise we assume that something else owns it and move on. I
think that we have to do this on a block-by-block basis since we have
no idea what else could be crosslinked with the old btree.
for_each_xfs_repair_extent_safe(rex, n, exlist) {
agno = ...;
agbno = ...;
for (i = 0; i < rex->len, i++) {
/* don't touch ag headers or post-eofs blocks */
if (!xfs_verify_agbno(sc->mp, agno, agbno))
continue;
bp = xfs_buf_get_anything_caching_this_block(...);
if (!bp)
continue;
if (bp does not belong to this tree) {
xfs_trans_brelse(sc->tp, bp);
continue;
}
xfs_trans_binval(sc->tp, bp);
}
}
> That's why I get worried when I see assumptions that we can process
> contiguous metadata ranges in single block buffers....
:/
--D
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-16 19:34 ` Darrick J. Wong
@ 2018-05-16 22:32 ` Dave Chinner
2018-05-16 23:18 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 22:32 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Wed, May 16, 2018 at 12:34:25PM -0700, Darrick J. Wong wrote:
> On Wed, May 16, 2018 at 06:32:32PM +1000, Dave Chinner wrote:
> > On Tue, May 15, 2018 at 03:34:04PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > >
> > > Now that we've plumbed in the ability to construct a list of dead btree
> > > blocks following a repair, add more helpers to dispose of them. This is
> > > done by examining the rmapbt -- if the btree was the only owner we can
> > > free the block, otherwise it's crosslinked and we can only remove the
> > > rmapbt record.
> > >
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
[...]
> > > + struct xfs_owner_info oinfo;
> > > + struct xfs_perag *pag;
> > > + int error;
> > > +
> > > + /* Make sure there's space on the freelist. */
> > > + error = xfs_repair_fix_freelist(sc, true);
> > > + if (error)
> > > + return error;
> > > + pag = xfs_perag_get(sc->mp, sc->sa.agno);
> >
> > Because this is how it quickly gets it gets to silly numbers of
> > lookups. That's two now in this function.
> >
> > > + if (pag->pagf_flcount == 0) {
> > > + xfs_perag_put(pag);
> > > + return -EFSCORRUPTED;
> >
> > Why is having an empty freelist a problem here? It's an AG thatis
> > completely out of space, but it isn't corruption? And I don't see
> > why an empty freelist prevents us from adding a backs back onto the
> > AGFL?
I think you missed a question :P
> > > + /* Can we find any other rmappings? */
> > > + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
> > > + if (error)
> > > + goto out_cur;
> > > + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
> > > +
> > > + /*
> > > + * If there are other rmappings, this block is cross linked and must
> > > + * not be freed. Remove the reverse mapping and move on. Otherwise,
> >
> > Why do we just remove the reverse mapping if the block cannot be
> > freed? I have my suspicions that this is removing cross-links one by
> > one until there's only one reference left to the extent, but then I
> > ask "how do we know which one is the correct mapping"?
>
> Right. Prior to calling this function we built a totally new btree with
> blocks from the freespace, so now we need to remove the rmaps that
> covered the old btree and/or free the block. The goal is to rebuild
> /all/ the trees that think they own this block so that we can free the
> block and not have to care which one is correct.
Ok, so we've already rebuilt the new btree, and this is removing
stale references to cross-linked blocks that have owners different
to the one we are currently scanning.
What happens if the cross-linked block is cross-linked within the
same owner context?
> > > + struct xfs_scrub_context *sc,
> > > + xfs_fsblock_t fsbno,
> > > + xfs_extlen_t len,
> > > + struct xfs_owner_info *oinfo,
> > > + enum xfs_ag_resv_type resv)
> > > +{
> > > + struct xfs_mount *mp = sc->mp;
> > > + int error = 0;
> > > +
> > > + ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
> > > + ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(mp, fsbno) == sc->sa.agno);
> > > +
> > > + trace_xfs_repair_dispose_btree_extent(mp, XFS_FSB_TO_AGNO(mp, fsbno),
> > > + XFS_FSB_TO_AGBNO(mp, fsbno), len);
> > > +
> > > + for (; len > 0; len--, fsbno++) {
> > > + error = xfs_repair_dispose_btree_block(sc, fsbno, oinfo, resv);
> > > + if (error)
> > > + return error;
> >
> > So why do we do this one block at a time, rather than freeing it
> > as an entire extent in one go?
>
> At the moment the xfs_rmap_has_other_keys helper can only tell you if
> there are multiple rmap owners for any part of a given extent. For
> example, if the rmap records were:
>
> (start = 35, len = 3, owner = rmap)
> (start = 35, len = 1, owner = refcount)
> (start = 37, len = 1, owner = inobt)
>
> Notice how block 35 and 37 are crosslinked, but 36 isn't. A call to
> xfs_rmap_has_other_keys(35, 3) will say "yes" but doesn't have a way to
> signal back that the yes applies to 35 but that the caller should try
> again with block 36. Doing so would require _has_other_keys to maintain
> a refcount and to return to the caller any time the refcount changed,
> and the caller would still have to loop the extent. It's easier to have
> a dumb loop for the initial implementation and optimize it if we start
> taking more heat than we'd like on crosslinked filesystems.
Well, I can see why you are doing this now, but the problems with
multi-block metadata makes me think that we really need to know more
detail of the owner in the rmap. e.g. that it's directory or
attribute data, not user file data and hence we can infer things
about expected block sizes, do the correct sort of buffer lookups
for invalidation, etc.
I'm tending towards "this needs a design doc to explain all
this stuff" right now. Code is great, but I'm struggling understand
(reverse engineer!) all the algorithms and decisions that have been
made from the code...
> > > +/*
> > > + * Invalidate buffers for per-AG btree blocks we're dumping. We assume that
> > > + * exlist points only to metadata blocks.
> > > + */
> > > +int
> > > +xfs_repair_invalidate_blocks(
> > > + struct xfs_scrub_context *sc,
> > > + struct xfs_repair_extent_list *exlist)
> > > +{
> > > + struct xfs_repair_extent *rex;
> > > + struct xfs_repair_extent *n;
> > > + struct xfs_buf *bp;
> > > + xfs_agnumber_t agno;
> > > + xfs_agblock_t agbno;
> > > + xfs_agblock_t i;
> > > +
> > > + for_each_xfs_repair_extent_safe(rex, n, exlist) {
> > > + agno = XFS_FSB_TO_AGNO(sc->mp, rex->fsbno);
> > > + agbno = XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno);
> > > + for (i = 0; i < rex->len; i++) {
> > > + bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> > > + agbno + i, 0);
> > > + xfs_trans_binval(sc->tp, bp);
> > > + }
> >
> > Again, this is doing things by single blocks. We do have multi-block
> > metadata (inodes, directory blocks, remote attrs) that, if it
> > is already in memory, needs to be treated as multi-block extents. If
> > we don't do that, we'll cause aliasing problems in the buffer cache
> > (see _xfs_buf_obj_cmp()) and it's all downhill from there.
>
> I only recently started testing with filesystems containing multiblock
> dir/rmt metadata, and this is an unsolved problem. :(
That needs documenting, too. Perhaps explicitly, by rejecting repair
requests on filesystems or types that have multi-block constructs
until we solve these problems.
> I /think/ the solution is that we need to query the buffer cache to see
> if it has a buffer for the given disk blocks, and if it matches the
> btree we're discarding (correct magic/uuid/b_length) then we invalidate
> it,
I don't think that provides any guarantees. Even ignoring all the
problems with invalidation while the buffer is dirty and tracked in
the AIL, there's nothing stopping the other code from attempting to
re-instantiate the buffer due to some other access. And then we
have aliasing problems again....
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-16 22:32 ` Dave Chinner
@ 2018-05-16 23:18 ` Darrick J. Wong
2018-05-17 5:58 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-16 23:18 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Thu, May 17, 2018 at 08:32:25AM +1000, Dave Chinner wrote:
> On Wed, May 16, 2018 at 12:34:25PM -0700, Darrick J. Wong wrote:
> > On Wed, May 16, 2018 at 06:32:32PM +1000, Dave Chinner wrote:
> > > On Tue, May 15, 2018 at 03:34:04PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > >
> > > > Now that we've plumbed in the ability to construct a list of dead btree
> > > > blocks following a repair, add more helpers to dispose of them. This is
> > > > done by examining the rmapbt -- if the btree was the only owner we can
> > > > free the block, otherwise it's crosslinked and we can only remove the
> > > > rmapbt record.
> > > >
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
>
> [...]
>
> > > > + struct xfs_owner_info oinfo;
> > > > + struct xfs_perag *pag;
> > > > + int error;
> > > > +
> > > > + /* Make sure there's space on the freelist. */
> > > > + error = xfs_repair_fix_freelist(sc, true);
> > > > + if (error)
> > > > + return error;
> > > > + pag = xfs_perag_get(sc->mp, sc->sa.agno);
> > >
> > > Because this is how it quickly gets it gets to silly numbers of
> > > lookups. That's two now in this function.
> > >
> > > > + if (pag->pagf_flcount == 0) {
> > > > + xfs_perag_put(pag);
> > > > + return -EFSCORRUPTED;
> > >
> > > Why is having an empty freelist a problem here? It's an AG thatis
> > > completely out of space, but it isn't corruption? And I don't see
> > > why an empty freelist prevents us from adding a backs back onto the
> > > AGFL?
>
> I think you missed a question :P
Doh, sorry. I don't remember exactly why I put that in there; judging
from my notes I think the idea was that if the AG is completely full
we'd rather shut down with a corruption signal hoping that the admin
will run xfs_repair.
I also don't see why it's necessary now, I'll see what happens if I
remove it.
> > > > + /* Can we find any other rmappings? */
> > > > + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
> > > > + if (error)
> > > > + goto out_cur;
> > > > + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
> > > > +
> > > > + /*
> > > > + * If there are other rmappings, this block is cross linked and must
> > > > + * not be freed. Remove the reverse mapping and move on. Otherwise,
> > >
> > > Why do we just remove the reverse mapping if the block cannot be
> > > freed? I have my suspicions that this is removing cross-links one by
> > > one until there's only one reference left to the extent, but then I
> > > ask "how do we know which one is the correct mapping"?
> >
> > Right. Prior to calling this function we built a totally new btree with
> > blocks from the freespace, so now we need to remove the rmaps that
> > covered the old btree and/or free the block. The goal is to rebuild
> > /all/ the trees that think they own this block so that we can free the
> > block and not have to care which one is correct.
>
> Ok, so we've already rebuilt the new btree, and this is removing
> stale references to cross-linked blocks that have owners different
> to the one we are currently scanning.
>
> What happens if the cross-linked block is cross-linked within the
> same owner context?
It won't end up on the reap list in first place, because we scan every
block of every object with the same rmap owner to construct sublist.
Then we subtract sublist from exlist (which we got from rmap) and only
reap the difference.
> > > > + struct xfs_scrub_context *sc,
> > > > + xfs_fsblock_t fsbno,
> > > > + xfs_extlen_t len,
> > > > + struct xfs_owner_info *oinfo,
> > > > + enum xfs_ag_resv_type resv)
> > > > +{
> > > > + struct xfs_mount *mp = sc->mp;
> > > > + int error = 0;
> > > > +
> > > > + ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
> > > > + ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(mp, fsbno) == sc->sa.agno);
> > > > +
> > > > + trace_xfs_repair_dispose_btree_extent(mp, XFS_FSB_TO_AGNO(mp, fsbno),
> > > > + XFS_FSB_TO_AGBNO(mp, fsbno), len);
> > > > +
> > > > + for (; len > 0; len--, fsbno++) {
> > > > + error = xfs_repair_dispose_btree_block(sc, fsbno, oinfo, resv);
> > > > + if (error)
> > > > + return error;
> > >
> > > So why do we do this one block at a time, rather than freeing it
> > > as an entire extent in one go?
> >
> > At the moment the xfs_rmap_has_other_keys helper can only tell you if
> > there are multiple rmap owners for any part of a given extent. For
> > example, if the rmap records were:
> >
> > (start = 35, len = 3, owner = rmap)
> > (start = 35, len = 1, owner = refcount)
> > (start = 37, len = 1, owner = inobt)
> >
> > Notice how block 35 and 37 are crosslinked, but 36 isn't. A call to
> > xfs_rmap_has_other_keys(35, 3) will say "yes" but doesn't have a way to
> > signal back that the yes applies to 35 but that the caller should try
> > again with block 36. Doing so would require _has_other_keys to maintain
> > a refcount and to return to the caller any time the refcount changed,
> > and the caller would still have to loop the extent. It's easier to have
> > a dumb loop for the initial implementation and optimize it if we start
> > taking more heat than we'd like on crosslinked filesystems.
>
> Well, I can see why you are doing this now, but the problems with
> multi-block metadata makes me think that we really need to know more
> detail of the owner in the rmap. e.g. that it's directory or
> attribute data, not user file data and hence we can infer things
> about expected block sizes, do the correct sort of buffer lookups
> for invalidation, etc.
I'm not sure we can do that without causing a deadlocking problem, since
we lock all the AG headers to rebuild a btree and in general we can't
_iget an inode to find out if it's a dir or not. But I have more to say
on this in a few paragraphs...
> I'm tending towards "this needs a design doc to explain all
> this stuff" right now. Code is great, but I'm struggling understand
> (reverse engineer!) all the algorithms and decisions that have been
> made from the code...
Working on it.
> > > > +/*
> > > > + * Invalidate buffers for per-AG btree blocks we're dumping. We assume that
> > > > + * exlist points only to metadata blocks.
> > > > + */
> > > > +int
> > > > +xfs_repair_invalidate_blocks(
> > > > + struct xfs_scrub_context *sc,
> > > > + struct xfs_repair_extent_list *exlist)
> > > > +{
> > > > + struct xfs_repair_extent *rex;
> > > > + struct xfs_repair_extent *n;
> > > > + struct xfs_buf *bp;
> > > > + xfs_agnumber_t agno;
> > > > + xfs_agblock_t agbno;
> > > > + xfs_agblock_t i;
> > > > +
> > > > + for_each_xfs_repair_extent_safe(rex, n, exlist) {
> > > > + agno = XFS_FSB_TO_AGNO(sc->mp, rex->fsbno);
> > > > + agbno = XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno);
> > > > + for (i = 0; i < rex->len; i++) {
> > > > + bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> > > > + agbno + i, 0);
> > > > + xfs_trans_binval(sc->tp, bp);
> > > > + }
> > >
> > > Again, this is doing things by single blocks. We do have multi-block
> > > metadata (inodes, directory blocks, remote attrs) that, if it
> > > is already in memory, needs to be treated as multi-block extents. If
> > > we don't do that, we'll cause aliasing problems in the buffer cache
> > > (see _xfs_buf_obj_cmp()) and it's all downhill from there.
> >
> > I only recently started testing with filesystems containing multiblock
> > dir/rmt metadata, and this is an unsolved problem. :(
>
> That needs documenting, too. Perhaps explicitly, by rejecting repair
> requests on filesystems or types that have multi-block constructs
> until we solve these problems.
Trouble is, remote attr values can have an xfs_buf that spans however
many blocks you need to store a full 64k value, and what happens if the
rmapbt collides with that? It sorta implies that we can't do
invalidation on /any/ filesystem, which is unfortunate....
...unless we have an easy way of finding /any/ buffer that points to a
given block? Probably not, since iirc they're indexed by the first disk
block number. Hm. I suppose we could use the rmap data to look for
anything within 64k of the logical offset of an attr/data rmap
overlapping the same block...
...but on second thought we only care about invalidating the buffer if
the block belonged to the ag btree we've just killed, right? If there's
a multi-block buffer because it's part of a directory or an rmt block
then the buffer is clearly owned by someone else and we don't even have
to look for that. Likewise, if it's a single-block buffer but the
block has some other magic then we don't own it and we should leave it
alone.
> > I /think/ the solution is that we need to query the buffer cache to see
> > if it has a buffer for the given disk blocks, and if it matches the
> > btree we're discarding (correct magic/uuid/b_length) then we invalidate
> > it,
>
> I don't think that provides any guarantees. Even ignoring all the
> problems with invalidation while the buffer is dirty and tracked in
> the AIL, there's nothing stopping the other code from attempting to
> re-instantiate the buffer due to some other access. And then we
> have aliasing problems again....
Well, we /could/ just freeze the fs while we do repairs on any ag btree.
--D
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-16 23:18 ` Darrick J. Wong
@ 2018-05-17 5:58 ` Darrick J. Wong
0 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-17 5:58 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Wed, May 16, 2018 at 04:18:20PM -0700, Darrick J. Wong wrote:
> On Thu, May 17, 2018 at 08:32:25AM +1000, Dave Chinner wrote:
> > On Wed, May 16, 2018 at 12:34:25PM -0700, Darrick J. Wong wrote:
> > > On Wed, May 16, 2018 at 06:32:32PM +1000, Dave Chinner wrote:
> > > > On Tue, May 15, 2018 at 03:34:04PM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > >
> > > > > Now that we've plumbed in the ability to construct a list of dead btree
> > > > > blocks following a repair, add more helpers to dispose of them. This is
> > > > > done by examining the rmapbt -- if the btree was the only owner we can
> > > > > free the block, otherwise it's crosslinked and we can only remove the
> > > > > rmapbt record.
> > > > >
> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > ---
> >
> > [...]
> >
> > > > > + struct xfs_owner_info oinfo;
> > > > > + struct xfs_perag *pag;
> > > > > + int error;
> > > > > +
> > > > > + /* Make sure there's space on the freelist. */
> > > > > + error = xfs_repair_fix_freelist(sc, true);
> > > > > + if (error)
> > > > > + return error;
> > > > > + pag = xfs_perag_get(sc->mp, sc->sa.agno);
> > > >
> > > > Because this is how it quickly gets it gets to silly numbers of
> > > > lookups. That's two now in this function.
> > > >
> > > > > + if (pag->pagf_flcount == 0) {
> > > > > + xfs_perag_put(pag);
> > > > > + return -EFSCORRUPTED;
> > > >
> > > > Why is having an empty freelist a problem here? It's an AG thatis
> > > > completely out of space, but it isn't corruption? And I don't see
> > > > why an empty freelist prevents us from adding a backs back onto the
> > > > AGFL?
> >
> > I think you missed a question :P
>
> Doh, sorry. I don't remember exactly why I put that in there; judging
> from my notes I think the idea was that if the AG is completely full
> we'd rather shut down with a corruption signal hoping that the admin
> will run xfs_repair.
>
> I also don't see why it's necessary now, I'll see what happens if I
> remove it.
>
> > > > > + /* Can we find any other rmappings? */
> > > > > + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
> > > > > + if (error)
> > > > > + goto out_cur;
> > > > > + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
> > > > > +
> > > > > + /*
> > > > > + * If there are other rmappings, this block is cross linked and must
> > > > > + * not be freed. Remove the reverse mapping and move on. Otherwise,
> > > >
> > > > Why do we just remove the reverse mapping if the block cannot be
> > > > freed? I have my suspicions that this is removing cross-links one by
> > > > one until there's only one reference left to the extent, but then I
> > > > ask "how do we know which one is the correct mapping"?
> > >
> > > Right. Prior to calling this function we built a totally new btree with
> > > blocks from the freespace, so now we need to remove the rmaps that
> > > covered the old btree and/or free the block. The goal is to rebuild
> > > /all/ the trees that think they own this block so that we can free the
> > > block and not have to care which one is correct.
> >
> > Ok, so we've already rebuilt the new btree, and this is removing
> > stale references to cross-linked blocks that have owners different
> > to the one we are currently scanning.
> >
> > What happens if the cross-linked block is cross-linked within the
> > same owner context?
>
> It won't end up on the reap list in first place, because we scan every
> block of every object with the same rmap owner to construct sublist.
> Then we subtract sublist from exlist (which we got from rmap) and only
> reap the difference.
>
> > > > > + struct xfs_scrub_context *sc,
> > > > > + xfs_fsblock_t fsbno,
> > > > > + xfs_extlen_t len,
> > > > > + struct xfs_owner_info *oinfo,
> > > > > + enum xfs_ag_resv_type resv)
> > > > > +{
> > > > > + struct xfs_mount *mp = sc->mp;
> > > > > + int error = 0;
> > > > > +
> > > > > + ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
> > > > > + ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(mp, fsbno) == sc->sa.agno);
> > > > > +
> > > > > + trace_xfs_repair_dispose_btree_extent(mp, XFS_FSB_TO_AGNO(mp, fsbno),
> > > > > + XFS_FSB_TO_AGBNO(mp, fsbno), len);
> > > > > +
> > > > > + for (; len > 0; len--, fsbno++) {
> > > > > + error = xfs_repair_dispose_btree_block(sc, fsbno, oinfo, resv);
> > > > > + if (error)
> > > > > + return error;
> > > >
> > > > So why do we do this one block at a time, rather than freeing it
> > > > as an entire extent in one go?
> > >
> > > At the moment the xfs_rmap_has_other_keys helper can only tell you if
> > > there are multiple rmap owners for any part of a given extent. For
> > > example, if the rmap records were:
> > >
> > > (start = 35, len = 3, owner = rmap)
> > > (start = 35, len = 1, owner = refcount)
> > > (start = 37, len = 1, owner = inobt)
> > >
> > > Notice how block 35 and 37 are crosslinked, but 36 isn't. A call to
> > > xfs_rmap_has_other_keys(35, 3) will say "yes" but doesn't have a way to
> > > signal back that the yes applies to 35 but that the caller should try
> > > again with block 36. Doing so would require _has_other_keys to maintain
> > > a refcount and to return to the caller any time the refcount changed,
> > > and the caller would still have to loop the extent. It's easier to have
> > > a dumb loop for the initial implementation and optimize it if we start
> > > taking more heat than we'd like on crosslinked filesystems.
> >
> > Well, I can see why you are doing this now, but the problems with
> > multi-block metadata makes me think that we really need to know more
> > detail of the owner in the rmap. e.g. that it's directory or
> > attribute data, not user file data and hence we can infer things
> > about expected block sizes, do the correct sort of buffer lookups
> > for invalidation, etc.
>
> I'm not sure we can do that without causing a deadlocking problem, since
> we lock all the AG headers to rebuild a btree and in general we can't
> _iget an inode to find out if it's a dir or not. But I have more to say
> on this in a few paragraphs...
>
> > I'm tending towards "this needs a design doc to explain all
> > this stuff" right now. Code is great, but I'm struggling understand
> > (reverse engineer!) all the algorithms and decisions that have been
> > made from the code...
>
> Working on it.
Nearly my bedtime, so here's the current draft:
/*
* Reconstructing per-AG Btrees
*
* When a space btree is corrupt, we don't bother trying to fix it.
* Instead, we scan secondary space metadata to derive the records that
* should be in the damaged btree, initialize a fresh btree root, and
* insert the records. Note that for rebuilding the rmapbt we scan all
* the primary data.
*
* However, that leaves the matter of removing all the metadata
* describing the old broken structure. For primary metadata we use the
* rmap data to construct a first bitmap of every extent with a matching
* rmap owner; we then iterate all other metadata structures with the
* same rmap owner to construct a second bitmap of rmaps that cannot be
* removed. We then subtract the second bitmap from the first bitmap
* (first & ~second) to derive the blocks that were used by the old
* btree. These blocks can be reaped.
*
* For rmapbt reconstructions we must use different tactics. First we
* iterate all primary metadata (this excludes the old rmapbt,
* obviously) to generate new rmap records. Then we iterate the new
* rmap records to find the gaps, which should be encompass the free
* space and the old rmapbt blocks. That corresponds to the 'first
* bitmap' of the previous section. The bnobt is iterated to generate
* the second bitmap of the previous section. We then reap the blocks
* corresponding to the difference just like we do for primary data.
*
* The comment for xfs_repair_reap_btree_extents will describe the block
* disposal process in more detail.
*/
And later, down by xfs_repair_reap_btree_extents,
/*
* Dispose of btree blocks from the old per-AG btree.
*
* Now that we've constructed a new btree to replace the damaged one, we
* want to dispose of the blocks that (we think) the old btree was
* using. Previously, we used the rmapbt to construct a list of extents
* (@exlist) with the rmap owner corresponding to the tree we rebuilt,
* then subtracted out any other blocks with the same rmap owner that
* are owned by another data structure. In theory the extents in
* @exlist are the old btree's blocks.
*
* Unfortunately, it's possible that the btree was crosslinked with
* other blocks on disk. The rmap data can tell us if there are
* multiple owners, so if the rmapbt says there is an owner of this
* block other than @oinfo, then the block is crosslinked. Remove the
* reverse mapping and continue.
*
* If there is one rmap record, we can free the block, which removes the
* reverse mapping but doesn't add the block to the free space. Our
* repair strategy is to hope the other metadata objects crosslinked on
* this block will be rebuilt (atop different blocks), thereby removing
* all the cross links.
*
* If there are no rmap records at all, we also free the block. If the
* btree being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then
* there isn't supposed to be a rmap record and everything is ok. For
* other btrees there had to have been an rmap entry for the block to
* have ended up on @exlist, so if it's gone now there's something wrong
* and the fs will shut down.
*
* The caller is responsible for locking the AG headers for the entire
* rebuild operation so that nothing else can sneak in and change the AG
* state while we're not looking. We also assume that the caller
* already invalidated any buffers associated with @exlist.
*/
Later, for the function that finds AG btree roots for agf/agi
reconstruction:
/*
* Find the roots of the per-AG btrees described in btree_info.
*
* The caller provides information about the btrees to look for by
* passing in an array (@btree_info) of xfs_repair_find_ag_btree with
* the (rmap owner, buf_ops, magic) fields set. The last element of the
* array should have a NULL buf_ops, and the (root, height) fields will
* be set on return if anything is found.
*
* For every rmapbt record matching any of the rmap owners in
* @btree_info, read each block referenced by the rmap record. If the
* block is a btree block from this filesystem matching any of the magic
* numbers and has a level higher than what we've already seen, remember
* the block and the height of the tree required to have such a block.
* When the call completes, we return the highest block we've found for
* each btree description; those should be the roots.
*
* The caller must lock the applicable per-AG header buffers (AGF, AGI)
* to prevent other threads from changing the shape of the btrees that
* we are looking for. It must maintain those locks until it's safe for
* other threads to change the btrees' shapes.
*/
--D
>
> > > > > +/*
> > > > > + * Invalidate buffers for per-AG btree blocks we're dumping. We assume that
> > > > > + * exlist points only to metadata blocks.
> > > > > + */
> > > > > +int
> > > > > +xfs_repair_invalidate_blocks(
> > > > > + struct xfs_scrub_context *sc,
> > > > > + struct xfs_repair_extent_list *exlist)
> > > > > +{
> > > > > + struct xfs_repair_extent *rex;
> > > > > + struct xfs_repair_extent *n;
> > > > > + struct xfs_buf *bp;
> > > > > + xfs_agnumber_t agno;
> > > > > + xfs_agblock_t agbno;
> > > > > + xfs_agblock_t i;
> > > > > +
> > > > > + for_each_xfs_repair_extent_safe(rex, n, exlist) {
> > > > > + agno = XFS_FSB_TO_AGNO(sc->mp, rex->fsbno);
> > > > > + agbno = XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno);
> > > > > + for (i = 0; i < rex->len; i++) {
> > > > > + bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> > > > > + agbno + i, 0);
> > > > > + xfs_trans_binval(sc->tp, bp);
> > > > > + }
> > > >
> > > > Again, this is doing things by single blocks. We do have multi-block
> > > > metadata (inodes, directory blocks, remote attrs) that, if it
> > > > is already in memory, needs to be treated as multi-block extents. If
> > > > we don't do that, we'll cause aliasing problems in the buffer cache
> > > > (see _xfs_buf_obj_cmp()) and it's all downhill from there.
> > >
> > > I only recently started testing with filesystems containing multiblock
> > > dir/rmt metadata, and this is an unsolved problem. :(
> >
> > That needs documenting, too. Perhaps explicitly, by rejecting repair
> > requests on filesystems or types that have multi-block constructs
> > until we solve these problems.
>
> Trouble is, remote attr values can have an xfs_buf that spans however
> many blocks you need to store a full 64k value, and what happens if the
> rmapbt collides with that? It sorta implies that we can't do
> invalidation on /any/ filesystem, which is unfortunate....
>
> ...unless we have an easy way of finding /any/ buffer that points to a
> given block? Probably not, since iirc they're indexed by the first disk
> block number. Hm. I suppose we could use the rmap data to look for
> anything within 64k of the logical offset of an attr/data rmap
> overlapping the same block...
>
> ...but on second thought we only care about invalidating the buffer if
> the block belonged to the ag btree we've just killed, right? If there's
> a multi-block buffer because it's part of a directory or an rmt block
> then the buffer is clearly owned by someone else and we don't even have
> to look for that. Likewise, if it's a single-block buffer but the
> block has some other magic then we don't own it and we should leave it
> alone.
>
> > > I /think/ the solution is that we need to query the buffer cache to see
> > > if it has a buffer for the given disk blocks, and if it matches the
> > > btree we're discarding (correct magic/uuid/b_length) then we invalidate
> > > it,
> >
> > I don't think that provides any guarantees. Even ignoring all the
> > problems with invalidation while the buffer is dirty and tracked in
> > the AIL, there's nothing stopping the other code from attempting to
> > re-instantiate the buffer due to some other access. And then we
> > have aliasing problems again....
>
> Well, we /could/ just freeze the fs while we do repairs on any ag btree.
>
> --D
>
> >
> > Cheers,
> >
> > Dave.
> > --
> > Dave Chinner
> > david@fromorbit.com
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v2 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-15 22:34 ` [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair Darrick J. Wong
2018-05-16 8:32 ` Dave Chinner
@ 2018-05-18 3:53 ` Darrick J. Wong
2018-05-29 3:14 ` Dave Chinner
1 sibling, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-18 3:53 UTC (permalink / raw)
To: linux-xfs, david; +Cc: Allison Henderson
From: Darrick J. Wong <darrick.wong@oracle.com>
Now that we've plumbed in the ability to construct a list of dead btree
blocks following a repair, add more helpers to dispose of them. This is
done by examining the rmapbt -- if the btree was the only owner we can
free the block, otherwise it's crosslinked and we can only remove the
rmapbt record.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
v2: document the design of how old btree block disposal is supposed to
work and document some of the limitations of the buffer cache that
we can't fix here, and reduce perag_get/put traffic
---
fs/xfs/scrub/repair.c | 251 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 6 +
2 files changed, 257 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index fca8e3c7887d..7daf0120d1bf 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -585,3 +585,254 @@ xfs_repair_subtract_extents(
}
#undef LEFT_ALIGNED
#undef RIGHT_ALIGNED
+
+/*
+ * Disposal of Blocks from Old per-AG Btrees
+ *
+ * Now that we've constructed a new btree to replace the damaged one, we want
+ * to dispose of the blocks that (we think) the old btree was using.
+ * Previously, we used the rmapbt to collect the extents (exlist) with the
+ * rmap owner corresponding to the tree we rebuilt, collected extents for any
+ * blocks with the same rmap owner that are owned by another data structure
+ * (sublist), and subtracted sublist from exlist. In theory the extents
+ * remaining in exlist are the old btree's blocks.
+ *
+ * Unfortunately, it's possible that the btree was crosslinked with other
+ * blocks on disk. The rmap data can tell us if there are multiple owners, so
+ * if the rmapbt says there is an owner of this block other than @oinfo, then
+ * the block is crosslinked. Remove the reverse mapping and continue.
+ *
+ * If there is one rmap record, we can free the block, which removes the
+ * reverse mapping but doesn't add the block to the free space. Our repair
+ * strategy is to hope the other metadata objects crosslinked on this block
+ * will be rebuilt (atop different blocks), thereby removing all the cross
+ * links.
+ *
+ * If there are no rmap records at all, we also free the block. If the btree
+ * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
+ * supposed to be a rmap record and everything is ok. For other btrees there
+ * had to have been an rmap entry for the block to have ended up on @exlist,
+ * so if it's gone now there's something wrong and the fs will shut down.
+ *
+ * Note: If there are multiple rmap records with only the same rmap owner as
+ * the btree we're trying to rebuild and the block is indeed owned by another
+ * data structure with the same rmap owner, then the block will be in sublist
+ * and therefore doesn't need disposal. If there are multiple rmap records
+ * with only the same rmap owner but the block is not owned by something with
+ * the same rmap owner, the block will be freed.
+ *
+ * The caller is responsible for locking the AG headers for the entire rebuild
+ * operation so that nothing else can sneak in and change the AG state while
+ * we're not looking. We also assume that the caller already invalidated any
+ * buffers associated with @exlist.
+ */
+
+/*
+ * Invalidate buffers for per-AG btree blocks we're dumping. This function
+ * is not intended for use with file data repairs; we have bunmapi for that.
+ */
+int
+xfs_repair_invalidate_blocks(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t i;
+
+ /*
+ * For each block in each extent, see if there's an incore buffer for
+ * exactly that block; if so, invalidate it. The buffer cache only
+ * lets us look for one buffer at a time, so we have to look one block
+ * at a time. Avoid invalidating AG headers and post-EOFS blocks
+ * because we never own those; and if we can't TRYLOCK the buffer we
+ * assume it's owned by someone else.
+ */
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) {
+ /* Skip AG headers and post-EOFS blocks */
+ if (!xfs_verify_fsbno(sc->mp, fsbno))
+ continue;
+ bp = xfs_buf_incore(sc->mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
+ if (bp) {
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* Ensure the freelist is the correct size. */
+int
+xfs_repair_fix_freelist(
+ struct xfs_scrub_context *sc,
+ bool can_shrink)
+{
+ struct xfs_alloc_arg args = {0};
+
+ args.mp = sc->mp;
+ args.tp = sc->tp;
+ args.agno = sc->sa.agno;
+ args.alignment = 1;
+ args.pag = sc->sa.pag;
+
+ return xfs_alloc_fix_freelist(&args,
+ can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+}
+
+/*
+ * Put a block back on the AGFL.
+ */
+STATIC int
+xfs_repair_put_freelist(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno)
+{
+ struct xfs_owner_info oinfo;
+ int error;
+
+ /* Make sure there's space on the freelist. */
+ error = xfs_repair_fix_freelist(sc, true);
+ if (error)
+ return error;
+
+ /*
+ * Since we're "freeing" a lost block onto the AGFL, we have to
+ * create an rmap for the block prior to merging it or else other
+ * parts will break.
+ */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
+ &oinfo);
+ if (error)
+ return error;
+
+ /* Put the block on the AGFL. */
+ error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
+ agbno, 0);
+ if (error)
+ return error;
+ xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+ return 0;
+}
+
+/* Dispose of a single metadata block. */
+STATIC int
+xfs_repair_dispose_btree_block(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsbno,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type resv)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf_bp = NULL;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ bool has_other_rmap;
+ int error;
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+
+ /*
+ * If we are repairing per-inode metadata, we need to read in the AGF
+ * buffer. Otherwise, we're repairing a per-AG structure, so reuse
+ * the AGF buffer that the setup functions already grabbed.
+ */
+ if (sc->ip) {
+ error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+ } else {
+ agf_bp = sc->sa.agf_bp;
+ }
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
+
+ /* Can we find any other rmappings? */
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
+ if (error)
+ goto out_cur;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the reverse mapping and move on. Otherwise,
+ * we were the only owner of the block, so free the extent, which will
+ * also remove the rmap.
+ *
+ * XXX: XFS doesn't support detecting the case where a single block
+ * metadata structure is crosslinked with a multi-block structure
+ * because the buffer cache doesn't detect aliasing problems, so we
+ * can't fix 100% of crosslinking problems (yet). The verifiers will
+ * blow on writeout, the filesystem will shut down, and the admin gets
+ * to run xfs_repair.
+ */
+ if (has_other_rmap)
+ error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
+ else if (resv == XFS_AG_RESV_AGFL)
+ error = xfs_repair_put_freelist(sc, agbno);
+ else
+ error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+ if (agf_bp != sc->sa.agf_bp)
+ xfs_trans_brelse(sc->tp, agf_bp);
+ if (error)
+ return error;
+
+ if (sc->ip)
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+ return xfs_repair_roll_ag_trans(sc);
+
+out_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ if (agf_bp != sc->sa.agf_bp)
+ xfs_trans_brelse(sc->tp, agf_bp);
+ return error;
+}
+
+/* Dispose of btree blocks from an old per-AG btree. */
+int
+xfs_repair_reap_btree_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+ int error = 0;
+
+ ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
+
+ /* Dispose of every block from the old btree. */
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ ASSERT(sc->ip != NULL ||
+ XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno);
+
+ trace_xfs_repair_dispose_btree_extent(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, rex->fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len);
+
+ for (; rex->len > 0; rex->len--, rex->fsbno++) {
+ error = xfs_repair_dispose_btree_block(sc, rex->fsbno,
+ oinfo, type);
+ if (error)
+ goto out;
+ }
+ list_del(&rex->list);
+ kmem_free(rex);
+ }
+
+out:
+ xfs_repair_cancel_btree_extents(sc, exlist);
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ba1fdd7b9a79..f14aaab7df9e 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -70,6 +70,12 @@ void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
struct xfs_repair_extent_list *exlist,
struct xfs_repair_extent_list *sublist);
+int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink);
+int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist);
+int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
/* Metadata repairers */
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH v2 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-18 3:53 ` [PATCH v2 " Darrick J. Wong
@ 2018-05-29 3:14 ` Dave Chinner
2018-05-29 18:01 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-29 3:14 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs, Allison Henderson
On Thu, May 17, 2018 at 08:53:47PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Now that we've plumbed in the ability to construct a list of dead btree
> blocks following a repair, add more helpers to dispose of them. This is
> done by examining the rmapbt -- if the btree was the only owner we can
> free the block, otherwise it's crosslinked and we can only remove the
> rmapbt record.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> v2: document the design of how old btree block disposal is supposed to
> work and document some of the limitations of the buffer cache that
> we can't fix here, and reduce perag_get/put traffic
Yup, I can accept those limitations for the initial implementation.
Gotta leave something for you to work on once it's been merged,
right?
Reviewed-by: Dave Chinner <dchinner@redhat.com>
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 04/22] xfs: add helpers to dispose of old btree blocks after a repair
2018-05-29 3:14 ` Dave Chinner
@ 2018-05-29 18:01 ` Darrick J. Wong
0 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-29 18:01 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs, Allison Henderson
On Tue, May 29, 2018 at 01:14:22PM +1000, Dave Chinner wrote:
> On Thu, May 17, 2018 at 08:53:47PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Now that we've plumbed in the ability to construct a list of dead btree
> > blocks following a repair, add more helpers to dispose of them. This is
> > done by examining the rmapbt -- if the btree was the only owner we can
> > free the block, otherwise it's crosslinked and we can only remove the
> > rmapbt record.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > v2: document the design of how old btree block disposal is supposed to
> > work and document some of the limitations of the buffer cache that
> > we can't fix here, and reduce perag_get/put traffic
>
> Yup, I can accept those limitations for the initial implementation.
> Gotta leave something for you to work on once it's been merged,
> right?
Yeah. I'd spec'd out adding a bitmap to each AG's xfs_buf cache to
track non-stale cached buffers and scream if someone tries to add an
overlapping buffer, but then started talking to Dan last week about
connecting XFS to pmem poison notifications.
If the metadata block using the poisoned pmem is cached in dram then
we'd want to find the associated xfs_buf and write it back out. For
that we'd need full block -> buf mapping abilities (and not just a
bitmap), which will likely bloat the radix tree significantly. This
became a mess of ink in my notebook so I gave up, for now.
--D
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
>
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH 05/22] xfs: recover AG btree roots from rmap data
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (3 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 04/22] xfs: add helpers to dispose of old btree blocks after a repair Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-16 8:51 ` Dave Chinner
2018-05-18 3:54 ` [PATCH v2 " Darrick J. Wong
2018-05-15 22:34 ` [PATCH 06/22] xfs: add a repair helper to reset superblock counters Darrick J. Wong
` (17 subsequent siblings)
22 siblings, 2 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Add a helper function to help us recover btree roots from the rmap data.
Callers pass in a list of rmap owner codes, buffer ops, and magic
numbers. We iterate the rmap records looking for owner matches, and
then read the matching blocks to see if the magic number & uuid match.
If so, we then read-verify the block, and if that passes then we retain
a pointer to the block with the highest level, assuming that by the end
of the call we will have found the root. This will be used to reset the
AGF/AGI btree root fields during their rebuild procedures.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/repair.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 20 ++++++
2 files changed, 198 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index d820e01d1146..06c84f76d7ff 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -766,3 +766,181 @@ xfs_repair_invalidate_blocks(
return 0;
}
+
+/* See if our block is in the AGFL. */
+STATIC int
+xfs_repair_findroot_agfl_walk(
+ struct xfs_mount *mp,
+ xfs_agblock_t bno,
+ void *priv)
+{
+ xfs_agblock_t *agbno = priv;
+
+ return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
+}
+
+struct xfs_repair_findroot {
+ struct xfs_scrub_context *sc;
+ struct xfs_buf *agfl_bp;
+ struct xfs_agf *agf;
+ struct xfs_repair_find_ag_btree *btree_info;
+};
+
+/* Does this block match the btree information passed in? */
+STATIC int
+xfs_repair_findroot_block(
+ struct xfs_repair_findroot *ri,
+ struct xfs_repair_find_ag_btree *fab,
+ uint64_t owner,
+ xfs_agblock_t agbno,
+ bool *found_it)
+{
+ struct xfs_mount *mp = ri->sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_btree_block *btblock;
+ xfs_daddr_t daddr;
+ int error;
+
+ /* rmap owner match? */
+ if (owner != fab->rmap_owner)
+ return 0;
+
+ daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
+
+ /*
+ * Blocks in the AGFL have stale contents that might just happen to
+ * have a matching magic and uuid. We don't want to pull these blocks
+ * in as part of a tree root, so we have to filter out the AGFL stuff
+ * here. If the AGFL looks insane we'll just refuse to repair.
+ */
+ if (owner == XFS_RMAP_OWN_AG) {
+ error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
+ xfs_repair_findroot_agfl_walk, &agbno);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ return 0;
+ if (error)
+ return error;
+ }
+
+ error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
+ mp->m_bsize, 0, &bp, NULL);
+ if (error)
+ return error;
+
+ /*
+ * Does this look like a block matching our fs and higher than any
+ * other block we've found so far? If so, reattach buffer verifiers
+ * so the AIL won't complain if the buffer is also dirty.
+ */
+ btblock = XFS_BUF_TO_BLOCK(bp);
+ if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ goto out;
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ goto out;
+ bp->b_ops = fab->buf_ops;
+
+ /* Ignore this block if it's lower in the tree than we've seen. */
+ if (fab->root != NULLAGBLOCK &&
+ xfs_btree_get_level(btblock) < fab->height)
+ goto out;
+
+ /* Make sure we pass the verifiers. */
+ bp->b_ops->verify_read(bp);
+ if (bp->b_error)
+ goto out;
+ fab->root = agbno;
+ fab->height = xfs_btree_get_level(btblock) + 1;
+ *found_it = true;
+
+ trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
+ be32_to_cpu(btblock->bb_magic), fab->height - 1);
+out:
+ xfs_trans_brelse(ri->sc->tp, bp);
+ return error;
+}
+
+/*
+ * Do any of the blocks in this rmap record match one of the btrees we're
+ * looking for?
+ */
+STATIC int
+xfs_repair_findroot_rmap(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_findroot *ri = priv;
+ struct xfs_repair_find_ag_btree *fab;
+ xfs_agblock_t b;
+ bool found_it;
+ int error = 0;
+
+ /* Ignore anything that isn't AG metadata. */
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+ return 0;
+
+ /* Otherwise scan each block + btree type. */
+ for (b = 0; b < rec->rm_blockcount; b++) {
+ found_it = false;
+ for (fab = ri->btree_info; fab->buf_ops; fab++) {
+ error = xfs_repair_findroot_block(ri, fab,
+ rec->rm_owner, rec->rm_startblock + b,
+ &found_it);
+ if (error)
+ return error;
+ if (found_it)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Find the roots of the per-AG btrees described in btree_info.
+ *
+ * The caller provides information about the btrees to look for by passing in
+ * an array (@btree_info) of xfs_repair_find_ag_btree with the (rmap owner,
+ * buf_ops, magic) fields set. The last element of the array should have a
+ * NULL buf_ops, and the (root, height) fields will be set on return if
+ * anything is found.
+ *
+ * For every rmapbt record matching any of the rmap owners in @btree_info,
+ * read each block referenced by the rmap record. If the block is a btree
+ * block from this filesystem matching any of the magic numbers and has a
+ * level higher than what we've already seen, remember the block and the
+ * height of the tree required to have such a block. When the call completes,
+ * we return the highest block we've found for each btree description; those
+ * should be the roots.
+ */
+int
+xfs_repair_find_ag_btree_roots(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *agf_bp,
+ struct xfs_repair_find_ag_btree *btree_info,
+ struct xfs_buf *agfl_bp)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_repair_findroot ri;
+ struct xfs_repair_find_ag_btree *fab;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ri.sc = sc;
+ ri.btree_info = btree_info;
+ ri.agf = XFS_BUF_TO_AGF(agf_bp);
+ ri.agfl_bp = agfl_bp;
+ for (fab = btree_info; fab->buf_ops; fab++) {
+ ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
+ ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
+ fab->root = NULLAGBLOCK;
+ fab->height = 0;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index e0e7f86d509c..55441774e8e5 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -77,6 +77,26 @@ int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink);
int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc,
struct xfs_repair_extent_list *btlist);
+struct xfs_repair_find_ag_btree {
+ /* in: rmap owner of the btree we're looking for */
+ uint64_t rmap_owner;
+
+ /* in: buffer ops */
+ const struct xfs_buf_ops *buf_ops;
+
+ /* in: magic number of the btree */
+ uint32_t magic;
+
+ /* out: the highest btree block found and the tree height */
+ xfs_agblock_t root;
+ unsigned int height;
+};
+
+int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
+ struct xfs_buf *agf_bp,
+ struct xfs_repair_find_ag_btree *btree_info,
+ struct xfs_buf *agfl_bp);
+
/* Metadata repairers */
int xfs_repair_probe(struct xfs_scrub_context *sc);
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH 05/22] xfs: recover AG btree roots from rmap data
2018-05-15 22:34 ` [PATCH 05/22] xfs: recover AG btree roots from rmap data Darrick J. Wong
@ 2018-05-16 8:51 ` Dave Chinner
2018-05-16 18:37 ` Darrick J. Wong
2018-05-18 3:54 ` [PATCH v2 " Darrick J. Wong
1 sibling, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 8:51 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Tue, May 15, 2018 at 03:34:10PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add a helper function to help us recover btree roots from the rmap data.
> Callers pass in a list of rmap owner codes, buffer ops, and magic
> numbers. We iterate the rmap records looking for owner matches, and
> then read the matching blocks to see if the magic number & uuid match.
> If so, we then read-verify the block, and if that passes then we retain
> a pointer to the block with the highest level, assuming that by the end
> of the call we will have found the root. This will be used to reset the
> AGF/AGI btree root fields during their rebuild procedures.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/repair.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 20 ++++++
> 2 files changed, 198 insertions(+)
>
>
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index d820e01d1146..06c84f76d7ff 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -766,3 +766,181 @@ xfs_repair_invalidate_blocks(
>
> return 0;
> }
> +
> +/* See if our block is in the AGFL. */
> +STATIC int
> +xfs_repair_findroot_agfl_walk(
> + struct xfs_mount *mp,
> + xfs_agblock_t bno,
> + void *priv)
> +{
> + xfs_agblock_t *agbno = priv;
> +
> + return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
> +}
> +
> +struct xfs_repair_findroot {
> + struct xfs_scrub_context *sc;
> + struct xfs_buf *agfl_bp;
> + struct xfs_agf *agf;
> + struct xfs_repair_find_ag_btree *btree_info;
> +};
> +
> +/* Does this block match the btree information passed in? */
> +STATIC int
> +xfs_repair_findroot_block(
> + struct xfs_repair_findroot *ri,
> + struct xfs_repair_find_ag_btree *fab,
> + uint64_t owner,
> + xfs_agblock_t agbno,
> + bool *found_it)
> +{
> + struct xfs_mount *mp = ri->sc->mp;
> + struct xfs_buf *bp;
> + struct xfs_btree_block *btblock;
> + xfs_daddr_t daddr;
> + int error;
> +
> + /* rmap owner match? */
> + if (owner != fab->rmap_owner)
> + return 0;
I'd put that in the caller - it's iterating the fab array and it
knows the owner it is looking for, so I think it makes more sense to
go there....
> +
> + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
> +
> + /*
> + * Blocks in the AGFL have stale contents that might just happen to
> + * have a matching magic and uuid. We don't want to pull these blocks
> + * in as part of a tree root, so we have to filter out the AGFL stuff
> + * here. If the AGFL looks insane we'll just refuse to repair.
> + */
> + if (owner == XFS_RMAP_OWN_AG) {
> + error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
> + xfs_repair_findroot_agfl_walk, &agbno);
> + if (error == XFS_BTREE_QUERY_RANGE_ABORT)
> + return 0;
> + if (error)
> + return error;
> + }
> +
> + error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
> + mp->m_bsize, 0, &bp, NULL);
> + if (error)
> + return error;
> +
> + /*
> + * Does this look like a block matching our fs and higher than any
> + * other block we've found so far? If so, reattach buffer verifiers
> + * so the AIL won't complain if the buffer is also dirty.
> + */
> + btblock = XFS_BUF_TO_BLOCK(bp);
> + if (be32_to_cpu(btblock->bb_magic) != fab->magic)
> + goto out;
> + if (xfs_sb_version_hascrc(&mp->m_sb) &&
> + !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
> + goto out;
> + bp->b_ops = fab->buf_ops;
> +
> + /* Ignore this block if it's lower in the tree than we've seen. */
> + if (fab->root != NULLAGBLOCK &&
> + xfs_btree_get_level(btblock) < fab->height)
> + goto out;
> +
> + /* Make sure we pass the verifiers. */
> + bp->b_ops->verify_read(bp);
> + if (bp->b_error)
> + goto out;
> + fab->root = agbno;
> + fab->height = xfs_btree_get_level(btblock) + 1;
> + *found_it = true;
> +
> + trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
> + be32_to_cpu(btblock->bb_magic), fab->height - 1);
> +out:
> + xfs_trans_brelse(ri->sc->tp, bp);
So we release the buffer once we've found it, which also unlocks it.
That means when we come back to it later, it may have been accessed
and changed by something else and no longer be the block we are
looking for. How do you protect against this sort of race given we
are unlocking the buffer? Perhaps it should be held on the fab
structure, and released when a better candidate is found?
> + return error;
> +}
> +
> +/*
> + * Do any of the blocks in this rmap record match one of the btrees we're
> + * looking for?
> + */
> +STATIC int
> +xfs_repair_findroot_rmap(
> + struct xfs_btree_cur *cur,
> + struct xfs_rmap_irec *rec,
> + void *priv)
> +{
> + struct xfs_repair_findroot *ri = priv;
> + struct xfs_repair_find_ag_btree *fab;
> + xfs_agblock_t b;
> + bool found_it;
> + int error = 0;
> +
> + /* Ignore anything that isn't AG metadata. */
> + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
> + return 0;
> +
> + /* Otherwise scan each block + btree type. */
> + for (b = 0; b < rec->rm_blockcount; b++) {
> + found_it = false;
> + for (fab = ri->btree_info; fab->buf_ops; fab++) {
> + error = xfs_repair_findroot_block(ri, fab,
> + rec->rm_owner, rec->rm_startblock + b,
> + &found_it);
This loop is where I think the fab->owner/rec->rm_owner check
should go....
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 05/22] xfs: recover AG btree roots from rmap data
2018-05-16 8:51 ` Dave Chinner
@ 2018-05-16 18:37 ` Darrick J. Wong
2018-05-16 19:18 ` Allison Henderson
2018-05-16 22:36 ` Dave Chinner
0 siblings, 2 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-16 18:37 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Wed, May 16, 2018 at 06:51:52PM +1000, Dave Chinner wrote:
> On Tue, May 15, 2018 at 03:34:10PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Add a helper function to help us recover btree roots from the rmap data.
> > Callers pass in a list of rmap owner codes, buffer ops, and magic
> > numbers. We iterate the rmap records looking for owner matches, and
> > then read the matching blocks to see if the magic number & uuid match.
> > If so, we then read-verify the block, and if that passes then we retain
> > a pointer to the block with the highest level, assuming that by the end
> > of the call we will have found the root. This will be used to reset the
> > AGF/AGI btree root fields during their rebuild procedures.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > fs/xfs/scrub/repair.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++++
> > fs/xfs/scrub/repair.h | 20 ++++++
> > 2 files changed, 198 insertions(+)
> >
> >
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index d820e01d1146..06c84f76d7ff 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -766,3 +766,181 @@ xfs_repair_invalidate_blocks(
> >
> > return 0;
> > }
> > +
> > +/* See if our block is in the AGFL. */
> > +STATIC int
> > +xfs_repair_findroot_agfl_walk(
> > + struct xfs_mount *mp,
> > + xfs_agblock_t bno,
> > + void *priv)
> > +{
> > + xfs_agblock_t *agbno = priv;
> > +
> > + return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
> > +}
> > +
> > +struct xfs_repair_findroot {
> > + struct xfs_scrub_context *sc;
> > + struct xfs_buf *agfl_bp;
> > + struct xfs_agf *agf;
> > + struct xfs_repair_find_ag_btree *btree_info;
> > +};
> > +
> > +/* Does this block match the btree information passed in? */
> > +STATIC int
> > +xfs_repair_findroot_block(
> > + struct xfs_repair_findroot *ri,
> > + struct xfs_repair_find_ag_btree *fab,
> > + uint64_t owner,
> > + xfs_agblock_t agbno,
> > + bool *found_it)
> > +{
> > + struct xfs_mount *mp = ri->sc->mp;
> > + struct xfs_buf *bp;
> > + struct xfs_btree_block *btblock;
> > + xfs_daddr_t daddr;
> > + int error;
> > +
> > + /* rmap owner match? */
> > + if (owner != fab->rmap_owner)
> > + return 0;
>
> I'd put that in the caller - it's iterating the fab array and it
> knows the owner it is looking for, so I think it makes more sense to
> go there....
Ok.
> > +
> > + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
> > +
> > + /*
> > + * Blocks in the AGFL have stale contents that might just happen to
> > + * have a matching magic and uuid. We don't want to pull these blocks
> > + * in as part of a tree root, so we have to filter out the AGFL stuff
> > + * here. If the AGFL looks insane we'll just refuse to repair.
> > + */
> > + if (owner == XFS_RMAP_OWN_AG) {
> > + error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
> > + xfs_repair_findroot_agfl_walk, &agbno);
> > + if (error == XFS_BTREE_QUERY_RANGE_ABORT)
> > + return 0;
> > + if (error)
> > + return error;
> > + }
> > +
> > + error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
> > + mp->m_bsize, 0, &bp, NULL);
> > + if (error)
> > + return error;
> > +
> > + /*
> > + * Does this look like a block matching our fs and higher than any
> > + * other block we've found so far? If so, reattach buffer verifiers
> > + * so the AIL won't complain if the buffer is also dirty.
> > + */
> > + btblock = XFS_BUF_TO_BLOCK(bp);
> > + if (be32_to_cpu(btblock->bb_magic) != fab->magic)
> > + goto out;
> > + if (xfs_sb_version_hascrc(&mp->m_sb) &&
> > + !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
> > + goto out;
> > + bp->b_ops = fab->buf_ops;
> > +
> > + /* Ignore this block if it's lower in the tree than we've seen. */
> > + if (fab->root != NULLAGBLOCK &&
> > + xfs_btree_get_level(btblock) < fab->height)
> > + goto out;
> > +
> > + /* Make sure we pass the verifiers. */
> > + bp->b_ops->verify_read(bp);
> > + if (bp->b_error)
> > + goto out;
> > + fab->root = agbno;
> > + fab->height = xfs_btree_get_level(btblock) + 1;
> > + *found_it = true;
> > +
> > + trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
> > + be32_to_cpu(btblock->bb_magic), fab->height - 1);
> > +out:
> > + xfs_trans_brelse(ri->sc->tp, bp);
>
> So we release the buffer once we've found it, which also unlocks it.
> That means when we come back to it later, it may have been accessed
> and changed by something else and no longer be the block we are
> looking for. How do you protect against this sort of race given we
> are unlocking the buffer? Perhaps it should be held on the fab
> structure, and released when a better candidate is found?
The two callers of this function are the AGF and AGI repair functions.
AGF repair holds the locked AGF buffer, and AGI repair holds the locked
AGF & AGI buffers, which should be enough to prevent anyone else from
accessing the AG btrees. They keep the all the AG header buffers locked
until they're completely finished with rebuilding the headers (i.e.
xfs_scrub_teardown) and it's safe for the shape to change.
How about I add to the comment for this function:
/*
* The caller must lock the applicable per-AG header buffers (AGF, AGI)
* to prevent other threads from changing the shape of the btrees that
* we are looking for. It must maintain those locks until it's safe for
* other threads to change the btrees' shapes.
*/
--D
> > + return error;
> > +}
> > +
> > +/*
> > + * Do any of the blocks in this rmap record match one of the btrees we're
> > + * looking for?
> > + */
> > +STATIC int
> > +xfs_repair_findroot_rmap(
> > + struct xfs_btree_cur *cur,
> > + struct xfs_rmap_irec *rec,
> > + void *priv)
> > +{
> > + struct xfs_repair_findroot *ri = priv;
> > + struct xfs_repair_find_ag_btree *fab;
> > + xfs_agblock_t b;
> > + bool found_it;
> > + int error = 0;
> > +
> > + /* Ignore anything that isn't AG metadata. */
> > + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
> > + return 0;
> > +
> > + /* Otherwise scan each block + btree type. */
> > + for (b = 0; b < rec->rm_blockcount; b++) {
> > + found_it = false;
> > + for (fab = ri->btree_info; fab->buf_ops; fab++) {
> > + error = xfs_repair_findroot_block(ri, fab,
> > + rec->rm_owner, rec->rm_startblock + b,
> > + &found_it);
>
> This loop is where I think the fab->owner/rec->rm_owner check
> should go....
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 05/22] xfs: recover AG btree roots from rmap data
2018-05-16 18:37 ` Darrick J. Wong
@ 2018-05-16 19:18 ` Allison Henderson
2018-05-16 22:36 ` Dave Chinner
1 sibling, 0 replies; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 19:18 UTC (permalink / raw)
To: Darrick J. Wong, Dave Chinner; +Cc: linux-xfs
Ok, you can add my review here as well:
Reviewed by: Allison Henderson <allison.henderson@oracle.com>
Thx!
On 05/16/2018 11:37 AM, Darrick J. Wong wrote:
> On Wed, May 16, 2018 at 06:51:52PM +1000, Dave Chinner wrote:
>> On Tue, May 15, 2018 at 03:34:10PM -0700, Darrick J. Wong wrote:
>>> From: Darrick J. Wong <darrick.wong@oracle.com>
>>>
>>> Add a helper function to help us recover btree roots from the rmap data.
>>> Callers pass in a list of rmap owner codes, buffer ops, and magic
>>> numbers. We iterate the rmap records looking for owner matches, and
>>> then read the matching blocks to see if the magic number & uuid match.
>>> If so, we then read-verify the block, and if that passes then we retain
>>> a pointer to the block with the highest level, assuming that by the end
>>> of the call we will have found the root. This will be used to reset the
>>> AGF/AGI btree root fields during their rebuild procedures.
>>>
>>> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
>>> ---
>>> fs/xfs/scrub/repair.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++++
>>> fs/xfs/scrub/repair.h | 20 ++++++
>>> 2 files changed, 198 insertions(+)
>>>
>>>
>>> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
>>> index d820e01d1146..06c84f76d7ff 100644
>>> --- a/fs/xfs/scrub/repair.c
>>> +++ b/fs/xfs/scrub/repair.c
>>> @@ -766,3 +766,181 @@ xfs_repair_invalidate_blocks(
>>>
>>> return 0;
>>> }
>>> +
>>> +/* See if our block is in the AGFL. */
>>> +STATIC int
>>> +xfs_repair_findroot_agfl_walk(
>>> + struct xfs_mount *mp,
>>> + xfs_agblock_t bno,
>>> + void *priv)
>>> +{
>>> + xfs_agblock_t *agbno = priv;
>>> +
>>> + return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
>>> +}
>>> +
>>> +struct xfs_repair_findroot {
>>> + struct xfs_scrub_context *sc;
>>> + struct xfs_buf *agfl_bp;
>>> + struct xfs_agf *agf;
>>> + struct xfs_repair_find_ag_btree *btree_info;
>>> +};
>>> +
>>> +/* Does this block match the btree information passed in? */
>>> +STATIC int
>>> +xfs_repair_findroot_block(
>>> + struct xfs_repair_findroot *ri,
>>> + struct xfs_repair_find_ag_btree *fab,
>>> + uint64_t owner,
>>> + xfs_agblock_t agbno,
>>> + bool *found_it)
>>> +{
>>> + struct xfs_mount *mp = ri->sc->mp;
>>> + struct xfs_buf *bp;
>>> + struct xfs_btree_block *btblock;
>>> + xfs_daddr_t daddr;
>>> + int error;
>>> +
>>> + /* rmap owner match? */
>>> + if (owner != fab->rmap_owner)
>>> + return 0;
>>
>> I'd put that in the caller - it's iterating the fab array and it
>> knows the owner it is looking for, so I think it makes more sense to
>> go there....
>
> Ok.
>
>>> +
>>> + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
>>> +
>>> + /*
>>> + * Blocks in the AGFL have stale contents that might just happen to
>>> + * have a matching magic and uuid. We don't want to pull these blocks
>>> + * in as part of a tree root, so we have to filter out the AGFL stuff
>>> + * here. If the AGFL looks insane we'll just refuse to repair.
>>> + */
>>> + if (owner == XFS_RMAP_OWN_AG) {
>>> + error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
>>> + xfs_repair_findroot_agfl_walk, &agbno);
>>> + if (error == XFS_BTREE_QUERY_RANGE_ABORT)
>>> + return 0;
>>> + if (error)
>>> + return error;
>>> + }
>>> +
>>> + error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
>>> + mp->m_bsize, 0, &bp, NULL);
>>> + if (error)
>>> + return error;
>>> +
>>> + /*
>>> + * Does this look like a block matching our fs and higher than any
>>> + * other block we've found so far? If so, reattach buffer verifiers
>>> + * so the AIL won't complain if the buffer is also dirty.
>>> + */
>>> + btblock = XFS_BUF_TO_BLOCK(bp);
>>> + if (be32_to_cpu(btblock->bb_magic) != fab->magic)
>>> + goto out;
>>> + if (xfs_sb_version_hascrc(&mp->m_sb) &&
>>> + !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
>>> + goto out;
>>> + bp->b_ops = fab->buf_ops;
>>> +
>>> + /* Ignore this block if it's lower in the tree than we've seen. */
>>> + if (fab->root != NULLAGBLOCK &&
>>> + xfs_btree_get_level(btblock) < fab->height)
>>> + goto out;
>>> +
>>> + /* Make sure we pass the verifiers. */
>>> + bp->b_ops->verify_read(bp);
>>> + if (bp->b_error)
>>> + goto out;
>>> + fab->root = agbno;
>>> + fab->height = xfs_btree_get_level(btblock) + 1;
>>> + *found_it = true;
>>> +
>>> + trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
>>> + be32_to_cpu(btblock->bb_magic), fab->height - 1);
>>> +out:
>>> + xfs_trans_brelse(ri->sc->tp, bp);
>>
>> So we release the buffer once we've found it, which also unlocks it.
>> That means when we come back to it later, it may have been accessed
>> and changed by something else and no longer be the block we are
>> looking for. How do you protect against this sort of race given we
>> are unlocking the buffer? Perhaps it should be held on the fab
>> structure, and released when a better candidate is found?
>
> The two callers of this function are the AGF and AGI repair functions.
> AGF repair holds the locked AGF buffer, and AGI repair holds the locked
> AGF & AGI buffers, which should be enough to prevent anyone else from
> accessing the AG btrees. They keep the all the AG header buffers locked
> until they're completely finished with rebuilding the headers (i.e.
> xfs_scrub_teardown) and it's safe for the shape to change.
>
> How about I add to the comment for this function:
>
> /*
> * The caller must lock the applicable per-AG header buffers (AGF, AGI)
> * to prevent other threads from changing the shape of the btrees that
> * we are looking for. It must maintain those locks until it's safe for
> * other threads to change the btrees' shapes.
> */
>
> --D
>
>>> + return error;
>>> +}
>>> +
>>> +/*
>>> + * Do any of the blocks in this rmap record match one of the btrees we're
>>> + * looking for?
>>> + */
>>> +STATIC int
>>> +xfs_repair_findroot_rmap(
>>> + struct xfs_btree_cur *cur,
>>> + struct xfs_rmap_irec *rec,
>>> + void *priv)
>>> +{
>>> + struct xfs_repair_findroot *ri = priv;
>>> + struct xfs_repair_find_ag_btree *fab;
>>> + xfs_agblock_t b;
>>> + bool found_it;
>>> + int error = 0;
>>> +
>>> + /* Ignore anything that isn't AG metadata. */
>>> + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
>>> + return 0;
>>> +
>>> + /* Otherwise scan each block + btree type. */
>>> + for (b = 0; b < rec->rm_blockcount; b++) {
>>> + found_it = false;
>>> + for (fab = ri->btree_info; fab->buf_ops; fab++) {
>>> + error = xfs_repair_findroot_block(ri, fab,
>>> + rec->rm_owner, rec->rm_startblock + b,
>>> + &found_it);
>>
>> This loop is where I think the fab->owner/rec->rm_owner check
>> should go....
>>
>> Cheers,
>>
>> Dave.
>> --
>> Dave Chinner
>> david@fromorbit.com
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at https://urldefense.proofpoint.com/v2/url?u=http-3A__vger.kernel.org_majordomo-2Dinfo.html&d=DwIBAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=LHZQ8fHvy6wDKXGTWcm97burZH5sQKHRDMaY1UthQxc&m=rBPwCYkOfcA0Ljv7VWlTAZN8EPqKEvw5_1z9fzrl1S4&s=9iowSgTX47PO1FTR1YOuG2QOVgfa2kVqnVNbwqcqsI8&e=
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at https://urldefense.proofpoint.com/v2/url?u=http-3A__vger.kernel.org_majordomo-2Dinfo.html&d=DwIBAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=LHZQ8fHvy6wDKXGTWcm97burZH5sQKHRDMaY1UthQxc&m=rBPwCYkOfcA0Ljv7VWlTAZN8EPqKEvw5_1z9fzrl1S4&s=9iowSgTX47PO1FTR1YOuG2QOVgfa2kVqnVNbwqcqsI8&e=
>
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 05/22] xfs: recover AG btree roots from rmap data
2018-05-16 18:37 ` Darrick J. Wong
2018-05-16 19:18 ` Allison Henderson
@ 2018-05-16 22:36 ` Dave Chinner
2018-05-17 5:53 ` Darrick J. Wong
1 sibling, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-16 22:36 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Wed, May 16, 2018 at 11:37:29AM -0700, Darrick J. Wong wrote:
> On Wed, May 16, 2018 at 06:51:52PM +1000, Dave Chinner wrote:
> > On Tue, May 15, 2018 at 03:34:10PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > >
> > > Add a helper function to help us recover btree roots from the rmap data.
> > > Callers pass in a list of rmap owner codes, buffer ops, and magic
> > > numbers. We iterate the rmap records looking for owner matches, and
> > > then read the matching blocks to see if the magic number & uuid match.
> > > If so, we then read-verify the block, and if that passes then we retain
> > > a pointer to the block with the highest level, assuming that by the end
> > > of the call we will have found the root. This will be used to reset the
> > > AGF/AGI btree root fields during their rebuild procedures.
> > >
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
.....
> > > + /* Ignore this block if it's lower in the tree than we've seen. */
> > > + if (fab->root != NULLAGBLOCK &&
> > > + xfs_btree_get_level(btblock) < fab->height)
> > > + goto out;
> > > +
> > > + /* Make sure we pass the verifiers. */
> > > + bp->b_ops->verify_read(bp);
> > > + if (bp->b_error)
> > > + goto out;
> > > + fab->root = agbno;
> > > + fab->height = xfs_btree_get_level(btblock) + 1;
> > > + *found_it = true;
> > > +
> > > + trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
> > > + be32_to_cpu(btblock->bb_magic), fab->height - 1);
> > > +out:
> > > + xfs_trans_brelse(ri->sc->tp, bp);
> >
> > So we release the buffer once we've found it, which also unlocks it.
> > That means when we come back to it later, it may have been accessed
> > and changed by something else and no longer be the block we are
> > looking for. How do you protect against this sort of race given we
> > are unlocking the buffer? Perhaps it should be held on the fab
> > structure, and released when a better candidate is found?
>
> The two callers of this function are the AGF and AGI repair functions.
> AGF repair holds the locked AGF buffer, and AGI repair holds the locked
> AGF & AGI buffers, which should be enough to prevent anyone else from
> accessing the AG btrees. They keep the all the AG header buffers locked
> until they're completely finished with rebuilding the headers (i.e.
> xfs_scrub_teardown) and it's safe for the shape to change.
>
> How about I add to the comment for this function:
>
> /*
> * The caller must lock the applicable per-AG header buffers (AGF, AGI)
> * to prevent other threads from changing the shape of the btrees that
> * we are looking for. It must maintain those locks until it's safe for
> * other threads to change the btrees' shapes.
> */
That's helpful. :) Can you sprinkle some checks like
ASSERT(xfs_buf_islocked(agbp)) to remind readers of the
leaf/callback functions that they expect the AGF/AGI to be locked on
entry?
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 05/22] xfs: recover AG btree roots from rmap data
2018-05-16 22:36 ` Dave Chinner
@ 2018-05-17 5:53 ` Darrick J. Wong
0 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-17 5:53 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Thu, May 17, 2018 at 08:36:22AM +1000, Dave Chinner wrote:
> On Wed, May 16, 2018 at 11:37:29AM -0700, Darrick J. Wong wrote:
> > On Wed, May 16, 2018 at 06:51:52PM +1000, Dave Chinner wrote:
> > > On Tue, May 15, 2018 at 03:34:10PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > >
> > > > Add a helper function to help us recover btree roots from the rmap data.
> > > > Callers pass in a list of rmap owner codes, buffer ops, and magic
> > > > numbers. We iterate the rmap records looking for owner matches, and
> > > > then read the matching blocks to see if the magic number & uuid match.
> > > > If so, we then read-verify the block, and if that passes then we retain
> > > > a pointer to the block with the highest level, assuming that by the end
> > > > of the call we will have found the root. This will be used to reset the
> > > > AGF/AGI btree root fields during their rebuild procedures.
> > > >
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> .....
> > > > + /* Ignore this block if it's lower in the tree than we've seen. */
> > > > + if (fab->root != NULLAGBLOCK &&
> > > > + xfs_btree_get_level(btblock) < fab->height)
> > > > + goto out;
> > > > +
> > > > + /* Make sure we pass the verifiers. */
> > > > + bp->b_ops->verify_read(bp);
> > > > + if (bp->b_error)
> > > > + goto out;
> > > > + fab->root = agbno;
> > > > + fab->height = xfs_btree_get_level(btblock) + 1;
> > > > + *found_it = true;
> > > > +
> > > > + trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
> > > > + be32_to_cpu(btblock->bb_magic), fab->height - 1);
> > > > +out:
> > > > + xfs_trans_brelse(ri->sc->tp, bp);
> > >
> > > So we release the buffer once we've found it, which also unlocks it.
> > > That means when we come back to it later, it may have been accessed
> > > and changed by something else and no longer be the block we are
> > > looking for. How do you protect against this sort of race given we
> > > are unlocking the buffer? Perhaps it should be held on the fab
> > > structure, and released when a better candidate is found?
> >
> > The two callers of this function are the AGF and AGI repair functions.
> > AGF repair holds the locked AGF buffer, and AGI repair holds the locked
> > AGF & AGI buffers, which should be enough to prevent anyone else from
> > accessing the AG btrees. They keep the all the AG header buffers locked
> > until they're completely finished with rebuilding the headers (i.e.
> > xfs_scrub_teardown) and it's safe for the shape to change.
> >
> > How about I add to the comment for this function:
> >
> > /*
> > * The caller must lock the applicable per-AG header buffers (AGF, AGI)
> > * to prevent other threads from changing the shape of the btrees that
> > * we are looking for. It must maintain those locks until it's safe for
> > * other threads to change the btrees' shapes.
> > */
>
> That's helpful. :) Can you sprinkle some checks like
> ASSERT(xfs_buf_islocked(agbp)) to remind readers of the
> leaf/callback functions that they expect the AGF/AGI to be locked on
> entry?
Ok, will do.
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v2 05/22] xfs: recover AG btree roots from rmap data
2018-05-15 22:34 ` [PATCH 05/22] xfs: recover AG btree roots from rmap data Darrick J. Wong
2018-05-16 8:51 ` Dave Chinner
@ 2018-05-18 3:54 ` Darrick J. Wong
2018-05-29 3:16 ` Dave Chinner
1 sibling, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-18 3:54 UTC (permalink / raw)
To: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Add a helper function to help us recover btree roots from the rmap data.
Callers pass in a list of rmap owner codes, buffer ops, and magic
numbers. We iterate the rmap records looking for owner matches, and
then read the matching blocks to see if the magic number & uuid match.
If so, we then read-verify the block, and if that passes then we retain
a pointer to the block with the highest level, assuming that by the end
of the call we will have found the root. This will be used to reset the
AGF/AGI btree root fields during their rebuild procedures.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
v2: document how we probe for btree roots and dcoument the locking
requirements for the callers
---
fs/xfs/scrub/repair.c | 190 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 20 +++++
2 files changed, 210 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 7daf0120d1bf..877488ce4bc8 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -836,3 +836,193 @@ xfs_repair_reap_btree_extents(
xfs_repair_cancel_btree_extents(sc, exlist);
return error;
}
+
+/*
+ * Finding per-AG Btree Roots for AGF/AGI Reconstruction
+ *
+ * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
+ * the AG headers by using the rmap data to rummage through the AG looking for
+ * btree roots. This is not guaranteed to work if the AG is heavily damaged
+ * or the rmap data are corrupt.
+ *
+ * Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL
+ * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
+ * AGI is being rebuilt. It must maintain these locks until it's safe for
+ * other threads to change the btrees' shapes. The caller provides
+ * information about the btrees to look for by passing in an array of
+ * xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
+ * The (root, height) fields will be set on return if anything is found. The
+ * last element of the array should have a NULL buf_ops to mark the end of the
+ * array.
+ *
+ * For every rmapbt record matching any of the rmap owners in btree_info,
+ * read each block referenced by the rmap record. If the block is a btree
+ * block from this filesystem matching any of the magic numbers and has a
+ * level higher than what we've already seen, remember the block and the
+ * height of the tree required to have such a block. When the call completes,
+ * we return the highest block we've found for each btree description; those
+ * should be the roots.
+ */
+
+struct xfs_repair_findroot {
+ struct xfs_scrub_context *sc;
+ struct xfs_buf *agfl_bp;
+ struct xfs_agf *agf;
+ struct xfs_repair_find_ag_btree *btree_info;
+};
+
+/* See if our block is in the AGFL. */
+STATIC int
+xfs_repair_findroot_agfl_walk(
+ struct xfs_mount *mp,
+ xfs_agblock_t bno,
+ void *priv)
+{
+ xfs_agblock_t *agbno = priv;
+
+ return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
+}
+
+/* Does this block match the btree information passed in? */
+STATIC int
+xfs_repair_findroot_block(
+ struct xfs_repair_findroot *ri,
+ struct xfs_repair_find_ag_btree *fab,
+ uint64_t owner,
+ xfs_agblock_t agbno,
+ bool *found_it)
+{
+ struct xfs_mount *mp = ri->sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_btree_block *btblock;
+ xfs_daddr_t daddr;
+ int error;
+
+ daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
+
+ /*
+ * Blocks in the AGFL have stale contents that might just happen to
+ * have a matching magic and uuid. We don't want to pull these blocks
+ * in as part of a tree root, so we have to filter out the AGFL stuff
+ * here. If the AGFL looks insane we'll just refuse to repair.
+ */
+ if (owner == XFS_RMAP_OWN_AG) {
+ error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
+ xfs_repair_findroot_agfl_walk, &agbno);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ return 0;
+ if (error)
+ return error;
+ }
+
+ error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
+ mp->m_bsize, 0, &bp, NULL);
+ if (error)
+ return error;
+
+ /*
+ * Does this look like a block matching our fs and higher than any
+ * other block we've found so far? If so, reattach buffer verifiers
+ * so the AIL won't complain if the buffer is also dirty.
+ */
+ btblock = XFS_BUF_TO_BLOCK(bp);
+ if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ goto out;
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ goto out;
+ bp->b_ops = fab->buf_ops;
+
+ /* Ignore this block if it's lower in the tree than we've seen. */
+ if (fab->root != NULLAGBLOCK &&
+ xfs_btree_get_level(btblock) < fab->height)
+ goto out;
+
+ /* Make sure we pass the verifiers. */
+ bp->b_ops->verify_read(bp);
+ if (bp->b_error)
+ goto out;
+ fab->root = agbno;
+ fab->height = xfs_btree_get_level(btblock) + 1;
+ *found_it = true;
+
+ trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
+ be32_to_cpu(btblock->bb_magic), fab->height - 1);
+out:
+ xfs_trans_brelse(ri->sc->tp, bp);
+ return error;
+}
+
+/*
+ * Do any of the blocks in this rmap record match one of the btrees we're
+ * looking for?
+ */
+STATIC int
+xfs_repair_findroot_rmap(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_findroot *ri = priv;
+ struct xfs_repair_find_ag_btree *fab;
+ xfs_agblock_t b;
+ bool found_it;
+ int error = 0;
+
+ /* Ignore anything that isn't AG metadata. */
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+ return 0;
+
+ /* Otherwise scan each block + btree type. */
+ for (b = 0; b < rec->rm_blockcount; b++) {
+ found_it = false;
+ for (fab = ri->btree_info; fab->buf_ops; fab++) {
+ if (rec->rm_owner != fab->rmap_owner)
+ continue;
+ error = xfs_repair_findroot_block(ri, fab,
+ rec->rm_owner, rec->rm_startblock + b,
+ &found_it);
+ if (error)
+ return error;
+ if (found_it)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* Find the roots of the per-AG btrees described in btree_info. */
+int
+xfs_repair_find_ag_btree_roots(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *agf_bp,
+ struct xfs_repair_find_ag_btree *btree_info,
+ struct xfs_buf *agfl_bp)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_repair_findroot ri;
+ struct xfs_repair_find_ag_btree *fab;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(xfs_buf_islocked(agf_bp));
+ ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
+
+ ri.sc = sc;
+ ri.btree_info = btree_info;
+ ri.agf = XFS_BUF_TO_AGF(agf_bp);
+ ri.agfl_bp = agfl_bp;
+ for (fab = btree_info; fab->buf_ops; fab++) {
+ ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
+ ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
+ fab->root = NULLAGBLOCK;
+ fab->height = 0;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index f14aaab7df9e..c922ef06b894 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -77,6 +77,26 @@ int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc,
struct xfs_repair_extent_list *exlist,
struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+struct xfs_repair_find_ag_btree {
+ /* in: rmap owner of the btree we're looking for */
+ uint64_t rmap_owner;
+
+ /* in: buffer ops */
+ const struct xfs_buf_ops *buf_ops;
+
+ /* in: magic number of the btree */
+ uint32_t magic;
+
+ /* out: the highest btree block found and the tree height */
+ xfs_agblock_t root;
+ unsigned int height;
+};
+
+int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
+ struct xfs_buf *agf_bp,
+ struct xfs_repair_find_ag_btree *btree_info,
+ struct xfs_buf *agfl_bp);
+
/* Metadata repairers */
int xfs_repair_probe(struct xfs_scrub_context *sc);
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH v2 05/22] xfs: recover AG btree roots from rmap data
2018-05-18 3:54 ` [PATCH v2 " Darrick J. Wong
@ 2018-05-29 3:16 ` Dave Chinner
0 siblings, 0 replies; 76+ messages in thread
From: Dave Chinner @ 2018-05-29 3:16 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Thu, May 17, 2018 at 08:54:49PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add a helper function to help us recover btree roots from the rmap data.
> Callers pass in a list of rmap owner codes, buffer ops, and magic
> numbers. We iterate the rmap records looking for owner matches, and
> then read the matching blocks to see if the magic number & uuid match.
> If so, we then read-verify the block, and if that passes then we retain
> a pointer to the block with the highest level, assuming that by the end
> of the call we will have found the root. This will be used to reset the
> AGF/AGI btree root fields during their rebuild procedures.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> v2: document how we probe for btree roots and dcoument the locking
> requirements for the callers
Looks good.
Reviewed-by: Dave Chinner <dchinner@redhat.com>
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH 06/22] xfs: add a repair helper to reset superblock counters
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (4 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 05/22] xfs: recover AG btree roots from rmap data Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-16 21:29 ` Allison Henderson
2018-05-18 3:56 ` [PATCH v2 " Darrick J. Wong
2018-05-15 22:34 ` [PATCH 07/22] xfs: add helpers to attach quotas to inodes Darrick J. Wong
` (16 subsequent siblings)
22 siblings, 2 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Add a helper function to reset the superblock inode and block counters.
The AG rebuilding functions will need these to adjust the counts if they
need to change as a part of recovering from corruption.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/repair.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 7 ++++
fs/xfs/scrub/scrub.c | 2 +
fs/xfs/scrub/scrub.h | 1 +
4 files changed, 89 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 06c84f76d7ff..1ca7e124e1a7 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -944,3 +944,82 @@ xfs_repair_find_ag_btree_roots(
return error;
}
+
+/* Reset the superblock counters from the AGF/AGI. */
+int
+xfs_repair_reset_counters(
+ struct xfs_mount *mp)
+{
+ struct xfs_buf *agi_bp;
+ struct xfs_buf *agf_bp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ xfs_agnumber_t agno;
+ xfs_ino_t icount = 0;
+ xfs_ino_t ifree = 0;
+ xfs_filblks_t fdblocks = 0;
+ int64_t delta_icount;
+ int64_t delta_ifree;
+ int64_t delta_fdblocks;
+ int error;
+
+ trace_xfs_repair_reset_counters(mp);
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ /* Count all the inodes... */
+ error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+ if (error)
+ return error;
+ agi = XFS_BUF_TO_AGI(agi_bp);
+ icount += be32_to_cpu(agi->agi_count);
+ ifree += be32_to_cpu(agi->agi_freecount);
+ xfs_buf_relse(agi_bp);
+
+ /* Add up the free/freelist/bnobt/cntbt blocks... */
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+ agf = XFS_BUF_TO_AGF(agf_bp);
+ fdblocks += be32_to_cpu(agf->agf_freeblks);
+ fdblocks += be32_to_cpu(agf->agf_flcount);
+ fdblocks += be32_to_cpu(agf->agf_btreeblks);
+ xfs_buf_relse(agf_bp);
+ }
+
+ /*
+ * Reinitialize the counters. The on-disk and in-core counters differ
+ * by the number of inodes/blocks reserved by the admin, the per-AG
+ * reservation, and any transactions in progress, so we have to
+ * account for that. First we take the sb lock and update its
+ * counters...
+ */
+ spin_lock(&mp->m_sb_lock);
+ delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
+ delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
+ delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
+ mp->m_sb.sb_icount = icount;
+ mp->m_sb.sb_ifree = ifree;
+ mp->m_sb.sb_fdblocks = fdblocks;
+ spin_unlock(&mp->m_sb_lock);
+
+ /* ...and then update the per-cpu counters. */
+ if (delta_icount) {
+ error = xfs_mod_icount(mp, delta_icount);
+ if (error)
+ return error;
+ }
+ if (delta_ifree) {
+ error = xfs_mod_ifree(mp, delta_ifree);
+ if (error)
+ return error;
+ }
+ if (delta_fdblocks) {
+ error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 55441774e8e5..eea32a26f947 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -96,6 +96,7 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
struct xfs_buf *agf_bp,
struct xfs_repair_find_ag_btree *btree_info,
struct xfs_buf *agfl_bp);
+int xfs_repair_reset_counters(struct xfs_mount *mp);
/* Metadata repairers */
@@ -121,6 +122,12 @@ xfs_repair_calc_ag_resblks(
return 0;
}
+static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
+{
+ ASSERT(0);
+ return -EIO;
+}
+
#define xfs_repair_probe xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c5999c28c20c..bf5e8dd66133 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -200,6 +200,8 @@ xfs_scrub_teardown(
kmem_free(sc->buf);
sc->buf = NULL;
}
+ if (sc->reset_counters && !error)
+ error = xfs_repair_reset_counters(sc->mp);
return error;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 2f89a84a0e10..1aaea393c2d1 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -77,6 +77,7 @@ struct xfs_scrub_context {
uint ilock_flags;
bool try_harder;
bool has_quotaofflock;
+ bool reset_counters;
/* State tracking for single-AG operations. */
struct xfs_scrub_ag sa;
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH 06/22] xfs: add a repair helper to reset superblock counters
2018-05-15 22:34 ` [PATCH 06/22] xfs: add a repair helper to reset superblock counters Darrick J. Wong
@ 2018-05-16 21:29 ` Allison Henderson
2018-05-18 3:56 ` Darrick J. Wong
2018-05-18 3:56 ` [PATCH v2 " Darrick J. Wong
1 sibling, 1 reply; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 21:29 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs, david
On 05/15/2018 03:34 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add a helper function to reset the superblock inode and block counters.
> The AG rebuilding functions will need these to adjust the counts if they
> need to change as a part of recovering from corruption.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/repair.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 7 ++++
> fs/xfs/scrub/scrub.c | 2 +
> fs/xfs/scrub/scrub.h | 1 +
> 4 files changed, 89 insertions(+)
>
>
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 06c84f76d7ff..1ca7e124e1a7 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -944,3 +944,82 @@ xfs_repair_find_ag_btree_roots(
>
> return error;
> }
> +
> +/* Reset the superblock counters from the AGF/AGI. */
> +int
> +xfs_repair_reset_counters(
> + struct xfs_mount *mp)
> +{
> + struct xfs_buf *agi_bp;
> + struct xfs_buf *agf_bp;
> + struct xfs_agi *agi;
> + struct xfs_agf *agf;
> + xfs_agnumber_t agno;
> + xfs_ino_t icount = 0;
> + xfs_ino_t ifree = 0;
> + xfs_filblks_t fdblocks = 0;
> + int64_t delta_icount;
> + int64_t delta_ifree;
> + int64_t delta_fdblocks;
> + int error;
> +
> + trace_xfs_repair_reset_counters(mp);
> +
> + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> + /* Count all the inodes... */
> + error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
> + if (error)
> + return error;
> + agi = XFS_BUF_TO_AGI(agi_bp);
> + icount += be32_to_cpu(agi->agi_count);
> + ifree += be32_to_cpu(agi->agi_freecount);
> + xfs_buf_relse(agi_bp);
> +
> + /* Add up the free/freelist/bnobt/cntbt blocks... */
> + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
> + if (error)
> + return error;
> + if (!agf_bp)
> + return -ENOMEM;
> + agf = XFS_BUF_TO_AGF(agf_bp);
> + fdblocks += be32_to_cpu(agf->agf_freeblks);
> + fdblocks += be32_to_cpu(agf->agf_flcount);
> + fdblocks += be32_to_cpu(agf->agf_btreeblks);
> + xfs_buf_relse(agf_bp);
> + }
> +
> + /*
> + * Reinitialize the counters. The on-disk and in-core counters differ
> + * by the number of inodes/blocks reserved by the admin, the per-AG
> + * reservation, and any transactions in progress, so we have to
> + * account for that. First we take the sb lock and update its
> + * counters...
> + */
> + spin_lock(&mp->m_sb_lock);
> + delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> + delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> + delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> + mp->m_sb.sb_icount = icount;
> + mp->m_sb.sb_ifree = ifree;
> + mp->m_sb.sb_fdblocks = fdblocks;
> + spin_unlock(&mp->m_sb_lock);
> +
> + /* ...and then update the per-cpu counters. */
> + if (delta_icount) {
> + error = xfs_mod_icount(mp, delta_icount);
> + if (error)
> + return error;
> + }
> + if (delta_ifree) {
> + error = xfs_mod_ifree(mp, delta_ifree);
> + if (error)
> + return error;
> + }
> + if (delta_fdblocks) {
> + error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
> + if (error)
> + return error;
> + }
> +
> + return 0;
> +}
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index 55441774e8e5..eea32a26f947 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -96,6 +96,7 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
> struct xfs_buf *agf_bp,
> struct xfs_repair_find_ag_btree *btree_info,
> struct xfs_buf *agfl_bp);
> +int xfs_repair_reset_counters(struct xfs_mount *mp);
>
> /* Metadata repairers */
>
> @@ -121,6 +122,12 @@ xfs_repair_calc_ag_resblks(
> return 0;
> }
>
> +static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
> +{
> + ASSERT(0);
> + return -EIO;
> +}
> +
> #define xfs_repair_probe xfs_repair_notsupported
>
> #endif /* CONFIG_XFS_ONLINE_REPAIR */
> diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> index c5999c28c20c..bf5e8dd66133 100644
> --- a/fs/xfs/scrub/scrub.c
> +++ b/fs/xfs/scrub/scrub.c
> @@ -200,6 +200,8 @@ xfs_scrub_teardown(
> kmem_free(sc->buf);
> sc->buf = NULL;
> }
> + if (sc->reset_counters && !error)
> + error = xfs_repair_reset_counters(sc->mp);
> return error;
> }
>
> diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
> index 2f89a84a0e10..1aaea393c2d1 100644
> --- a/fs/xfs/scrub/scrub.h
> +++ b/fs/xfs/scrub/scrub.h
> @@ -77,6 +77,7 @@ struct xfs_scrub_context {
> uint ilock_flags;
> bool try_harder;
> bool has_quotaofflock;
> + bool reset_counters;
I noticed this bool is added here, and used in the conditional above,
but I don't see where it gets set to anything? Maybe it gets used later
in the set?
Other than that looks good. Thx!
Reviewed by: Allison Henderson <allison.henderson@oracle.com>
>
> /* State tracking for single-AG operations. */
> struct xfs_scrub_ag sa;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at https://urldefense.proofpoint.com/v2/url?u=http-3A__vger.kernel.org_majordomo-2Dinfo.html&d=DwICaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=LHZQ8fHvy6wDKXGTWcm97burZH5sQKHRDMaY1UthQxc&m=QJevHm77WCVmLISD9kjEPpN22CjDaymobWOYR-vTC2c&s=LEL8S70qR88zGjqrODTcuvE_OA75SuB5hdZ86g6FD70&e=
>
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 06/22] xfs: add a repair helper to reset superblock counters
2018-05-16 21:29 ` Allison Henderson
@ 2018-05-18 3:56 ` Darrick J. Wong
0 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-18 3:56 UTC (permalink / raw)
To: Allison Henderson; +Cc: linux-xfs, david
On Wed, May 16, 2018 at 02:29:27PM -0700, Allison Henderson wrote:
> On 05/15/2018 03:34 PM, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Add a helper function to reset the superblock inode and block counters.
> > The AG rebuilding functions will need these to adjust the counts if they
> > need to change as a part of recovering from corruption.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > fs/xfs/scrub/repair.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++
> > fs/xfs/scrub/repair.h | 7 ++++
> > fs/xfs/scrub/scrub.c | 2 +
> > fs/xfs/scrub/scrub.h | 1 +
> > 4 files changed, 89 insertions(+)
> >
> >
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index 06c84f76d7ff..1ca7e124e1a7 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -944,3 +944,82 @@ xfs_repair_find_ag_btree_roots(
> > return error;
> > }
> > +
> > +/* Reset the superblock counters from the AGF/AGI. */
> > +int
> > +xfs_repair_reset_counters(
> > + struct xfs_mount *mp)
> > +{
> > + struct xfs_buf *agi_bp;
> > + struct xfs_buf *agf_bp;
> > + struct xfs_agi *agi;
> > + struct xfs_agf *agf;
> > + xfs_agnumber_t agno;
> > + xfs_ino_t icount = 0;
> > + xfs_ino_t ifree = 0;
> > + xfs_filblks_t fdblocks = 0;
> > + int64_t delta_icount;
> > + int64_t delta_ifree;
> > + int64_t delta_fdblocks;
> > + int error;
> > +
> > + trace_xfs_repair_reset_counters(mp);
> > +
> > + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> > + /* Count all the inodes... */
> > + error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
> > + if (error)
> > + return error;
> > + agi = XFS_BUF_TO_AGI(agi_bp);
> > + icount += be32_to_cpu(agi->agi_count);
> > + ifree += be32_to_cpu(agi->agi_freecount);
> > + xfs_buf_relse(agi_bp);
> > +
> > + /* Add up the free/freelist/bnobt/cntbt blocks... */
> > + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
> > + if (error)
> > + return error;
> > + if (!agf_bp)
> > + return -ENOMEM;
> > + agf = XFS_BUF_TO_AGF(agf_bp);
> > + fdblocks += be32_to_cpu(agf->agf_freeblks);
> > + fdblocks += be32_to_cpu(agf->agf_flcount);
> > + fdblocks += be32_to_cpu(agf->agf_btreeblks);
> > + xfs_buf_relse(agf_bp);
> > + }
> > +
> > + /*
> > + * Reinitialize the counters. The on-disk and in-core counters differ
> > + * by the number of inodes/blocks reserved by the admin, the per-AG
> > + * reservation, and any transactions in progress, so we have to
> > + * account for that. First we take the sb lock and update its
> > + * counters...
> > + */
> > + spin_lock(&mp->m_sb_lock);
> > + delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > + delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > + delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > + mp->m_sb.sb_icount = icount;
> > + mp->m_sb.sb_ifree = ifree;
> > + mp->m_sb.sb_fdblocks = fdblocks;
> > + spin_unlock(&mp->m_sb_lock);
> > +
> > + /* ...and then update the per-cpu counters. */
> > + if (delta_icount) {
> > + error = xfs_mod_icount(mp, delta_icount);
> > + if (error)
> > + return error;
> > + }
> > + if (delta_ifree) {
> > + error = xfs_mod_ifree(mp, delta_ifree);
> > + if (error)
> > + return error;
> > + }
> > + if (delta_fdblocks) {
> > + error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
> > + if (error)
> > + return error;
> > + }
> > +
> > + return 0;
> > +}
> > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > index 55441774e8e5..eea32a26f947 100644
> > --- a/fs/xfs/scrub/repair.h
> > +++ b/fs/xfs/scrub/repair.h
> > @@ -96,6 +96,7 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
> > struct xfs_buf *agf_bp,
> > struct xfs_repair_find_ag_btree *btree_info,
> > struct xfs_buf *agfl_bp);
> > +int xfs_repair_reset_counters(struct xfs_mount *mp);
> > /* Metadata repairers */
> > @@ -121,6 +122,12 @@ xfs_repair_calc_ag_resblks(
> > return 0;
> > }
> > +static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
> > +{
> > + ASSERT(0);
> > + return -EIO;
> > +}
> > +
> > #define xfs_repair_probe xfs_repair_notsupported
> > #endif /* CONFIG_XFS_ONLINE_REPAIR */
> > diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> > index c5999c28c20c..bf5e8dd66133 100644
> > --- a/fs/xfs/scrub/scrub.c
> > +++ b/fs/xfs/scrub/scrub.c
> > @@ -200,6 +200,8 @@ xfs_scrub_teardown(
> > kmem_free(sc->buf);
> > sc->buf = NULL;
> > }
> > + if (sc->reset_counters && !error)
> > + error = xfs_repair_reset_counters(sc->mp);
> > return error;
> > }
> > diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
> > index 2f89a84a0e10..1aaea393c2d1 100644
> > --- a/fs/xfs/scrub/scrub.h
> > +++ b/fs/xfs/scrub/scrub.h
> > @@ -77,6 +77,7 @@ struct xfs_scrub_context {
> > uint ilock_flags;
> > bool try_harder;
> > bool has_quotaofflock;
> > + bool reset_counters;
> I noticed this bool is added here, and used in the conditional above,
> but I don't see where it gets set to anything? Maybe it gets used later in
> the set?
Yeah, there are no callers of any of these helper functions until you
get to the part of the series where we actually start repairing
metadata.
--D
> Other than that looks good. Thx!
>
> Reviewed by: Allison Henderson <allison.henderson@oracle.com>
>
> > /* State tracking for single-AG operations. */
> > struct xfs_scrub_ag sa;
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at https://urldefense.proofpoint.com/v2/url?u=http-3A__vger.kernel.org_majordomo-2Dinfo.html&d=DwICaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=LHZQ8fHvy6wDKXGTWcm97burZH5sQKHRDMaY1UthQxc&m=QJevHm77WCVmLISD9kjEPpN22CjDaymobWOYR-vTC2c&s=LEL8S70qR88zGjqrODTcuvE_OA75SuB5hdZ86g6FD70&e=
> >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v2 06/22] xfs: add a repair helper to reset superblock counters
2018-05-15 22:34 ` [PATCH 06/22] xfs: add a repair helper to reset superblock counters Darrick J. Wong
2018-05-16 21:29 ` Allison Henderson
@ 2018-05-18 3:56 ` Darrick J. Wong
2018-05-29 3:28 ` Dave Chinner
1 sibling, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-18 3:56 UTC (permalink / raw)
To: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Add a helper function to reset the superblock inode and block counters.
The AG rebuilding functions will need these to adjust the counts if they
need to change as a part of recovering from corruption.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
---
v2: improve documentation
---
fs/xfs/scrub/repair.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 7 ++++
fs/xfs/scrub/scrub.c | 2 +
fs/xfs/scrub/scrub.h | 1 +
4 files changed, 99 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 877488ce4bc8..4b95a15c0bd0 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1026,3 +1026,92 @@ xfs_repair_find_ag_btree_roots(
return error;
}
+
+/*
+ * Reset the superblock counters.
+ *
+ * If a repair function changes the inode or free block counters, it must set
+ * reset_counters to push this function to reset the global counters. Repair
+ * functions are responsible for resetting all other in-core state. This
+ * function runs outside of transaction context after the repair context has
+ * been torn down, so if there's further filesystem corruption we'll error out
+ * to userspace and give userspace a chance to call back to fix the further
+ * errors.
+ */
+int
+xfs_repair_reset_counters(
+ struct xfs_mount *mp)
+{
+ struct xfs_buf *agi_bp;
+ struct xfs_buf *agf_bp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ xfs_agnumber_t agno;
+ xfs_ino_t icount = 0;
+ xfs_ino_t ifree = 0;
+ xfs_filblks_t fdblocks = 0;
+ int64_t delta_icount;
+ int64_t delta_ifree;
+ int64_t delta_fdblocks;
+ int error;
+
+ trace_xfs_repair_reset_counters(mp);
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ /* Count all the inodes... */
+ error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+ if (error)
+ return error;
+ agi = XFS_BUF_TO_AGI(agi_bp);
+ icount += be32_to_cpu(agi->agi_count);
+ ifree += be32_to_cpu(agi->agi_freecount);
+ xfs_buf_relse(agi_bp);
+
+ /* Add up the free/freelist/bnobt/cntbt blocks... */
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+ agf = XFS_BUF_TO_AGF(agf_bp);
+ fdblocks += be32_to_cpu(agf->agf_freeblks);
+ fdblocks += be32_to_cpu(agf->agf_flcount);
+ fdblocks += be32_to_cpu(agf->agf_btreeblks);
+ xfs_buf_relse(agf_bp);
+ }
+
+ /*
+ * Reinitialize the counters. The on-disk and in-core counters differ
+ * by the number of inodes/blocks reserved by the admin, the per-AG
+ * reservation, and any transactions in progress, so we have to
+ * account for that. First we take the sb lock and update its
+ * counters...
+ */
+ spin_lock(&mp->m_sb_lock);
+ delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
+ delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
+ delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
+ mp->m_sb.sb_icount = icount;
+ mp->m_sb.sb_ifree = ifree;
+ mp->m_sb.sb_fdblocks = fdblocks;
+ spin_unlock(&mp->m_sb_lock);
+
+ /* ...and then update the per-cpu counters. */
+ if (delta_icount) {
+ error = xfs_mod_icount(mp, delta_icount);
+ if (error)
+ return error;
+ }
+ if (delta_ifree) {
+ error = xfs_mod_ifree(mp, delta_ifree);
+ if (error)
+ return error;
+ }
+ if (delta_fdblocks) {
+ error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index c922ef06b894..cc590312550a 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -96,6 +96,7 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
struct xfs_buf *agf_bp,
struct xfs_repair_find_ag_btree *btree_info,
struct xfs_buf *agfl_bp);
+int xfs_repair_reset_counters(struct xfs_mount *mp);
/* Metadata repairers */
@@ -121,6 +122,12 @@ xfs_repair_calc_ag_resblks(
return 0;
}
+static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
+{
+ ASSERT(0);
+ return -EIO;
+}
+
#define xfs_repair_probe xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c5999c28c20c..bf5e8dd66133 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -200,6 +200,8 @@ xfs_scrub_teardown(
kmem_free(sc->buf);
sc->buf = NULL;
}
+ if (sc->reset_counters && !error)
+ error = xfs_repair_reset_counters(sc->mp);
return error;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 636424d5e2ee..52b2be2df143 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -78,6 +78,7 @@ struct xfs_scrub_context {
uint ilock_flags;
bool try_harder;
bool has_quotaofflock;
+ bool reset_counters;
/* State tracking for single-AG operations. */
struct xfs_scrub_ag sa;
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH v2 06/22] xfs: add a repair helper to reset superblock counters
2018-05-18 3:56 ` [PATCH v2 " Darrick J. Wong
@ 2018-05-29 3:28 ` Dave Chinner
2018-05-29 22:07 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-29 3:28 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add a helper function to reset the superblock inode and block counters.
> The AG rebuilding functions will need these to adjust the counts if they
> need to change as a part of recovering from corruption.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
> ---
> v2: improve documentation
> ---
> fs/xfs/scrub/repair.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 7 ++++
> fs/xfs/scrub/scrub.c | 2 +
> fs/xfs/scrub/scrub.h | 1 +
> 4 files changed, 99 insertions(+)
>
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 877488ce4bc8..4b95a15c0bd0 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -1026,3 +1026,92 @@ xfs_repair_find_ag_btree_roots(
>
> return error;
> }
> +
> +/*
> + * Reset the superblock counters.
> + *
> + * If a repair function changes the inode or free block counters, it must set
> + * reset_counters to push this function to reset the global counters. Repair
> + * functions are responsible for resetting all other in-core state. This
> + * function runs outside of transaction context after the repair context has
> + * been torn down, so if there's further filesystem corruption we'll error out
> + * to userspace and give userspace a chance to call back to fix the further
> + * errors.
> + */
> +int
> +xfs_repair_reset_counters(
> + struct xfs_mount *mp)
> +{
> + struct xfs_buf *agi_bp;
> + struct xfs_buf *agf_bp;
> + struct xfs_agi *agi;
> + struct xfs_agf *agf;
> + xfs_agnumber_t agno;
> + xfs_ino_t icount = 0;
> + xfs_ino_t ifree = 0;
> + xfs_filblks_t fdblocks = 0;
> + int64_t delta_icount;
> + int64_t delta_ifree;
> + int64_t delta_fdblocks;
> + int error;
> +
> + trace_xfs_repair_reset_counters(mp);
> +
> + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> + /* Count all the inodes... */
> + error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
> + if (error)
> + return error;
> + agi = XFS_BUF_TO_AGI(agi_bp);
> + icount += be32_to_cpu(agi->agi_count);
> + ifree += be32_to_cpu(agi->agi_freecount);
> + xfs_buf_relse(agi_bp);
> +
> + /* Add up the free/freelist/bnobt/cntbt blocks... */
> + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
> + if (error)
> + return error;
> + if (!agf_bp)
> + return -ENOMEM;
> + agf = XFS_BUF_TO_AGF(agf_bp);
> + fdblocks += be32_to_cpu(agf->agf_freeblks);
> + fdblocks += be32_to_cpu(agf->agf_flcount);
> + fdblocks += be32_to_cpu(agf->agf_btreeblks);
> + xfs_buf_relse(agf_bp);
> + }
> +
> + /*
> + * Reinitialize the counters. The on-disk and in-core counters differ
> + * by the number of inodes/blocks reserved by the admin, the per-AG
> + * reservation, and any transactions in progress, so we have to
> + * account for that. First we take the sb lock and update its
> + * counters...
> + */
> + spin_lock(&mp->m_sb_lock);
> + delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> + delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> + delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> + mp->m_sb.sb_icount = icount;
> + mp->m_sb.sb_ifree = ifree;
> + mp->m_sb.sb_fdblocks = fdblocks;
> + spin_unlock(&mp->m_sb_lock);
This seems racy to me ? i.e. the per-ag counters can change while
we are summing them, and once we've summed them then sb counters
can change while we are waiting for the m_sb_lock. It's looks to me
like the summed per-ag counters are not in any way coherent
wit the superblock or the in-core per-CPU counters, so I'm
struggling to understand why this is safe?
We can do this sort of summation at mount time (in
xfs_initialize_perag_data()) because the filesystem is running
single threaded while the summation is taking place and so nothing
is changing during th summation. The filesystem is active in this
case, so I don't think we can do the same thing here.
Also, it brought a question to mind because I haven't clearly noted
it happening yet: when do the xfs_perag counters get corrected? And
if they are already correct, why not just iterate the perag
counters?
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 06/22] xfs: add a repair helper to reset superblock counters
2018-05-29 3:28 ` Dave Chinner
@ 2018-05-29 22:07 ` Darrick J. Wong
2018-05-29 22:24 ` Dave Chinner
0 siblings, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-29 22:07 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > Add a helper function to reset the superblock inode and block counters.
> > The AG rebuilding functions will need these to adjust the counts if they
> > need to change as a part of recovering from corruption.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
> > ---
> > v2: improve documentation
> > ---
> > fs/xfs/scrub/repair.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
> > fs/xfs/scrub/repair.h | 7 ++++
> > fs/xfs/scrub/scrub.c | 2 +
> > fs/xfs/scrub/scrub.h | 1 +
> > 4 files changed, 99 insertions(+)
> >
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index 877488ce4bc8..4b95a15c0bd0 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -1026,3 +1026,92 @@ xfs_repair_find_ag_btree_roots(
> >
> > return error;
> > }
> > +
> > +/*
> > + * Reset the superblock counters.
> > + *
> > + * If a repair function changes the inode or free block counters, it must set
> > + * reset_counters to push this function to reset the global counters. Repair
> > + * functions are responsible for resetting all other in-core state. This
> > + * function runs outside of transaction context after the repair context has
> > + * been torn down, so if there's further filesystem corruption we'll error out
> > + * to userspace and give userspace a chance to call back to fix the further
> > + * errors.
> > + */
> > +int
> > +xfs_repair_reset_counters(
> > + struct xfs_mount *mp)
> > +{
> > + struct xfs_buf *agi_bp;
> > + struct xfs_buf *agf_bp;
> > + struct xfs_agi *agi;
> > + struct xfs_agf *agf;
> > + xfs_agnumber_t agno;
> > + xfs_ino_t icount = 0;
> > + xfs_ino_t ifree = 0;
> > + xfs_filblks_t fdblocks = 0;
> > + int64_t delta_icount;
> > + int64_t delta_ifree;
> > + int64_t delta_fdblocks;
> > + int error;
> > +
> > + trace_xfs_repair_reset_counters(mp);
> > +
> > + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> > + /* Count all the inodes... */
> > + error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
> > + if (error)
> > + return error;
> > + agi = XFS_BUF_TO_AGI(agi_bp);
> > + icount += be32_to_cpu(agi->agi_count);
> > + ifree += be32_to_cpu(agi->agi_freecount);
> > + xfs_buf_relse(agi_bp);
> > +
> > + /* Add up the free/freelist/bnobt/cntbt blocks... */
> > + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
> > + if (error)
> > + return error;
> > + if (!agf_bp)
> > + return -ENOMEM;
> > + agf = XFS_BUF_TO_AGF(agf_bp);
> > + fdblocks += be32_to_cpu(agf->agf_freeblks);
> > + fdblocks += be32_to_cpu(agf->agf_flcount);
> > + fdblocks += be32_to_cpu(agf->agf_btreeblks);
> > + xfs_buf_relse(agf_bp);
> > + }
> > +
> > + /*
> > + * Reinitialize the counters. The on-disk and in-core counters differ
> > + * by the number of inodes/blocks reserved by the admin, the per-AG
> > + * reservation, and any transactions in progress, so we have to
> > + * account for that. First we take the sb lock and update its
> > + * counters...
> > + */
> > + spin_lock(&mp->m_sb_lock);
> > + delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > + delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > + delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > + mp->m_sb.sb_icount = icount;
> > + mp->m_sb.sb_ifree = ifree;
> > + mp->m_sb.sb_fdblocks = fdblocks;
> > + spin_unlock(&mp->m_sb_lock);
>
> This seems racy to me ? i.e. the per-ag counters can change while
> we are summing them, and once we've summed them then sb counters
> can change while we are waiting for the m_sb_lock. It's looks to me
> like the summed per-ag counters are not in any way coherent
> wit the superblock or the in-core per-CPU counters, so I'm
> struggling to understand why this is safe?
Hmm, yes, I think this is racy too. The purpose of this code is to
recompute the global counters from the AG counters after any operation
that modifies anything that would affect the icount/ifreecount/fdblocks
counters...
> We can do this sort of summation at mount time (in
> xfs_initialize_perag_data()) because the filesystem is running
> single threaded while the summation is taking place and so nothing
> is changing during th summation. The filesystem is active in this
> case, so I don't think we can do the same thing here.
...however, you're correct to point out that the fs must be quiesced
before we can actually do this. In other words, I think the filesystem
has to be completely frozen before we can do this. Perhaps it's better
to have the per-ag rebuilders fix only the per-ag counters and leave the
global counters alone. Then add a new scrubber that checks the summary
counters and fixes them if necessary.
> Also, it brought a question to mind because I haven't clearly noted
> it happening yet: when do the xfs_perag counters get corrected? And
> if they are already correct, why not just iterate the perag
> counters?
The xfs_perag counters are updated by the AGF/AGI/inobt rebuild code.
--D
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 06/22] xfs: add a repair helper to reset superblock counters
2018-05-29 22:07 ` Darrick J. Wong
@ 2018-05-29 22:24 ` Dave Chinner
2018-05-29 22:43 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-29 22:24 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Tue, May 29, 2018 at 03:07:16PM -0700, Darrick J. Wong wrote:
> On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> > On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > > + /*
> > > + * Reinitialize the counters. The on-disk and in-core counters differ
> > > + * by the number of inodes/blocks reserved by the admin, the per-AG
> > > + * reservation, and any transactions in progress, so we have to
> > > + * account for that. First we take the sb lock and update its
> > > + * counters...
> > > + */
> > > + spin_lock(&mp->m_sb_lock);
> > > + delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > > + delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > > + delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > > + mp->m_sb.sb_icount = icount;
> > > + mp->m_sb.sb_ifree = ifree;
> > > + mp->m_sb.sb_fdblocks = fdblocks;
> > > + spin_unlock(&mp->m_sb_lock);
> >
> > This seems racy to me ? i.e. the per-ag counters can change while
> > we are summing them, and once we've summed them then sb counters
> > can change while we are waiting for the m_sb_lock. It's looks to me
> > like the summed per-ag counters are not in any way coherent
> > wit the superblock or the in-core per-CPU counters, so I'm
> > struggling to understand why this is safe?
>
> Hmm, yes, I think this is racy too. The purpose of this code is to
> recompute the global counters from the AG counters after any operation
> that modifies anything that would affect the icount/ifreecount/fdblocks
> counters...
*nod*
> > We can do this sort of summation at mount time (in
> > xfs_initialize_perag_data()) because the filesystem is running
> > single threaded while the summation is taking place and so nothing
> > is changing during th summation. The filesystem is active in this
> > case, so I don't think we can do the same thing here.
>
> ...however, you're correct to point out that the fs must be quiesced
> before we can actually do this. In other words, I think the filesystem
> has to be completely frozen before we can do this. Perhaps it's better
> to have the per-ag rebuilders fix only the per-ag counters and leave the
> global counters alone. Then add a new scrubber that checks the summary
> counters and fixes them if necessary.
So the question here is whether we actually need to accurately
correct the global superblock counters? We know that if we have a
dirty unmount, the counters will we re-initialised on mount from the
AG header information, so perhaps what we need here is a flag to
tell unmount to dirty the log again after it has written the unmount
record (like we currently do for quiesce).
That was we can do a racy "near enough" update here to get us out of
the worst of the space accounting mismatches, knowing that on the
next mount it will be accurately rebuilt.
Thoughts?
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 06/22] xfs: add a repair helper to reset superblock counters
2018-05-29 22:24 ` Dave Chinner
@ 2018-05-29 22:43 ` Darrick J. Wong
2018-05-30 1:23 ` Dave Chinner
0 siblings, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-29 22:43 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Wed, May 30, 2018 at 08:24:28AM +1000, Dave Chinner wrote:
> On Tue, May 29, 2018 at 03:07:16PM -0700, Darrick J. Wong wrote:
> > On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> > > On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > > > + /*
> > > > + * Reinitialize the counters. The on-disk and in-core counters differ
> > > > + * by the number of inodes/blocks reserved by the admin, the per-AG
> > > > + * reservation, and any transactions in progress, so we have to
> > > > + * account for that. First we take the sb lock and update its
> > > > + * counters...
> > > > + */
> > > > + spin_lock(&mp->m_sb_lock);
> > > > + delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > > > + delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > > > + delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > > > + mp->m_sb.sb_icount = icount;
> > > > + mp->m_sb.sb_ifree = ifree;
> > > > + mp->m_sb.sb_fdblocks = fdblocks;
> > > > + spin_unlock(&mp->m_sb_lock);
> > >
> > > This seems racy to me ? i.e. the per-ag counters can change while
> > > we are summing them, and once we've summed them then sb counters
> > > can change while we are waiting for the m_sb_lock. It's looks to me
> > > like the summed per-ag counters are not in any way coherent
> > > wit the superblock or the in-core per-CPU counters, so I'm
> > > struggling to understand why this is safe?
> >
> > Hmm, yes, I think this is racy too. The purpose of this code is to
> > recompute the global counters from the AG counters after any operation
> > that modifies anything that would affect the icount/ifreecount/fdblocks
> > counters...
>
> *nod*
>
> > > We can do this sort of summation at mount time (in
> > > xfs_initialize_perag_data()) because the filesystem is running
> > > single threaded while the summation is taking place and so nothing
> > > is changing during th summation. The filesystem is active in this
> > > case, so I don't think we can do the same thing here.
> >
> > ...however, you're correct to point out that the fs must be quiesced
> > before we can actually do this. In other words, I think the filesystem
> > has to be completely frozen before we can do this. Perhaps it's better
> > to have the per-ag rebuilders fix only the per-ag counters and leave the
> > global counters alone. Then add a new scrubber that checks the summary
> > counters and fixes them if necessary.
>
> So the question here is whether we actually need to accurately
> correct the global superblock counters?
I think so, because what happens if the superblock counter is
artificially high but the AGs do not actually have the free space?
xfs_trans_reserve won't ENOSPC like it should, so we could end up
blowing out of transactions and shutting down because some allocation
that has to succeed ("because trans_reserve said there was space!")
fails...
> We know that if we have a dirty unmount, the counters will we
> re-initialised on mount from the AG header information, so perhaps
> what we need here is a flag to tell unmount to dirty the log again
> after it has written the unmount record (like we currently do for
> quiesce).
...but now that we've repaired the filesystem, it could potentially run
for a very long time until the next unmount. During that run, we'd be
misleading users about the real amount of free space and risking a hard
shutdown. I prefer that online repair try not to leave any weird state
around after xfs_scrub exits.
> That was we can do a racy "near enough" update here to get us out of
> the worst of the space accounting mismatches, knowing that on the
> next mount it will be accurately rebuilt.
>
> Thoughts?
Well, I think the best solution is to have the AGF/AGI/inobt rebuilders
adjust the global counters by the same amount that they're adjusting the
counters in the AGF/AGI, then add a new scrubber that runs at the end to
freeze the fs and check/repair the global counter state. :)
--D
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 06/22] xfs: add a repair helper to reset superblock counters
2018-05-29 22:43 ` Darrick J. Wong
@ 2018-05-30 1:23 ` Dave Chinner
2018-05-30 3:22 ` Darrick J. Wong
0 siblings, 1 reply; 76+ messages in thread
From: Dave Chinner @ 2018-05-30 1:23 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Tue, May 29, 2018 at 03:43:32PM -0700, Darrick J. Wong wrote:
> On Wed, May 30, 2018 at 08:24:28AM +1000, Dave Chinner wrote:
> > On Tue, May 29, 2018 at 03:07:16PM -0700, Darrick J. Wong wrote:
> > > On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> > > > On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > > > > + /*
> > > > > + * Reinitialize the counters. The on-disk and in-core counters differ
> > > > > + * by the number of inodes/blocks reserved by the admin, the per-AG
> > > > > + * reservation, and any transactions in progress, so we have to
> > > > > + * account for that. First we take the sb lock and update its
> > > > > + * counters...
> > > > > + */
> > > > > + spin_lock(&mp->m_sb_lock);
> > > > > + delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > > > > + delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > > > > + delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > > > > + mp->m_sb.sb_icount = icount;
> > > > > + mp->m_sb.sb_ifree = ifree;
> > > > > + mp->m_sb.sb_fdblocks = fdblocks;
> > > > > + spin_unlock(&mp->m_sb_lock);
> > > >
> > > > This seems racy to me ? i.e. the per-ag counters can change while
> > > > we are summing them, and once we've summed them then sb counters
> > > > can change while we are waiting for the m_sb_lock. It's looks to me
> > > > like the summed per-ag counters are not in any way coherent
> > > > wit the superblock or the in-core per-CPU counters, so I'm
> > > > struggling to understand why this is safe?
> > >
> > > Hmm, yes, I think this is racy too. The purpose of this code is to
> > > recompute the global counters from the AG counters after any operation
> > > that modifies anything that would affect the icount/ifreecount/fdblocks
> > > counters...
> >
> > *nod*
> >
> > > > We can do this sort of summation at mount time (in
> > > > xfs_initialize_perag_data()) because the filesystem is running
> > > > single threaded while the summation is taking place and so nothing
> > > > is changing during th summation. The filesystem is active in this
> > > > case, so I don't think we can do the same thing here.
> > >
> > > ...however, you're correct to point out that the fs must be quiesced
> > > before we can actually do this. In other words, I think the filesystem
> > > has to be completely frozen before we can do this. Perhaps it's better
> > > to have the per-ag rebuilders fix only the per-ag counters and leave the
> > > global counters alone. Then add a new scrubber that checks the summary
> > > counters and fixes them if necessary.
> >
> > So the question here is whether we actually need to accurately
> > correct the global superblock counters?
>
> I think so, because what happens if the superblock counter is
> artificially high but the AGs do not actually have the free space?
> xfs_trans_reserve won't ENOSPC like it should, so we could end up
> blowing out of transactions and shutting down because some allocation
> that has to succeed ("because trans_reserve said there was space!")
> fails...
Yes, but I would have thought the reset will get us close enough
that this wouldn't be an issue for the vast majority of people.
And the other side of it is that if we get close enough to ENOSPC
that it matters, we could freeze/sum/thaw to be fully accurate on
demand in xfs_trans_reserve(), right? We already slow down greatly
at ENOSPC, so at that point the perf overhead fo a freeze/thaw cycle
just doesn't matter...
> > We know that if we have a dirty unmount, the counters will we
> > re-initialised on mount from the AG header information, so perhaps
> > what we need here is a flag to tell unmount to dirty the log again
> > after it has written the unmount record (like we currently do for
> > quiesce).
>
> ...but now that we've repaired the filesystem, it could potentially run
> for a very long time until the next unmount. During that run, we'd be
> misleading users about the real amount of free space and risking a hard
> shutdown. I prefer that online repair try not to leave any weird state
> around after xfs_scrub exits.
Sure, but user's may not want a freeze/read-all-ag-headers/thaw
cycle as part of repair if it can be avoided. If there are thousands
of AGs, this could take many seconds....
> > That was we can do a racy "near enough" update here to get us out of
> > the worst of the space accounting mismatches, knowing that on the
> > next mount it will be accurately rebuilt.
> >
> > Thoughts?
>
> Well, I think the best solution is to have the AGF/AGI/inobt rebuilders
> adjust the global counters by the same amount that they're adjusting the
> counters in the AGF/AGI, then add a new scrubber that runs at the end to
> freeze the fs and check/repair the global counter state. :)
I'm just not convinced that we can get away with a global freeze to
do this summation without having noticable impact on applications.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH v2 06/22] xfs: add a repair helper to reset superblock counters
2018-05-30 1:23 ` Dave Chinner
@ 2018-05-30 3:22 ` Darrick J. Wong
0 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-30 3:22 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs
On Wed, May 30, 2018 at 11:23:33AM +1000, Dave Chinner wrote:
> On Tue, May 29, 2018 at 03:43:32PM -0700, Darrick J. Wong wrote:
> > On Wed, May 30, 2018 at 08:24:28AM +1000, Dave Chinner wrote:
> > > On Tue, May 29, 2018 at 03:07:16PM -0700, Darrick J. Wong wrote:
> > > > On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> > > > > On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > > > > > + /*
> > > > > > + * Reinitialize the counters. The on-disk and in-core counters differ
> > > > > > + * by the number of inodes/blocks reserved by the admin, the per-AG
> > > > > > + * reservation, and any transactions in progress, so we have to
> > > > > > + * account for that. First we take the sb lock and update its
> > > > > > + * counters...
> > > > > > + */
> > > > > > + spin_lock(&mp->m_sb_lock);
> > > > > > + delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > > > > > + delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > > > > > + delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > > > > > + mp->m_sb.sb_icount = icount;
> > > > > > + mp->m_sb.sb_ifree = ifree;
> > > > > > + mp->m_sb.sb_fdblocks = fdblocks;
> > > > > > + spin_unlock(&mp->m_sb_lock);
> > > > >
> > > > > This seems racy to me ? i.e. the per-ag counters can change while
> > > > > we are summing them, and once we've summed them then sb counters
> > > > > can change while we are waiting for the m_sb_lock. It's looks to me
> > > > > like the summed per-ag counters are not in any way coherent
> > > > > wit the superblock or the in-core per-CPU counters, so I'm
> > > > > struggling to understand why this is safe?
> > > >
> > > > Hmm, yes, I think this is racy too. The purpose of this code is to
> > > > recompute the global counters from the AG counters after any operation
> > > > that modifies anything that would affect the icount/ifreecount/fdblocks
> > > > counters...
> > >
> > > *nod*
> > >
> > > > > We can do this sort of summation at mount time (in
> > > > > xfs_initialize_perag_data()) because the filesystem is running
> > > > > single threaded while the summation is taking place and so nothing
> > > > > is changing during th summation. The filesystem is active in this
> > > > > case, so I don't think we can do the same thing here.
> > > >
> > > > ...however, you're correct to point out that the fs must be quiesced
> > > > before we can actually do this. In other words, I think the filesystem
> > > > has to be completely frozen before we can do this. Perhaps it's better
> > > > to have the per-ag rebuilders fix only the per-ag counters and leave the
> > > > global counters alone. Then add a new scrubber that checks the summary
> > > > counters and fixes them if necessary.
> > >
> > > So the question here is whether we actually need to accurately
> > > correct the global superblock counters?
> >
> > I think so, because what happens if the superblock counter is
> > artificially high but the AGs do not actually have the free space?
> > xfs_trans_reserve won't ENOSPC like it should, so we could end up
> > blowing out of transactions and shutting down because some allocation
> > that has to succeed ("because trans_reserve said there was space!")
> > fails...
>
> Yes, but I would have thought the reset will get us close enough
> that this wouldn't be an issue for the vast majority of people.
<nod> I'll adjust the sb counters based on the agf/agi/inobt adjustments
and we'll leave verifying and/or fixing the superblock counters as a
Future Research Topic(tm).
> And the other side of it is that if we get close enough to ENOSPC
> that it matters, we could freeze/sum/thaw to be fully accurate on
> demand in xfs_trans_reserve(), right? We already slow down greatly
> at ENOSPC, so at that point the perf overhead fo a freeze/thaw cycle
> just doesn't matter...
>
> > > We know that if we have a dirty unmount, the counters will we
> > > re-initialised on mount from the AG header information, so perhaps
> > > what we need here is a flag to tell unmount to dirty the log again
> > > after it has written the unmount record (like we currently do for
> > > quiesce).
> >
> > ...but now that we've repaired the filesystem, it could potentially run
> > for a very long time until the next unmount. During that run, we'd be
> > misleading users about the real amount of free space and risking a hard
> > shutdown. I prefer that online repair try not to leave any weird state
> > around after xfs_scrub exits.
>
> Sure, but user's may not want a freeze/read-all-ag-headers/thaw
> cycle as part of repair if it can be avoided. If there are thousands
> of AGs, this could take many seconds....
>
> > > That was we can do a racy "near enough" update here to get us out of
> > > the worst of the space accounting mismatches, knowing that on the
> > > next mount it will be accurately rebuilt.
> > >
> > > Thoughts?
> >
> > Well, I think the best solution is to have the AGF/AGI/inobt rebuilders
> > adjust the global counters by the same amount that they're adjusting the
> > counters in the AGF/AGI, then add a new scrubber that runs at the end to
> > freeze the fs and check/repair the global counter state. :)
>
> I'm just not convinced that we can get away with a global freeze to
> do this summation without having noticable impact on applications.
Admittedly, online repair has a semi-implicit design assumption that
either (a) it's running on fast enough storage that a bunch of random
IOs won't seriously harm performance or (b) whoever runs the client
program will throttle it to avoid starving regular operations, and (c)
repairs will not frequently be required.
Of course, the fsfreeze repairs totally blow (b) out of the water, which
means that in the long run I'm going to have to find a way shorten the
runtime of those repair types (rmap, quota). If (a) is true then maybe
we can parallelize some of the AG accesses to reduce freeze time. I'm
hoping that will reduce the pain of such things, though. $god help the
users on floppy disks.
--D
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH 07/22] xfs: add helpers to attach quotas to inodes
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (5 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 06/22] xfs: add a repair helper to reset superblock counters Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-16 22:21 ` Allison Henderson
2018-05-18 3:58 ` [PATCH v2 " Darrick J. Wong
2018-05-15 22:34 ` [PATCH 08/22] xfs: repair superblocks Darrick J. Wong
` (15 subsequent siblings)
22 siblings, 2 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Add a helper routine to attach quota information to inodes that are
about to undergo repair. If that fails, we need to schedule a
quotacheck for the next mount but allow the corrupted metadata repair to
continue.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/repair.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 2 ++
fs/xfs/xfs_quota.h | 16 ++++++++++++++
3 files changed, 76 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1ca7e124e1a7..1679fe7cc912 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -42,6 +42,7 @@
#include "xfs_extent_busy.h"
#include "xfs_ag_resv.h"
#include "xfs_trans_space.h"
+#include "xfs_quota.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -1023,3 +1024,60 @@ xfs_repair_reset_counters(
return 0;
}
+
+/* Force a quotacheck the next time we mount. */
+void
+xfs_repair_force_quotacheck(
+ struct xfs_scrub_context *sc,
+ uint dqtype)
+{
+ uint flag;
+
+ flag = xfs_quota_chkd_flag(dqtype);
+ if (!(flag & sc->mp->m_qflags))
+ return;
+
+ sc->mp->m_qflags &= ~flag;
+ spin_lock(&sc->mp->m_sb_lock);
+ sc->mp->m_sb.sb_qflags &= ~flag;
+ spin_unlock(&sc->mp->m_sb_lock);
+ xfs_log_sb(sc->tp);
+}
+
+/*
+ * Ensure that dquots are attached to this inode. We cannot allow the dquot
+ * code to allocate an on-disk dquot block here because we're already in
+ * transaction context with the inode locked. The on-disk dquot should
+ * already exist anyway. If the quota code signals corruption or missing
+ * quota information, schedule quotacheck at next mount.
+ */
+int
+xfs_repair_ino_dqattach(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ error = xfs_qm_dqattach_locked(sc->ip, false);
+ switch (error) {
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ case -ENOENT:
+ xfs_err_ratelimited(sc->mp,
+"inode %llu repair encountered quota error %d, quotacheck forced.",
+ (unsigned long long)sc->ip->i_ino, error);
+ if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
+ if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
+ if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
+ /* fall through */
+ case -ESRCH:
+ error = 0;
+ break;
+ default:
+ break;
+ }
+
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index eea32a26f947..0cb91dcb9f1e 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -97,6 +97,8 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
struct xfs_repair_find_ag_btree *btree_info,
struct xfs_buf *agfl_bp);
int xfs_repair_reset_counters(struct xfs_mount *mp);
+void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype);
+int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
/* Metadata repairers */
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 1c79ebbe5236..3edf52b14919 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -48,6 +48,22 @@ struct xfs_trans;
(XFS_IS_PQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
+static inline uint
+xfs_quota_chkd_flag(
+ uint dqtype)
+{
+ switch (dqtype) {
+ case XFS_DQ_USER:
+ return XFS_UQUOTA_CHKD;
+ case XFS_DQ_GROUP:
+ return XFS_GQUOTA_CHKD;
+ case XFS_DQ_PROJ:
+ return XFS_PQUOTA_CHKD;
+ default:
+ return 0;
+ }
+}
+
/*
* The structure kept inside the xfs_trans_t keep track of dquot changes
* within a transaction and apply them later.
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH 07/22] xfs: add helpers to attach quotas to inodes
2018-05-15 22:34 ` [PATCH 07/22] xfs: add helpers to attach quotas to inodes Darrick J. Wong
@ 2018-05-16 22:21 ` Allison Henderson
2018-05-18 3:58 ` [PATCH v2 " Darrick J. Wong
1 sibling, 0 replies; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 22:21 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs, david
This one looks good to me. Thx!
Reviewed by: Allison Henderson <allison.henderson@oracle.com>
On 05/15/2018 03:34 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add a helper routine to attach quota information to inodes that are
> about to undergo repair. If that fails, we need to schedule a
> quotacheck for the next mount but allow the corrupted metadata repair to
> continue.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/scrub/repair.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 2 ++
> fs/xfs/xfs_quota.h | 16 ++++++++++++++
> 3 files changed, 76 insertions(+)
>
>
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 1ca7e124e1a7..1679fe7cc912 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -42,6 +42,7 @@
> #include "xfs_extent_busy.h"
> #include "xfs_ag_resv.h"
> #include "xfs_trans_space.h"
> +#include "xfs_quota.h"
> #include "scrub/xfs_scrub.h"
> #include "scrub/scrub.h"
> #include "scrub/common.h"
> @@ -1023,3 +1024,60 @@ xfs_repair_reset_counters(
>
> return 0;
> }
> +
> +/* Force a quotacheck the next time we mount. */
> +void
> +xfs_repair_force_quotacheck(
> + struct xfs_scrub_context *sc,
> + uint dqtype)
> +{
> + uint flag;
> +
> + flag = xfs_quota_chkd_flag(dqtype);
> + if (!(flag & sc->mp->m_qflags))
> + return;
> +
> + sc->mp->m_qflags &= ~flag;
> + spin_lock(&sc->mp->m_sb_lock);
> + sc->mp->m_sb.sb_qflags &= ~flag;
> + spin_unlock(&sc->mp->m_sb_lock);
> + xfs_log_sb(sc->tp);
> +}
> +
> +/*
> + * Ensure that dquots are attached to this inode. We cannot allow the dquot
> + * code to allocate an on-disk dquot block here because we're already in
> + * transaction context with the inode locked. The on-disk dquot should
> + * already exist anyway. If the quota code signals corruption or missing
> + * quota information, schedule quotacheck at next mount.
> + */
> +int
> +xfs_repair_ino_dqattach(
> + struct xfs_scrub_context *sc)
> +{
> + int error;
> +
> + error = xfs_qm_dqattach_locked(sc->ip, false);
> + switch (error) {
> + case -EFSBADCRC:
> + case -EFSCORRUPTED:
> + case -ENOENT:
> + xfs_err_ratelimited(sc->mp,
> +"inode %llu repair encountered quota error %d, quotacheck forced.",
> + (unsigned long long)sc->ip->i_ino, error);
> + if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
> + xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
> + if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
> + xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
> + if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
> + xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
> + /* fall through */
> + case -ESRCH:
> + error = 0;
> + break;
> + default:
> + break;
> + }
> +
> + return error;
> +}
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index eea32a26f947..0cb91dcb9f1e 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -97,6 +97,8 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
> struct xfs_repair_find_ag_btree *btree_info,
> struct xfs_buf *agfl_bp);
> int xfs_repair_reset_counters(struct xfs_mount *mp);
> +void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype);
> +int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
>
> /* Metadata repairers */
>
> diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
> index 1c79ebbe5236..3edf52b14919 100644
> --- a/fs/xfs/xfs_quota.h
> +++ b/fs/xfs/xfs_quota.h
> @@ -48,6 +48,22 @@ struct xfs_trans;
> (XFS_IS_PQUOTA_ON(mp) && \
> (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
>
> +static inline uint
> +xfs_quota_chkd_flag(
> + uint dqtype)
> +{
> + switch (dqtype) {
> + case XFS_DQ_USER:
> + return XFS_UQUOTA_CHKD;
> + case XFS_DQ_GROUP:
> + return XFS_GQUOTA_CHKD;
> + case XFS_DQ_PROJ:
> + return XFS_PQUOTA_CHKD;
> + default:
> + return 0;
> + }
> +}
> +
> /*
> * The structure kept inside the xfs_trans_t keep track of dquot changes
> * within a transaction and apply them later.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at https://urldefense.proofpoint.com/v2/url?u=http-3A__vger.kernel.org_majordomo-2Dinfo.html&d=DwICaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=LHZQ8fHvy6wDKXGTWcm97burZH5sQKHRDMaY1UthQxc&m=i_MKNIO9laMI_y2D3Tgc19EICKaIyy10IeEKdvc3soc&s=cw8pXW84tup1SVubP_G8I3xvwifkDLcg8scJFxK9nZ4&e=
>
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH v2 07/22] xfs: add helpers to attach quotas to inodes
2018-05-15 22:34 ` [PATCH 07/22] xfs: add helpers to attach quotas to inodes Darrick J. Wong
2018-05-16 22:21 ` Allison Henderson
@ 2018-05-18 3:58 ` Darrick J. Wong
2018-05-29 3:29 ` Dave Chinner
1 sibling, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-18 3:58 UTC (permalink / raw)
To: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Add a helper routine to attach quota information to inodes that are
about to undergo repair. If that fails, we need to schedule a
quotacheck for the next mount but allow the corrupted metadata repair to
continue.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
---
v2: improve documentation
---
fs/xfs/scrub/repair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 2 ++
fs/xfs/xfs_quota.h | 16 +++++++++++++
3 files changed, 79 insertions(+)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4b95a15c0bd0..4d16a9503bf6 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -42,6 +42,7 @@
#include "xfs_extent_busy.h"
#include "xfs_ag_resv.h"
#include "xfs_trans_space.h"
+#include "xfs_quota.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -1115,3 +1116,63 @@ xfs_repair_reset_counters(
return 0;
}
+
+/* Force a quotacheck the next time we mount. */
+void
+xfs_repair_force_quotacheck(
+ struct xfs_scrub_context *sc,
+ uint dqtype)
+{
+ uint flag;
+
+ flag = xfs_quota_chkd_flag(dqtype);
+ if (!(flag & sc->mp->m_qflags))
+ return;
+
+ sc->mp->m_qflags &= ~flag;
+ spin_lock(&sc->mp->m_sb_lock);
+ sc->mp->m_sb.sb_qflags &= ~flag;
+ spin_unlock(&sc->mp->m_sb_lock);
+ xfs_log_sb(sc->tp);
+}
+
+/*
+ * Attach dquots to this inode, or schedule quotacheck to fix them.
+ *
+ * This function ensures that the appropriate dquots are attached to an inode.
+ * We cannot allow the dquot code to allocate an on-disk dquot block here
+ * because we're already in transaction context with the inode locked. The
+ * on-disk dquot should already exist anyway. If the quota code signals
+ * corruption or missing quota information, schedule quotacheck, which will
+ * repair corruptions in the quota metadata.
+ */
+int
+xfs_repair_ino_dqattach(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ error = xfs_qm_dqattach_locked(sc->ip, false);
+ switch (error) {
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ case -ENOENT:
+ xfs_err_ratelimited(sc->mp,
+"inode %llu repair encountered quota error %d, quotacheck forced.",
+ (unsigned long long)sc->ip->i_ino, error);
+ if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
+ if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
+ if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
+ /* fall through */
+ case -ESRCH:
+ error = 0;
+ break;
+ default:
+ break;
+ }
+
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index cc590312550a..fbfc7da7b708 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -97,6 +97,8 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
struct xfs_repair_find_ag_btree *btree_info,
struct xfs_buf *agfl_bp);
int xfs_repair_reset_counters(struct xfs_mount *mp);
+void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype);
+int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
/* Metadata repairers */
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 1c79ebbe5236..3edf52b14919 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -48,6 +48,22 @@ struct xfs_trans;
(XFS_IS_PQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
+static inline uint
+xfs_quota_chkd_flag(
+ uint dqtype)
+{
+ switch (dqtype) {
+ case XFS_DQ_USER:
+ return XFS_UQUOTA_CHKD;
+ case XFS_DQ_GROUP:
+ return XFS_GQUOTA_CHKD;
+ case XFS_DQ_PROJ:
+ return XFS_PQUOTA_CHKD;
+ default:
+ return 0;
+ }
+}
+
/*
* The structure kept inside the xfs_trans_t keep track of dquot changes
* within a transaction and apply them later.
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH v2 07/22] xfs: add helpers to attach quotas to inodes
2018-05-18 3:58 ` [PATCH v2 " Darrick J. Wong
@ 2018-05-29 3:29 ` Dave Chinner
0 siblings, 0 replies; 76+ messages in thread
From: Dave Chinner @ 2018-05-29 3:29 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Thu, May 17, 2018 at 08:58:02PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Add a helper routine to attach quota information to inodes that are
> about to undergo repair. If that fails, we need to schedule a
> quotacheck for the next mount but allow the corrupted metadata repair to
> continue.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
> ---
> v2: improve documentation
> ---
Looks good.
Reviewed-by: Dave Chinner <dchinner@redhat.com>
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH 08/22] xfs: repair superblocks
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (6 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 07/22] xfs: add helpers to attach quotas to inodes Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-16 22:55 ` Allison Henderson
2018-05-29 3:42 ` Dave Chinner
2018-05-15 22:34 ` [PATCH 09/22] xfs: repair the AGF and AGFL Darrick J. Wong
` (14 subsequent siblings)
22 siblings, 2 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
If one of the backup superblocks is found to differ seriously from
superblock 0, write out a fresh copy from the in-core sb.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1 +
fs/xfs/libxfs/xfs_sb.c | 22 +++++++++++++
fs/xfs/libxfs/xfs_sb.h | 3 ++
| 70 ++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 2 +
fs/xfs/scrub/scrub.c | 2 +
6 files changed, 99 insertions(+), 1 deletion(-)
create mode 100644 fs/xfs/scrub/agheader_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0e93a099359b..29fe115f29d5 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -174,6 +174,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
# online repair
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
+ agheader_repair.o \
repair.o \
)
endif
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index ea6c85a4c27c..3a3273572a5c 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1097,3 +1097,25 @@ xfs_sb_read_secondary(
*bpp = bp;
return 0;
}
+
+/* Get a secondary superblock buffer. */
+int
+xfs_sb_get_secondary(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp;
+
+ ASSERT(agno != 0 && agno != NULLAGNUMBER);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0);
+ if (!bp)
+ return -ENOMEM;
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_set_ref(bp, XFS_SSB_REF);
+ *bpp = bp;
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 03e0cc6bf3a6..244e0162c49e 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -50,5 +50,8 @@ extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
extern int xfs_sb_read_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **bpp);
+extern int xfs_sb_get_secondary(struct xfs_mount *mp,
+ struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **bpp);
#endif /* __XFS_SB_H__ */
--git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
new file mode 100644
index 000000000000..8b91e9ebe1e7
--- /dev/null
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Superblock */
+
+/* Repair the superblock. */
+int
+xfs_repair_superblock(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+ int error;
+
+ /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
+ agno = sc->sm->sm_agno;
+ if (agno == 0)
+ return -EOPNOTSUPP;
+
+ error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
+ if (error)
+ return error;
+
+ /* Copy AG 0's superblock to this one. */
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+
+ /* Write this to disk. */
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 0cb91dcb9f1e..0090087ded45 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -103,6 +103,7 @@ int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
/* Metadata repairers */
int xfs_repair_probe(struct xfs_scrub_context *sc);
+int xfs_repair_superblock(struct xfs_scrub_context *sc);
#else
@@ -131,6 +132,7 @@ static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
}
#define xfs_repair_probe xfs_repair_notsupported
+#define xfs_repair_superblock xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index bf5e8dd66133..1520703c9504 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -218,7 +218,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_superblock,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_superblock,
},
[XFS_SCRUB_TYPE_AGF] = { /* agf */
.type = ST_PERAG,
^ permalink raw reply related [flat|nested] 76+ messages in thread
* Re: [PATCH 08/22] xfs: repair superblocks
2018-05-15 22:34 ` [PATCH 08/22] xfs: repair superblocks Darrick J. Wong
@ 2018-05-16 22:55 ` Allison Henderson
2018-05-29 3:42 ` Dave Chinner
1 sibling, 0 replies; 76+ messages in thread
From: Allison Henderson @ 2018-05-16 22:55 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs, david
Looks ok. Thx!
Reviewed by: Allison Henderson <allison.henderson@oracle.com>
On 05/15/2018 03:34 PM, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> If one of the backup superblocks is found to differ seriously from
> superblock 0, write out a fresh copy from the in-core sb.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/Makefile | 1 +
> fs/xfs/libxfs/xfs_sb.c | 22 +++++++++++++
> fs/xfs/libxfs/xfs_sb.h | 3 ++
> fs/xfs/scrub/agheader_repair.c | 70 ++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 2 +
> fs/xfs/scrub/scrub.c | 2 +
> 6 files changed, 99 insertions(+), 1 deletion(-)
> create mode 100644 fs/xfs/scrub/agheader_repair.c
>
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 0e93a099359b..29fe115f29d5 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -174,6 +174,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
> # online repair
> ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
> xfs-y += $(addprefix scrub/, \
> + agheader_repair.o \
> repair.o \
> )
> endif
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index ea6c85a4c27c..3a3273572a5c 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -1097,3 +1097,25 @@ xfs_sb_read_secondary(
> *bpp = bp;
> return 0;
> }
> +
> +/* Get a secondary superblock buffer. */
> +int
> +xfs_sb_get_secondary(
> + struct xfs_mount *mp,
> + struct xfs_trans *tp,
> + xfs_agnumber_t agno,
> + struct xfs_buf **bpp)
> +{
> + struct xfs_buf *bp;
> +
> + ASSERT(agno != 0 && agno != NULLAGNUMBER);
> + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
> + XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
> + XFS_FSS_TO_BB(mp, 1), 0);
> + if (!bp)
> + return -ENOMEM;
> + bp->b_ops = &xfs_sb_buf_ops;
> + xfs_buf_set_ref(bp, XFS_SSB_REF);
> + *bpp = bp;
> + return 0;
> +}
> diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
> index 03e0cc6bf3a6..244e0162c49e 100644
> --- a/fs/xfs/libxfs/xfs_sb.h
> +++ b/fs/xfs/libxfs/xfs_sb.h
> @@ -50,5 +50,8 @@ extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
> extern int xfs_sb_read_secondary(struct xfs_mount *mp,
> struct xfs_trans *tp, xfs_agnumber_t agno,
> struct xfs_buf **bpp);
> +extern int xfs_sb_get_secondary(struct xfs_mount *mp,
> + struct xfs_trans *tp, xfs_agnumber_t agno,
> + struct xfs_buf **bpp);
>
> #endif /* __XFS_SB_H__ */
> diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
> new file mode 100644
> index 000000000000..8b91e9ebe1e7
> --- /dev/null
> +++ b/fs/xfs/scrub/agheader_repair.c
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (C) 2018 Oracle. All Rights Reserved.
> + *
> + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it would be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write the Free Software Foundation,
> + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_defer.h"
> +#include "xfs_btree.h"
> +#include "xfs_bit.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans.h"
> +#include "xfs_sb.h"
> +#include "xfs_inode.h"
> +#include "xfs_alloc.h"
> +#include "xfs_ialloc.h"
> +#include "xfs_rmap.h"
> +#include "scrub/xfs_scrub.h"
> +#include "scrub/scrub.h"
> +#include "scrub/common.h"
> +#include "scrub/trace.h"
> +
> +/* Superblock */
> +
> +/* Repair the superblock. */
> +int
> +xfs_repair_superblock(
> + struct xfs_scrub_context *sc)
> +{
> + struct xfs_mount *mp = sc->mp;
> + struct xfs_buf *bp;
> + xfs_agnumber_t agno;
> + int error;
> +
> + /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
> + agno = sc->sm->sm_agno;
> + if (agno == 0)
> + return -EOPNOTSUPP;
> +
> + error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
> + if (error)
> + return error;
> +
> + /* Copy AG 0's superblock to this one. */
> + xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
> + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
> +
> + /* Write this to disk. */
> + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
> + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
> + return error;
> +}
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index 0cb91dcb9f1e..0090087ded45 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -103,6 +103,7 @@ int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
> /* Metadata repairers */
>
> int xfs_repair_probe(struct xfs_scrub_context *sc);
> +int xfs_repair_superblock(struct xfs_scrub_context *sc);
>
> #else
>
> @@ -131,6 +132,7 @@ static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
> }
>
> #define xfs_repair_probe xfs_repair_notsupported
> +#define xfs_repair_superblock xfs_repair_notsupported
>
> #endif /* CONFIG_XFS_ONLINE_REPAIR */
>
> diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> index bf5e8dd66133..1520703c9504 100644
> --- a/fs/xfs/scrub/scrub.c
> +++ b/fs/xfs/scrub/scrub.c
> @@ -218,7 +218,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
> .type = ST_PERAG,
> .setup = xfs_scrub_setup_fs,
> .scrub = xfs_scrub_superblock,
> - .repair = xfs_repair_notsupported,
> + .repair = xfs_repair_superblock,
> },
> [XFS_SCRUB_TYPE_AGF] = { /* agf */
> .type = ST_PERAG,
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at https://urldefense.proofpoint.com/v2/url?u=http-3A__vger.kernel.org_majordomo-2Dinfo.html&d=DwICaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=LHZQ8fHvy6wDKXGTWcm97burZH5sQKHRDMaY1UthQxc&m=1JFQm3tOfB4lBtH4Ota_fVXDFbuFCNX6jUVpnImv860&s=p9BNrzBGzQZvFfEbGNgnVPBbDjyfRy-ZnwcLyqXcrl8&e=
>
^ permalink raw reply [flat|nested] 76+ messages in thread
* Re: [PATCH 08/22] xfs: repair superblocks
2018-05-15 22:34 ` [PATCH 08/22] xfs: repair superblocks Darrick J. Wong
2018-05-16 22:55 ` Allison Henderson
@ 2018-05-29 3:42 ` Dave Chinner
1 sibling, 0 replies; 76+ messages in thread
From: Dave Chinner @ 2018-05-29 3:42 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs
On Tue, May 15, 2018 at 03:34:30PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> If one of the backup superblocks is found to differ seriously from
> superblock 0, write out a fresh copy from the in-core sb.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Looks good, couple of minor things and one for a followup patch.
Reviewed-by: Dave Chinner <dchinner@redhat.com>
> ---
> fs/xfs/Makefile | 1 +
> fs/xfs/libxfs/xfs_sb.c | 22 +++++++++++++
> fs/xfs/libxfs/xfs_sb.h | 3 ++
> fs/xfs/scrub/agheader_repair.c | 70 ++++++++++++++++++++++++++++++++++++++++
> fs/xfs/scrub/repair.h | 2 +
> fs/xfs/scrub/scrub.c | 2 +
> 6 files changed, 99 insertions(+), 1 deletion(-)
> create mode 100644 fs/xfs/scrub/agheader_repair.c
>
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 0e93a099359b..29fe115f29d5 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -174,6 +174,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
> # online repair
> ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
> xfs-y += $(addprefix scrub/, \
> + agheader_repair.o \
> repair.o \
> )
> endif
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index ea6c85a4c27c..3a3273572a5c 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -1097,3 +1097,25 @@ xfs_sb_read_secondary(
> *bpp = bp;
> return 0;
> }
> +
> +/* Get a secondary superblock buffer. */
/* get an uninitialised secondary superblock buffer */
> +int
> +xfs_sb_get_secondary(
> + struct xfs_mount *mp,
> + struct xfs_trans *tp,
> + xfs_agnumber_t agno,
> + struct xfs_buf **bpp)
> +{
> + struct xfs_buf *bp;
> +
> + ASSERT(agno != 0 && agno != NULLAGNUMBER);
> + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
> + XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
> + XFS_FSS_TO_BB(mp, 1), 0);
> + if (!bp)
> + return -ENOMEM;
> + bp->b_ops = &xfs_sb_buf_ops;
> + xfs_buf_set_ref(bp, XFS_SSB_REF);
> + *bpp = bp;
This should probably call xfs_buf_oneshot(bp) rather than having
multiple mechanisms for doing the same thing. Same goes for
xfs_sb_read_secondary().
> + return 0;
> +}
> diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
> index 03e0cc6bf3a6..244e0162c49e 100644
> --- a/fs/xfs/libxfs/xfs_sb.h
> +++ b/fs/xfs/libxfs/xfs_sb.h
> @@ -50,5 +50,8 @@ extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
> extern int xfs_sb_read_secondary(struct xfs_mount *mp,
> struct xfs_trans *tp, xfs_agnumber_t agno,
> struct xfs_buf **bpp);
> +extern int xfs_sb_get_secondary(struct xfs_mount *mp,
> + struct xfs_trans *tp, xfs_agnumber_t agno,
> + struct xfs_buf **bpp);
>
> #endif /* __XFS_SB_H__ */
> diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
> new file mode 100644
> index 000000000000..8b91e9ebe1e7
> --- /dev/null
> +++ b/fs/xfs/scrub/agheader_repair.c
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (C) 2018 Oracle. All Rights Reserved.
> + *
> + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it would be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write the Free Software Foundation,
> + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_defer.h"
> +#include "xfs_btree.h"
> +#include "xfs_bit.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans.h"
> +#include "xfs_sb.h"
> +#include "xfs_inode.h"
> +#include "xfs_alloc.h"
> +#include "xfs_ialloc.h"
> +#include "xfs_rmap.h"
> +#include "scrub/xfs_scrub.h"
> +#include "scrub/scrub.h"
> +#include "scrub/common.h"
> +#include "scrub/trace.h"
> +
> +/* Superblock */
> +
> +/* Repair the superblock. */
> +int
> +xfs_repair_superblock(
> + struct xfs_scrub_context *sc)
> +{
> + struct xfs_mount *mp = sc->mp;
> + struct xfs_buf *bp;
> + xfs_agnumber_t agno;
> + int error;
> +
> + /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
> + agno = sc->sm->sm_agno;
> + if (agno == 0)
> + return -EOPNOTSUPP;
> +
> + error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
> + if (error)
> + return error;
> +
> + /* Copy AG 0's superblock to this one. */
> + xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
> + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
> +
> + /* Write this to disk. */
> + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
> + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
> + return error;
> +}
This is largely common with the core of xfs_update_secondary_sbs()
and can go into fs/xfs/libxfs/xfs_sb.c
int
xfs_sb_init_secondary(
struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_agnumber_t agno
struct xfs_buf **bpp)
{
struct xfs_buf *bp;
int error;
error = xfs_sb_get_secondary(mp, tp, agno, &bp);
if (error)
return error;
/* Copy AG 0's superblock to this one. */
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
if (tp) {
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
}
if (bpp)
*bpp = bp;
return 0;
}
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 76+ messages in thread
* [PATCH 09/22] xfs: repair the AGF and AGFL
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (7 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 08/22] xfs: repair superblocks Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-15 22:34 ` [PATCH 10/22] xfs: repair the AGI Darrick J. Wong
` (13 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Regenerate the AGF and AGFL from the rmap data.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
| 459 ++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 4
fs/xfs/scrub/scrub.c | 4
3 files changed, 465 insertions(+), 2 deletions(-)
--git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 8b91e9ebe1e7..b09bb1ee9613 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -31,12 +31,18 @@
#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/* Superblock */
@@ -68,3 +74,456 @@ xfs_repair_superblock(
xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
return error;
}
+
+/* AGF */
+
+struct xfs_repair_agf_allocbt {
+ struct xfs_scrub_context *sc;
+ xfs_agblock_t freeblks;
+ xfs_agblock_t longest;
+};
+
+/* Record free space shape information. */
+STATIC int
+xfs_repair_agf_walk_allocbt(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xfs_repair_agf_allocbt *raa = priv;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(raa->sc, &error))
+ return error;
+
+ raa->freeblks += rec->ar_blockcount;
+ if (rec->ar_blockcount > raa->longest)
+ raa->longest = rec->ar_blockcount;
+ return error;
+}
+
+/* Does this AGFL block look sane? */
+STATIC int
+xfs_repair_agf_check_agfl_block(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xfs_scrub_context *sc = priv;
+
+ if (!xfs_verify_agbno(mp, sc->sa.agno, agbno))
+ return -EFSCORRUPTED;
+ return 0;
+}
+
+/* Repair the AGF. */
+int
+xfs_repair_agf(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_find_ag_btree fab[] = {
+ {
+ .rmap_owner = XFS_RMAP_OWN_AG,
+ .buf_ops = &xfs_allocbt_buf_ops,
+ .magic = XFS_ABTB_CRC_MAGIC,
+ },
+ {
+ .rmap_owner = XFS_RMAP_OWN_AG,
+ .buf_ops = &xfs_allocbt_buf_ops,
+ .magic = XFS_ABTC_CRC_MAGIC,
+ },
+ {
+ .rmap_owner = XFS_RMAP_OWN_AG,
+ .buf_ops = &xfs_rmapbt_buf_ops,
+ .magic = XFS_RMAP_CRC_MAGIC,
+ },
+ {
+ .rmap_owner = XFS_RMAP_OWN_REFC,
+ .buf_ops = &xfs_refcountbt_buf_ops,
+ .magic = XFS_REFC_CRC_MAGIC,
+ },
+ {
+ .buf_ops = NULL,
+ },
+ };
+ struct xfs_repair_agf_allocbt raa;
+ struct xfs_agf old_agf;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agf_bp;
+ struct xfs_buf *agfl_bp;
+ struct xfs_agf *agf;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_perag *pag;
+ xfs_agblock_t blocks;
+ xfs_agblock_t freesp_blocks;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ memset(&raa, 0, sizeof(raa));
+ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
+ if (error)
+ return error;
+ agf_bp->b_ops = &xfs_agf_buf_ops;
+
+ /*
+ * Load the AGFL so that we can screen out OWN_AG blocks that
+ * are on the AGFL now; these blocks might have once been part
+ * of the bno/cnt/rmap btrees but are not now.
+ */
+ error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
+ if (error)
+ return error;
+ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
+ xfs_repair_agf_check_agfl_block, sc);
+ if (error)
+ return error;
+
+ /* Find the btree roots. */
+ error = xfs_repair_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp);
+ if (error)
+ return error;
+ if (fab[0].root == NULLAGBLOCK || fab[0].height > XFS_BTREE_MAXLEVELS ||
+ fab[1].root == NULLAGBLOCK || fab[1].height > XFS_BTREE_MAXLEVELS ||
+ fab[2].root == NULLAGBLOCK || fab[2].height > XFS_BTREE_MAXLEVELS)
+ return -EFSCORRUPTED;
+ if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+ (fab[3].root == NULLAGBLOCK || fab[3].height > XFS_BTREE_MAXLEVELS))
+ return -EFSCORRUPTED;
+
+ /* Start rewriting the header. */
+ agf = XFS_BUF_TO_AGF(agf_bp);
+ old_agf = *agf;
+ /*
+ * We relied on the rmapbt to reconstruct the AGF. If we get a
+ * different root then something's seriously wrong.
+ */
+ if (be32_to_cpu(old_agf.agf_roots[XFS_BTNUM_RMAPi]) != fab[2].root)
+ return -EFSCORRUPTED;
+ memset(agf, 0, mp->m_sb.sb_sectsize);
+ agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+ agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+ agf->agf_seqno = cpu_to_be32(sc->sa.agno);
+ agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno));
+ agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(fab[0].root);
+ agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(fab[1].root);
+ agf->agf_roots[XFS_BTNUM_RMAPi] = cpu_to_be32(fab[2].root);
+ agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(fab[0].height);
+ agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(fab[1].height);
+ agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(fab[2].height);
+ agf->agf_flfirst = old_agf.agf_flfirst;
+ agf->agf_fllast = old_agf.agf_fllast;
+ agf->agf_flcount = old_agf.agf_flcount;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ agf->agf_refcount_root = cpu_to_be32(fab[3].root);
+ agf->agf_refcount_level = cpu_to_be32(fab[3].height);
+ }
+
+ /* Update the AGF counters from the bnobt. */
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+ XFS_BTNUM_BNO);
+ raa.sc = sc;
+ error = xfs_alloc_query_all(cur, xfs_repair_agf_walk_allocbt, &raa);
+ if (error)
+ goto err;
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ freesp_blocks = blocks - 1;
+ agf->agf_freeblks = cpu_to_be32(raa.freeblks);
+ agf->agf_longest = cpu_to_be32(raa.longest);
+
+ /* Update the AGF counters from the cntbt. */
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+ XFS_BTNUM_CNT);
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ freesp_blocks += blocks - 1;
+
+ /* Update the AGF counters from the rmapbt. */
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ agf->agf_rmap_blocks = cpu_to_be32(blocks);
+ freesp_blocks += blocks - 1;
+
+ /* Update the AGF counters from the refcountbt. */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.agno, NULL);
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ agf->agf_refcount_blocks = cpu_to_be32(blocks);
+ }
+ agf->agf_btreeblks = cpu_to_be32(freesp_blocks);
+ cur = NULL;
+
+ /* Trigger reinitialization of the in-core data. */
+ if (raa.freeblks != be32_to_cpu(old_agf.agf_freeblks) ||
+ freesp_blocks != be32_to_cpu(old_agf.agf_btreeblks) ||
+ raa.longest != be32_to_cpu(old_agf.agf_longest) ||
+ fab[0].height != be32_to_cpu(old_agf.agf_levels[XFS_BTNUM_BNOi]) ||
+ fab[1].height != be32_to_cpu(old_agf.agf_levels[XFS_BTNUM_CNTi]) ||
+ fab[2].height != be32_to_cpu(old_agf.agf_levels[XFS_BTNUM_RMAPi]) ||
+ fab[3].height != be32_to_cpu(old_agf.agf_refcount_level)) {
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ if (pag->pagf_init) {
+ pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+ pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+ pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+ pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+ pag->pagf_levels[XFS_BTNUM_BNOi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+ pag->pagf_levels[XFS_BTNUM_CNTi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+ pag->pagf_levels[XFS_BTNUM_RMAPi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_refcount_level =
+ be32_to_cpu(agf->agf_refcount_level);
+ }
+ xfs_perag_put(pag);
+ sc->reset_counters = true;
+ }
+
+ /* Write this to disk. */
+ xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
+ xfs_trans_log_buf(sc->tp, agf_bp, 0, mp->m_sb.sb_sectsize - 1);
+ return error;
+
+err:
+ if (cur)
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ *agf = old_agf;
+ return error;
+}
+
+/* AGFL */
+
+struct xfs_repair_agfl {
+ struct xfs_repair_extent_list freesp_list;
+ struct xfs_repair_extent_list agmeta_list;
+ struct xfs_scrub_context *sc;
+};
+
+/* Record all freespace information. */
+STATIC int
+xfs_repair_agfl_rmap_fn(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_agfl *ra = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsb;
+ int i;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(ra->sc, &error))
+ return error;
+
+ /* Record all the OWN_AG blocks... */
+ if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ rec->rm_startblock);
+ error = xfs_repair_collect_btree_extent(ra->sc,
+ &ra->freesp_list, fsb, rec->rm_blockcount);
+ if (error)
+ return error;
+ }
+
+ /* ...and all the rmapbt blocks... */
+ for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) {
+ xfs_btree_get_block(cur, i, &bp);
+ if (!bp)
+ continue;
+ fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
+ error = xfs_repair_collect_btree_extent(ra->sc,
+ &ra->agmeta_list, fsb, 1);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Add a btree block to the agmeta list. */
+STATIC int
+xfs_repair_agfl_visit_btblock(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xfs_repair_agfl *ra = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsb;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(ra->sc, &error))
+ return error;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
+ return xfs_repair_collect_btree_extent(ra->sc, &ra->agmeta_list,
+ fsb, 1);
+}
+
+/* Repair the AGFL. */
+int
+xfs_repair_agfl(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_agfl ra;
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agf_bp;
+ struct xfs_buf *agfl_bp;
+ struct xfs_agf *agf;
+ struct xfs_agfl *agfl;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_perag *pag;
+ __be32 *agfl_bno;
+ struct xfs_repair_extent *rae;
+ struct xfs_repair_extent *n;
+ xfs_agblock_t flcount;
+ xfs_agblock_t agbno;
+ xfs_agblock_t bno;
+ xfs_agblock_t old_flcount;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ xfs_repair_init_extent_list(&ra.freesp_list);
+ xfs_repair_init_extent_list(&ra.agmeta_list);
+ ra.sc = sc;
+
+ error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+
+ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGFL_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
+ if (error)
+ return error;
+ agfl_bp->b_ops = &xfs_agfl_buf_ops;
+
+ /* Find all space used by the free space btrees & rmapbt. */
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_agfl_rmap_fn, &ra);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ /* Find all space used by bnobt. */
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+ XFS_BTNUM_BNO);
+ error = xfs_btree_visit_blocks(cur, xfs_repair_agfl_visit_btblock,
+ &ra);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ /* Find all space used by cntbt. */
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+ XFS_BTNUM_CNT);
+ error = xfs_btree_visit_blocks(cur, xfs_repair_agfl_visit_btblock,
+ &ra);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+
+ /*
+ * Drop the freesp meta blocks that are in use by btrees.
+ * The remaining blocks /should/ be AGFL blocks.
+ */
+ error = xfs_repair_subtract_extents(sc, &ra.freesp_list,
+ &ra.agmeta_list);
+ if (error)
+ goto err;
+ xfs_repair_cancel_btree_extents(sc, &ra.agmeta_list);
+
+ /* Start rewriting the header. */
+ agfl = XFS_BUF_TO_AGFL(agfl_bp);
+ memset(agfl, 0xFF, mp->m_sb.sb_sectsize);
+ agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+ agfl->agfl_seqno = cpu_to_be32(sc->sa.agno);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+
+ /* Fill the AGFL with the remaining blocks. */
+ flcount = 0;
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp);
+ for_each_xfs_repair_extent_safe(rae, n, &ra.freesp_list) {
+ agbno = XFS_FSB_TO_AGBNO(mp, rae->fsbno);
+
+ trace_xfs_repair_agfl_insert(mp, sc->sa.agno, agbno, rae->len);
+
+ for (bno = 0; bno < rae->len; bno++) {
+ if (flcount >= xfs_agfl_size(mp) - 1)
+ break;
+ agfl_bno[flcount + 1] = cpu_to_be32(agbno + bno);
+ flcount++;
+ }
+ rae->fsbno += bno;
+ rae->len -= bno;
+ if (rae->len)
+ break;
+ list_del(&rae->list);
+ kmem_free(rae);
+ }
+
+ /* Update the AGF counters. */
+ agf = XFS_BUF_TO_AGF(agf_bp);
+ old_flcount = be32_to_cpu(agf->agf_flcount);
+ agf->agf_flfirst = cpu_to_be32(1);
+ agf->agf_flcount = cpu_to_be32(flcount);
+ agf->agf_fllast = cpu_to_be32(flcount);
+
+ /* Trigger reinitialization of the in-core data. */
+ if (flcount != old_flcount) {
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ if (pag->pagf_init)
+ pag->pagf_flcount = flcount;
+ xfs_perag_put(pag);
+ sc->reset_counters = true;
+ }
+
+ /* Write AGF and AGFL to disk. */
+ xfs_alloc_log_agf(sc->tp, agf_bp,
+ XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+ xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
+ xfs_trans_log_buf(sc->tp, agfl_bp, 0, mp->m_sb.sb_sectsize - 1);
+
+ /* Dump any AGFL overflow. */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ return xfs_repair_reap_btree_extents(sc, &ra.freesp_list, &oinfo,
+ XFS_AG_RESV_AGFL);
+err:
+ xfs_repair_cancel_btree_extents(sc, &ra.agmeta_list);
+ xfs_repair_cancel_btree_extents(sc, &ra.freesp_list);
+ if (cur)
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 0090087ded45..8c19136ccdbe 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -104,6 +104,8 @@ int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
int xfs_repair_probe(struct xfs_scrub_context *sc);
int xfs_repair_superblock(struct xfs_scrub_context *sc);
+int xfs_repair_agf(struct xfs_scrub_context *sc);
+int xfs_repair_agfl(struct xfs_scrub_context *sc);
#else
@@ -133,6 +135,8 @@ static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
#define xfs_repair_probe xfs_repair_notsupported
#define xfs_repair_superblock xfs_repair_notsupported
+#define xfs_repair_agf xfs_repair_notsupported
+#define xfs_repair_agfl xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 1520703c9504..f129693777c3 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -224,13 +224,13 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agf,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_agf,
},
[XFS_SCRUB_TYPE_AGFL]= { /* agfl */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agfl,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_agfl,
},
[XFS_SCRUB_TYPE_AGI] = { /* agi */
.type = ST_PERAG,
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 10/22] xfs: repair the AGI
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (8 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 09/22] xfs: repair the AGF and AGFL Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-15 22:34 ` [PATCH 11/22] xfs: repair free space btrees Darrick J. Wong
` (12 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Rebuild the AGI header items with some help from the rmapbt.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
| 111 ++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 2 +
fs/xfs/scrub/scrub.c | 2 -
3 files changed, 114 insertions(+), 1 deletion(-)
--git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index b09bb1ee9613..fe21e6a87a37 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -527,3 +527,114 @@ xfs_repair_agfl(
XFS_BTREE_NOERROR);
return error;
}
+
+/* AGI */
+
+int
+xfs_repair_agi(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_find_ag_btree fab[] = {
+ {
+ .rmap_owner = XFS_RMAP_OWN_INOBT,
+ .buf_ops = &xfs_inobt_buf_ops,
+ .magic = XFS_IBT_CRC_MAGIC,
+ },
+ {
+ .rmap_owner = XFS_RMAP_OWN_INOBT,
+ .buf_ops = &xfs_inobt_buf_ops,
+ .magic = XFS_FIBT_CRC_MAGIC,
+ },
+ {
+ .buf_ops = NULL
+ },
+ };
+ struct xfs_agi old_agi;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi_bp;
+ struct xfs_buf *agf_bp;
+ struct xfs_agi *agi;
+ struct xfs_btree_cur *cur;
+ struct xfs_perag *pag;
+ xfs_agino_t old_count;
+ xfs_agino_t old_freecount;
+ xfs_agino_t count;
+ xfs_agino_t freecount;
+ int bucket;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL);
+ if (error)
+ return error;
+ agi_bp->b_ops = &xfs_agi_buf_ops;
+
+ error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+
+ /* Find the btree roots. */
+ error = xfs_repair_find_ag_btree_roots(sc, agf_bp, fab, NULL);
+ if (error)
+ return error;
+ if (fab[0].root == NULLAGBLOCK || fab[0].height > XFS_BTREE_MAXLEVELS)
+ return -EFSCORRUPTED;
+ if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ (fab[1].root == NULLAGBLOCK || fab[1].height > XFS_BTREE_MAXLEVELS))
+ return -EFSCORRUPTED;
+
+ /* Start rewriting the header. */
+ agi = XFS_BUF_TO_AGI(agi_bp);
+ old_agi = *agi;
+ old_count = be32_to_cpu(old_agi.agi_count);
+ old_freecount = be32_to_cpu(old_agi.agi_freecount);
+ memset(agi, 0, mp->m_sb.sb_sectsize);
+ agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+ agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+ agi->agi_seqno = cpu_to_be32(sc->sa.agno);
+ agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno));
+ agi->agi_newino = cpu_to_be32(NULLAGINO);
+ agi->agi_dirino = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+ agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+ agi->agi_root = cpu_to_be32(fab[0].root);
+ agi->agi_level = cpu_to_be32(fab[0].height);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agi->agi_free_root = cpu_to_be32(fab[1].root);
+ agi->agi_free_level = cpu_to_be32(fab[1].height);
+ }
+
+ /* Update the AGI counters. */
+ cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno,
+ XFS_BTNUM_INO);
+ error = xfs_ialloc_count_inodes(cur, &count, &freecount);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ if (error)
+ goto err;
+ agi->agi_count = cpu_to_be32(count);
+ agi->agi_freecount = cpu_to_be32(freecount);
+ if (old_count != count || old_freecount != freecount) {
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ pag->pagi_init = 0;
+ xfs_perag_put(pag);
+ sc->reset_counters = true;
+ }
+
+ /* Write this to disk. */
+ xfs_trans_buf_set_type(sc->tp, agi_bp, XFS_BLFT_AGI_BUF);
+ xfs_trans_log_buf(sc->tp, agi_bp, 0, mp->m_sb.sb_sectsize - 1);
+ return error;
+
+err:
+ *agi = old_agi;
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 8c19136ccdbe..9a22428f970c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -106,6 +106,7 @@ int xfs_repair_probe(struct xfs_scrub_context *sc);
int xfs_repair_superblock(struct xfs_scrub_context *sc);
int xfs_repair_agf(struct xfs_scrub_context *sc);
int xfs_repair_agfl(struct xfs_scrub_context *sc);
+int xfs_repair_agi(struct xfs_scrub_context *sc);
#else
@@ -137,6 +138,7 @@ static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
#define xfs_repair_superblock xfs_repair_notsupported
#define xfs_repair_agf xfs_repair_notsupported
#define xfs_repair_agfl xfs_repair_notsupported
+#define xfs_repair_agi xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index f129693777c3..af551a2339e4 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -236,7 +236,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agi,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_agi,
},
[XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
.type = ST_PERAG,
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 11/22] xfs: repair free space btrees
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (9 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 10/22] xfs: repair the AGI Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-15 22:34 ` [PATCH 12/22] xfs: repair inode btrees Darrick J. Wong
` (11 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Rebuild the free space btrees from the gaps in the rmap btree.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/alloc.c | 1
fs/xfs/scrub/alloc_repair.c | 438 +++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/common.c | 8 +
fs/xfs/scrub/repair.h | 2
fs/xfs/scrub/scrub.c | 4
6 files changed, 450 insertions(+), 4 deletions(-)
create mode 100644 fs/xfs/scrub/alloc_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 29fe115f29d5..abe035ad0aa4 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -175,6 +175,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
+ alloc_repair.o \
repair.o \
)
endif
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 941a0a55224e..fe7e8bdf4a52 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -29,7 +29,6 @@
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
-#include "xfs_alloc.h"
#include "xfs_rmap.h"
#include "xfs_alloc.h"
#include "scrub/xfs_scrub.h"
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
new file mode 100644
index 000000000000..32a0ba615184
--- /dev/null
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_refcount.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Free space btree repair. */
+
+struct xfs_repair_alloc_extent {
+ struct list_head list;
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+};
+
+struct xfs_repair_alloc {
+ struct list_head extlist;
+ struct xfs_repair_extent_list btlist; /* OWN_AG blocks */
+ struct xfs_repair_extent_list nobtlist; /* rmapbt/agfl blocks */
+ struct xfs_scrub_context *sc;
+ xfs_agblock_t next_bno;
+ uint64_t nr_records;
+};
+
+/* Record extents that aren't in use from gaps in the rmap records. */
+STATIC int
+xfs_repair_alloc_extent_fn(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_alloc *ra = priv;
+ struct xfs_repair_alloc_extent *rae;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsb;
+ int i;
+ int error;
+
+ /* Record all the OWN_AG blocks... */
+ if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ rec->rm_startblock);
+ error = xfs_repair_collect_btree_extent(ra->sc,
+ &ra->btlist, fsb, rec->rm_blockcount);
+ if (error)
+ return error;
+ }
+
+ /* ...and all the rmapbt blocks... */
+ for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) {
+ xfs_btree_get_block(cur, i, &bp);
+ if (!bp)
+ continue;
+ fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
+ error = xfs_repair_collect_btree_extent(ra->sc,
+ &ra->nobtlist, fsb, 1);
+ if (error)
+ return error;
+ }
+
+ /* ...and all the free space. */
+ if (rec->rm_startblock > ra->next_bno) {
+ trace_xfs_repair_alloc_extent_fn(cur->bc_mp,
+ cur->bc_private.a.agno,
+ ra->next_bno, rec->rm_startblock - ra->next_bno,
+ XFS_RMAP_OWN_NULL, 0, 0);
+
+ rae = kmem_alloc(sizeof(struct xfs_repair_alloc_extent),
+ KM_MAYFAIL);
+ if (!rae)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&rae->list);
+ rae->bno = ra->next_bno;
+ rae->len = rec->rm_startblock - ra->next_bno;
+ list_add_tail(&rae->list, &ra->extlist);
+ ra->nr_records++;
+ }
+ ra->next_bno = max_t(xfs_agblock_t, ra->next_bno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/* Find the longest free extent in the list. */
+static struct xfs_repair_alloc_extent *
+xfs_repair_allocbt_get_longest(
+ struct xfs_repair_alloc *ra)
+{
+ struct xfs_repair_alloc_extent *rae;
+ struct xfs_repair_alloc_extent *longest = NULL;
+
+ list_for_each_entry(rae, &ra->extlist, list) {
+ if (!longest || rae->len > longest->len)
+ longest = rae;
+ }
+ return longest;
+}
+
+/* Collect an AGFL block for the not-to-release list. */
+static int
+xfs_repair_collect_agfl_block(
+ struct xfs_mount *mp,
+ xfs_agblock_t bno,
+ void *priv)
+{
+ struct xfs_repair_alloc *ra = priv;
+ xfs_fsblock_t fsb;
+
+ fsb = XFS_AGB_TO_FSB(mp, ra->sc->sa.agno, bno);
+ return xfs_repair_collect_btree_extent(ra->sc, &ra->nobtlist, fsb, 1);
+}
+
+/* Compare two btree extents. */
+static int
+xfs_repair_allocbt_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_alloc_extent *ap;
+ struct xfs_repair_alloc_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_alloc_extent, list);
+ bp = container_of(b, struct xfs_repair_alloc_extent, list);
+
+ if (ap->bno > bp->bno)
+ return 1;
+ else if (ap->bno < bp->bno)
+ return -1;
+ return 0;
+}
+
+/* Put an extent onto the free list. */
+STATIC int
+xfs_repair_allocbt_free_extent(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo)
+{
+ int error;
+
+ error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0);
+ if (error)
+ return error;
+ error = xfs_repair_roll_ag_trans(sc);
+ if (error)
+ return error;
+ return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false);
+}
+
+/* Allocate a block from the (cached) longest extent in the AG. */
+STATIC xfs_fsblock_t
+xfs_repair_allocbt_alloc_from_longest(
+ struct xfs_repair_alloc *ra,
+ struct xfs_repair_alloc_extent **longest)
+{
+ xfs_fsblock_t fsb;
+
+ if (*longest && (*longest)->len == 0) {
+ list_del(&(*longest)->list);
+ kmem_free(*longest);
+ *longest = NULL;
+ }
+
+ if (*longest == NULL) {
+ *longest = xfs_repair_allocbt_get_longest(ra);
+ if (*longest == NULL)
+ return NULLFSBLOCK;
+ }
+
+ fsb = XFS_AGB_TO_FSB(ra->sc->mp, ra->sc->sa.agno, (*longest)->bno);
+ (*longest)->bno++;
+ (*longest)->len--;
+ return fsb;
+}
+
+/* Repair the freespace btrees for some AG. */
+int
+xfs_repair_allocbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_alloc ra;
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_repair_alloc_extent *longest = NULL;
+ struct xfs_repair_alloc_extent *rae;
+ struct xfs_repair_alloc_extent *n;
+ struct xfs_perag *pag;
+ struct xfs_agf *agf;
+ struct xfs_buf *bp;
+ xfs_fsblock_t bnofsb;
+ xfs_fsblock_t cntfsb;
+ xfs_extlen_t oldf;
+ xfs_extlen_t nr_blocks;
+ xfs_agblock_t agend;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put
+ * extents on there twice.
+ */
+ pag = xfs_perag_get(sc->mp, sc->sa.agno);
+ spin_lock(&pag->pagb_lock);
+ if (pag->pagb_tree.rb_node) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ return -EDEADLOCK;
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+
+ /*
+ * Collect all reverse mappings for free extents, and the rmapbt
+ * blocks. We can discover the rmapbt blocks completely from a
+ * query_all handler because there are always rmapbt entries.
+ * (One cannot use on query_all to visit all of a btree's blocks
+ * unless that btree is guaranteed to have at least one entry.)
+ */
+ INIT_LIST_HEAD(&ra.extlist);
+ xfs_repair_init_extent_list(&ra.btlist);
+ xfs_repair_init_extent_list(&ra.nobtlist);
+ ra.next_bno = 0;
+ ra.nr_records = 0;
+ ra.sc = sc;
+
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_alloc_extent_fn, &ra);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agend = be32_to_cpu(agf->agf_length);
+ if (ra.next_bno < agend) {
+ rae = kmem_alloc(sizeof(struct xfs_repair_alloc_extent),
+ KM_MAYFAIL);
+ if (!rae) {
+ error = -ENOMEM;
+ goto out;
+ }
+ INIT_LIST_HEAD(&rae->list);
+ rae->bno = ra.next_bno;
+ rae->len = agend - ra.next_bno;
+ list_add_tail(&rae->list, &ra.extlist);
+ ra.nr_records++;
+ }
+
+ /* Collect all the AGFL blocks. */
+ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+ sc->sa.agfl_bp, xfs_repair_collect_agfl_block, &ra);
+ if (error)
+ goto out;
+
+ /* Do we actually have enough space to do this? */
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ nr_blocks = 2 * xfs_allocbt_calc_size(mp, ra.nr_records);
+ if (!xfs_repair_ag_has_space(pag, nr_blocks, XFS_AG_RESV_NONE)) {
+ xfs_perag_put(pag);
+ error = -ENOSPC;
+ goto out;
+ }
+ xfs_perag_put(pag);
+
+ /* Invalidate all the bnobt/cntbt blocks in btlist. */
+ error = xfs_repair_subtract_extents(sc, &ra.btlist, &ra.nobtlist);
+ if (error)
+ goto out;
+ xfs_repair_cancel_btree_extents(sc, &ra.nobtlist);
+ error = xfs_repair_invalidate_blocks(sc, &ra.btlist);
+ if (error)
+ goto out;
+
+ /* Allocate new bnobt root. */
+ bnofsb = xfs_repair_allocbt_alloc_from_longest(&ra, &longest);
+ if (bnofsb == NULLFSBLOCK) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ /* Allocate new cntbt root. */
+ cntfsb = xfs_repair_allocbt_alloc_from_longest(&ra, &longest);
+ if (cntfsb == NULLFSBLOCK) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ /* Initialize new bnobt root. */
+ error = xfs_repair_init_btblock(sc, bnofsb, &bp, XFS_BTNUM_BNO,
+ &xfs_allocbt_buf_ops);
+ if (error)
+ goto out;
+ agf->agf_roots[XFS_BTNUM_BNOi] =
+ cpu_to_be32(XFS_FSB_TO_AGBNO(mp, bnofsb));
+ agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+
+ /* Initialize new cntbt root. */
+ error = xfs_repair_init_btblock(sc, cntfsb, &bp, XFS_BTNUM_CNT,
+ &xfs_allocbt_buf_ops);
+ if (error)
+ goto out;
+ agf->agf_roots[XFS_BTNUM_CNTi] =
+ cpu_to_be32(XFS_FSB_TO_AGBNO(mp, cntfsb));
+ agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+
+ /*
+ * Since we're abandoning the old bnobt/cntbt, we have to
+ * decrease fdblocks by the # of blocks in those trees.
+ * btreeblks counts the non-root blocks of the free space
+ * and rmap btrees. Do this before resetting the AGF counters.
+ */
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ oldf = pag->pagf_btreeblks + 2;
+ oldf -= (be32_to_cpu(agf->agf_rmap_blocks) - 1);
+ error = xfs_mod_fdblocks(mp, -(int64_t)oldf, false);
+ if (error) {
+ xfs_perag_put(pag);
+ goto out;
+ }
+
+ /* Reset the perag info. */
+ pag->pagf_btreeblks = be32_to_cpu(agf->agf_rmap_blocks) - 1;
+ pag->pagf_freeblks = 0;
+ pag->pagf_longest = 0;
+ pag->pagf_levels[XFS_BTNUM_BNOi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+ pag->pagf_levels[XFS_BTNUM_CNTi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+
+ /* Now reset the AGF counters. */
+ agf->agf_btreeblks = cpu_to_be32(pag->pagf_btreeblks);
+ agf->agf_freeblks = cpu_to_be32(pag->pagf_freeblks);
+ agf->agf_longest = cpu_to_be32(pag->pagf_longest);
+ xfs_perag_put(pag);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp,
+ XFS_AGF_ROOTS | XFS_AGF_LEVELS | XFS_AGF_BTREEBLKS |
+ XFS_AGF_LONGEST | XFS_AGF_FREEBLKS);
+ error = xfs_repair_roll_ag_trans(sc);
+ if (error)
+ goto out;
+
+ /*
+ * Insert the longest free extent in case it's necessary to
+ * refresh the AGFL with multiple blocks.
+ */
+ xfs_rmap_skip_owner_update(&oinfo);
+ if (longest && longest->len == 0) {
+ error = xfs_repair_allocbt_free_extent(sc,
+ XFS_AGB_TO_FSB(sc->mp, sc->sa.agno,
+ longest->bno),
+ longest->len, &oinfo);
+ if (error)
+ goto out;
+ list_del(&longest->list);
+ kmem_free(longest);
+ }
+
+ /* Insert records into the new btrees. */
+ list_sort(NULL, &ra.extlist, xfs_repair_allocbt_extent_cmp);
+ list_for_each_entry_safe(rae, n, &ra.extlist, list) {
+ error = xfs_repair_allocbt_free_extent(sc,
+ XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
+ rae->len, &oinfo);
+ if (error)
+ goto out;
+ list_del(&rae->list);
+ kmem_free(rae);
+ }
+
+ /* Add rmap records for the btree roots */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno,
+ XFS_FSB_TO_AGBNO(mp, bnofsb), 1, &oinfo);
+ if (error)
+ goto out;
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno,
+ XFS_FSB_TO_AGBNO(mp, cntfsb), 1, &oinfo);
+ if (error)
+ goto out;
+
+ /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
+ return xfs_repair_reap_btree_extents(sc, &ra.btlist, &oinfo,
+ XFS_AG_RESV_NONE);
+out:
+ xfs_repair_cancel_btree_extents(sc, &ra.btlist);
+ xfs_repair_cancel_btree_extents(sc, &ra.nobtlist);
+ if (cur)
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ list_for_each_entry_safe(rae, n, &ra.extlist, list) {
+ list_del(&rae->list);
+ kmem_free(rae);
+ }
+ return error;
+}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index f1826b4b7572..ea5c03f30dc6 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -620,8 +620,14 @@ xfs_scrub_setup_ag_btree(
* expensive operation should be performed infrequently and only
* as a last resort. Any caller that sets force_log should
* document why they need to do so.
+ *
+ * Force everything in memory out to disk if we're repairing.
+ * This ensures we won't get tripped up by btree blocks sitting
+ * in memory waiting to have LSNs stamped in. The AGF/AGI repair
+ * routines use any available rmap data to try to find a btree
+ * root that also passes the read verifiers.
*/
- if (force_log) {
+ if (force_log || (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) {
error = xfs_scrub_checkpoint_log(mp);
if (error)
return error;
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9a22428f970c..1d41d215c030 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -107,6 +107,7 @@ int xfs_repair_superblock(struct xfs_scrub_context *sc);
int xfs_repair_agf(struct xfs_scrub_context *sc);
int xfs_repair_agfl(struct xfs_scrub_context *sc);
int xfs_repair_agi(struct xfs_scrub_context *sc);
+int xfs_repair_allocbt(struct xfs_scrub_context *sc);
#else
@@ -139,6 +140,7 @@ static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
#define xfs_repair_agf xfs_repair_notsupported
#define xfs_repair_agfl xfs_repair_notsupported
#define xfs_repair_agi xfs_repair_notsupported
+#define xfs_repair_allocbt xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index af551a2339e4..1b59a1598d6b 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -242,13 +242,13 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_allocbt,
.scrub = xfs_scrub_bnobt,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_allocbt,
},
[XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_allocbt,
.scrub = xfs_scrub_cntbt,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_allocbt,
},
[XFS_SCRUB_TYPE_INOBT] = { /* inobt */
.type = ST_PERAG,
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 12/22] xfs: repair inode btrees
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (10 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 11/22] xfs: repair free space btrees Darrick J. Wong
@ 2018-05-15 22:34 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 13/22] xfs: repair the rmapbt Darrick J. Wong
` (10 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:34 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Use the rmapbt to find inode chunks, query the chunks to compute
hole and free masks, and with that information rebuild the inobt
and finobt.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/ialloc_repair.c | 468 ++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 2
fs/xfs/scrub/scrub.c | 4
4 files changed, 473 insertions(+), 2 deletions(-)
create mode 100644 fs/xfs/scrub/ialloc_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index abe035ad0aa4..7c442f83b179 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -176,6 +176,7 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
alloc_repair.o \
+ ialloc_repair.o \
repair.o \
)
endif
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
new file mode 100644
index 000000000000..7b66181d953c
--- /dev/null
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Inode btree repair. */
+
+struct xfs_repair_ialloc_extent {
+ struct list_head list;
+ xfs_inofree_t freemask;
+ xfs_agino_t startino;
+ unsigned int count;
+ unsigned int usedcount;
+ uint16_t holemask;
+};
+
+struct xfs_repair_ialloc {
+ struct list_head extlist;
+ struct xfs_repair_extent_list btlist;
+ struct xfs_scrub_context *sc;
+ uint64_t nr_records;
+};
+
+/* Set usedmask if the inode is in use. */
+STATIC int
+xfs_repair_ialloc_check_free(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ xfs_ino_t fsino,
+ xfs_agino_t bpino,
+ bool *inuse)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_dinode *dip;
+ int error;
+
+ /* Will the in-core inode tell us if it's in use? */
+ error = xfs_icache_inode_is_allocated(mp, cur->bc_tp, fsino, inuse);
+ if (!error)
+ return 0;
+
+ /* Inode uncached or half assembled, read disk buffer */
+ dip = xfs_buf_offset(bp, bpino * mp->m_sb.sb_inodesize);
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
+ return -EFSCORRUPTED;
+
+ if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
+ return -EFSCORRUPTED;
+
+ *inuse = dip->di_mode != 0;
+ return 0;
+}
+
+/* Record extents that belong to inode btrees. */
+STATIC int
+xfs_repair_ialloc_extent_fn(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_imap imap;
+ struct xfs_repair_ialloc *ri = priv;
+ struct xfs_repair_ialloc_extent *rie;
+ struct xfs_dinode *dip;
+ struct xfs_buf *bp;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_ino_t fsino;
+ xfs_inofree_t usedmask;
+ xfs_fsblock_t fsbno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agino_t cdist;
+ xfs_agino_t startino;
+ xfs_agino_t clusterino;
+ xfs_agino_t nr_inodes;
+ xfs_agino_t inoalign;
+ xfs_agino_t agino;
+ xfs_agino_t rmino;
+ uint16_t fillmask;
+ bool inuse;
+ int blks_per_cluster;
+ int usedcount;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(ri->sc, &error))
+ return error;
+
+ /* Fragment of the old btrees; dispose of them later. */
+ if (rec->rm_owner == XFS_RMAP_OWN_INOBT) {
+ fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ rec->rm_startblock);
+ return xfs_repair_collect_btree_extent(ri->sc, &ri->btlist,
+ fsbno, rec->rm_blockcount);
+ }
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != XFS_RMAP_OWN_INODES)
+ return 0;
+
+ agno = cur->bc_private.a.agno;
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0);
+
+ if (rec->rm_startblock % blks_per_cluster != 0)
+ return -EFSCORRUPTED;
+
+ trace_xfs_repair_ialloc_extent_fn(mp, cur->bc_private.a.agno,
+ rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
+ rec->rm_offset, rec->rm_flags);
+
+ /*
+ * Determine the inode block alignment, and where the block
+ * ought to start if it's aligned properly. On a sparse inode
+ * system the rmap doesn't have to start on an alignment boundary,
+ * but the record does. On pre-sparse filesystems, we /must/
+ * start both rmap and inobt on an alignment boundary.
+ */
+ inoalign = xfs_ialloc_cluster_alignment(mp);
+ agbno = rec->rm_startblock;
+ agino = XFS_OFFBNO_TO_AGINO(mp, agbno, 0);
+ rmino = XFS_OFFBNO_TO_AGINO(mp, rounddown(agbno, inoalign), 0);
+ if (!xfs_sb_version_hassparseinodes(&mp->m_sb) && agino != rmino)
+ return -EFSCORRUPTED;
+
+ /*
+ * For each cluster in this blob of inode, we must calculate the
+ * properly aligned startino of that cluster, then iterate each
+ * cluster to fill in used and filled masks appropriately. We
+ * then use the (startino, used, filled) information to construct
+ * the appropriate inode records.
+ */
+ for (agbno = rec->rm_startblock;
+ agbno < rec->rm_startblock + rec->rm_blockcount;
+ agbno += blks_per_cluster) {
+ /* The per-AG inum of this inode cluster. */
+ agino = XFS_OFFBNO_TO_AGINO(mp, agbno, 0);
+
+ /* The per-AG inum of the inobt record. */
+ startino = rmino +
+ rounddown(agino - rmino, XFS_INODES_PER_CHUNK);
+ cdist = agino - startino;
+
+ /* Every inode in this holemask slot is filled. */
+ fillmask = xfs_inobt_maskn(
+ cdist / XFS_INODES_PER_HOLEMASK_BIT,
+ nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
+
+ /* Grab the inode cluster buffer. */
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap.im_boffset = 0;
+
+ error = xfs_imap_to_bp(mp, cur->bc_tp, &imap,
+ &dip, &bp, 0, XFS_IGET_UNTRUSTED);
+ if (error)
+ return error;
+
+ usedmask = 0;
+ usedcount = 0;
+ /* Which inodes within this cluster are free? */
+ for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
+ fsino = XFS_AGINO_TO_INO(mp, cur->bc_private.a.agno,
+ agino + clusterino);
+ error = xfs_repair_ialloc_check_free(cur, bp, fsino,
+ clusterino, &inuse);
+ if (error) {
+ xfs_trans_brelse(cur->bc_tp, bp);
+ return error;
+ }
+ if (inuse) {
+ usedcount++;
+ usedmask |= XFS_INOBT_MASK(cdist + clusterino);
+ }
+ }
+ xfs_trans_brelse(cur->bc_tp, bp);
+
+ /*
+ * If the last item in the list is our chunk record,
+ * update that.
+ */
+ if (!list_empty(&ri->extlist)) {
+ rie = list_last_entry(&ri->extlist,
+ struct xfs_repair_ialloc_extent, list);
+ if (rie->startino + XFS_INODES_PER_CHUNK > startino) {
+ rie->freemask &= ~usedmask;
+ rie->holemask &= ~fillmask;
+ rie->count += nr_inodes;
+ rie->usedcount += usedcount;
+ continue;
+ }
+ }
+
+ /* New inode chunk; add to the list. */
+ rie = kmem_alloc(sizeof(struct xfs_repair_ialloc_extent),
+ KM_MAYFAIL);
+ if (!rie)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&rie->list);
+ rie->startino = startino;
+ rie->freemask = XFS_INOBT_ALL_FREE & ~usedmask;
+ rie->holemask = XFS_INOBT_ALL_FREE & ~fillmask;
+ rie->count = nr_inodes;
+ rie->usedcount = usedcount;
+ list_add_tail(&rie->list, &ri->extlist);
+ ri->nr_records++;
+ }
+
+ return 0;
+}
+
+/* Compare two ialloc extents. */
+static int
+xfs_repair_ialloc_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_ialloc_extent *ap;
+ struct xfs_repair_ialloc_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_ialloc_extent, list);
+ bp = container_of(b, struct xfs_repair_ialloc_extent, list);
+
+ if (ap->startino > bp->startino)
+ return 1;
+ else if (ap->startino < bp->startino)
+ return -1;
+ return 0;
+}
+
+/* Insert an inode chunk record into a given btree. */
+static int
+xfs_repair_iallocbt_insert_btrec(
+ struct xfs_btree_cur *cur,
+ struct xfs_repair_ialloc_extent *rie)
+{
+ int stat;
+ int error;
+
+ error = xfs_inobt_lookup(cur, rie->startino, XFS_LOOKUP_EQ, &stat);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 0);
+ error = xfs_inobt_insert_rec(cur, rie->holemask, rie->count,
+ rie->count - rie->usedcount, rie->freemask, &stat);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
+ return error;
+}
+
+/* Insert an inode chunk record into both inode btrees. */
+static int
+xfs_repair_iallocbt_insert_rec(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_ialloc_extent *rie)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ trace_xfs_repair_ialloc_insert(sc->mp, sc->sa.agno, rie->startino,
+ rie->holemask, rie->count, rie->count - rie->usedcount,
+ rie->freemask);
+
+ /* Insert into the inobt. */
+ cur = xfs_inobt_init_cursor(sc->mp, sc->tp, sc->sa.agi_bp, sc->sa.agno,
+ XFS_BTNUM_INO);
+ error = xfs_repair_iallocbt_insert_btrec(cur, rie);
+ if (error)
+ goto out_cur;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ /* Insert into the finobt if chunk has free inodes. */
+ if (xfs_sb_version_hasfinobt(&sc->mp->m_sb) &&
+ rie->count != rie->usedcount) {
+ cur = xfs_inobt_init_cursor(sc->mp, sc->tp, sc->sa.agi_bp,
+ sc->sa.agno, XFS_BTNUM_FINO);
+ error = xfs_repair_iallocbt_insert_btrec(cur, rie);
+ if (error)
+ goto out_cur;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ }
+
+ return xfs_repair_roll_ag_trans(sc);
+out_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/* Repair both inode btrees. */
+int
+xfs_repair_iallocbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_ialloc ri;
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_repair_ialloc_extent *rie;
+ struct xfs_repair_ialloc_extent *n;
+ struct xfs_agi *agi;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_perag *pag;
+ xfs_fsblock_t inofsb;
+ xfs_fsblock_t finofsb;
+ xfs_extlen_t nr_blocks;
+ unsigned int count;
+ unsigned int usedcount;
+ int logflags;
+ int error = 0;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ /* Collect all reverse mappings for inode blocks. */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
+ INIT_LIST_HEAD(&ri.extlist);
+ xfs_repair_init_extent_list(&ri.btlist);
+ ri.nr_records = 0;
+ ri.sc = sc;
+
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_ialloc_extent_fn, &ri);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+
+ /* Do we actually have enough space to do this? */
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ nr_blocks = xfs_iallocbt_calc_size(mp, ri.nr_records);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ nr_blocks *= 2;
+ if (!xfs_repair_ag_has_space(pag, nr_blocks, XFS_AG_RESV_NONE)) {
+ xfs_perag_put(pag);
+ error = -ENOSPC;
+ goto out;
+ }
+ xfs_perag_put(pag);
+
+ /* Invalidate all the inobt/finobt blocks in btlist. */
+ error = xfs_repair_invalidate_blocks(sc, &ri.btlist);
+ if (error)
+ goto out;
+
+ agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ /* Initialize new btree roots. */
+ error = xfs_repair_alloc_ag_block(sc, &oinfo, &inofsb,
+ XFS_AG_RESV_NONE);
+ if (error)
+ goto out;
+ error = xfs_repair_init_btblock(sc, inofsb, &bp, XFS_BTNUM_INO,
+ &xfs_inobt_buf_ops);
+ if (error)
+ goto out;
+ agi->agi_root = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, inofsb));
+ agi->agi_level = cpu_to_be32(1);
+ logflags = XFS_AGI_ROOT | XFS_AGI_LEVEL;
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ error = xfs_repair_alloc_ag_block(sc, &oinfo, &finofsb,
+ mp->m_inotbt_nores ? XFS_AG_RESV_NONE :
+ XFS_AG_RESV_METADATA);
+ if (error)
+ goto out;
+ error = xfs_repair_init_btblock(sc, finofsb, &bp,
+ XFS_BTNUM_FINO, &xfs_inobt_buf_ops);
+ if (error)
+ goto out;
+ agi->agi_free_root = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, finofsb));
+ agi->agi_free_level = cpu_to_be32(1);
+ logflags |= XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
+ }
+
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, logflags);
+ error = xfs_repair_roll_ag_trans(sc);
+ if (error)
+ goto out;
+
+ /* Insert records into the new btrees. */
+ count = 0;
+ usedcount = 0;
+ list_sort(NULL, &ri.extlist, xfs_repair_ialloc_extent_cmp);
+ list_for_each_entry_safe(rie, n, &ri.extlist, list) {
+ count += rie->count;
+ usedcount += rie->usedcount;
+
+ error = xfs_repair_iallocbt_insert_rec(sc, rie);
+ if (error)
+ goto out;
+
+ list_del(&rie->list);
+ kmem_free(rie);
+ }
+
+ /* Update the AGI counters. */
+ agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ if (be32_to_cpu(agi->agi_count) != count ||
+ be32_to_cpu(agi->agi_freecount) != count - usedcount) {
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ pag->pagi_init = 0;
+ xfs_perag_put(pag);
+
+ agi->agi_count = cpu_to_be32(count);
+ agi->agi_freecount = cpu_to_be32(count - usedcount);
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
+ XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+ sc->reset_counters = true;
+ }
+
+ /* Free the old inode btree blocks if they're not in use. */
+ return xfs_repair_reap_btree_extents(sc, &ri.btlist, &oinfo,
+ XFS_AG_RESV_NONE);
+out:
+ if (cur)
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_repair_cancel_btree_extents(sc, &ri.btlist);
+ list_for_each_entry_safe(rie, n, &ri.extlist, list) {
+ list_del(&rie->list);
+ kmem_free(rie);
+ }
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 1d41d215c030..d1d737048535 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -108,6 +108,7 @@ int xfs_repair_agf(struct xfs_scrub_context *sc);
int xfs_repair_agfl(struct xfs_scrub_context *sc);
int xfs_repair_agi(struct xfs_scrub_context *sc);
int xfs_repair_allocbt(struct xfs_scrub_context *sc);
+int xfs_repair_iallocbt(struct xfs_scrub_context *sc);
#else
@@ -141,6 +142,7 @@ static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
#define xfs_repair_agfl xfs_repair_notsupported
#define xfs_repair_agi xfs_repair_notsupported
#define xfs_repair_allocbt xfs_repair_notsupported
+#define xfs_repair_iallocbt xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 1b59a1598d6b..10ce4edf1d86 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -254,14 +254,14 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_iallocbt,
.scrub = xfs_scrub_inobt,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_iallocbt,
},
[XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_iallocbt,
.scrub = xfs_scrub_finobt,
.has = xfs_sb_version_hasfinobt,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_iallocbt,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 13/22] xfs: repair the rmapbt
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (11 preceding siblings ...)
2018-05-15 22:34 ` [PATCH 12/22] xfs: repair inode btrees Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 14/22] xfs: repair refcount btrees Darrick J. Wong
` (9 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Rebuild the reverse mapping btree from all primary metadata.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/common.c | 6
fs/xfs/scrub/repair.c | 119 +++++++
fs/xfs/scrub/repair.h | 27 +
fs/xfs/scrub/rmap.c | 6
fs/xfs/scrub/rmap_repair.c | 802 ++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/scrub.c | 18 +
fs/xfs/scrub/scrub.h | 2
fs/xfs/xfs_mount.h | 1
fs/xfs/xfs_super.c | 27 +
fs/xfs/xfs_trans.c | 7
11 files changed, 1010 insertions(+), 6 deletions(-)
create mode 100644 fs/xfs/scrub/rmap_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7c442f83b179..b9bbac3d5075 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -178,6 +178,7 @@ xfs-y += $(addprefix scrub/, \
alloc_repair.o \
ialloc_repair.o \
repair.o \
+ rmap_repair.o \
)
endif
endif
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index ea5c03f30dc6..067b1c4b7790 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -586,9 +586,13 @@ xfs_scrub_trans_alloc(
struct xfs_scrub_context *sc,
uint resblks)
{
+ uint flags = 0;
+
+ if (sc->fs_frozen)
+ flags |= XFS_TRANS_NO_WRITECOUNT;
if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
- resblks, 0, 0, &sc->tp);
+ resblks, 0, flags, &sc->tp);
return xfs_trans_alloc_empty(sc->mp, &sc->tp);
}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1679fe7cc912..2b97f54d8e1f 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -43,6 +43,8 @@
#include "xfs_ag_resv.h"
#include "xfs_trans_space.h"
#include "xfs_quota.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -1081,3 +1083,120 @@ xfs_repair_ino_dqattach(
return error;
}
+
+/*
+ * Freeze the FS against all other activity so that we can avoid ABBA
+ * deadlocks while taking locks in unusual orders so that we can rebuild
+ * metadata structures such as the rmapbt.
+ */
+int
+xfs_repair_fs_freeze(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ error = freeze_super(sc->mp->m_super);
+ if (error)
+ return error;
+ sc->fs_frozen = true;
+ return 0;
+}
+
+/* Unfreeze the FS. */
+int
+xfs_repair_fs_thaw(
+ struct xfs_scrub_context *sc)
+{
+ struct inode *inode, *o;
+ int error;
+
+ sc->fs_frozen = false;
+ error = thaw_super(sc->mp->m_super);
+
+ inode = sc->frozen_inode_list;
+ while (inode) {
+ o = inode->i_private;
+ inode->i_private = NULL;
+ iput(inode);
+ inode = o;
+ }
+
+ return error;
+}
+
+/*
+ * Release an inode while the fs is frozen for a repair.
+ *
+ * We froze the fs so that everything in the fs will be static except for the
+ * metadata that we are rebuilding. Users can't modify things and periodic
+ * block reclaim is stopped, which leaves only the reclamation that happens
+ * as part of evicting an inode from memory. We can't have that either, so
+ * redirect those inodes onto a side list and free them once we've thawed the
+ * fs. Note that memory reclaim is allowed to get to the other inodes.
+ */
+void
+xfs_repair_frozen_iput(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_bmbt_irec imap;
+ xfs_fileoff_t end_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_filblks_t map_len;
+ int nimaps;
+ int error;
+
+ if (!xfs_can_free_eofblocks(ip, true))
+ goto iput;
+
+ /*
+ * Figure out if there are any blocks beyond the end of the file.
+ * If not, then free immediately.
+ */
+ end_fsb = XFS_B_TO_FSB(sc->mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+ last_fsb = XFS_B_TO_FSB(sc->mp, sc->mp->m_super->s_maxbytes);
+ if (last_fsb <= end_fsb)
+ goto iput;
+ map_len = last_fsb - end_fsb;
+
+ nimaps = 1;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ /*
+ * If there are blocks after the end of file, hang on to them so that
+ * they don't get destroyed while we aren't able to handle any fs
+ * modifications.
+ */
+ if (!error && (nimaps != 0) &&
+ (imap.br_startblock != HOLESTARTBLOCK ||
+ ip->i_delayed_blks)) {
+ VFS_I(ip)->i_private = sc->frozen_inode_list;
+ sc->frozen_inode_list = VFS_I(ip);
+ return;
+ }
+iput:
+ iput(VFS_I(ip));
+}
+
+/* Read all AG headers and attach to this transaction. */
+int
+xfs_repair_grab_all_ag_headers(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi;
+ struct xfs_buf *agf;
+ struct xfs_buf *agfl;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_scrub_ag_read_headers(sc, agno, &agi, &agf, &agfl);
+ if (error)
+ break;
+ }
+
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index d1d737048535..d45a542e1e2b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -99,6 +99,11 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
int xfs_repair_reset_counters(struct xfs_mount *mp);
void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype);
int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
+int xfs_repair_fs_freeze(struct xfs_scrub_context *sc);
+int xfs_repair_fs_thaw(struct xfs_scrub_context *sc);
+void xfs_repair_frozen_iput(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+int xfs_repair_grab_all_ag_headers(struct xfs_scrub_context *sc);
+int xfs_repair_rmapbt_setup(struct xfs_scrub_context *sc, struct xfs_inode *ip);
/* Metadata repairers */
@@ -109,6 +114,7 @@ int xfs_repair_agfl(struct xfs_scrub_context *sc);
int xfs_repair_agi(struct xfs_scrub_context *sc);
int xfs_repair_allocbt(struct xfs_scrub_context *sc);
int xfs_repair_iallocbt(struct xfs_scrub_context *sc);
+int xfs_repair_rmapbt(struct xfs_scrub_context *sc);
#else
@@ -136,6 +142,26 @@ static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
return -EIO;
}
+static inline int xfs_repair_fs_freeze(struct xfs_scrub_context *sc)
+{
+ ASSERT(0);
+ return -EOPNOTSUPP;
+}
+
+static inline int xfs_repair_fs_thaw(struct xfs_scrub_context *sc)
+{
+ ASSERT(0);
+ return -EIO;
+}
+
+static inline int xfs_repair_rmapbt_setup(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ /* We don't support rmap repair, but we can still do a scan. */
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
#define xfs_repair_probe xfs_repair_notsupported
#define xfs_repair_superblock xfs_repair_notsupported
#define xfs_repair_agf xfs_repair_notsupported
@@ -143,6 +169,7 @@ static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
#define xfs_repair_agi xfs_repair_notsupported
#define xfs_repair_allocbt xfs_repair_notsupported
#define xfs_repair_iallocbt xfs_repair_notsupported
+#define xfs_repair_rmapbt xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index b376a9a77c04..14d9b7b40f7b 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -38,6 +38,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reverse mapping btrees.
@@ -47,7 +48,10 @@ xfs_scrub_setup_ag_rmapbt(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- return xfs_scrub_setup_ag_btree(sc, ip, false);
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return xfs_repair_rmapbt_setup(sc, ip);
+ else
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
}
/* Reverse-mapping scrubber. */
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
new file mode 100644
index 000000000000..502a4a549688
--- /dev/null
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Reverse-mapping repair. */
+
+/* Set us up to repair reverse mapping btrees. */
+int
+xfs_repair_rmapbt_setup(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ int error;
+
+ /*
+ * Freeze out anything that can lock an inode. We reconstruct
+ * the rmapbt by reading inode bmaps with the AGF held, which is
+ * only safe w.r.t. ABBA deadlocks if we're the only ones locking
+ * inodes.
+ */
+ error = xfs_repair_fs_freeze(sc);
+ if (error)
+ return error;
+
+ /* Check the AG number and set up the scrub context. */
+ error = xfs_scrub_setup_fs(sc, ip);
+ if (error)
+ return error;
+
+ /*
+ * Lock all the AG header buffers so that we can read all the
+ * per-AG metadata too.
+ */
+ error = xfs_repair_grab_all_ag_headers(sc);
+ if (error)
+ return error;
+
+ return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa);
+}
+
+struct xfs_repair_rmapbt_extent {
+ struct list_head list;
+ struct xfs_rmap_irec rmap;
+};
+
+struct xfs_repair_rmapbt {
+ struct list_head rmaplist;
+ struct xfs_repair_extent_list rmap_freelist;
+ struct xfs_repair_extent_list bno_freelist;
+ struct xfs_scrub_context *sc;
+ uint64_t owner;
+ xfs_extlen_t btblocks;
+ xfs_agblock_t next_bno;
+ uint64_t nr_records;
+};
+
+/* Initialize an rmap. */
+static inline int
+xfs_repair_rmapbt_new_rmap(
+ struct xfs_repair_rmapbt *rr,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ struct xfs_repair_rmapbt_extent *rre;
+ int error = 0;
+
+ trace_xfs_repair_rmap_extent_fn(rr->sc->mp, rr->sc->sa.agno,
+ startblock, blockcount, owner, offset, flags);
+
+ if (xfs_scrub_should_terminate(rr->sc, &error))
+ return error;
+
+ rre = kmem_alloc(sizeof(struct xfs_repair_rmapbt_extent), KM_MAYFAIL);
+ if (!rre)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&rre->list);
+ rre->rmap.rm_startblock = startblock;
+ rre->rmap.rm_blockcount = blockcount;
+ rre->rmap.rm_owner = owner;
+ rre->rmap.rm_offset = offset;
+ rre->rmap.rm_flags = flags;
+ list_add_tail(&rre->list, &rr->rmaplist);
+ rr->nr_records++;
+
+ return 0;
+}
+
+/* Add an AGFL block to the rmap list. */
+STATIC int
+xfs_repair_rmapbt_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t bno,
+ void *priv)
+{
+ struct xfs_repair_rmapbt *rr = priv;
+
+ return xfs_repair_rmapbt_new_rmap(rr, bno, 1, XFS_RMAP_OWN_AG, 0, 0);
+}
+
+/* Add a btree block to the rmap list. */
+STATIC int
+xfs_repair_rmapbt_visit_btblock(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xfs_repair_rmapbt *rr = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsb;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ rr->btblocks++;
+ fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
+ return xfs_repair_rmapbt_new_rmap(rr, XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
+ 1, rr->owner, 0, 0);
+}
+
+/* Record inode btree rmaps. */
+STATIC int
+xfs_repair_rmapbt_inodes(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xfs_repair_rmapbt *rr = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsb;
+ xfs_agino_t agino;
+ xfs_agino_t iperhole;
+ unsigned int i;
+ int error;
+
+ /* Record the inobt blocks */
+ for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) {
+ xfs_btree_get_block(cur, i, &bp);
+ if (!bp)
+ continue;
+ fsb = XFS_DADDR_TO_FSB(mp, bp->b_bn);
+ error = xfs_repair_rmapbt_new_rmap(rr,
+ XFS_FSB_TO_AGBNO(mp, fsb), 1,
+ XFS_RMAP_OWN_INOBT, 0, 0);
+ if (error)
+ return error;
+ }
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ /* Record a non-sparse inode chunk. */
+ if (irec.ir_holemask == XFS_INOBT_HOLEMASK_FULL)
+ return xfs_repair_rmapbt_new_rmap(rr,
+ XFS_AGINO_TO_AGBNO(mp, irec.ir_startino),
+ XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock,
+ XFS_RMAP_OWN_INODES, 0, 0);
+
+ /* Iterate each chunk. */
+ iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
+ XFS_INODES_PER_HOLEMASK_BIT);
+ for (i = 0, agino = irec.ir_startino;
+ i < XFS_INOBT_HOLEMASK_BITS;
+ i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
+ /* Skip holes. */
+ if (irec.ir_holemask & (1 << i))
+ continue;
+
+ /* Record the inode chunk otherwise. */
+ error = xfs_repair_rmapbt_new_rmap(rr,
+ XFS_AGINO_TO_AGBNO(mp, agino),
+ iperhole / mp->m_sb.sb_inopblock,
+ XFS_RMAP_OWN_INODES, 0, 0);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xfs_repair_rmapbt_refcount(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_repair_rmapbt *rr = priv;
+ struct xfs_refcount_irec refc;
+
+ xfs_refcount_btrec_to_irec(rec, &refc);
+ if (refc.rc_refcount != 1)
+ return -EFSCORRUPTED;
+
+ return xfs_repair_rmapbt_new_rmap(rr,
+ refc.rc_startblock - XFS_REFC_COW_START,
+ refc.rc_blockcount, XFS_RMAP_OWN_COW, 0, 0);
+}
+
+/* Add a bmbt block to the rmap list. */
+STATIC int
+xfs_repair_rmapbt_visit_bmbt(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xfs_repair_rmapbt *rr = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsb;
+ unsigned int flags = XFS_RMAP_BMBT_BLOCK;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
+ if (XFS_FSB_TO_AGNO(cur->bc_mp, fsb) != rr->sc->sa.agno)
+ return 0;
+
+ if (cur->bc_private.b.whichfork == XFS_ATTR_FORK)
+ flags |= XFS_RMAP_ATTR_FORK;
+ return xfs_repair_rmapbt_new_rmap(rr,
+ XFS_FSB_TO_AGBNO(cur->bc_mp, fsb), 1,
+ cur->bc_private.b.ip->i_ino, 0, flags);
+}
+
+/* Determine rmap flags from fork and bmbt state. */
+static inline unsigned int
+xfs_repair_rmapbt_bmap_flags(
+ int whichfork,
+ xfs_exntst_t state)
+{
+ return (whichfork == XFS_ATTR_FORK ? XFS_RMAP_ATTR_FORK : 0) |
+ (state == XFS_EXT_UNWRITTEN ? XFS_RMAP_UNWRITTEN : 0);
+}
+
+/* Find all the extents from a given AG in an inode fork. */
+STATIC int
+xfs_repair_rmapbt_scan_ifork(
+ struct xfs_repair_rmapbt *rr,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
+ struct xfs_mount *mp = rr->sc->mp;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_ifork *ifp;
+ unsigned int rflags;
+ int fmt;
+ int error = 0;
+
+ /* Do we even have data mapping extents? */
+ fmt = XFS_IFORK_FORMAT(ip, whichfork);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ switch (fmt) {
+ case XFS_DINODE_FMT_BTREE:
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(rr->sc->tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ break;
+ default:
+ return 0;
+ }
+ if (!ifp)
+ return 0;
+
+ /* Find all the BMBT blocks in the AG. */
+ if (fmt == XFS_DINODE_FMT_BTREE) {
+ cur = xfs_bmbt_init_cursor(mp, rr->sc->tp, ip, whichfork);
+ error = xfs_btree_visit_blocks(cur,
+ xfs_repair_rmapbt_visit_bmbt, rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+ }
+
+ /* We're done if this is an rt inode's data fork. */
+ if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip))
+ return 0;
+
+ /* Find all the extents in the AG. */
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ /* Stash non-hole extent. */
+ if (XFS_FSB_TO_AGNO(mp, rec.br_startblock) == rr->sc->sa.agno) {
+ rflags = xfs_repair_rmapbt_bmap_flags(whichfork,
+ rec.br_state);
+ error = xfs_repair_rmapbt_new_rmap(rr,
+ XFS_FSB_TO_AGBNO(mp, rec.br_startblock),
+ rec.br_blockcount, ip->i_ino,
+ rec.br_startoff, rflags);
+ if (error)
+ goto out;
+ }
+ }
+out:
+ if (cur)
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/* Iterate all the inodes in an AG group. */
+STATIC int
+xfs_repair_rmapbt_scan_inobt(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xfs_repair_rmapbt *rr = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = NULL;
+ xfs_ino_t ino;
+ xfs_agino_t agino;
+ int chunkidx;
+ int lock_mode = 0;
+ int error = 0;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ for (chunkidx = 0, agino = irec.ir_startino;
+ chunkidx < XFS_INODES_PER_CHUNK;
+ chunkidx++, agino++) {
+ bool inuse;
+
+ /* Skip if this inode is free */
+ if (XFS_INOBT_MASK(chunkidx) & irec.ir_free)
+ continue;
+ ino = XFS_AGINO_TO_INO(mp, cur->bc_private.a.agno, agino);
+
+ /* Back off and try again if an inode is being reclaimed */
+ error = xfs_icache_inode_is_allocated(mp, cur->bc_tp, ino,
+ &inuse);
+ if (error == -EAGAIN)
+ return -EDEADLOCK;
+
+ /*
+ * Grab inode for scanning. We cannot use DONTCACHE here
+ * because we already have a transaction so the iput must not
+ * trigger inode reclaim (which might allocate a transaction
+ * to clean up posteof blocks).
+ */
+ error = xfs_iget(mp, cur->bc_tp, ino, 0, 0, &ip);
+ if (error)
+ return error;
+
+ if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ !(ip->i_df.if_flags & XFS_IFEXTENTS)) ||
+ (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+ !(ip->i_afp->if_flags & XFS_IFEXTENTS)))
+ lock_mode = XFS_ILOCK_EXCL;
+ else
+ lock_mode = XFS_ILOCK_SHARED;
+ if (!xfs_ilock_nowait(ip, lock_mode)) {
+ error = -EBUSY;
+ goto out_rele;
+ }
+
+ /* Check the data fork. */
+ error = xfs_repair_rmapbt_scan_ifork(rr, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* Check the attr fork. */
+ error = xfs_repair_rmapbt_scan_ifork(rr, ip, XFS_ATTR_FORK);
+ if (error)
+ goto out_unlock;
+
+ xfs_iunlock(ip, lock_mode);
+ xfs_repair_frozen_iput(rr->sc, ip);
+ ip = NULL;
+ }
+
+ return error;
+out_unlock:
+ xfs_iunlock(ip, lock_mode);
+out_rele:
+ iput(VFS_I(ip));
+ return error;
+}
+
+/* Record extents that aren't in use from gaps in the rmap records. */
+STATIC int
+xfs_repair_rmapbt_record_rmap_freesp(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_rmapbt *rr = priv;
+ xfs_fsblock_t fsb;
+ int error;
+
+ /* Record the free space we find. */
+ if (rec->rm_startblock > rr->next_bno) {
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ rr->next_bno);
+ error = xfs_repair_collect_btree_extent(rr->sc,
+ &rr->rmap_freelist, fsb,
+ rec->rm_startblock - rr->next_bno);
+ if (error)
+ return error;
+ }
+ rr->next_bno = max_t(xfs_agblock_t, rr->next_bno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/* Record extents that aren't in use from the bnobt records. */
+STATIC int
+xfs_repair_rmapbt_record_bno_freesp(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xfs_repair_rmapbt *rr = priv;
+ xfs_fsblock_t fsb;
+
+ /* Record the free space we find. */
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ rec->ar_startblock);
+ return xfs_repair_collect_btree_extent(rr->sc, &rr->bno_freelist,
+ fsb, rec->ar_blockcount);
+}
+
+/* Compare two rmapbt extents. */
+static int
+xfs_repair_rmapbt_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_rmapbt_extent *ap;
+ struct xfs_repair_rmapbt_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_rmapbt_extent, list);
+ bp = container_of(b, struct xfs_repair_rmapbt_extent, list);
+ return xfs_rmap_compare(&ap->rmap, &bp->rmap);
+}
+
+#define RMAP(type, startblock, blockcount) xfs_repair_rmapbt_new_rmap( \
+ &rr, (startblock), (blockcount), \
+ XFS_RMAP_OWN_##type, 0, 0)
+/* Repair the rmap btree for some AG. */
+int
+xfs_repair_rmapbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_rmapbt rr;
+ struct xfs_owner_info oinfo;
+ struct xfs_repair_rmapbt_extent *rre;
+ struct xfs_repair_rmapbt_extent *n;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_buf *bp = NULL;
+ struct xfs_agf *agf;
+ struct xfs_agi *agi;
+ struct xfs_perag *pag;
+ xfs_fsblock_t btfsb;
+ xfs_agnumber_t ag;
+ xfs_agblock_t agend;
+ xfs_extlen_t freesp_btblocks;
+ int error;
+
+ INIT_LIST_HEAD(&rr.rmaplist);
+ xfs_repair_init_extent_list(&rr.rmap_freelist);
+ xfs_repair_init_extent_list(&rr.bno_freelist);
+ rr.sc = sc;
+ rr.nr_records = 0;
+
+ /* Collect rmaps for all AG headers. */
+ error = RMAP(FS, XFS_SB_BLOCK(mp), 1);
+ if (error)
+ goto out;
+ rre = list_last_entry(&rr.rmaplist, struct xfs_repair_rmapbt_extent,
+ list);
+
+ if (rre->rmap.rm_startblock != XFS_AGF_BLOCK(mp)) {
+ error = RMAP(FS, XFS_AGF_BLOCK(mp), 1);
+ if (error)
+ goto out;
+ rre = list_last_entry(&rr.rmaplist,
+ struct xfs_repair_rmapbt_extent, list);
+ }
+
+ if (rre->rmap.rm_startblock != XFS_AGI_BLOCK(mp)) {
+ error = RMAP(FS, XFS_AGI_BLOCK(mp), 1);
+ if (error)
+ goto out;
+ rre = list_last_entry(&rr.rmaplist,
+ struct xfs_repair_rmapbt_extent, list);
+ }
+
+ if (rre->rmap.rm_startblock != XFS_AGFL_BLOCK(mp)) {
+ error = RMAP(FS, XFS_AGFL_BLOCK(mp), 1);
+ if (error)
+ goto out;
+ }
+
+ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+ sc->sa.agfl_bp, xfs_repair_rmapbt_walk_agfl, &rr);
+ if (error)
+ goto out;
+
+ /* Collect rmap for the log if it's in this AG. */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == sc->sa.agno) {
+ error = RMAP(LOG, XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart),
+ mp->m_sb.sb_logblocks);
+ if (error)
+ goto out;
+ }
+
+ /* Collect rmaps for the free space btrees. */
+ rr.owner = XFS_RMAP_OWN_AG;
+ rr.btblocks = 0;
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno,
+ XFS_BTNUM_BNO);
+ error = xfs_btree_visit_blocks(cur, xfs_repair_rmapbt_visit_btblock,
+ &rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+
+ /* Collect rmaps for the cntbt. */
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno,
+ XFS_BTNUM_CNT);
+ error = xfs_btree_visit_blocks(cur, xfs_repair_rmapbt_visit_btblock,
+ &rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+ freesp_btblocks = rr.btblocks;
+
+ /* Collect rmaps for the inode btree. */
+ cur = xfs_inobt_init_cursor(mp, sc->tp, sc->sa.agi_bp, sc->sa.agno,
+ XFS_BTNUM_INO);
+ error = xfs_btree_query_all(cur, xfs_repair_rmapbt_inodes, &rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ /* If there are no inodes, we have to include the inobt root. */
+ agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ if (agi->agi_count == cpu_to_be32(0)) {
+ error = xfs_repair_rmapbt_new_rmap(&rr,
+ be32_to_cpu(agi->agi_root), 1,
+ XFS_RMAP_OWN_INOBT, 0, 0);
+ if (error)
+ goto out;
+ }
+
+ /* Collect rmaps for the free inode btree. */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ rr.owner = XFS_RMAP_OWN_INOBT;
+ cur = xfs_inobt_init_cursor(mp, sc->tp, sc->sa.agi_bp,
+ sc->sa.agno, XFS_BTNUM_FINO);
+ error = xfs_btree_visit_blocks(cur,
+ xfs_repair_rmapbt_visit_btblock, &rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+ }
+
+ /* Collect rmaps for the refcount btree. */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ union xfs_btree_irec low;
+ union xfs_btree_irec high;
+
+ rr.owner = XFS_RMAP_OWN_REFC;
+ cur = xfs_refcountbt_init_cursor(mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.agno, NULL);
+ error = xfs_btree_visit_blocks(cur,
+ xfs_repair_rmapbt_visit_btblock, &rr);
+ if (error)
+ goto out;
+
+ /* Collect rmaps for CoW staging extents. */
+ memset(&low, 0, sizeof(low));
+ low.rc.rc_startblock = XFS_REFC_COW_START;
+ memset(&high, 0xFF, sizeof(high));
+ error = xfs_btree_query_range(cur, &low, &high,
+ xfs_repair_rmapbt_refcount, &rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+ }
+
+ /* Iterate all AGs for inodes. */
+ for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+ error = xfs_ialloc_read_agi(mp, sc->tp, ag, &bp);
+ if (error)
+ goto out;
+ cur = xfs_inobt_init_cursor(mp, sc->tp, bp, ag, XFS_BTNUM_INO);
+ error = xfs_btree_query_all(cur, xfs_repair_rmapbt_scan_inobt,
+ &rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+ xfs_trans_brelse(sc->tp, bp);
+ bp = NULL;
+ }
+
+ /* Do we actually have enough space to do this? */
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ if (!xfs_repair_ag_has_space(pag,
+ xfs_rmapbt_calc_size(mp, rr.nr_records),
+ XFS_AG_RESV_RMAPBT)) {
+ xfs_perag_put(pag);
+ error = -ENOSPC;
+ goto out;
+ }
+
+ /* Initialize a new rmapbt root. */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_UNKNOWN);
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ error = xfs_repair_alloc_ag_block(sc, &oinfo, &btfsb,
+ XFS_AG_RESV_RMAPBT);
+ if (error) {
+ xfs_perag_put(pag);
+ goto out;
+ }
+ error = xfs_repair_init_btblock(sc, btfsb, &bp, XFS_BTNUM_RMAP,
+ &xfs_rmapbt_buf_ops);
+ if (error) {
+ xfs_perag_put(pag);
+ goto out;
+ }
+ agf->agf_roots[XFS_BTNUM_RMAPi] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp,
+ btfsb));
+ agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_blocks = cpu_to_be32(1);
+
+ /* Reset the perag info. */
+ pag->pagf_btreeblks = freesp_btblocks - 2;
+ pag->pagf_levels[XFS_BTNUM_RMAPi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+
+ /* Now reset the AGF counters. */
+ agf->agf_btreeblks = cpu_to_be32(pag->pagf_btreeblks);
+ xfs_perag_put(pag);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_ROOTS |
+ XFS_AGF_LEVELS | XFS_AGF_RMAP_BLOCKS |
+ XFS_AGF_BTREEBLKS);
+ bp = NULL;
+ error = xfs_repair_roll_ag_trans(sc);
+ if (error)
+ goto out;
+
+ /* Insert all the metadata rmaps. */
+ list_sort(NULL, &rr.rmaplist, xfs_repair_rmapbt_extent_cmp);
+ list_for_each_entry_safe(rre, n, &rr.rmaplist, list) {
+ /* Add the rmap. */
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.agno);
+ error = xfs_rmap_map_raw(cur, &rre->rmap);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+
+ error = xfs_repair_roll_ag_trans(sc);
+ if (error)
+ goto out;
+
+ list_del(&rre->list);
+ kmem_free(rre);
+
+ /*
+ * Ensure the freelist is full, but don't let it shrink.
+ * The rmapbt isn't fully set up yet, which means that
+ * the current AGFL blocks might not be reflected in the
+ * rmapbt, which is a problem if we want to unmap blocks
+ * from the AGFL.
+ */
+ error = xfs_repair_fix_freelist(sc, false);
+ if (error)
+ goto out;
+ }
+
+ /* Compute free space from the new rmapbt. */
+ rr.next_bno = 0;
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_rmapbt_record_rmap_freesp,
+ &rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agend = be32_to_cpu(agf->agf_length);
+ if (rr.next_bno < agend) {
+ btfsb = XFS_AGB_TO_FSB(mp, sc->sa.agno, rr.next_bno);
+ error = xfs_repair_collect_btree_extent(sc, &rr.rmap_freelist,
+ btfsb, agend - rr.next_bno);
+ if (error)
+ goto out;
+ }
+
+ /* Compute free space from the existing bnobt. */
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno,
+ XFS_BTNUM_BNO);
+ error = xfs_alloc_query_all(cur, xfs_repair_rmapbt_record_bno_freesp,
+ &rr);
+ if (error)
+ goto out;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+
+ /*
+ * Free the "free" blocks that the new rmapbt knows about but
+ * the old bnobt doesn't. These are the old rmapbt blocks.
+ */
+ error = xfs_repair_subtract_extents(sc, &rr.rmap_freelist,
+ &rr.bno_freelist);
+ if (error)
+ goto out;
+ xfs_repair_cancel_btree_extents(sc, &rr.bno_freelist);
+ error = xfs_repair_invalidate_blocks(sc, &rr.rmap_freelist);
+ if (error) {
+ xfs_perag_put(pag);
+ goto out;
+ }
+ return xfs_repair_reap_btree_extents(sc, &rr.rmap_freelist, &oinfo,
+ XFS_AG_RESV_RMAPBT);
+out:
+ if (cur)
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ if (bp)
+ xfs_trans_brelse(sc->tp, bp);
+ xfs_repair_cancel_btree_extents(sc, &rr.bno_freelist);
+ xfs_repair_cancel_btree_extents(sc, &rr.rmap_freelist);
+ list_for_each_entry_safe(rre, n, &rr.rmaplist, list) {
+ list_del(&rre->list);
+ kmem_free(rre);
+ }
+ return error;
+}
+#undef RMAP
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 10ce4edf1d86..8a7d4b2288d8 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -178,6 +178,8 @@ xfs_scrub_teardown(
struct xfs_inode *ip_in,
int error)
{
+ int err2;
+
xfs_scrub_ag_free(sc, &sc->sa);
if (sc->tp) {
if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
@@ -194,6 +196,12 @@ xfs_scrub_teardown(
iput(VFS_I(sc->ip));
sc->ip = NULL;
}
+ if (sc->fs_frozen) {
+ err2 = xfs_repair_fs_thaw(sc);
+ if (!error && err2)
+ error = err2;
+ sc->fs_frozen = false;
+ }
if (sc->has_quotaofflock)
mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
if (sc->buf) {
@@ -268,7 +276,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.setup = xfs_scrub_setup_ag_rmapbt,
.scrub = xfs_scrub_rmapbt,
.has = xfs_sb_version_hasrmapbt,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_rmapbt,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
@@ -514,6 +522,8 @@ xfs_scrub_metadata(
xfs_scrub_experimental_warning(mp);
+ atomic_inc(&mp->m_scrubbers);
+
retry_op:
/* Set up for the operation. */
memset(&sc, 0, sizeof(sc));
@@ -536,7 +546,7 @@ xfs_scrub_metadata(
*/
error = xfs_scrub_teardown(&sc, ip, 0);
if (error)
- goto out;
+ goto out_dec;
try_harder = true;
goto retry_op;
} else if (error)
@@ -572,7 +582,7 @@ xfs_scrub_metadata(
error = xfs_scrub_teardown(&sc, ip, 0);
if (error) {
xfs_repair_failure(mp);
- goto out;
+ goto out_dec;
}
goto retry_op;
}
@@ -582,6 +592,8 @@ xfs_scrub_metadata(
xfs_scrub_postmortem(&sc);
out_teardown:
error = xfs_scrub_teardown(&sc, ip, error);
+out_dec:
+ atomic_dec(&mp->m_scrubbers);
out:
trace_xfs_scrub_done(ip, sm, error);
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 1aaea393c2d1..8cf4062e069d 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -68,6 +68,7 @@ struct xfs_scrub_ag {
struct xfs_scrub_context {
/* General scrub state. */
+ struct inode *frozen_inode_list;
struct xfs_mount *mp;
struct xfs_scrub_metadata *sm;
const struct xfs_scrub_meta_ops *ops;
@@ -78,6 +79,7 @@ struct xfs_scrub_context {
bool try_harder;
bool has_quotaofflock;
bool reset_counters;
+ bool fs_frozen;
/* State tracking for single-AG operations. */
struct xfs_scrub_ag sa;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 10b90bbc5162..44ad46182077 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -205,6 +205,7 @@ typedef struct xfs_mount {
unsigned int *m_errortag;
struct xfs_kobj m_errortag_kobj;
#endif
+ atomic_t m_scrubbers; /* # of active scrub processes */
} xfs_mount_t;
/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 39e5ec3d407f..7f5d335a3f70 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1457,6 +1457,30 @@ xfs_fs_unfreeze(
return 0;
}
+/* Don't let userspace freeze while we're scrubbing the filesystem. */
+STATIC int
+xfs_fs_freeze_super(
+ struct super_block *sb)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (atomic_read(&mp->m_scrubbers) > 0)
+ return -EBUSY;
+ return freeze_super(sb);
+}
+
+/* Don't let userspace thaw while we're scrubbing the filesystem. */
+STATIC int
+xfs_fs_thaw_super(
+ struct super_block *sb)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (atomic_read(&mp->m_scrubbers) > 0)
+ return -EBUSY;
+ return thaw_super(sb);
+}
+
STATIC int
xfs_fs_show_options(
struct seq_file *m,
@@ -1595,6 +1619,7 @@ xfs_mount_alloc(
spin_lock_init(&mp->m_perag_lock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
+ atomic_set(&mp->m_scrubbers, 0);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
@@ -1852,6 +1877,8 @@ static const struct super_operations xfs_super_operations = {
.show_options = xfs_fs_show_options,
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
+ .freeze_super = xfs_fs_freeze_super,
+ .thaw_super = xfs_fs_thaw_super,
};
static struct file_system_type xfs_fs_type = {
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fc7ba75b8b69..449771d98561 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -270,7 +270,12 @@ xfs_trans_alloc(
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
- WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+ /*
+ * Scrub is allowed to freeze the filesystem in order to obtain
+ * exclusive access to the filesystem.
+ */
+ WARN_ON(atomic_read(&mp->m_scrubbers) == 0 &&
+ mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
atomic_inc(&mp->m_active_trans);
tp = kmem_zone_zalloc(xfs_trans_zone,
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 14/22] xfs: repair refcount btrees
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (12 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 13/22] xfs: repair the rmapbt Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 15/22] xfs: repair inode records Darrick J. Wong
` (8 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Reconstruct the refcount data from the rmap btree.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/refcount_repair.c | 529 ++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 2
fs/xfs/scrub/scrub.c | 2
4 files changed, 533 insertions(+), 1 deletion(-)
create mode 100644 fs/xfs/scrub/refcount_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b9bbac3d5075..36ad73145c25 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -177,6 +177,7 @@ xfs-y += $(addprefix scrub/, \
agheader_repair.o \
alloc_repair.o \
ialloc_repair.o \
+ refcount_repair.o \
repair.o \
rmap_repair.o \
)
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
new file mode 100644
index 000000000000..aa0b6db3ca14
--- /dev/null
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_error.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ *
+ * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ * - ---
+ * -- ----- ---- --- ------
+ * -- ---- ----------- ---- ---------
+ * -------------------------------- -----------
+ * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
+ * 2 1 23 21 3 43 234 2123 1 01 2 3 0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time. Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ * - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ * - Add to the bag all rmaps in the array where startblock == sp.
+ * - Set np to the physical block where the bag size will change. This
+ * is the minimum of (the pblk of the next unprocessed rmap) and
+ * (startblock + len of each rmap in the bag).
+ * - Record the bag size as old_bag_size.
+ *
+ * - While the bag isn't empty,
+ * - Remove from the bag all rmaps where startblock + len == np.
+ * - Add to the bag all rmaps in the array where startblock == np.
+ * - If the bag size isn't old_bag_size, store the refcount entry
+ * (sp, np - sp, bag_size) in the refcnt btree.
+ * - If the bag is empty, break out of the inner loop.
+ * - Set old_bag_size to the bag size
+ * - Set sp = np.
+ * - Set np to the physical block where the bag size will change.
+ * This is the minimum of (the pblk of the next unprocessed rmap)
+ * and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the refcount btree root and
+ * insert all the records.
+ */
+
+struct xfs_repair_refc_rmap {
+ struct list_head list;
+ struct xfs_rmap_irec rmap;
+};
+
+struct xfs_repair_refc_extent {
+ struct list_head list;
+ struct xfs_refcount_irec refc;
+};
+
+struct xfs_repair_refc {
+ struct list_head rmap_bag; /* rmaps we're tracking */
+ struct list_head rmap_idle; /* idle rmaps */
+ struct list_head extlist; /* refcount extents */
+ struct xfs_repair_extent_list btlist; /* old refcountbt blocks */
+ struct xfs_scrub_context *sc;
+ unsigned long nr_records;/* nr refcount extents */
+ xfs_extlen_t btblocks; /* # of refcountbt blocks */
+};
+
+/* Grab the next record from the rmapbt. */
+STATIC int
+xfs_repair_refcountbt_next_rmap(
+ struct xfs_btree_cur *cur,
+ struct xfs_repair_refc *rr,
+ struct xfs_rmap_irec *rec,
+ bool *have_rec)
+{
+ struct xfs_rmap_irec rmap;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_repair_refc_extent *rre;
+ xfs_fsblock_t fsbno;
+ int have_gt;
+ int error = 0;
+
+ *have_rec = false;
+ /*
+ * Loop through the remaining rmaps. Remember CoW staging
+ * extents and the refcountbt blocks from the old tree for later
+ * disposal. We can only share written data fork extents, so
+ * keep looping until we find an rmap for one.
+ */
+ do {
+ if (xfs_scrub_should_terminate(rr->sc, &error))
+ goto out_error;
+
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ goto out_error;
+ if (!have_gt)
+ return 0;
+
+ error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+
+ if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
+ /* Pass CoW staging extents right through. */
+ rre = kmem_alloc(sizeof(struct xfs_repair_refc_extent),
+ KM_MAYFAIL);
+ if (!rre)
+ goto out_error;
+
+ INIT_LIST_HEAD(&rre->list);
+ rre->refc.rc_startblock = rmap.rm_startblock +
+ XFS_REFC_COW_START;
+ rre->refc.rc_blockcount = rmap.rm_blockcount;
+ rre->refc.rc_refcount = 1;
+ list_add_tail(&rre->list, &rr->extlist);
+ } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+ /* refcountbt block, dump it when we're done. */
+ rr->btblocks += rmap.rm_blockcount;
+ fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+ cur->bc_private.a.agno,
+ rmap.rm_startblock);
+ error = xfs_repair_collect_btree_extent(rr->sc,
+ &rr->btlist, fsbno, rmap.rm_blockcount);
+ if (error)
+ goto out_error;
+ }
+ } while (XFS_RMAP_NON_INODE_OWNER(rmap.rm_owner) ||
+ xfs_internal_inum(mp, rmap.rm_owner) ||
+ (rmap.rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN)));
+
+ *rec = rmap;
+ *have_rec = true;
+ return 0;
+
+out_error:
+ return error;
+}
+
+/* Recycle an idle rmap or allocate a new one. */
+static struct xfs_repair_refc_rmap *
+xfs_repair_refcountbt_get_rmap(
+ struct xfs_repair_refc *rr)
+{
+ struct xfs_repair_refc_rmap *rrm;
+
+ if (list_empty(&rr->rmap_idle)) {
+ rrm = kmem_alloc(sizeof(struct xfs_repair_refc_rmap),
+ KM_MAYFAIL);
+ if (!rrm)
+ return NULL;
+ INIT_LIST_HEAD(&rrm->list);
+ return rrm;
+ }
+
+ rrm = list_first_entry(&rr->rmap_idle, struct xfs_repair_refc_rmap,
+ list);
+ list_del_init(&rrm->list);
+ return rrm;
+}
+
+/* Compare two btree extents. */
+static int
+xfs_repair_refcount_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_refc_extent *ap;
+ struct xfs_repair_refc_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_refc_extent, list);
+ bp = container_of(b, struct xfs_repair_refc_extent, list);
+
+ if (ap->refc.rc_startblock > bp->refc.rc_startblock)
+ return 1;
+ else if (ap->refc.rc_startblock < bp->refc.rc_startblock)
+ return -1;
+ return 0;
+}
+
+/* Record a reference count extent. */
+STATIC int
+xfs_repair_refcountbt_new_refc(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_refc *rr,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ xfs_nlink_t refcount)
+{
+ struct xfs_repair_refc_extent *rre;
+ struct xfs_refcount_irec irec;
+
+ irec.rc_startblock = agbno;
+ irec.rc_blockcount = len;
+ irec.rc_refcount = refcount;
+
+ trace_xfs_repair_refcount_extent_fn(sc->mp, sc->sa.agno,
+ &irec);
+
+ rre = kmem_alloc(sizeof(struct xfs_repair_refc_extent),
+ KM_MAYFAIL);
+ if (!rre)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&rre->list);
+ rre->refc = irec;
+ list_add_tail(&rre->list, &rr->extlist);
+
+ return 0;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+#define RMAP_NEXT(r) ((r).rm_startblock + (r).rm_blockcount)
+STATIC int
+xfs_repair_refcountbt_generate_refcounts(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_refc *rr)
+{
+ struct xfs_rmap_irec rmap;
+ struct xfs_btree_cur *cur;
+ struct xfs_repair_refc_rmap *rrm;
+ struct xfs_repair_refc_rmap *n;
+ xfs_agblock_t sbno;
+ xfs_agblock_t cbno;
+ xfs_agblock_t nbno;
+ size_t old_stack_sz;
+ size_t stack_sz = 0;
+ bool have;
+ int have_gt;
+ int error;
+
+ /* Start the rmapbt cursor to the left of all records. */
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.agno);
+ error = xfs_rmap_lookup_le(cur, 0, 0, 0, 0, 0, &have_gt);
+ if (error)
+ goto out;
+ ASSERT(have_gt == 0);
+
+ /* Process reverse mappings into refcount data. */
+ while (xfs_btree_has_more_records(cur)) {
+ /* Push all rmaps with pblk == sbno onto the stack */
+ error = xfs_repair_refcountbt_next_rmap(cur, rr, &rmap, &have);
+ if (error)
+ goto out;
+ if (!have)
+ break;
+ sbno = cbno = rmap.rm_startblock;
+ while (have && rmap.rm_startblock == sbno) {
+ rrm = xfs_repair_refcountbt_get_rmap(rr);
+ if (!rrm)
+ goto out;
+ rrm->rmap = rmap;
+ list_add_tail(&rrm->list, &rr->rmap_bag);
+ stack_sz++;
+ error = xfs_repair_refcountbt_next_rmap(cur, rr, &rmap,
+ &have);
+ if (error)
+ goto out;
+ }
+ error = xfs_btree_decrement(cur, 0, &have_gt);
+ if (error)
+ goto out;
+ XFS_WANT_CORRUPTED_GOTO(sc->mp, have_gt, out);
+
+ /* Set nbno to the bno of the next refcount change */
+ nbno = have ? rmap.rm_startblock : NULLAGBLOCK;
+ list_for_each_entry(rrm, &rr->rmap_bag, list)
+ nbno = min_t(xfs_agblock_t, nbno, RMAP_NEXT(rrm->rmap));
+
+ ASSERT(nbno > sbno);
+ old_stack_sz = stack_sz;
+
+ /* While stack isn't empty... */
+ while (stack_sz) {
+ /* Pop all rmaps that end at nbno */
+ list_for_each_entry_safe(rrm, n, &rr->rmap_bag, list) {
+ if (RMAP_NEXT(rrm->rmap) != nbno)
+ continue;
+ stack_sz--;
+ list_move(&rrm->list, &rr->rmap_idle);
+ }
+
+ /* Push array items that start at nbno */
+ error = xfs_repair_refcountbt_next_rmap(cur, rr, &rmap,
+ &have);
+ if (error)
+ goto out;
+ while (have && rmap.rm_startblock == nbno) {
+ rrm = xfs_repair_refcountbt_get_rmap(rr);
+ if (!rrm)
+ goto out;
+ rrm->rmap = rmap;
+ list_add_tail(&rrm->list, &rr->rmap_bag);
+ stack_sz++;
+ error = xfs_repair_refcountbt_next_rmap(cur,
+ rr, &rmap, &have);
+ if (error)
+ goto out;
+ }
+ error = xfs_btree_decrement(cur, 0, &have_gt);
+ if (error)
+ goto out;
+ XFS_WANT_CORRUPTED_GOTO(sc->mp, have_gt, out);
+
+ /* Emit refcount if necessary */
+ ASSERT(nbno > cbno);
+ if (stack_sz != old_stack_sz) {
+ if (old_stack_sz > 1) {
+ error = xfs_repair_refcountbt_new_refc(
+ sc, rr, cbno,
+ nbno - cbno,
+ old_stack_sz);
+ if (error)
+ goto out;
+ rr->nr_records++;
+ }
+ cbno = nbno;
+ }
+
+ /* Stack empty, go find the next rmap */
+ if (stack_sz == 0)
+ break;
+ old_stack_sz = stack_sz;
+ sbno = nbno;
+
+ /* Set nbno to the bno of the next refcount change */
+ nbno = have ? rmap.rm_startblock : NULLAGBLOCK;
+ list_for_each_entry(rrm, &rr->rmap_bag, list)
+ nbno = min_t(xfs_agblock_t, nbno,
+ RMAP_NEXT(rrm->rmap));
+
+ ASSERT(nbno > sbno);
+ }
+ }
+
+ /* Free all the leftover rmap records. */
+ list_for_each_entry_safe(rrm, n, &rr->rmap_idle, list) {
+ list_del(&rrm->list);
+ kmem_free(rrm);
+ }
+
+ ASSERT(list_empty(&rr->rmap_bag));
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+out:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+#undef RMAP_NEXT
+
+/* Rebuild the refcount btree. */
+int
+xfs_repair_refcountbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_refc rr;
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_repair_refc_rmap *rrm;
+ struct xfs_repair_refc_rmap *n;
+ struct xfs_repair_refc_extent *rre;
+ struct xfs_repair_refc_extent *o;
+ struct xfs_buf *bp = NULL;
+ struct xfs_agf *agf;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_perag *pag;
+ xfs_fsblock_t btfsb;
+ int have_gt;
+ int error = 0;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ INIT_LIST_HEAD(&rr.rmap_bag);
+ INIT_LIST_HEAD(&rr.rmap_idle);
+ INIT_LIST_HEAD(&rr.extlist);
+ xfs_repair_init_extent_list(&rr.btlist);
+ rr.btblocks = 0;
+ rr.sc = sc;
+ rr.nr_records = 0;
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
+
+ error = xfs_repair_refcountbt_generate_refcounts(sc, &rr);
+ if (error)
+ goto out;
+
+ /* Do we actually have enough space to do this? */
+ pag = xfs_perag_get(mp, sc->sa.agno);
+ if (!xfs_repair_ag_has_space(pag,
+ xfs_refcountbt_calc_size(mp, rr.nr_records),
+ XFS_AG_RESV_METADATA)) {
+ xfs_perag_put(pag);
+ error = -ENOSPC;
+ goto out;
+ }
+ xfs_perag_put(pag);
+
+ /* Invalidate all the refcountbt blocks in btlist. */
+ error = xfs_repair_invalidate_blocks(sc, &rr.btlist);
+ if (error)
+ goto out;
+
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ /* Initialize a new btree root. */
+ error = xfs_repair_alloc_ag_block(sc, &oinfo, &btfsb,
+ XFS_AG_RESV_METADATA);
+ if (error)
+ goto out;
+ error = xfs_repair_init_btblock(sc, btfsb, &bp, XFS_BTNUM_REFC,
+ &xfs_refcountbt_buf_ops);
+ if (error)
+ goto out;
+ agf->agf_refcount_root = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, btfsb));
+ agf->agf_refcount_level = cpu_to_be32(1);
+ agf->agf_refcount_blocks = cpu_to_be32(1);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_REFCOUNT_BLOCKS |
+ XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
+ error = xfs_repair_roll_ag_trans(sc);
+ if (error)
+ goto out;
+
+ /* Insert records into the new btree. */
+ list_sort(NULL, &rr.extlist, xfs_repair_refcount_extent_cmp);
+ list_for_each_entry_safe(rre, o, &rr.extlist, list) {
+ /* Insert into the refcountbt. */
+ cur = xfs_refcountbt_init_cursor(mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.agno, NULL);
+ error = xfs_refcount_lookup_eq(cur, rre->refc.rc_startblock,
+ &have_gt);
+ if (error)
+ goto out;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 0, out);
+ error = xfs_refcount_insert(cur, &rre->refc, &have_gt);
+ if (error)
+ goto out;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+
+ error = xfs_repair_roll_ag_trans(sc);
+ if (error)
+ goto out;
+
+ list_del(&rre->list);
+ kmem_free(rre);
+ }
+
+ /* Free the old refcountbt blocks if they're not in use. */
+ return xfs_repair_reap_btree_extents(sc, &rr.btlist, &oinfo,
+ XFS_AG_RESV_METADATA);
+out:
+ if (cur)
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_repair_cancel_btree_extents(sc, &rr.btlist);
+ list_for_each_entry_safe(rrm, n, &rr.rmap_idle, list) {
+ list_del(&rrm->list);
+ kmem_free(rrm);
+ }
+ list_for_each_entry_safe(rrm, n, &rr.rmap_bag, list) {
+ list_del(&rrm->list);
+ kmem_free(rrm);
+ }
+ list_for_each_entry_safe(rre, o, &rr.extlist, list) {
+ list_del(&rre->list);
+ kmem_free(rre);
+ }
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index d45a542e1e2b..d663fc4c464d 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -115,6 +115,7 @@ int xfs_repair_agi(struct xfs_scrub_context *sc);
int xfs_repair_allocbt(struct xfs_scrub_context *sc);
int xfs_repair_iallocbt(struct xfs_scrub_context *sc);
int xfs_repair_rmapbt(struct xfs_scrub_context *sc);
+int xfs_repair_refcountbt(struct xfs_scrub_context *sc);
#else
@@ -170,6 +171,7 @@ static inline int xfs_repair_rmapbt_setup(
#define xfs_repair_allocbt xfs_repair_notsupported
#define xfs_repair_iallocbt xfs_repair_notsupported
#define xfs_repair_rmapbt xfs_repair_notsupported
+#define xfs_repair_refcountbt xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 8a7d4b2288d8..aa35d0384d26 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -283,7 +283,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.setup = xfs_scrub_setup_ag_refcountbt,
.scrub = xfs_scrub_refcountbt,
.has = xfs_sb_version_hasreflink,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_refcountbt,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
.type = ST_INODE,
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 15/22] xfs: repair inode records
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (13 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 14/22] xfs: repair refcount btrees Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 16/22] xfs: zap broken inode forks Darrick J. Wong
` (7 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Try to reinitialize corrupt inodes, or clear the reflink flag
if it's not needed.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/inode_repair.c | 392 +++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 2
fs/xfs/scrub/scrub.c | 2
4 files changed, 396 insertions(+), 1 deletion(-)
create mode 100644 fs/xfs/scrub/inode_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 36ad73145c25..b0f25bf07207 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -177,6 +177,7 @@ xfs-y += $(addprefix scrub/, \
agheader_repair.o \
alloc_repair.o \
ialloc_repair.o \
+ inode_repair.o \
refcount_repair.o \
repair.o \
rmap_repair.o \
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
new file mode 100644
index 000000000000..90208a58a1d1
--- /dev/null
+++ b/fs/xfs/scrub/inode_repair.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "xfs_rmap.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dir2.h"
+#include "xfs_quota_defs.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Make sure this buffer can pass the inode buffer verifier. */
+STATIC void
+xfs_repair_inode_buf(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_dinode *dip;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ int ioff;
+ int i;
+ int ni;
+ int di_ok;
+ bool unlinked_ok;
+
+ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+ agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
+ for (i = 0; i < ni; i++) {
+ ioff = i << mp->m_sb.sb_inodelog;
+ dip = xfs_buf_offset(bp, ioff);
+ agino = be32_to_cpu(dip->di_next_unlinked);
+ unlinked_ok = (agino == NULLAGINO ||
+ xfs_verify_agino(sc->mp, agno, agino));
+ di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ xfs_dinode_good_version(mp, dip->di_version);
+ if (di_ok && unlinked_ok)
+ continue;
+ dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ dip->di_version = 3;
+ if (!unlinked_ok)
+ dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+ xfs_trans_log_buf(tp, bp, ioff, ioff + sizeof(*dip) - 1);
+ }
+}
+
+/* Inode didn't pass verifiers, so fix the raw buffer and retry iget. */
+STATIC int
+xfs_repair_inode_core(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_imap imap;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
+ xfs_ino_t ino;
+ uint64_t flags2;
+ uint16_t flags;
+ uint16_t mode;
+ int error;
+
+ /* Map & read inode. */
+ ino = sc->sm->sm_ino;
+ error = xfs_imap(sc->mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED);
+ if (error)
+ return error;
+
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+ imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, NULL);
+ if (error)
+ return error;
+
+ /* Make sure we can pass the inode buffer verifier. */
+ xfs_repair_inode_buf(sc, bp);
+ bp->b_ops = &xfs_inode_buf_ops;
+
+ /* Fix everything the verifier will complain about. */
+ dip = xfs_buf_offset(bp, imap.im_boffset);
+ mode = be16_to_cpu(dip->di_mode);
+ if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) {
+ /* bad mode, so we set it to a file that only root can read */
+ mode = S_IFREG;
+ dip->di_mode = cpu_to_be16(mode);
+ dip->di_uid = 0;
+ dip->di_gid = 0;
+ }
+ dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ if (!xfs_dinode_good_version(sc->mp, dip->di_version))
+ dip->di_version = 3;
+ dip->di_ino = cpu_to_be64(ino);
+ uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ flags = be16_to_cpu(dip->di_flags);
+ flags2 = be64_to_cpu(dip->di_flags2);
+ if (xfs_sb_version_hasreflink(&sc->mp->m_sb) && S_ISREG(mode))
+ flags2 |= XFS_DIFLAG2_REFLINK;
+ else
+ flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
+ if (flags & XFS_DIFLAG_REALTIME)
+ flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (flags2 & XFS_DIFLAG2_REFLINK)
+ flags2 &= ~XFS_DIFLAG2_DAX;
+ dip->di_flags = cpu_to_be16(flags);
+ dip->di_flags2 = cpu_to_be64(flags2);
+ dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
+ if (be64_to_cpu(dip->di_size) & (1ULL << 63))
+ dip->di_size = cpu_to_be64((1ULL << 63) - 1);
+
+ /* Write out the inode... */
+ xfs_dinode_calc_crc(sc->mp, dip);
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
+ xfs_trans_log_buf(sc->tp, bp, imap.im_boffset,
+ imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
+ error = xfs_trans_commit(sc->tp);
+ if (error)
+ return error;
+ sc->tp = NULL;
+
+ /* ...and reload it? */
+ error = xfs_iget(sc->mp, sc->tp, ino,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &sc->ip);
+ if (error)
+ return error;
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xfs_scrub_trans_alloc(sc, 0);
+ if (error)
+ return error;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+ return 0;
+}
+
+/* Fix di_extsize hint. */
+STATIC void
+xfs_repair_inode_extsize(
+ struct xfs_scrub_context *sc)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_inode_validate_extsize(sc->mp, sc->ip->i_d.di_extsize,
+ VFS_I(sc->ip)->i_mode, sc->ip->i_d.di_flags);
+ if (!fa)
+ return;
+
+ sc->ip->i_d.di_extsize = 0;
+ sc->ip->i_d.di_flags &= ~(XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT);
+}
+
+/* Fix di_cowextsize hint. */
+STATIC void
+xfs_repair_inode_cowextsize(
+ struct xfs_scrub_context *sc)
+{
+ xfs_failaddr_t fa;
+
+ if (sc->ip->i_d.di_version < 3)
+ return;
+
+ fa = xfs_inode_validate_cowextsize(sc->mp, sc->ip->i_d.di_cowextsize,
+ VFS_I(sc->ip)->i_mode, sc->ip->i_d.di_flags,
+ sc->ip->i_d.di_flags2);
+ if (!fa)
+ return;
+
+ sc->ip->i_d.di_cowextsize = 0;
+ sc->ip->i_d.di_flags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+}
+
+/* Fix inode flags. */
+STATIC void
+xfs_repair_inode_flags(
+ struct xfs_scrub_context *sc)
+{
+ uint16_t mode;
+
+ mode = VFS_I(sc->ip)->i_mode;
+
+ if (sc->ip->i_d.di_flags & ~XFS_DIFLAG_ANY)
+ sc->ip->i_d.di_flags &= ~XFS_DIFLAG_ANY;
+
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
+ sc->ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+ else
+ sc->ip->i_d.di_flags &= ~XFS_DIFLAG_NEWRTBM;
+
+ if (!S_ISDIR(mode))
+ sc->ip->i_d.di_flags &= ~(XFS_DIFLAG_RTINHERIT |
+ XFS_DIFLAG_EXTSZINHERIT |
+ XFS_DIFLAG_PROJINHERIT |
+ XFS_DIFLAG_NOSYMLINKS);
+ if (!S_ISREG(mode))
+ sc->ip->i_d.di_flags &= ~(XFS_DIFLAG_REALTIME |
+ XFS_DIFLAG_EXTSIZE);
+
+ if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
+ sc->ip->i_d.di_flags &= ~XFS_DIFLAG_FILESTREAM;
+}
+
+/* Fix inode flags2 */
+STATIC void
+xfs_repair_inode_flags2(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ uint16_t mode;
+
+ if (sc->ip->i_d.di_version < 3)
+ return;
+
+ mode = VFS_I(sc->ip)->i_mode;
+
+ if (sc->ip->i_d.di_flags2 & ~XFS_DIFLAG2_ANY)
+ sc->ip->i_d.di_flags2 &= ~XFS_DIFLAG2_ANY;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb) ||
+ !S_ISREG(mode))
+ sc->ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ if (!(S_ISREG(mode) || S_ISDIR(mode)))
+ sc->ip->i_d.di_flags2 &= ~XFS_DIFLAG2_DAX;
+
+ if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
+ sc->ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ if (sc->ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)
+ sc->ip->i_d.di_flags2 &= ~XFS_DIFLAG2_DAX;
+}
+
+/* Repair an inode's fields. */
+int
+xfs_repair_inode(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip;
+ xfs_filblks_t count;
+ xfs_filblks_t acount;
+ xfs_extnum_t nextents;
+ uint16_t flags;
+ int error = 0;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ /* Skip inode core repair if w're here only for preening. */
+ if (sc->ip &&
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) &&
+ !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) &&
+ !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ goto preen_only;
+ }
+
+ if (!sc->ip) {
+ error = xfs_repair_inode_core(sc);
+ if (error)
+ goto out;
+ if (XFS_IS_UQUOTA_ON(mp))
+ xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
+ if (XFS_IS_GQUOTA_ON(mp))
+ xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
+ if (XFS_IS_PQUOTA_ON(mp))
+ xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
+ }
+ ASSERT(sc->ip);
+
+ ip = sc->ip;
+ xfs_trans_ijoin(sc->tp, ip, 0);
+
+ /* di_[acm]time.nsec */
+ if ((unsigned long)VFS_I(ip)->i_atime.tv_nsec >= NSEC_PER_SEC)
+ VFS_I(ip)->i_atime.tv_nsec = 0;
+ if ((unsigned long)VFS_I(ip)->i_mtime.tv_nsec >= NSEC_PER_SEC)
+ VFS_I(ip)->i_mtime.tv_nsec = 0;
+ if ((unsigned long)VFS_I(ip)->i_ctime.tv_nsec >= NSEC_PER_SEC)
+ VFS_I(ip)->i_ctime.tv_nsec = 0;
+ if (ip->i_d.di_version > 2 &&
+ (unsigned long)ip->i_d.di_crtime.t_nsec >= NSEC_PER_SEC)
+ ip->i_d.di_crtime.t_nsec = 0;
+
+ /* di_size */
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && !S_ISREG(VFS_I(ip)->i_mode) &&
+ !S_ISLNK(VFS_I(ip)->i_mode)) {
+ i_size_write(VFS_I(ip), 0);
+ ip->i_d.di_size = 0;
+ }
+
+ /* di_flags */
+ flags = ip->i_d.di_flags;
+ if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND))
+ flags &= ~XFS_DIFLAG_APPEND;
+
+ if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
+ flags &= ~XFS_DIFLAG_FILESTREAM;
+ ip->i_d.di_flags = flags;
+
+ /* di_nblocks/di_nextents/di_anextents */
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
+ &nextents, &count);
+ if (error)
+ goto out;
+ ip->i_d.di_nextents = nextents;
+
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
+ &nextents, &acount);
+ if (error)
+ goto out;
+ ip->i_d.di_anextents = nextents;
+
+ ip->i_d.di_nblocks = count + acount;
+ if (ip->i_d.di_anextents != 0 && ip->i_d.di_forkoff == 0)
+ ip->i_d.di_anextents = 0;
+
+ /* Invalid uid/gid? */
+ if (ip->i_d.di_uid == -1U) {
+ ip->i_d.di_uid = 0;
+ VFS_I(ip)->i_mode &= ~(S_ISUID | S_ISGID);
+ if (XFS_IS_UQUOTA_ON(mp))
+ xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
+ }
+ if (ip->i_d.di_gid == -1U) {
+ ip->i_d.di_gid = 0;
+ VFS_I(ip)->i_mode &= ~(S_ISUID | S_ISGID);
+ if (XFS_IS_GQUOTA_ON(mp))
+ xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
+ }
+
+ /* Invalid flags? */
+ xfs_repair_inode_flags(sc);
+ xfs_repair_inode_flags2(sc);
+
+ /* Invalid extent size hints? */
+ xfs_repair_inode_extsize(sc);
+ xfs_repair_inode_cowextsize(sc);
+
+ /* Commit inode core changes. */
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+ error = xfs_trans_roll_inode(&sc->tp, ip);
+ if (error)
+ goto out;
+
+preen_only:
+ /* Inode must be _trans_ijoin'd here */
+ if (xfs_is_reflink_inode(sc->ip))
+ return xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index d663fc4c464d..8d13986ae824 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -116,6 +116,7 @@ int xfs_repair_allocbt(struct xfs_scrub_context *sc);
int xfs_repair_iallocbt(struct xfs_scrub_context *sc);
int xfs_repair_rmapbt(struct xfs_scrub_context *sc);
int xfs_repair_refcountbt(struct xfs_scrub_context *sc);
+int xfs_repair_inode(struct xfs_scrub_context *sc);
#else
@@ -172,6 +173,7 @@ static inline int xfs_repair_rmapbt_setup(
#define xfs_repair_iallocbt xfs_repair_notsupported
#define xfs_repair_rmapbt xfs_repair_notsupported
#define xfs_repair_refcountbt xfs_repair_notsupported
+#define xfs_repair_inode xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index aa35d0384d26..d9b1511cb0d3 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -289,7 +289,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xfs_scrub_setup_inode,
.scrub = xfs_scrub_inode,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_inode,
},
[XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
.type = ST_INODE,
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 16/22] xfs: zap broken inode forks
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (14 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 15/22] xfs: repair inode records Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 17/22] xfs: repair inode block maps Darrick J. Wong
` (6 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Determine if inode fork damage is responsible for the inode being unable
to pass the ifork verifiers in xfs_iget and zap the fork contents if
this is true. Once this is done the fork will be empty but we'll be
able to construct an in-core inode, and a subsequent call to the inode
fork repair ioctl will search the rmapbt to rebuild the records that
were in the fork.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/libxfs/xfs_attr_leaf.c | 32 ++-
fs/xfs/libxfs/xfs_attr_leaf.h | 2
fs/xfs/libxfs/xfs_bmap.c | 21 ++
fs/xfs/libxfs/xfs_bmap.h | 2
fs/xfs/scrub/inode_repair.c | 393 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 433 insertions(+), 17 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2135b8e67dcc..01ce59a4fc92 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -889,23 +889,16 @@ xfs_attr_shortform_allfit(
return xfs_attr_shortform_bytesfit(dp, bytes);
}
-/* Verify the consistency of an inline attribute fork. */
+/* Verify the consistency of a raw inline attribute fork. */
xfs_failaddr_t
-xfs_attr_shortform_verify(
- struct xfs_inode *ip)
+xfs_attr_shortform_verify_struct(
+ struct xfs_attr_shortform *sfp,
+ size_t size)
{
- struct xfs_attr_shortform *sfp;
struct xfs_attr_sf_entry *sfep;
struct xfs_attr_sf_entry *next_sfep;
char *endp;
- struct xfs_ifork *ifp;
int i;
- int size;
-
- ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL);
- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
- sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
/*
* Give up if the attribute is way too short.
@@ -963,6 +956,23 @@ xfs_attr_shortform_verify(
return NULL;
}
+/* Verify the consistency of an inline attribute fork. */
+xfs_failaddr_t
+xfs_attr_shortform_verify(
+ struct xfs_inode *ip)
+{
+ struct xfs_attr_shortform *sfp;
+ struct xfs_ifork *ifp;
+ int size;
+
+ ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL);
+ ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+ sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ size = ifp->if_bytes;
+
+ return xfs_attr_shortform_verify_struct(sfp, size);
+}
+
/*
* Convert a leaf attribute list to shortform attribute list
*/
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 4da08af5b134..e5b4102772c1 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -53,6 +53,8 @@ int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
int xfs_attr_shortform_remove(struct xfs_da_args *args);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
+xfs_failaddr_t xfs_attr_shortform_verify_struct(struct xfs_attr_shortform *sfp,
+ size_t size);
xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 7b0e2b551e23..16d17e6a16d2 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6187,18 +6187,16 @@ xfs_bmap_finish_one(
return error;
}
-/* Check that an inode's extent does not have invalid flags or bad ranges. */
+/* Check that an extent does not have invalid flags or bad ranges. */
xfs_failaddr_t
-xfs_bmap_validate_extent(
- struct xfs_inode *ip,
+xfs_bmbt_validate_extent(
+ struct xfs_mount *mp,
+ bool isrt,
int whichfork,
struct xfs_bmbt_irec *irec)
{
- struct xfs_mount *mp = ip->i_mount;
xfs_fsblock_t endfsb;
- bool isrt;
- isrt = XFS_IS_REALTIME_INODE(ip);
endfsb = irec->br_startblock + irec->br_blockcount - 1;
if (isrt) {
if (!xfs_verify_rtbno(mp, irec->br_startblock))
@@ -6222,3 +6220,14 @@ xfs_bmap_validate_extent(
}
return NULL;
}
+
+/* Check that an inode's extent does not have invalid flags or bad ranges. */
+xfs_failaddr_t
+xfs_bmap_validate_extent(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *irec)
+{
+ return xfs_bmbt_validate_extent(ip->i_mount, XFS_IS_REALTIME_INODE(ip),
+ whichfork, irec);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 2c233f9f1a26..3b9a83e054c9 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -294,6 +294,8 @@ static inline int xfs_bmap_fork_to_state(int whichfork)
}
}
+xfs_failaddr_t xfs_bmbt_validate_extent(struct xfs_mount *mp, bool isrt,
+ int whichfork, struct xfs_bmbt_irec *irec);
xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *irec);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 90208a58a1d1..da37e04bc4df 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -36,11 +36,15 @@
#include "xfs_ialloc.h"
#include "xfs_da_format.h"
#include "xfs_reflink.h"
+#include "xfs_alloc.h"
#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap_util.h"
#include "xfs_dir2.h"
#include "xfs_quota_defs.h"
+#include "xfs_attr_leaf.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -87,11 +91,387 @@ xfs_repair_inode_buf(
}
}
+struct xfs_repair_inode_fork_counters {
+ struct xfs_scrub_context *sc;
+ xfs_rfsblock_t data_blocks;
+ xfs_rfsblock_t rt_blocks;
+ xfs_rfsblock_t attr_blocks;
+ xfs_extnum_t data_extents;
+ xfs_extnum_t rt_extents;
+ xfs_aextnum_t attr_extents;
+};
+
+/* Count extents and blocks for an inode given an rmap. */
+STATIC int
+xfs_repair_inode_count_rmap(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_inode_fork_counters *rifc = priv;
+
+ /* Is this even the right fork? */
+ if (rec->rm_owner != rifc->sc->sm->sm_ino)
+ return 0;
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
+ rifc->attr_blocks += rec->rm_blockcount;
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ rifc->attr_extents++;
+ } else {
+ rifc->data_blocks += rec->rm_blockcount;
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ rifc->data_extents++;
+ }
+ return 0;
+}
+
+/* Count extents and blocks for an inode from all AG rmap data. */
+STATIC int
+xfs_repair_inode_count_ag_rmaps(
+ struct xfs_repair_inode_fork_counters *rifc,
+ xfs_agnumber_t agno)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf;
+ int error;
+
+ error = xfs_alloc_read_agf(rifc->sc->mp, rifc->sc->tp, agno, 0, &agf);
+ if (error)
+ return error;
+
+ cur = xfs_rmapbt_init_cursor(rifc->sc->mp, rifc->sc->tp, agf, agno);
+ if (!cur) {
+ error = -ENOMEM;
+ goto out_agf;
+ }
+
+ error = xfs_rmap_query_all(cur, xfs_repair_inode_count_rmap, rifc);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ error = 0;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+out_agf:
+ xfs_trans_brelse(rifc->sc->tp, agf);
+ return error;
+}
+
+/* Count extents and blocks for a given inode from all rmap data. */
+STATIC int
+xfs_repair_inode_count_rmaps(
+ struct xfs_repair_inode_fork_counters *rifc)
+{
+ xfs_agnumber_t agno;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&rifc->sc->mp->m_sb) ||
+ xfs_sb_version_hasrealtime(&rifc->sc->mp->m_sb))
+ return -EOPNOTSUPP;
+
+ /* XXX: find rt blocks too */
+
+ for (agno = 0; agno < rifc->sc->mp->m_sb.sb_agcount; agno++) {
+ error = xfs_repair_inode_count_ag_rmaps(rifc, agno);
+ if (error)
+ return error;
+ }
+
+ /* Can't have extents on both the rt and the data device. */
+ if (rifc->data_extents && rifc->rt_extents)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Figure out if we need to zap this extents format fork. */
+STATIC bool
+xfs_repair_inode_core_check_extents_fork(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ int dfork_size,
+ int whichfork)
+{
+ struct xfs_bmbt_irec new;
+ struct xfs_bmbt_rec *dp;
+ bool isrt;
+ int i;
+ int nex;
+ int fork_size;
+
+ nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ fork_size = nex * sizeof(struct xfs_bmbt_rec);
+ if (fork_size < 0 || fork_size > dfork_size)
+ return true;
+ dp = (struct xfs_bmbt_rec *)XFS_DFORK_PTR(dip, whichfork);
+
+ isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
+ for (i = 0; i < nex; i++, dp++) {
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(dp, &new);
+ fa = xfs_bmbt_validate_extent(sc->mp, isrt, whichfork, &new);
+ if (fa)
+ return true;
+ }
+
+ return false;
+}
+
+/* Figure out if we need to zap this btree format fork. */
+STATIC bool
+xfs_repair_inode_core_check_btree_fork(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ int dfork_size,
+ int whichfork)
+{
+ struct xfs_bmdr_block *dfp;
+ int nrecs;
+ int level;
+
+ if (XFS_DFORK_NEXTENTS(dip, whichfork) <=
+ dfork_size / sizeof(struct xfs_bmbt_irec))
+ return true;
+
+ dfp = (struct xfs_bmdr_block *)XFS_DFORK_PTR(dip, whichfork);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
+ return true;
+ if (level == 0 || level > XFS_BTREE_MAXLEVELS)
+ return true;
+ return false;
+}
+
+/*
+ * Check the data fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xfs_repair_inode_core_check_data_fork(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ uint64_t size;
+ int dfork_size;
+
+ size = be64_to_cpu(dip->di_size);
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_DEV)
+ return true;
+ break;
+ case S_IFREG:
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK)) {
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ default:
+ return true;
+ }
+ break;
+ default:
+ return true;
+ }
+ dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
+ switch (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK)) {
+ case XFS_DINODE_FMT_DEV:
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ if (size > dfork_size)
+ return true;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (xfs_repair_inode_core_check_extents_fork(sc, dip,
+ dfork_size, XFS_DATA_FORK))
+ return true;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (xfs_repair_inode_core_check_btree_fork(sc, dip,
+ dfork_size, XFS_DATA_FORK))
+ return true;
+ break;
+ default:
+ return true;
+ }
+
+ return false;
+}
+
+/* Reset the data fork to something sane. */
+STATIC void
+xfs_repair_inode_core_zap_data_fork(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ uint16_t mode,
+ struct xfs_repair_inode_fork_counters *rifc)
+{
+ char *p;
+ const struct xfs_dir_ops *ops;
+ struct xfs_dir2_sf_hdr *sfp;
+ int i8count;
+
+ /* Special files always get reset to DEV */
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ dip->di_format = XFS_DINODE_FMT_DEV;
+ dip->di_size = 0;
+ return;
+ }
+
+ /*
+ * If we have data extents, reset to an empty map and hope the user
+ * will run the bmapbtd checker next.
+ */
+ if (rifc->data_extents || rifc->rt_extents || S_ISREG(mode)) {
+ dip->di_format = XFS_DINODE_FMT_EXTENTS;
+ dip->di_nextents = 0;
+ return;
+ }
+
+ /* Otherwise, reset the local format to the minimum. */
+ switch (mode & S_IFMT) {
+ case S_IFLNK:
+ /* Blow out symlink; now it points to root dir */
+ dip->di_format = XFS_DINODE_FMT_LOCAL;
+ dip->di_size = cpu_to_be64(1);
+ p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ *p = '/';
+ break;
+ case S_IFDIR:
+ /*
+ * Blow out dir, make it point to the root. In the
+ * future the direction repair will reconstruct this
+ * dir for us.
+ */
+ dip->di_format = XFS_DINODE_FMT_LOCAL;
+ i8count = sc->mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
+ ops = xfs_dir_get_ops(sc->mp, NULL);
+ sfp = (struct xfs_dir2_sf_hdr *)XFS_DFORK_PTR(dip,
+ XFS_DATA_FORK);
+ sfp->count = 0;
+ sfp->i8count = i8count;
+ ops->sf_put_parent_ino(sfp, sc->mp->m_sb.sb_rootino);
+ dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
+ break;
+ }
+}
+
+/*
+ * Check the attr fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xfs_repair_inode_core_check_attr_fork(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip)
+{
+ struct xfs_attr_shortform *sfp;
+ int size;
+
+ if (XFS_DFORK_BOFF(dip) == 0)
+ return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
+ dip->di_anextents != 0;
+
+ size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+ switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
+ case XFS_DINODE_FMT_LOCAL:
+ sfp = (struct xfs_attr_shortform *)XFS_DFORK_PTR(dip,
+ XFS_ATTR_FORK);
+ return xfs_attr_shortform_verify_struct(sfp, size) != NULL;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (xfs_repair_inode_core_check_extents_fork(sc, dip, size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (xfs_repair_inode_core_check_btree_fork(sc, dip, size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
+ default:
+ return true;
+ }
+
+ return false;
+}
+
+/* Reset the attr fork to something sane. */
+STATIC void
+xfs_repair_inode_core_zap_attr_fork(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ struct xfs_repair_inode_fork_counters *rifc)
+{
+ dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
+ dip->di_anextents = 0;
+ /*
+ * We leave a nonzero forkoff so that the bmap scrub will look for
+ * attr rmaps.
+ */
+ dip->di_forkoff = rifc->attr_extents ? 1 : 0;
+}
+
+/*
+ * Zap the data/attr forks if we spot anything that isn't going to pass the
+ * ifork verifiers or the ifork formatters, because we need to get the inode
+ * into good enough shape that the higher level repair functions can run.
+ */
+STATIC void
+xfs_repair_inode_core_zap_forks(
+ struct xfs_scrub_context *sc,
+ struct xfs_dinode *dip,
+ uint16_t mode,
+ struct xfs_repair_inode_fork_counters *rifc)
+{
+ bool zap_datafork = false;
+ bool zap_attrfork = false;
+
+ /* Inode counters don't make sense? */
+ if (be32_to_cpu(dip->di_nextents) > be64_to_cpu(dip->di_nblocks))
+ zap_datafork = true;
+ if (be16_to_cpu(dip->di_anextents) > be64_to_cpu(dip->di_nblocks))
+ zap_attrfork = true;
+ if (be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) >
+ be64_to_cpu(dip->di_nblocks))
+ zap_datafork = zap_attrfork = true;
+
+ if (!zap_datafork)
+ zap_datafork = xfs_repair_inode_core_check_data_fork(sc, dip,
+ mode);
+ if (!zap_attrfork)
+ zap_attrfork = xfs_repair_inode_core_check_attr_fork(sc, dip);
+
+ /* Zap whatever's bad. */
+ if (zap_attrfork)
+ xfs_repair_inode_core_zap_attr_fork(sc, dip, rifc);
+ if (zap_datafork)
+ xfs_repair_inode_core_zap_data_fork(sc, dip, mode, rifc);
+ dip->di_nblocks = 0;
+ if (!zap_attrfork)
+ be64_add_cpu(&dip->di_nblocks, rifc->attr_blocks);
+ if (!zap_datafork) {
+ be64_add_cpu(&dip->di_nblocks, rifc->data_blocks);
+ be64_add_cpu(&dip->di_nblocks, rifc->rt_blocks);
+ }
+}
+
/* Inode didn't pass verifiers, so fix the raw buffer and retry iget. */
STATIC int
xfs_repair_inode_core(
struct xfs_scrub_context *sc)
{
+ struct xfs_repair_inode_fork_counters rifc;
struct xfs_imap imap;
struct xfs_buf *bp;
struct xfs_dinode *dip;
@@ -101,6 +481,13 @@ xfs_repair_inode_core(
uint16_t mode;
int error;
+ /* Figure out what this inode had mapped in both forks. */
+ memset(&rifc, 0, sizeof(rifc));
+ rifc.sc = sc;
+ error = xfs_repair_inode_count_rmaps(&rifc);
+ if (error)
+ return error;
+
/* Map & read inode. */
ino = sc->sm->sm_ino;
error = xfs_imap(sc->mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED);
@@ -133,6 +520,10 @@ xfs_repair_inode_core(
uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
flags = be16_to_cpu(dip->di_flags);
flags2 = be64_to_cpu(dip->di_flags2);
+ if (rifc.rt_extents)
+ flags |= XFS_DIFLAG_REALTIME;
+ else
+ flags &= ~XFS_DIFLAG_REALTIME;
if (xfs_sb_version_hasreflink(&sc->mp->m_sb) && S_ISREG(mode))
flags2 |= XFS_DIFLAG2_REFLINK;
else
@@ -147,6 +538,8 @@ xfs_repair_inode_core(
if (be64_to_cpu(dip->di_size) & (1ULL << 63))
dip->di_size = cpu_to_be64((1ULL << 63) - 1);
+ xfs_repair_inode_core_zap_forks(sc, dip, mode, &rifc);
+
/* Write out the inode... */
xfs_dinode_calc_crc(sc->mp, dip);
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 17/22] xfs: repair inode block maps
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (15 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 16/22] xfs: zap broken inode forks Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 18/22] xfs: repair damaged symlinks Darrick J. Wong
` (5 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Use the reverse-mapping btree information to rebuild an inode fork.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/bmap.c | 8 +
fs/xfs/scrub/bmap_repair.c | 399 ++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 4
fs/xfs/scrub/scrub.c | 4
fs/xfs/xfs_trans.c | 54 ++++++
fs/xfs/xfs_trans.h | 2
7 files changed, 470 insertions(+), 2 deletions(-)
create mode 100644 fs/xfs/scrub/bmap_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b0f25bf07207..653da1fe6b26 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -176,6 +176,7 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
alloc_repair.o \
+ bmap_repair.o \
ialloc_repair.o \
inode_repair.o \
refcount_repair.o \
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index eeadb33a701c..bbbac1083181 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -71,6 +71,14 @@ xfs_scrub_setup_inode_bmap(
error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping);
if (error)
goto out;
+
+ /* Drop the page cache if we're repairing block mappings. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+ error = invalidate_inode_pages2(
+ VFS_I(sc->ip)->i_mapping);
+ if (error)
+ goto out;
+ }
}
/* Got the inode, lock it and we're ready to go. */
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
new file mode 100644
index 000000000000..aae780a0032c
--- /dev/null
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Inode fork block mapping (BMBT) repair. */
+
+struct xfs_repair_bmap_extent {
+ struct list_head list;
+ struct xfs_rmap_irec rmap;
+ xfs_agnumber_t agno;
+};
+
+struct xfs_repair_bmap {
+ struct list_head extlist;
+ struct xfs_repair_extent_list btlist;
+ struct xfs_repair_bmap_extent ext; /* most files have 1 extent */
+ struct xfs_scrub_context *sc;
+ xfs_ino_t ino;
+ xfs_rfsblock_t otherfork_blocks;
+ xfs_rfsblock_t bmbt_blocks;
+ xfs_extnum_t extents;
+ int whichfork;
+};
+
+/* Record extents that belong to this inode's fork. */
+STATIC int
+xfs_repair_bmap_extent_fn(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_bmap *rb = priv;
+ struct xfs_repair_bmap_extent *rbe;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_fsblock_t fsbno;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(rb->sc, &error))
+ return error;
+
+ /* Skip extents which are not owned by this inode and fork. */
+ if (rec->rm_owner != rb->ino) {
+ return 0;
+ } else if (rb->whichfork == XFS_DATA_FORK &&
+ (rec->rm_flags & XFS_RMAP_ATTR_FORK)) {
+ rb->otherfork_blocks += rec->rm_blockcount;
+ return 0;
+ } else if (rb->whichfork == XFS_ATTR_FORK &&
+ !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) {
+ rb->otherfork_blocks += rec->rm_blockcount;
+ return 0;
+ }
+
+ rb->extents++;
+
+ /* Delete the old bmbt blocks later. */
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+ fsbno = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno,
+ rec->rm_startblock);
+ rb->bmbt_blocks += rec->rm_blockcount;
+ return xfs_repair_collect_btree_extent(rb->sc, &rb->btlist,
+ fsbno, rec->rm_blockcount);
+ }
+
+ /* Remember this rmap. */
+ trace_xfs_repair_bmap_extent_fn(mp, cur->bc_private.a.agno,
+ rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
+ rec->rm_offset, rec->rm_flags);
+
+ if (list_empty(&rb->extlist)) {
+ rbe = &rb->ext;
+ } else {
+ rbe = kmem_alloc(sizeof(struct xfs_repair_bmap_extent),
+ KM_MAYFAIL);
+ if (!rbe)
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&rbe->list);
+ rbe->rmap = *rec;
+ rbe->agno = cur->bc_private.a.agno;
+ list_add_tail(&rbe->list, &rb->extlist);
+
+ return 0;
+}
+
+/* Compare two bmap extents. */
+static int
+xfs_repair_bmap_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_bmap_extent *ap;
+ struct xfs_repair_bmap_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_bmap_extent, list);
+ bp = container_of(b, struct xfs_repair_bmap_extent, list);
+
+ if (ap->rmap.rm_offset > bp->rmap.rm_offset)
+ return 1;
+ else if (ap->rmap.rm_offset < bp->rmap.rm_offset)
+ return -1;
+ return 0;
+}
+
+/* Scan one AG for reverse mappings that we can turn into extent maps. */
+STATIC int
+xfs_repair_bmap_scan_ag(
+ struct xfs_repair_bmap *rb,
+ xfs_agnumber_t agno)
+{
+ struct xfs_scrub_context *sc = rb->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agf_bp = NULL;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_bmap_extent_fn, rb);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ error = 0;
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ xfs_trans_brelse(sc->tp, agf_bp);
+ return error;
+}
+
+/* Insert bmap records into an inode fork, given an rmap. */
+STATIC int
+xfs_repair_bmap_insert_rec(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_bmap_extent *rbe,
+ int baseflags)
+{
+ struct xfs_bmbt_irec bmap;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t firstfsb;
+ xfs_extlen_t extlen;
+ int flags;
+ int error = 0;
+
+ /* Form the "new" mapping... */
+ bmap.br_startblock = XFS_AGB_TO_FSB(sc->mp, rbe->agno,
+ rbe->rmap.rm_startblock);
+ bmap.br_startoff = rbe->rmap.rm_offset;
+
+ flags = 0;
+ if (rbe->rmap.rm_flags & XFS_RMAP_UNWRITTEN)
+ flags = XFS_BMAPI_PREALLOC;
+ while (rbe->rmap.rm_blockcount > 0) {
+ xfs_defer_init(&dfops, &firstfsb);
+ extlen = min_t(xfs_extlen_t, rbe->rmap.rm_blockcount,
+ MAXEXTLEN);
+ bmap.br_blockcount = extlen;
+
+ /* Re-add the extent to the fork. */
+ error = xfs_bmapi_remap(sc->tp, sc->ip,
+ bmap.br_startoff, extlen,
+ bmap.br_startblock, &dfops,
+ baseflags | flags);
+ if (error)
+ goto out_cancel;
+
+ bmap.br_startblock += extlen;
+ bmap.br_startoff += extlen;
+ rbe->rmap.rm_blockcount -= extlen;
+ error = xfs_defer_ijoin(&dfops, sc->ip);
+ if (error)
+ goto out_cancel;
+ error = xfs_defer_finish(&sc->tp, &dfops);
+ if (error)
+ goto out;
+ /* Make sure we roll the transaction. */
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ if (error)
+ goto out;
+ }
+
+ return 0;
+out_cancel:
+ xfs_defer_cancel(&dfops);
+out:
+ return error;
+}
+
+/* Repair an inode fork. */
+STATIC int
+xfs_repair_bmap(
+ struct xfs_scrub_context *sc,
+ int whichfork)
+{
+ struct xfs_repair_bmap rb;
+ struct xfs_owner_info oinfo;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_repair_bmap_extent *rbe;
+ struct xfs_repair_bmap_extent *n;
+ xfs_agnumber_t agno;
+ unsigned int resblks;
+ int baseflags;
+ int error = 0;
+
+ ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
+
+ /* Don't know how to repair the other fork formats. */
+ if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ return -EOPNOTSUPP;
+
+ /* Only files, symlinks, and directories get to have data forks. */
+ if (whichfork == XFS_DATA_FORK && !S_ISREG(VFS_I(ip)->i_mode) &&
+ !S_ISDIR(VFS_I(ip)->i_mode) && !S_ISLNK(VFS_I(ip)->i_mode))
+ return -EINVAL;
+
+ /* If we somehow have delalloc extents, forget it. */
+ if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks)
+ return -EBUSY;
+
+ /*
+ * If there's no attr fork area in the inode, there's
+ * no attr fork to rebuild.
+ */
+ if (whichfork == XFS_ATTR_FORK && !XFS_IFORK_Q(ip))
+ return -ENOENT;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ /* Don't know how to rebuild realtime data forks. */
+ if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK)
+ return -EOPNOTSUPP;
+
+ /*
+ * If this is a file data fork, wait for all pending directio to
+ * complete, then tear everything out of the page cache.
+ */
+ if (S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) {
+ inode_dio_wait(VFS_I(ip));
+ truncate_inode_pages(VFS_I(ip)->i_mapping, 0);
+ }
+
+ /* Collect all reverse mappings for this fork's extents. */
+ memset(&rb, 0, sizeof(rb));
+ INIT_LIST_HEAD(&rb.extlist);
+ xfs_repair_init_extent_list(&rb.btlist);
+ rb.ino = ip->i_ino;
+ rb.whichfork = whichfork;
+ rb.sc = sc;
+
+ /* Iterate the rmaps for extents. */
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_repair_bmap_scan_ag(&rb, agno);
+ if (error)
+ goto out;
+ }
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire bmap
+ * from the number of extents we found, and get ourselves a new
+ * transaction with proper block reservations.
+ */
+ resblks = xfs_bmbt_calc_size(mp, rb.extents);
+ error = xfs_trans_reserve_more(sc->tp, resblks, 0);
+ if (error)
+ goto out;
+
+ /* Blow out the in-core fork and zero the on-disk fork. */
+ sc->ip->i_d.di_nblocks = rb.otherfork_blocks;
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ if (XFS_IFORK_PTR(ip, whichfork) != NULL)
+ xfs_idestroy_fork(sc->ip, whichfork);
+ XFS_IFORK_FMT_SET(sc->ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ XFS_IFORK_NEXT_SET(sc->ip, whichfork, 0);
+
+ /* Reinitialize the on-disk fork. */
+ if (whichfork == XFS_DATA_FORK) {
+ memset(&ip->i_df, 0, sizeof(struct xfs_ifork));
+ ip->i_df.if_flags |= XFS_IFEXTENTS;
+ } else if (whichfork == XFS_ATTR_FORK) {
+ if (list_empty(&rb.extlist))
+ ip->i_afp = NULL;
+ else {
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+ ip->i_afp->if_flags |= XFS_IFEXTENTS;
+ }
+ }
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ if (error)
+ goto out;
+
+ baseflags = XFS_BMAPI_NORMAP;
+ if (whichfork == XFS_ATTR_FORK)
+ baseflags |= XFS_BMAPI_ATTRFORK;
+
+ /* Release quota counts for the old bmbt blocks. */
+ if (rb.bmbt_blocks) {
+ error = xfs_repair_ino_dqattach(sc);
+ if (error)
+ goto out;
+ xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT,
+ -rb.bmbt_blocks);
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ if (error)
+ goto out;
+ }
+
+ /* "Remap" the extents into the fork. */
+ list_sort(NULL, &rb.extlist, xfs_repair_bmap_extent_cmp);
+ list_for_each_entry_safe(rbe, n, &rb.extlist, list) {
+ error = xfs_repair_bmap_insert_rec(sc, rbe, baseflags);
+ if (error)
+ goto out;
+ list_del(&rbe->list);
+ if (rbe != &rb.ext)
+ kmem_free(rbe);
+ }
+
+ /* Dispose of all the old bmbt blocks. */
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, whichfork);
+ return xfs_repair_reap_btree_extents(sc, &rb.btlist, &oinfo,
+ XFS_AG_RESV_NONE);
+out:
+ xfs_repair_cancel_btree_extents(sc, &rb.btlist);
+ list_for_each_entry_safe(rbe, n, &rb.extlist, list) {
+ list_del(&rbe->list);
+ if (rbe != &rb.ext)
+ kmem_free(rbe);
+ }
+ return error;
+}
+
+/* Repair an inode's data fork. */
+int
+xfs_repair_bmap_data(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_repair_bmap(sc, XFS_DATA_FORK);
+}
+
+/* Repair an inode's attr fork. */
+int
+xfs_repair_bmap_attr(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_repair_bmap(sc, XFS_ATTR_FORK);
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 8d13986ae824..af5b8bf17858 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -117,6 +117,8 @@ int xfs_repair_iallocbt(struct xfs_scrub_context *sc);
int xfs_repair_rmapbt(struct xfs_scrub_context *sc);
int xfs_repair_refcountbt(struct xfs_scrub_context *sc);
int xfs_repair_inode(struct xfs_scrub_context *sc);
+int xfs_repair_bmap_data(struct xfs_scrub_context *sc);
+int xfs_repair_bmap_attr(struct xfs_scrub_context *sc);
#else
@@ -174,6 +176,8 @@ static inline int xfs_repair_rmapbt_setup(
#define xfs_repair_rmapbt xfs_repair_notsupported
#define xfs_repair_refcountbt xfs_repair_notsupported
#define xfs_repair_inode xfs_repair_notsupported
+#define xfs_repair_bmap_data xfs_repair_notsupported
+#define xfs_repair_bmap_attr xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index d9b1511cb0d3..c45be77c6c16 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -295,13 +295,13 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xfs_scrub_setup_inode_bmap,
.scrub = xfs_scrub_bmap_data,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_bmap_data,
},
[XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode_bmap,
.scrub = xfs_scrub_bmap_attr,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_bmap_attr,
},
[XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
.type = ST_INODE,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 449771d98561..5583c20a91fe 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -138,6 +138,60 @@ xfs_trans_dup(
return ntp;
}
+/*
+ * Try to reserve more blocks for a transaction. The single use case we
+ * support is for online repair -- use a transaction to gather data without
+ * fear of btree cycle deadlocks; calculate how many blocks we really need
+ * from that data; and only then start modifying data. This can fail due to
+ * ENOSPC, so we have to be able to cancel the transaction.
+ */
+int
+xfs_trans_reserve_more(
+ struct xfs_trans *tp,
+ uint blocks,
+ uint rtextents)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ int error = 0;
+
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+ /*
+ * Attempt to reserve the needed disk blocks by decrementing
+ * the number needed from the number available. This will
+ * fail if the count would go below zero.
+ */
+ if (blocks > 0) {
+ error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
+ if (error != 0)
+ return -ENOSPC;
+ tp->t_blk_res += blocks;
+ }
+
+ /*
+ * Attempt to reserve the needed realtime extents by decrementing
+ * the number needed from the number available. This will
+ * fail if the count would go below zero.
+ */
+ if (rtextents > 0) {
+ error = xfs_mod_frextents(mp, -((int64_t)rtextents));
+ if (error) {
+ error = -ENOSPC;
+ goto out_blocks;
+ }
+ tp->t_rtx_res += rtextents;
+ }
+
+ return 0;
+out_blocks:
+ if (blocks > 0) {
+ xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
+ tp->t_blk_res -= blocks;
+ }
+ return error;
+}
+
/*
* This is called to reserve free disk blocks and log space for the
* given transaction. This must be done before allocating any resources
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 29706b8b3bd4..7284555c4801 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -165,6 +165,8 @@ typedef struct xfs_trans {
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
uint blocks, uint rtextents, uint flags,
struct xfs_trans **tpp);
+int xfs_trans_reserve_more(struct xfs_trans *tp, uint blocks,
+ uint rtextents);
int xfs_trans_alloc_empty(struct xfs_mount *mp,
struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 18/22] xfs: repair damaged symlinks
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (16 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 17/22] xfs: repair inode block maps Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 19/22] xfs: repair extended attributes Darrick J. Wong
` (4 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Repair inconsistent symbolic link data.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/repair.h | 2
fs/xfs/scrub/scrub.c | 2
fs/xfs/scrub/symlink.c | 2
fs/xfs/scrub/symlink_repair.c | 284 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 289 insertions(+), 2 deletions(-)
create mode 100644 fs/xfs/scrub/symlink_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 653da1fe6b26..5e336892f21f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -182,6 +182,7 @@ xfs-y += $(addprefix scrub/, \
refcount_repair.o \
repair.o \
rmap_repair.o \
+ symlink_repair.o \
)
endif
endif
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index af5b8bf17858..4c48da0dfc02 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -119,6 +119,7 @@ int xfs_repair_refcountbt(struct xfs_scrub_context *sc);
int xfs_repair_inode(struct xfs_scrub_context *sc);
int xfs_repair_bmap_data(struct xfs_scrub_context *sc);
int xfs_repair_bmap_attr(struct xfs_scrub_context *sc);
+int xfs_repair_symlink(struct xfs_scrub_context *sc);
#else
@@ -178,6 +179,7 @@ static inline int xfs_repair_rmapbt_setup(
#define xfs_repair_inode xfs_repair_notsupported
#define xfs_repair_bmap_data xfs_repair_notsupported
#define xfs_repair_bmap_attr xfs_repair_notsupported
+#define xfs_repair_symlink xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c45be77c6c16..e22b48e0b2f3 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -325,7 +325,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xfs_scrub_setup_symlink,
.scrub = xfs_scrub_symlink,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_symlink,
},
[XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
.type = ST_INODE,
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 3aa3d60f7c16..a370aad5233f 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -48,7 +48,7 @@ xfs_scrub_setup_symlink(
if (!sc->buf)
return -ENOMEM;
- return xfs_scrub_setup_inode_contents(sc, ip, 0);
+ return xfs_scrub_setup_inode_contents(sc, ip, XFS_SYMLINK_MAPS);
}
/* Symbolic links. */
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
new file mode 100644
index 000000000000..a58eb96dd448
--- /dev/null
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_symlink.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Blow out the whole symlink; replace contents. */
+STATIC int
+xfs_repair_symlink_rewrite(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ const char *target_path,
+ int pathlen)
+{
+ struct xfs_defer_ops dfops;
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_ifork *ifp;
+ const char *cur_chunk;
+ struct xfs_mount *mp = (*tpp)->t_mountp;
+ struct xfs_buf *bp;
+ xfs_fsblock_t first_block;
+ xfs_fileoff_t first_fsb;
+ xfs_filblks_t fs_blocks;
+ xfs_daddr_t d;
+ int byte_cnt;
+ int n;
+ int nmaps;
+ int offset;
+ int error = 0;
+
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+ /* Truncate the whole data fork if it wasn't inline. */
+ if (!(ifp->if_flags & XFS_IFINLINE)) {
+ error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, 0);
+ if (error)
+ goto out;
+ }
+
+ /* Blow out the in-core fork and zero the on-disk fork. */
+ xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+ ip->i_d.di_nextents = 0;
+ memset(&ip->i_df, 0, sizeof(struct xfs_ifork));
+ ip->i_df.if_flags |= XFS_IFEXTENTS;
+
+ /* Rewrite an inline symlink. */
+ if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+ xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
+
+ i_size_write(VFS_I(ip), pathlen);
+ ip->i_d.di_size = pathlen;
+ ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(*tpp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+ goto out;
+
+ }
+
+ /* Rewrite a remote symlink. */
+ fs_blocks = xfs_symlink_blocks(mp, pathlen);
+ first_fsb = 0;
+ nmaps = XFS_SYMLINK_MAPS;
+
+ /* Reserve quota for new blocks. */
+ error = xfs_trans_reserve_quota_nblks(*tpp, ip, fs_blocks, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out;
+
+ /* Map blocks, write symlink target. */
+ xfs_defer_init(&dfops, &first_block);
+
+ error = xfs_bmapi_write(*tpp, ip, first_fsb, fs_blocks,
+ XFS_BMAPI_METADATA, &first_block, fs_blocks,
+ mval, &nmaps, &dfops);
+ if (error)
+ goto out_bmap_cancel;
+
+ ip->i_d.di_size = pathlen;
+ i_size_write(VFS_I(ip), pathlen);
+ xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+
+ cur_chunk = target_path;
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ char *buf;
+
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+ bp = xfs_trans_get_buf(*tpp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0);
+ if (!bp) {
+ error = -ENOMEM;
+ goto out_bmap_cancel;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ byte_cnt = min(byte_cnt, pathlen);
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
+ byte_cnt, bp);
+
+ memcpy(buf, cur_chunk, byte_cnt);
+
+ cur_chunk += byte_cnt;
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_trans_buf_set_type(*tpp, bp, XFS_BLFT_SYMLINK_BUF);
+ xfs_trans_log_buf(*tpp, bp, 0, (buf + byte_cnt - 1) -
+ (char *)bp->b_addr);
+ }
+ ASSERT(pathlen == 0);
+
+ error = xfs_defer_finish(tpp, &dfops);
+ if (error)
+ goto out_bmap_cancel;
+
+ return 0;
+
+out_bmap_cancel:
+ xfs_defer_cancel(&dfops);
+out:
+ return error;
+}
+
+/* Fix everything that fails the verifiers in the remote blocks. */
+STATIC int
+xfs_repair_symlink_fix_remotes(
+ struct xfs_scrub_context *sc,
+ loff_t len)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_buf *bp;
+ xfs_filblks_t fsblocks;
+ xfs_daddr_t d;
+ loff_t offset;
+ unsigned int byte_cnt;
+ int n;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int nr;
+ int error;
+
+ fsblocks = xfs_symlink_blocks(sc->mp, len);
+ error = xfs_bmapi_read(sc->ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ return error;
+
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ d = XFS_FSB_TO_DADDR(sc->mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(sc->mp, mval[n].br_blockcount);
+
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+ d, BTOBB(byte_cnt), 0, &bp, NULL);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(sc->mp, byte_cnt);
+ if (len < byte_cnt)
+ byte_cnt = len;
+
+ nr = xfs_symlink_hdr_set(sc->mp, sc->ip->i_ino, offset,
+ byte_cnt, bp);
+
+ len -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SYMLINK_BUF);
+ xfs_trans_log_buf(sc->tp, bp, 0, nr - 1);
+ xfs_trans_brelse(sc->tp, bp);
+ }
+ if (len != 0)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+int
+xfs_repair_symlink(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_ifork *ifp;
+ loff_t len;
+ size_t newlen;
+ int error = 0;
+
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ len = i_size_read(VFS_I(ip));
+ xfs_trans_ijoin(sc->tp, ip, 0);
+
+ /* Truncate the inode if there's a zero inside the length. */
+ if (ifp->if_flags & XFS_IFINLINE) {
+ if (ifp->if_u1.if_data)
+ newlen = strnlen(ifp->if_u1.if_data,
+ XFS_IFORK_DSIZE(ip));
+ else {
+ /* Zero length symlink becomes a root symlink. */
+ ifp->if_u1.if_data = kmem_alloc(4, KM_SLEEP);
+ snprintf(ifp->if_u1.if_data, 4, "/");
+ newlen = 1;
+ }
+ if (len > newlen) {
+ i_size_write(VFS_I(ip), newlen);
+ ip->i_d.di_size = newlen;
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_DDATA |
+ XFS_ILOG_CORE);
+ }
+ goto out;
+ }
+
+ error = xfs_repair_symlink_fix_remotes(sc, len);
+ if (error)
+ goto out;
+
+ /* Roll transaction, release buffers. */
+ error = xfs_trans_roll_inode(&sc->tp, ip);
+ if (error)
+ goto out;
+
+ /* Size set correctly? */
+ len = i_size_read(VFS_I(ip));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_readlink(ip, sc->buf);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ goto out;
+
+ /*
+ * Figure out the new target length. We can't handle zero-length
+ * symlinks, so make sure that we don't write that out.
+ */
+ newlen = strnlen(sc->buf, XFS_SYMLINK_MAXLEN);
+ if (newlen == 0) {
+ *((char *)sc->buf) = '/';
+ newlen = 1;
+ }
+
+ if (len > newlen)
+ error = xfs_repair_symlink_rewrite(&sc->tp, ip, sc->buf,
+ newlen);
+out:
+ return error;
+}
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 19/22] xfs: repair extended attributes
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (17 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 18/22] xfs: repair damaged symlinks Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 20/22] xfs: scrub should set preen if attr leaf has holes Darrick J. Wong
` (3 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
If the extended attributes look bad, try to sift through the rubble to
find whatever keys/values we can, zap the attr tree, and re-add the
values.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/attr.c | 2
fs/xfs/scrub/attr_repair.c | 519 ++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.h | 2
fs/xfs/scrub/scrub.c | 2
fs/xfs/scrub/scrub.h | 3
6 files changed, 527 insertions(+), 2 deletions(-)
create mode 100644 fs/xfs/scrub/attr_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5e336892f21f..5bc7e2deacbd 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -175,6 +175,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
+ attr_repair.o \
alloc_repair.o \
bmap_repair.o \
ialloc_repair.o \
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 84b6d6b66578..ac25d624286e 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -139,7 +139,7 @@ xfs_scrub_xattr_listent(
* Within a char, the lowest bit of the char represents the byte with
* the smallest address
*/
-STATIC bool
+bool
xfs_scrub_xattr_set_map(
struct xfs_scrub_context *sc,
unsigned long *map,
diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c
new file mode 100644
index 000000000000..c7a50fd8f0f5
--- /dev/null
+++ b/fs/xfs/scrub/attr_repair.c
@@ -0,0 +1,519 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Extended attribute repair. */
+
+struct xfs_attr_key {
+ struct list_head list;
+ unsigned char *value;
+ int valuelen;
+ int flags;
+ int namelen;
+ unsigned char name[0];
+};
+
+#define XFS_ATTR_KEY_LEN(namelen) (sizeof(struct xfs_attr_key) + (namelen) + 1)
+
+struct xfs_repair_xattr {
+ struct list_head attrlist;
+ struct xfs_scrub_context *sc;
+};
+
+/* Iterate each block in an attr fork extent */
+#define for_each_xfs_attr_block(mp, irec, dabno) \
+ for ((dabno) = roundup((xfs_dablk_t)(irec)->br_startoff, \
+ (mp)->m_attr_geo->fsbcount); \
+ (dabno) < (irec)->br_startoff + (irec)->br_blockcount; \
+ (dabno) += (mp)->m_attr_geo->fsbcount)
+
+/*
+ * Record an extended attribute key & value for later reinsertion into the
+ * inode. Use the helpers below, don't call this directly.
+ */
+STATIC int
+__xfs_repair_xattr_salvage_attr(
+ struct xfs_repair_xattr *rx,
+ struct xfs_buf *bp,
+ int flags,
+ int idx,
+ unsigned char *name,
+ int namelen,
+ unsigned char *value,
+ int valuelen)
+{
+ struct xfs_attr_key *key;
+ struct xfs_da_args args;
+ int error = -ENOMEM;
+
+ /* Ignore incomplete or oversized attributes. */
+ if ((flags & XFS_ATTR_INCOMPLETE) ||
+ namelen > XATTR_NAME_MAX || namelen < 0 ||
+ valuelen > XATTR_SIZE_MAX || valuelen < 0)
+ return 0;
+
+ /* Store attr key. */
+ key = kmem_alloc(XFS_ATTR_KEY_LEN(namelen), KM_MAYFAIL);
+ if (!key)
+ goto err;
+ INIT_LIST_HEAD(&key->list);
+ key->value = kmem_zalloc_large(valuelen, KM_MAYFAIL);
+ if (!key->value)
+ goto err_key;
+ key->valuelen = valuelen;
+ key->flags = flags & (ATTR_ROOT | ATTR_SECURE);
+ key->namelen = namelen;
+ key->name[namelen] = 0;
+ memcpy(key->name, name, namelen);
+
+ /* Caller already had the value, so copy it and exit. */
+ if (value) {
+ memcpy(key->value, value, valuelen);
+ goto out_ok;
+ }
+
+ /* Otherwise look up the remote value directly. */
+ memset(&args, 0, sizeof(args));
+ args.geo = rx->sc->mp->m_attr_geo;
+ args.index = idx;
+ args.namelen = namelen;
+ args.name = key->name;
+ args.valuelen = valuelen;
+ args.value = key->value;
+ args.dp = rx->sc->ip;
+ args.trans = rx->sc->tp;
+ error = xfs_attr3_leaf_getvalue(bp, &args);
+ if (error || args.rmtblkno == 0)
+ goto err_value;
+
+ error = xfs_attr_rmtval_get(&args);
+ switch (error) {
+ case 0:
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ error = 0;
+ /* fall through */
+ default:
+ goto err_value;
+ }
+
+out_ok:
+ list_add_tail(&key->list, &rx->attrlist);
+ return 0;
+
+err_value:
+ kmem_free(key->value);
+err_key:
+ kmem_free(key);
+err:
+ return error;
+}
+
+/*
+ * Record a local format extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+static inline int
+xfs_repair_xattr_salvage_local_attr(
+ struct xfs_repair_xattr *rx,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ unsigned char *value,
+ int valuelen)
+{
+ return __xfs_repair_xattr_salvage_attr(rx, NULL, flags, 0, name,
+ namelen, value, valuelen);
+}
+
+/*
+ * Record a remote format extended attribute key & value for later reinsertion
+ * into the inode.
+ */
+static inline int
+xfs_repair_xattr_salvage_remote_attr(
+ struct xfs_repair_xattr *rx,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ struct xfs_buf *leaf_bp,
+ int idx,
+ int valuelen)
+{
+ return __xfs_repair_xattr_salvage_attr(rx, leaf_bp, flags, idx,
+ name, namelen, NULL, valuelen);
+}
+
+/* Extract every xattr key that we can from this attr fork block. */
+STATIC int
+xfs_repair_xattr_recover_leaf(
+ struct xfs_repair_xattr *rx,
+ struct xfs_buf *bp)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_scrub_context *sc = rx->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_attr_leafblock *leaf;
+ unsigned long *usedmap = sc->buf;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ struct xfs_attr_leaf_entry *ent;
+ struct xfs_attr_leaf_entry *entries;
+ char *buf_end;
+ char *name;
+ char *name_end;
+ char *value;
+ size_t off;
+ unsigned int nameidx;
+ unsigned int namesize;
+ unsigned int hdrsize;
+ unsigned int namelen;
+ unsigned int valuelen;
+ int i;
+ int error;
+
+ bitmap_zero(usedmap, mp->m_attr_geo->blksize);
+
+ /* Check the leaf header */
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+ xfs_scrub_xattr_set_map(sc, usedmap, 0, hdrsize);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
+ /* Skip key if it conflicts with something else? */
+ off = (char *)ent - (char *)leaf;
+ if (!xfs_scrub_xattr_set_map(sc, usedmap, off,
+ sizeof(xfs_attr_leaf_entry_t)))
+ continue;
+
+ /* Check the name information. */
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr.firstused ||
+ nameidx >= mp->m_attr_geo->blksize)
+ continue;
+
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, i);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
+ name_end = (char *)lentry + namesize;
+ if (lentry->namelen == 0)
+ continue;
+ name = lentry->nameval;
+ namelen = lentry->namelen;
+ valuelen = be16_to_cpu(lentry->valuelen);
+ value = &name[namelen];
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, i);
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ name_end = (char *)rentry + namesize;
+ if (rentry->namelen == 0 || rentry->valueblk == 0)
+ continue;
+ name = rentry->name;
+ namelen = rentry->namelen;
+ valuelen = be32_to_cpu(rentry->valuelen);
+ value = NULL;
+ }
+ if (name_end > buf_end)
+ continue;
+ if (!xfs_scrub_xattr_set_map(sc, usedmap, nameidx, namesize))
+ continue;
+
+ /* Ok, let's save this key/value. */
+ if (ent->flags & XFS_ATTR_LOCAL)
+ error = xfs_repair_xattr_salvage_local_attr(rx,
+ ent->flags, name, namelen, value, valuelen);
+ else
+ error = xfs_repair_xattr_salvage_remote_attr(rx,
+ ent->flags, name, namelen, bp, i, valuelen);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Try to recover shortform attrs. */
+STATIC int
+xfs_repair_xattr_recover_sf(
+ struct xfs_repair_xattr *rx)
+{
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ struct xfs_attr_sf_entry *next;
+ struct xfs_ifork *ifp;
+ unsigned char *end;
+ int i;
+ int error;
+
+ ifp = XFS_IFORK_PTR(rx->sc->ip, XFS_ATTR_FORK);
+ sf = (struct xfs_attr_shortform *)rx->sc->ip->i_afp->if_u1.if_data;
+ end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;
+
+ for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ next = XFS_ATTR_SF_NEXTENTRY(sfe);
+ if ((unsigned char *)next > end)
+ break;
+
+ /* Ok, let's save this key/value. */
+ error = xfs_repair_xattr_salvage_local_attr(rx, sfe->flags,
+ sfe->nameval, sfe->namelen,
+ &sfe->nameval[sfe->namelen], sfe->valuelen);
+ if (error)
+ return error;
+
+ sfe = next;
+ }
+
+ return 0;
+}
+
+/* Extract as many attribute keys and values as we can. */
+STATIC int
+xfs_repair_xattr_recover(
+ struct xfs_repair_xattr *rx)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub_context *sc = rx->sc;
+ struct xfs_ifork *ifp;
+ struct xfs_da_blkinfo *info;
+ struct xfs_buf *bp;
+ xfs_dablk_t dabno;
+ int error = 0;
+
+ if (sc->ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+ return xfs_repair_xattr_recover_sf(rx);
+
+ /* Iterate each attr block in the attr fork. */
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_ATTR_FORK);
+ for_each_xfs_iext(ifp, &icur, &got) {
+ for_each_xfs_attr_block(sc->mp, &got, dabno) {
+ /*
+ * Try to read buffer. We invalidate them in the next
+ * step so we don't bother to set a buffer type or
+ * ops.
+ */
+ error = xfs_da_read_buf(sc->tp, sc->ip, dabno, -1, &bp,
+ XFS_ATTR_FORK, NULL);
+ if (error || !bp)
+ continue;
+
+ /* Screen out non-leaves & other garbage. */
+ info = bp->b_addr;
+ if (info->magic != cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) ||
+ xfs_attr3_leaf_buf_ops.verify_struct(bp) != NULL)
+ continue;
+
+ error = xfs_repair_xattr_recover_leaf(rx, bp);
+ if (error)
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+/* Free all the attribute fork blocks and delete the fork. */
+STATIC int
+xfs_repair_xattr_zap(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+ struct xfs_buf *bp;
+ xfs_fileoff_t lblk;
+ int error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ if (sc->ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+ goto out_fork_remove;
+
+ /* Invalidate each attr block in the attr fork. */
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_ATTR_FORK);
+ for_each_xfs_iext(ifp, &icur, &got) {
+ for_each_xfs_attr_block(sc->mp, &got, lblk) {
+ error = xfs_da_get_buf(sc->tp, sc->ip, lblk, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error || !bp)
+ continue;
+ xfs_trans_binval(sc->tp, bp);
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ if (error)
+ return error;
+ }
+ }
+
+ error = xfs_itruncate_extents(&sc->tp, sc->ip, XFS_ATTR_FORK, 0);
+ if (error)
+ return error;
+
+out_fork_remove:
+ /* Reset the attribute fork - this also destroys the in-core fork */
+ xfs_attr_fork_remove(sc->ip, sc->tp);
+ return 0;
+}
+
+/*
+ * Compare two xattr keys. ATTR_SECURE keys come before ATTR_ROOT and
+ * ATTR_ROOT keys come before user attrs. Otherwise sort in hash order.
+ */
+static int
+xfs_repair_xattr_key_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_attr_key *ap;
+ struct xfs_attr_key *bp;
+ uint ahash, bhash;
+
+ ap = container_of(a, struct xfs_attr_key, list);
+ bp = container_of(b, struct xfs_attr_key, list);
+
+ if (ap->flags > bp->flags)
+ return 1;
+ else if (ap->flags < bp->flags)
+ return -1;
+
+ ahash = xfs_da_hashname(ap->name, ap->namelen);
+ bhash = xfs_da_hashname(bp->name, bp->namelen);
+ if (ahash > bhash)
+ return 1;
+ else if (ahash < bhash)
+ return -1;
+ return 0;
+}
+
+/* Repair the extended attribute metadata. */
+int
+xfs_repair_xattr(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_xattr rx;
+ struct xfs_attr_key *key, *next;
+ struct xfs_ifork *ifp;
+ int error;
+
+ if (!xfs_inode_hasattr(sc->ip))
+ return -ENOENT;
+ error = xfs_repair_ino_dqattach(sc);
+ if (error)
+ return error;
+
+ /* Extent map should be loaded. */
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_ATTR_FORK);
+ if (XFS_IFORK_FORMAT(sc->ip, XFS_ATTR_FORK) != XFS_DINODE_FMT_LOCAL &&
+ !(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(sc->tp, sc->ip, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ }
+
+ memset(&rx, 0, sizeof(rx));
+ rx.sc = sc;
+ INIT_LIST_HEAD(&rx.attrlist);
+
+ /* Read every attr key and value and record them in memory. */
+ error = xfs_repair_xattr_recover(&rx);
+ if (error)
+ return error;
+
+ /* Reinsert the security and root attrs first. */
+ list_sort(NULL, &rx.attrlist, xfs_repair_xattr_key_cmp);
+
+ /*
+ * Invalidate and truncate the attribute fork extents, commit the
+ * repair transaction, and drop the ilock. The attribute setting code
+ * needs to be able to allocate special transactions and take the
+ * ilock on its own. This means that we can't 100% prevent other
+ * programs from accessing the inode while we're rebuilding the
+ * attributes.
+ */
+ error = xfs_repair_xattr_zap(sc);
+ if (error)
+ goto out_attrs;
+ error = xfs_trans_commit(sc->tp);
+ sc->tp = NULL;
+ if (error)
+ goto out_attrs;
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+ sc->ilock_flags &= ~XFS_ILOCK_EXCL;
+
+ /* Re-add every attr to the file. */
+ list_for_each_entry_safe(key, next, &rx.attrlist, list) {
+ error = xfs_attr_set(sc->ip, key->name, key->value,
+ key->valuelen, key->flags);
+ if (error)
+ goto out_attrs;
+
+ /*
+ * If the attr value is larger than a single page, free the
+ * key now so that we aren't hogging memory while doing a lot
+ * of metadata updates. Otherwise, we want to spend as little
+ * time reconstructing the attrs as we possibly can.
+ */
+ if (key->valuelen <= PAGE_SIZE)
+ continue;
+ list_del(&key->list);
+ kmem_free(key->value);
+ kmem_free(key);
+ }
+
+out_attrs:
+ /* Free attribute list. */
+ list_for_each_entry_safe(key, next, &rx.attrlist, list) {
+ list_del(&key->list);
+ kmem_free(key->value);
+ kmem_free(key);
+ }
+
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 4c48da0dfc02..bac0668b0396 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -120,6 +120,7 @@ int xfs_repair_inode(struct xfs_scrub_context *sc);
int xfs_repair_bmap_data(struct xfs_scrub_context *sc);
int xfs_repair_bmap_attr(struct xfs_scrub_context *sc);
int xfs_repair_symlink(struct xfs_scrub_context *sc);
+int xfs_repair_xattr(struct xfs_scrub_context *sc);
#else
@@ -180,6 +181,7 @@ static inline int xfs_repair_rmapbt_setup(
#define xfs_repair_bmap_data xfs_repair_notsupported
#define xfs_repair_bmap_attr xfs_repair_notsupported
#define xfs_repair_symlink xfs_repair_notsupported
+#define xfs_repair_xattr xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index e22b48e0b2f3..4fc0b6a4ae70 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -319,7 +319,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xfs_scrub_setup_xattr,
.scrub = xfs_scrub_xattr,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_xattr,
},
[XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
.type = ST_INODE,
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 8cf4062e069d..336c3169ae92 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -156,4 +156,7 @@ void xfs_scrub_xref_is_used_rt_space(struct xfs_scrub_context *sc,
# define xfs_scrub_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
#endif
+bool xfs_scrub_xattr_set_map(struct xfs_scrub_context *sc, unsigned long *map,
+ unsigned int start, unsigned int len);
+
#endif /* __XFS_SCRUB_SCRUB_H__ */
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 20/22] xfs: scrub should set preen if attr leaf has holes
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (18 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 19/22] xfs: repair extended attributes Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:35 ` [PATCH 21/22] xfs: repair quotas Darrick J. Wong
` (2 subsequent siblings)
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
If an attr block indicates that it could use compaction, set the preen
flag to have the attr fork rebuilt, since the attr fork rebuilder can
take care of that for us.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/attr.c | 2 ++
fs/xfs/scrub/dabtree.c | 15 +++++++++++++++
fs/xfs/scrub/dabtree.h | 1 +
fs/xfs/scrub/trace.h | 1 +
4 files changed, 19 insertions(+)
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index ac25d624286e..ce27357a8dd1 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -307,6 +307,8 @@ xfs_scrub_xattr_block(
xfs_scrub_da_set_corrupt(ds, level);
if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, 0, hdrsize))
xfs_scrub_da_set_corrupt(ds, level);
+ if (leafhdr.holes)
+ xfs_scrub_da_set_preen(ds, level);
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index bffdb7dc09bf..d11364d48286 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -99,6 +99,21 @@ xfs_scrub_da_set_corrupt(
__return_address);
}
+/* Flag a da btree node in need of optimization. */
+void
+xfs_scrub_da_set_preen(
+ struct xfs_scrub_da_btree *ds,
+ int level)
+{
+ struct xfs_scrub_context *sc = ds->sc;
+
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xfs_scrub_fblock_preen(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ __return_address);
+}
+
/* Find an entry at a certain level in a da btree. */
STATIC void *
xfs_scrub_da_btree_entry(
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
index d31468d68cef..681f82faee3e 100644
--- a/fs/xfs/scrub/dabtree.h
+++ b/fs/xfs/scrub/dabtree.h
@@ -50,6 +50,7 @@ bool xfs_scrub_da_process_error(struct xfs_scrub_da_btree *ds, int level, int *e
/* Check for da btree corruption. */
void xfs_scrub_da_set_corrupt(struct xfs_scrub_da_btree *ds, int level);
+void xfs_scrub_da_set_preen(struct xfs_scrub_da_btree *ds, int level);
int xfs_scrub_da_btree_hash(struct xfs_scrub_da_btree *ds, int level,
__be32 *hashp);
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 794d56bb1af8..1e25cc1cf34b 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -244,6 +244,7 @@ DEFINE_EVENT(xfs_scrub_fblock_error_class, name, \
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_error);
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_warning);
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_preen);
TRACE_EVENT(xfs_scrub_incomplete,
TP_PROTO(struct xfs_scrub_context *sc, void *ret_ip),
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 21/22] xfs: repair quotas
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (19 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 20/22] xfs: scrub should set preen if attr leaf has holes Darrick J. Wong
@ 2018-05-15 22:35 ` Darrick J. Wong
2018-05-15 22:36 ` [PATCH 22/22] xfs: implement live quotacheck as part of quota repair Darrick J. Wong
2018-05-18 3:47 ` [PATCH 0.5/22] xfs: grab the per-ag structure whenever relevant Darrick J. Wong
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:35 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Fix anything that causes the quota verifiers to fail.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/Makefile | 1
fs/xfs/scrub/attr_repair.c | 2
fs/xfs/scrub/common.h | 8 +
fs/xfs/scrub/quota.c | 2
fs/xfs/scrub/quota_repair.c | 355 +++++++++++++++++++++++++++++++++++++++++++
fs/xfs/scrub/repair.c | 58 +++++++
fs/xfs/scrub/repair.h | 8 +
fs/xfs/scrub/scrub.c | 11 +
fs/xfs/scrub/scrub.h | 1
9 files changed, 438 insertions(+), 8 deletions(-)
create mode 100644 fs/xfs/scrub/quota_repair.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5bc7e2deacbd..0018ba84944d 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -185,5 +185,6 @@ xfs-y += $(addprefix scrub/, \
rmap_repair.o \
symlink_repair.o \
)
+xfs-$(CONFIG_XFS_QUOTA) += scrub/quota_repair.o
endif
endif
diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c
index c7a50fd8f0f5..d66855860b7f 100644
--- a/fs/xfs/scrub/attr_repair.c
+++ b/fs/xfs/scrub/attr_repair.c
@@ -360,7 +360,7 @@ xfs_repair_xattr_recover(
}
/* Free all the attribute fork blocks and delete the fork. */
-STATIC int
+int
xfs_repair_xattr_zap(
struct xfs_scrub_context *sc)
{
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 6012049a8617..4079a7a65c87 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -151,6 +151,14 @@ static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT);
}
+/* Do we need to invoke the repair tool? */
+static inline bool xfs_scrub_needs_repair(struct xfs_scrub_metadata *sm)
+{
+ return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN);
+}
+
int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc);
int xfs_scrub_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 15ae4d23d6ac..64776257fd88 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -43,7 +43,7 @@
#include "scrub/trace.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
-static inline uint
+uint
xfs_scrub_quota_to_dqtype(
struct xfs_scrub_context *sc)
{
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
new file mode 100644
index 000000000000..68b7082af30a
--- /dev/null
+++ b/fs/xfs/scrub/quota_repair.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dquot.h"
+#include "xfs_dquot_item.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/* Quota repair. */
+
+struct xfs_repair_quota_info {
+ struct xfs_scrub_context *sc;
+ bool need_quotacheck;
+};
+
+/* Scrub the fields in an individual quota item. */
+STATIC int
+xfs_repair_quota_item(
+ struct xfs_dquot *dq,
+ uint dqtype,
+ void *priv)
+{
+ struct xfs_repair_quota_info *rqi = priv;
+ struct xfs_scrub_context *sc = rqi->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_disk_dquot *d = &dq->q_core;
+ unsigned long long bsoft;
+ unsigned long long isoft;
+ unsigned long long rsoft;
+ unsigned long long bhard;
+ unsigned long long ihard;
+ unsigned long long rhard;
+ unsigned long long bcount;
+ unsigned long long icount;
+ unsigned long long rcount;
+ xfs_ino_t fs_icount;
+ bool dirty = false;
+ int error;
+
+ /* Did we get the dquot type we wanted? */
+ if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) {
+ d->d_flags = dqtype;
+ dirty = true;
+ }
+
+ if (d->d_pad0 || d->d_pad) {
+ d->d_pad0 = 0;
+ d->d_pad = 0;
+ dirty = true;
+ }
+
+ /* Check the limits. */
+ bhard = be64_to_cpu(d->d_blk_hardlimit);
+ ihard = be64_to_cpu(d->d_ino_hardlimit);
+ rhard = be64_to_cpu(d->d_rtb_hardlimit);
+
+ bsoft = be64_to_cpu(d->d_blk_softlimit);
+ isoft = be64_to_cpu(d->d_ino_softlimit);
+ rsoft = be64_to_cpu(d->d_rtb_softlimit);
+
+ if (bsoft > bhard) {
+ d->d_blk_softlimit = d->d_blk_hardlimit;
+ dirty = true;
+ }
+
+ if (isoft > ihard) {
+ d->d_ino_softlimit = d->d_ino_hardlimit;
+ dirty = true;
+ }
+
+ if (rsoft > rhard) {
+ d->d_rtb_softlimit = d->d_rtb_hardlimit;
+ dirty = true;
+ }
+
+ /* Check the resource counts. */
+ bcount = be64_to_cpu(d->d_bcount);
+ icount = be64_to_cpu(d->d_icount);
+ rcount = be64_to_cpu(d->d_rtbcount);
+ fs_icount = percpu_counter_sum(&mp->m_icount);
+
+ /*
+ * Check that usage doesn't exceed physical limits. However, on
+ * a reflink filesystem we're allowed to exceed physical space
+ * if there are no quota limits. We don't know what the real number
+ * is, but we can make quotacheck find out for us.
+ */
+ if (!xfs_sb_version_hasreflink(&mp->m_sb) &&
+ mp->m_sb.sb_dblocks < bcount) {
+ dq->q_res_bcount -= be64_to_cpu(dq->q_core.d_bcount);
+ dq->q_res_bcount += mp->m_sb.sb_dblocks;
+ d->d_bcount = cpu_to_be64(mp->m_sb.sb_dblocks);
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+ if (icount > fs_icount) {
+ dq->q_res_icount -= be64_to_cpu(dq->q_core.d_icount);
+ dq->q_res_icount += fs_icount;
+ d->d_icount = cpu_to_be64(fs_icount);
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+ if (rcount > mp->m_sb.sb_rblocks) {
+ dq->q_res_rtbcount -= be64_to_cpu(dq->q_core.d_rtbcount);
+ dq->q_res_rtbcount += mp->m_sb.sb_rblocks;
+ d->d_rtbcount = cpu_to_be64(mp->m_sb.sb_rblocks);
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+
+ if (!dirty)
+ return 0;
+
+ dq->dq_flags |= XFS_DQ_DIRTY;
+ xfs_trans_dqjoin(sc->tp, dq);
+ xfs_trans_log_dquot(sc->tp, dq);
+ error = xfs_trans_roll(&sc->tp);
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Fix a quota timer so that we can pass the verifier. */
+STATIC void
+xfs_repair_quota_fix_timer(
+ __be64 softlimit,
+ __be64 countnow,
+ __be32 *timer,
+ time_t timelimit)
+{
+ uint64_t soft = be64_to_cpu(softlimit);
+ uint64_t count = be64_to_cpu(countnow);
+
+ if (soft && count > soft && *timer == 0)
+ *timer = cpu_to_be32(get_seconds() + timelimit);
+}
+
+/* Fix anything the verifiers complain about. */
+STATIC int
+xfs_repair_quota_block(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ uint dqtype,
+ xfs_dqid_t id)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ enum xfs_blft buftype = 0;
+ int i;
+
+ bp->b_ops = &xfs_dquot_buf_ops;
+ for (i = 0; i < qi->qi_dqperchunk; i++) {
+ ddq = &d[i].dd_diskdq;
+
+ ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ ddq->d_version = XFS_DQUOT_VERSION;
+ ddq->d_flags = dqtype;
+ ddq->d_id = cpu_to_be32(id + i);
+
+ xfs_repair_quota_fix_timer(ddq->d_blk_softlimit,
+ ddq->d_bcount, &ddq->d_btimer,
+ qi->qi_btimelimit);
+ xfs_repair_quota_fix_timer(ddq->d_ino_softlimit,
+ ddq->d_icount, &ddq->d_itimer,
+ qi->qi_itimelimit);
+ xfs_repair_quota_fix_timer(ddq->d_rtb_softlimit,
+ ddq->d_rtbcount, &ddq->d_rtbtimer,
+ qi->qi_rtbtimelimit);
+
+ if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ } else {
+ memset(&d->dd_uuid, 0, sizeof(d->dd_uuid));
+ d->dd_lsn = 0;
+ d->dd_crc = 0;
+ }
+ }
+ switch (dqtype) {
+ case XFS_DQ_USER:
+ buftype = XFS_BLFT_UDQUOT_BUF;
+ break;
+ case XFS_DQ_GROUP:
+ buftype = XFS_BLFT_GDQUOT_BUF;
+ break;
+ case XFS_DQ_PROJ:
+ buftype = XFS_BLFT_PDQUOT_BUF;
+ break;
+ }
+ xfs_trans_buf_set_type(sc->tp, bp, buftype);
+ xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+ return xfs_trans_roll(&sc->tp);
+}
+
+/* Repair quota's data fork. */
+STATIC int
+xfs_repair_quota_data_fork(
+ struct xfs_scrub_context *sc,
+ uint dqtype)
+{
+ struct xfs_bmbt_irec irec = { 0 };
+ struct xfs_iext_cursor icur;
+ struct xfs_scrub_metadata *real_sm = sc->sm;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp;
+ struct xfs_buf *bp;
+ struct xfs_dqblk *d;
+ xfs_dqid_t id;
+ xfs_fileoff_t max_dqid_off;
+ xfs_fileoff_t off;
+ xfs_fsblock_t fsbno;
+ bool truncate = false;
+ int error = 0;
+
+ error = xfs_repair_metadata_inode_forks(sc);
+ if (error)
+ goto out;
+
+ /* Check for data fork problems that apply only to quota files. */
+ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ if (isnullstartblock(irec.br_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (irec.br_startoff > max_dqid_off ||
+ irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+ truncate = true;
+ break;
+ }
+ }
+ if (truncate) {
+ error = xfs_itruncate_extents(&sc->tp, sc->ip, XFS_DATA_FORK,
+ max_dqid_off * sc->mp->m_sb.sb_blocksize);
+ if (error)
+ goto out;
+ }
+
+ /* Now go fix anything that fails the verifiers. */
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ for (fsbno = irec.br_startblock, off = irec.br_startoff;
+ fsbno < irec.br_startblock + irec.br_blockcount;
+ fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB,
+ off += XFS_DQUOT_CLUSTER_SIZE_FSB) {
+ id = off * qi->qi_dqperchunk;
+ error = xfs_trans_read_buf(sc->mp, sc->tp,
+ sc->mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ qi->qi_dqchunklen,
+ 0, &bp, &xfs_dquot_buf_ops);
+ if (error == 0) {
+ d = (struct xfs_dqblk *)bp->b_addr;
+ if (id == be32_to_cpu(d->dd_diskdq.d_id))
+ continue;
+ error = -EFSCORRUPTED;
+ }
+ if (error != -EFSBADCRC && error != -EFSCORRUPTED)
+ goto out;
+
+ /* Failed verifier, try again. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp,
+ sc->mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ qi->qi_dqchunklen,
+ 0, &bp, NULL);
+ if (error)
+ goto out;
+ error = xfs_repair_quota_block(sc, bp, dqtype, id);
+ }
+ }
+
+out:
+ sc->sm = real_sm;
+ return error;
+}
+
+/* Repair all of a quota type's items. */
+int
+xfs_repair_quota(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_repair_quota_info rqi;
+ struct xfs_mount *mp = sc->mp;
+ uint dqtype;
+ int error = 0;
+
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
+
+ error = xfs_repair_quota_data_fork(sc, dqtype);
+ if (error)
+ goto out;
+
+ /*
+ * Go fix anything in the quota items that we could have been mad
+ * about. Now that we've checked the quota inode data fork we have to
+ * drop ILOCK_EXCL to use the regular dquot functions.
+ */
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = 0;
+ rqi.sc = sc;
+ rqi.need_quotacheck = false;
+ error = xfs_qm_dqiterate(mp, dqtype, xfs_repair_quota_item, &rqi);
+ if (error)
+ goto out_relock;
+
+ /* Make a quotacheck happen. */
+ if (rqi.need_quotacheck)
+ xfs_repair_force_quotacheck(sc, dqtype);
+
+out_relock:
+ sc->ilock_flags = XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 2b97f54d8e1f..6cd109119692 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -45,6 +45,8 @@
#include "xfs_quota.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_attr.h"
+#include "xfs_reflink.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -1200,3 +1202,59 @@ xfs_repair_grab_all_ag_headers(
return error;
}
+
+/*
+ * Repair the attr/data forks of a metadata inode. The metadata inode must be
+ * pointed to by sc->ip and the ILOCK must be held.
+ */
+int
+xfs_repair_metadata_inode_forks(
+ struct xfs_scrub_context *sc)
+{
+ __u32 smtype;
+ __u32 smflags;
+ int error;
+
+ smtype = sc->sm->sm_type;
+ smflags = sc->sm->sm_flags;
+
+ /* Let's see if the forks need repair. */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ error = xfs_scrub_metadata_inode_forks(sc);
+ if (error || !xfs_scrub_needs_repair(sc->sm))
+ goto out;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Clear the reflink flag & attr forks that we shouldn't have. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+ if (error)
+ goto out;
+ }
+
+ if (xfs_inode_hasattr(sc->ip)) {
+ error = xfs_repair_xattr_zap(sc);
+ if (error)
+ goto out;
+ }
+
+ /* Repair the data fork. */
+ sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
+ error = xfs_repair_bmap_data(sc);
+ sc->sm->sm_type = smtype;
+ if (error)
+ goto out;
+
+ /* Bail out if we still need repairs. */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ error = xfs_scrub_metadata_inode_forks(sc);
+ if (error)
+ goto out;
+ if (xfs_scrub_needs_repair(sc->sm))
+ error = -EFSCORRUPTED;
+out:
+ sc->sm->sm_type = smtype;
+ sc->sm->sm_flags = smflags;
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index bac0668b0396..0e87c5371f7c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -104,6 +104,8 @@ int xfs_repair_fs_thaw(struct xfs_scrub_context *sc);
void xfs_repair_frozen_iput(struct xfs_scrub_context *sc, struct xfs_inode *ip);
int xfs_repair_grab_all_ag_headers(struct xfs_scrub_context *sc);
int xfs_repair_rmapbt_setup(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+int xfs_repair_xattr_zap(struct xfs_scrub_context *sc);
+int xfs_repair_metadata_inode_forks(struct xfs_scrub_context *sc);
/* Metadata repairers */
@@ -121,6 +123,11 @@ int xfs_repair_bmap_data(struct xfs_scrub_context *sc);
int xfs_repair_bmap_attr(struct xfs_scrub_context *sc);
int xfs_repair_symlink(struct xfs_scrub_context *sc);
int xfs_repair_xattr(struct xfs_scrub_context *sc);
+#ifdef CONFIG_XFS_QUOTA
+int xfs_repair_quota(struct xfs_scrub_context *sc);
+#else
+# define xfs_repair_quota xfs_repair_notsupported
+#endif /* CONFIG_XFS_QUOTA */
#else
@@ -182,6 +189,7 @@ static inline int xfs_repair_rmapbt_setup(
#define xfs_repair_bmap_attr xfs_repair_notsupported
#define xfs_repair_symlink xfs_repair_notsupported
#define xfs_repair_xattr xfs_repair_notsupported
+#define xfs_repair_quota xfs_repair_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4fc0b6a4ae70..baa2ed42fb55 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -351,19 +351,19 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_quota,
},
[XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_quota,
},
[XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
- .repair = xfs_repair_notsupported,
+ .repair = xfs_repair_quota,
},
};
@@ -559,9 +559,8 @@ xfs_scrub_metadata(
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT |
- XFS_SCRUB_OFLAG_PREEN));
+ needs_fix = xfs_scrub_needs_repair(sc.sm);
+
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 336c3169ae92..f42a6ea5b553 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -158,5 +158,6 @@ void xfs_scrub_xref_is_used_rt_space(struct xfs_scrub_context *sc,
bool xfs_scrub_xattr_set_map(struct xfs_scrub_context *sc, unsigned long *map,
unsigned int start, unsigned int len);
+uint xfs_scrub_quota_to_dqtype(struct xfs_scrub_context *sc);
#endif /* __XFS_SCRUB_SCRUB_H__ */
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 22/22] xfs: implement live quotacheck as part of quota repair
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (20 preceding siblings ...)
2018-05-15 22:35 ` [PATCH 21/22] xfs: repair quotas Darrick J. Wong
@ 2018-05-15 22:36 ` Darrick J. Wong
2018-05-18 3:47 ` [PATCH 0.5/22] xfs: grab the per-ag structure whenever relevant Darrick J. Wong
22 siblings, 0 replies; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-15 22:36 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Use the fs freezing mechanism we developed for the rmapbt repair to
freeze the fs, this time to scan the fs for a live quotacheck. We add a
new dqget variant to use the existing scrub transaction to allocate an
on-disk dquot block if it is missing.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/quota.c | 20 +++
fs/xfs/scrub/quota_repair.c | 286 +++++++++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_dquot.c | 59 ++++++++-
fs/xfs/xfs_dquot.h | 3
4 files changed, 360 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 64776257fd88..596c660ca155 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -41,6 +41,7 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
uint
@@ -78,12 +79,29 @@ xfs_scrub_setup_quota(
mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
if (!xfs_this_quota_on(sc->mp, dqtype))
return -ENOENT;
+ /*
+ * Freeze out anything that can alter an inode because we reconstruct
+ * the quota counts by iterating all the inodes in the system.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ (sc->try_harder || XFS_QM_NEED_QUOTACHECK(sc->mp))) {
+ error = xfs_repair_fs_freeze(sc);
+ if (error)
+ return error;
+ }
error = xfs_scrub_setup_fs(sc, ip);
if (error)
return error;
sc->ip = xfs_quota_inode(sc->mp, dqtype);
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
sc->ilock_flags = XFS_ILOCK_EXCL;
+ /*
+ * Pretend to be an ILOCK parent to shut up lockdep if we're going to
+ * do a full inode scan of the fs. Quota inodes do not count towards
+ * quota accounting, so we shouldn't deadlock on ourselves.
+ */
+ if (sc->fs_frozen)
+ sc->ilock_flags |= XFS_ILOCK_PARENT;
+ xfs_ilock(sc->ip, sc->ilock_flags);
return 0;
}
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index 68b7082af30a..5865025fe04d 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -30,13 +30,20 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
+#include "xfs_icache.h"
#include "xfs_inode_fork.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_dquot.h"
#include "xfs_dquot_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -314,6 +321,269 @@ xfs_repair_quota_data_fork(
return error;
}
+/*
+ * Add this inode's resource usage to the dquot. We adjust the in-core and
+ * the (cached) on-disk copies of the counters and leave the dquot dirty. A
+ * subsequent pass through the dquots logs them all to disk. Fortunately we
+ * froze the filesystem before starting so at least we don't have to deal
+ * with chown/chproj races.
+ */
+STATIC int
+xfs_repair_quotacheck_dqadjust(
+ struct xfs_scrub_context *sc,
+ uint type,
+ xfs_qcnt_t nblks,
+ xfs_qcnt_t rtblks)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_dquot *dqp;
+ xfs_dqid_t id;
+ int error;
+
+ /* Try to read in the dquot. */
+ id = xfs_qm_id_for_quotatype(ip, type);
+ error = xfs_qm_dqget(mp, id, type, false, &dqp);
+ if (error == -ENOENT) {
+ /* Allocate a dquot using our special transaction. */
+ error = xfs_qm_dqget_alloc(&sc->tp, id, type, &dqp);
+ if (error)
+ return error;
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ }
+ if (error) {
+ /*
+ * Shouldn't be able to turn off quotas here.
+ */
+ ASSERT(error != -ESRCH);
+ ASSERT(error != -ENOENT);
+ return error;
+ }
+
+ /*
+ * Adjust the inode count and the block count to reflect this inode's
+ * resource usage.
+ */
+ be64_add_cpu(&dqp->q_core.d_icount, 1);
+ dqp->q_res_icount++;
+ if (nblks) {
+ be64_add_cpu(&dqp->q_core.d_bcount, nblks);
+ dqp->q_res_bcount += nblks;
+ }
+ if (rtblks) {
+ be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
+ dqp->q_res_rtbcount += rtblks;
+ }
+
+ /*
+ * Set default limits, adjust timers (since we changed usages)
+ *
+ * There are no timers for the default values set in the root dquot.
+ */
+ if (dqp->q_core.d_id) {
+ xfs_qm_adjust_dqlimits(mp, dqp);
+ xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
+ }
+
+ dqp->dq_flags |= XFS_DQ_DIRTY;
+ xfs_qm_dqput(dqp);
+ return 0;
+}
+
+/* Record this inode's quota use. */
+STATIC int
+xfs_repair_quotacheck_inode(
+ struct xfs_scrub_context *sc,
+ uint dqtype,
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp;
+ xfs_filblks_t rtblks = 0; /* total rt blks */
+ xfs_qcnt_t nblks;
+ int error;
+
+ /* Count the realtime blocks. */
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(sc->tp, ip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ xfs_bmap_count_leaves(ifp, &rtblks);
+ }
+
+ nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+
+ /* Adjust the dquot. */
+ return xfs_repair_quotacheck_dqadjust(sc, dqtype, nblks, rtblks);
+}
+
+struct xfs_repair_quotacheck {
+ struct xfs_scrub_context *sc;
+ uint dqtype;
+};
+
+/* Iterate all the inodes in an AG group. */
+STATIC int
+xfs_repair_quotacheck_inobt(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = NULL;
+ struct xfs_repair_quotacheck *rq = priv;
+ xfs_ino_t ino;
+ xfs_agino_t agino;
+ int chunkidx;
+ int error = 0;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ for (chunkidx = 0, agino = irec.ir_startino;
+ chunkidx < XFS_INODES_PER_CHUNK;
+ chunkidx++, agino++) {
+ bool inuse;
+
+ /* Skip if this inode is free */
+ if (XFS_INOBT_MASK(chunkidx) & irec.ir_free)
+ continue;
+ ino = XFS_AGINO_TO_INO(mp, cur->bc_private.a.agno, agino);
+ if (xfs_is_quota_inode(&mp->m_sb, ino))
+ continue;
+
+ /* Back off and try again if an inode is being reclaimed */
+ error = xfs_icache_inode_is_allocated(mp, NULL, ino, &inuse);
+ if (error == -EAGAIN)
+ return -EDEADLOCK;
+
+ /*
+ * Grab inode for scanning. We cannot use DONTCACHE here
+ * because we already have a transaction so the iput must not
+ * trigger inode reclaim (which might allocate a transaction
+ * to clean up posteof blocks).
+ */
+ error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+ if (error)
+ return error;
+
+ error = xfs_repair_quotacheck_inode(rq->sc, rq->dqtype, ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_repair_frozen_iput(rq->sc, ip);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Zero a dquot prior to regenerating the counts. */
+static int
+xfs_repair_quotacheck_zero_dquot(
+ struct xfs_dquot *dq,
+ uint dqtype,
+ void *priv)
+{
+ dq->q_res_bcount -= be64_to_cpu(dq->q_core.d_bcount);
+ dq->q_core.d_bcount = 0;
+ dq->q_res_icount -= be64_to_cpu(dq->q_core.d_icount);
+ dq->q_core.d_icount = 0;
+ dq->q_res_rtbcount -= be64_to_cpu(dq->q_core.d_rtbcount);
+ dq->q_core.d_rtbcount = 0;
+ dq->dq_flags |= XFS_DQ_DIRTY;
+ return 0;
+}
+
+/* Log a dirty dquot after we regenerated the counters. */
+static int
+xfs_repair_quotacheck_log_dquot(
+ struct xfs_dquot *dq,
+ uint dqtype,
+ void *priv)
+{
+ struct xfs_scrub_context *sc = priv;
+ int error;
+
+ xfs_trans_dqjoin(sc->tp, dq);
+ xfs_trans_log_dquot(sc->tp, dq);
+ error = xfs_trans_roll(&sc->tp);
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Execute an online quotacheck. */
+STATIC int
+xfs_repair_quotacheck(
+ struct xfs_scrub_context *sc,
+ uint dqtype)
+{
+ struct xfs_repair_quotacheck rq;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_btree_cur *cur;
+ xfs_agnumber_t ag;
+ uint flag;
+ int error;
+
+ /*
+ * Commit the transaction so that we can allocate new quota ip
+ * mappings if we have to. If we crash after this point, the sb
+ * still has the CHKD flags cleared, so mount quotacheck will fix
+ * all of this up.
+ */
+ error = xfs_trans_commit(sc->tp);
+ sc->tp = NULL;
+ if (error)
+ return error;
+
+ /* Zero all the quota items. */
+ error = xfs_qm_dqiterate(mp, dqtype, xfs_repair_quotacheck_zero_dquot,
+ sc);
+ if (error)
+ goto out;
+
+ rq.sc = sc;
+ rq.dqtype = dqtype;
+
+ /* Iterate all AGs for inodes. */
+ for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+ error = xfs_ialloc_read_agi(mp, NULL, ag, &bp);
+ if (error)
+ goto out;
+ cur = xfs_inobt_init_cursor(mp, NULL, bp, ag, XFS_BTNUM_INO);
+ error = xfs_btree_query_all(cur, xfs_repair_quotacheck_inobt,
+ &rq);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ xfs_buf_relse(bp);
+ if (error)
+ goto out;
+ }
+
+ /* Log dquots. */
+ error = xfs_scrub_trans_alloc(sc, 0);
+ if (error)
+ goto out;
+ error = xfs_qm_dqiterate(mp, dqtype, xfs_repair_quotacheck_log_dquot,
+ sc);
+ if (error)
+ goto out;
+
+ /* Set quotachecked flag. */
+ flag = xfs_quota_chkd_flag(dqtype);
+ sc->mp->m_qflags |= flag;
+ spin_lock(&sc->mp->m_sb_lock);
+ sc->mp->m_sb.sb_qflags |= flag;
+ spin_unlock(&sc->mp->m_sb_lock);
+ xfs_log_sb(sc->tp);
+out:
+ return error;
+}
+
/* Repair all of a quota type's items. */
int
xfs_repair_quota(
@@ -322,6 +592,7 @@ xfs_repair_quota(
struct xfs_repair_quota_info rqi;
struct xfs_mount *mp = sc->mp;
uint dqtype;
+ uint flag;
int error = 0;
dqtype = xfs_scrub_quota_to_dqtype(sc);
@@ -344,9 +615,22 @@ xfs_repair_quota(
goto out_relock;
/* Make a quotacheck happen. */
- if (rqi.need_quotacheck)
+ if (rqi.need_quotacheck ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
xfs_repair_force_quotacheck(sc, dqtype);
+ /* Do we need a quotacheck? Did we need one? */
+ flag = xfs_quota_chkd_flag(dqtype);
+ if (!(flag & sc->mp->m_qflags)) {
+ /* We need to freeze the fs before we can scan inodes. */
+ if (!sc->fs_frozen) {
+ error = -EDEADLOCK;
+ goto out_relock;
+ }
+
+ error = xfs_repair_quotacheck(sc, dqtype);
+ }
+
out_relock:
sc->ilock_flags = XFS_ILOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 2567391489bd..be0e07f42b17 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -546,6 +546,7 @@ xfs_dquot_from_disk(
static int
xfs_qm_dqread_alloc(
struct xfs_mount *mp,
+ struct xfs_trans **tpp,
struct xfs_dquot *dqp,
struct xfs_buf **bpp)
{
@@ -553,6 +554,18 @@ xfs_qm_dqread_alloc(
struct xfs_buf *bp;
int error;
+ /*
+ * The caller passed in a transaction which we don't control, so
+ * release the hold before passing back the buffer.
+ */
+ if (tpp) {
+ error = xfs_dquot_disk_alloc(tpp, dqp, &bp);
+ if (error)
+ return error;
+ xfs_trans_bhold_release(*tpp, bp);
+ return 0;
+ }
+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
if (error)
@@ -588,6 +601,7 @@ xfs_qm_dqread_alloc(
static int
xfs_qm_dqread(
struct xfs_mount *mp,
+ struct xfs_trans **tpp,
xfs_dqid_t id,
uint type,
bool can_alloc,
@@ -603,7 +617,7 @@ xfs_qm_dqread(
/* Try to read the buffer, allocating if necessary. */
error = xfs_dquot_disk_read(mp, dqp, &bp);
if (error == -ENOENT && can_alloc)
- error = xfs_qm_dqread_alloc(mp, dqp, &bp);
+ error = xfs_qm_dqread_alloc(mp, tpp, dqp, &bp);
if (error)
goto err;
@@ -787,9 +801,10 @@ xfs_qm_dqget_checks(
* Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
* dquot, doing an allocation (if requested) as needed.
*/
-int
-xfs_qm_dqget(
+static int
+__xfs_qm_dqget(
struct xfs_mount *mp,
+ struct xfs_trans **tpp,
xfs_dqid_t id,
uint type,
bool can_alloc,
@@ -811,7 +826,7 @@ xfs_qm_dqget(
return 0;
}
- error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+ error = xfs_qm_dqread(mp, NULL, id, type, can_alloc, &dqp);
if (error)
return error;
@@ -850,7 +865,39 @@ xfs_qm_dqget_uncached(
if (error)
return error;
- return xfs_qm_dqread(mp, id, type, 0, dqpp);
+ return xfs_qm_dqread(mp, NULL, id, type, 0, dqpp);
+}
+
+/*
+ * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
+ * dquot, doing an allocation (if requested) as needed.
+ */
+int
+xfs_qm_dqget(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ bool can_alloc,
+ struct xfs_dquot **O_dqpp)
+{
+ return __xfs_qm_dqget(mp, NULL, id, type, can_alloc, O_dqpp);
+}
+
+/*
+ * Given the file system, id, and type (UDQUOT/GDQUOT) and a hole in the quota
+ * data where the on-disk dquot is supposed to live, return a locked dquot
+ * having allocated blocks with the transaction. This is a corner case
+ * required by online repair, which already has a transaction and has to pass
+ * that into dquot_setup.
+ */
+int
+xfs_qm_dqget_alloc(
+ struct xfs_trans **tpp,
+ xfs_dqid_t id,
+ uint type,
+ struct xfs_dquot **dqpp)
+{
+ return __xfs_qm_dqget((*tpp)->t_mountp, tpp, id, type, true, dqpp);
}
/* Return the quota id for a given inode and type. */
@@ -914,7 +961,7 @@ xfs_qm_dqget_inode(
* we re-acquire the lock.
*/
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+ error = xfs_qm_dqread(mp, NULL, id, type, can_alloc, &dqp);
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index bdd6bd921528..27e6df439493 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -180,6 +180,9 @@ extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
extern int xfs_qm_dqget_uncached(struct xfs_mount *mp,
xfs_dqid_t id, uint type,
struct xfs_dquot **dqpp);
+extern int xfs_qm_dqget_alloc(struct xfs_trans **tpp,
+ xfs_dqid_t id, uint type,
+ struct xfs_dquot **dqpp);
extern void xfs_qm_dqput(xfs_dquot_t *);
extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
^ permalink raw reply related [flat|nested] 76+ messages in thread
* [PATCH 0.5/22] xfs: grab the per-ag structure whenever relevant
2018-05-15 22:33 [PATCH v15.1 00/22] xfs-4.18: online repair support Darrick J. Wong
` (21 preceding siblings ...)
2018-05-15 22:36 ` [PATCH 22/22] xfs: implement live quotacheck as part of quota repair Darrick J. Wong
@ 2018-05-18 3:47 ` Darrick J. Wong
2018-05-30 6:44 ` Dave Chinner
22 siblings, 1 reply; 76+ messages in thread
From: Darrick J. Wong @ 2018-05-18 3:47 UTC (permalink / raw)
To: linux-xfs, david
From: Darrick J. Wong <darrick.wong@oracle.com>
Grab and hold the per-AG data across a scrub run whenever relevant.
This helps us avoid repeated trips through rcu in the repair code.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/scrub/common.c | 17 +++++++++++++++++
fs/xfs/scrub/common.h | 1 +
fs/xfs/scrub/scrub.h | 1 +
3 files changed, 19 insertions(+)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 518bff2be0c9..d3e5adc96411 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -541,6 +541,10 @@ xfs_scrub_ag_free(
xfs_trans_brelse(sc->tp, sa->agi_bp);
sa->agi_bp = NULL;
}
+ if (sa->pag) {
+ xfs_perag_put(sa->pag);
+ sa->pag = NULL;
+ }
sa->agno = NULLAGNUMBER;
}
@@ -568,6 +572,19 @@ xfs_scrub_ag_init(
return xfs_scrub_ag_btcur_init(sc, sa);
}
+/*
+ * Grab the per-ag structure if we haven't already gotten it. Teardown of the
+ * xfs_scrub_ag will release it for us.
+ */
+void
+xfs_scrub_perag_get(
+ struct xfs_mount *mp,
+ struct xfs_scrub_ag *sa)
+{
+ if (!sa->pag)
+ sa->pag = xfs_perag_get(mp, sa->agno);
+}
+
/* Per-scrubber setup functions */
/*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index a660087b606e..fbb91a7144fd 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -123,6 +123,7 @@ xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip)
void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa);
int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
struct xfs_scrub_ag *sa);
+void xfs_scrub_perag_get(struct xfs_mount *mp, struct xfs_scrub_ag *sa);
int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
struct xfs_buf **agi, struct xfs_buf **agf,
struct xfs_buf **agfl);
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 2f89a84a0e10..636424d5e2ee 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -51,6 +51,7 @@ struct xfs_scrub_meta_ops {
/* Buffer pointers and btree cursors for an entire AG. */
struct xfs_scrub_ag {
xfs_agnumber_t agno;
+ struct xfs_perag *pag;
/* AG btree roots */
struct xfs_buf *agf_bp;
^ permalink raw reply related [flat|nested] 76+ messages in thread