linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/3] xfs: prepare repair for bulk loading
@ 2020-01-01  1:02 Darrick J. Wong
  2020-01-01  1:02 ` [PATCH 1/3] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:02 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

Hi all,

Add some new helper functions (and debug knobs) to online repair to
prepare us to stage new btrees.

If you're going to start using this mess, you probably ought to just
pull from my git trees, which are linked below.

This is an extraordinary way to destroy everything.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-prep-for-bulk-loading

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] xfs: implement block reservation accounting for btrees we're staging
  2020-01-01  1:02 [PATCH v2 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
@ 2020-01-01  1:02 ` Darrick J. Wong
  2020-01-01  1:02 ` [PATCH 2/3] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
  2020-01-01  1:02 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
  2 siblings, 0 replies; 9+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:02 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Create a new xrep_newbt structure to encapsulate a fake root for
creating a staged btree cursor as well as to track all the blocks that
we need to reserve in order to build that btree.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c |  303 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |   55 +++++++++
 fs/xfs/scrub/trace.h  |   36 ++++++
 3 files changed, 394 insertions(+)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 088dbd7df096..8c3e13d8a5bd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -359,6 +359,309 @@ xrep_init_btblock(
 	return 0;
 }
 
+/* Initialize accounting resources for staging a new AG btree. */
+void
+xrep_newbt_init_ag(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc,
+	const struct xfs_owner_info	*oinfo,
+	xfs_fsblock_t			alloc_hint,
+	enum xfs_ag_resv_type		resv)
+{
+	memset(xnr, 0, sizeof(struct xrep_newbt));
+	xnr->sc = sc;
+	xnr->oinfo = *oinfo; /* structure copy */
+	xnr->alloc_hint = alloc_hint;
+	xnr->resv = resv;
+	INIT_LIST_HEAD(&xnr->resv_list);
+}
+
+/* Initialize accounting resources for staging a new inode fork btree. */
+void
+xrep_newbt_init_inode(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc,
+	int				whichfork,
+	const struct xfs_owner_info	*oinfo)
+{
+	xrep_newbt_init_ag(xnr, sc, oinfo,
+			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+			XFS_AG_RESV_NONE);
+	xnr->ifake.if_fork = kmem_zone_zalloc(xfs_ifork_zone, 0);
+	xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
+}
+
+/*
+ * Initialize accounting resources for staging a new btree.  Callers are
+ * expected to add their own reservations (and clean them up) manually.
+ */
+void
+xrep_newbt_init_bare(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc)
+{
+	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
+			XFS_AG_RESV_NONE);
+}
+
+/* Designate specific blocks to be used to build our new btree. */
+int
+xrep_newbt_add_blocks(
+	struct xrep_newbt		*xnr,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len)
+{
+	struct xrep_newbt_resv	*resv;
+
+	resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
+	if (!resv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&resv->list);
+	resv->fsbno = fsbno;
+	resv->len = len;
+	resv->used = 0;
+	list_add_tail(&resv->list, &xnr->resv_list);
+	return 0;
+}
+
+/* Allocate disk space for our new btree. */
+int
+xrep_newbt_alloc_blocks(
+	struct xrep_newbt	*xnr,
+	uint64_t		nr_blocks)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	xfs_alloctype_t		type;
+	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
+	int			error = 0;
+
+	/*
+	 * Inode-rooted btrees can allocate from any AG, whereas AG btrees
+	 * require a specific AG mentioned in the alloc hint..
+	 */
+	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
+
+	while (nr_blocks > 0 && !error) {
+		struct xfs_alloc_arg	args = {
+			.tp		= sc->tp,
+			.mp		= sc->mp,
+			.type		= type,
+			.fsbno		= alloc_hint,
+			.oinfo		= xnr->oinfo,
+			.minlen		= 1,
+			.maxlen		= nr_blocks,
+			.prod		= nr_blocks,
+			.resv		= xnr->resv,
+		};
+
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			return error;
+		if (args.fsbno == NULLFSBLOCK)
+			return -ENOSPC;
+
+		trace_xrep_newbt_alloc_blocks(sc->mp,
+				XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
+				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
+				args.len, xnr->oinfo.oi_owner);
+
+		error = xrep_newbt_add_blocks(xnr, args.fsbno, args.len);
+		if (error)
+			break;
+
+		nr_blocks -= args.len;
+		alloc_hint = args.fsbno + args.len - 1;
+
+		if (sc->ip)
+			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+		else
+			error = xrep_roll_ag_trans(sc);
+	}
+
+	return error;
+}
+
+/*
+ * Release blocks that were reserved for a btree repair.  If the repair
+ * succeeded then we log deferred frees for unused blocks.  Otherwise, we try
+ * to free the extents immediately to roll the filesystem back to where it was
+ * before we started.
+ */
+static inline int
+xrep_newbt_destroy_reservation(
+	struct xrep_newbt	*xnr,
+	struct xrep_newbt_resv	*resv,
+	bool			cancel_repair)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+
+	if (cancel_repair) {
+		int		error;
+
+		/* Free the extent then roll the transaction. */
+		error = xfs_free_extent(sc->tp, resv->fsbno, resv->len,
+				&xnr->oinfo, xnr->resv);
+		if (error)
+			return error;
+
+		if (sc->ip)
+			return xfs_trans_roll_inode(&sc->tp, sc->ip);
+		return xrep_roll_ag_trans(sc);
+	}
+
+	/*
+	 * Use the deferred freeing mechanism to schedule for deletion any
+	 * blocks we didn't use to rebuild the tree.  This enables us to log
+	 * them all in the same transaction as the root change.
+	 */
+	resv->fsbno += resv->used;
+	resv->len -= resv->used;
+	resv->used = 0;
+
+	if (resv->len == 0)
+		return 0;
+
+	trace_xrep_newbt_free_blocks(sc->mp,
+			XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
+			XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
+			resv->len, xnr->oinfo.oi_owner);
+
+	__xfs_bmap_add_free(sc->tp, resv->fsbno, resv->len, &xnr->oinfo, true);
+
+	return 0;
+}
+
+/* Free all the accounting info and disk space we reserved for a new btree. */
+void
+xrep_newbt_destroy(
+	struct xrep_newbt	*xnr,
+	int			error)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	struct xrep_newbt_resv	*resv, *n;
+
+	/*
+	 * If the filesystem already went down, we can't free the blocks.  Skip
+	 * ahead to freeing the incore metadata because we can't fix anything.
+	 */
+	if (XFS_FORCED_SHUTDOWN(sc->mp))
+		goto junkit;
+
+	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		error = xrep_newbt_destroy_reservation(xnr, resv, error != 0);
+		if (error)
+			goto junkit;
+
+		list_del(&resv->list);
+		kmem_free(resv);
+	}
+
+junkit:
+	/*
+	 * If we still have reservations attached to @newbt, cleanup must have
+	 * failed and the filesystem is about to go down.  Clean up the incore
+	 * reservations.
+	 */
+	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		list_del(&resv->list);
+		kmem_free(resv);
+	}
+
+	if (sc->ip) {
+		kmem_cache_free(xfs_ifork_zone, xnr->ifake.if_fork);
+		xnr->ifake.if_fork = NULL;
+	}
+}
+
+/* Feed one of the reserved btree blocks to the bulk loader. */
+int
+xrep_newbt_claim_block(
+	struct xfs_btree_cur	*cur,
+	struct xrep_newbt	*xnr,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xrep_newbt_resv	*resv;
+	xfs_fsblock_t		fsb;
+
+	/*
+	 * The first item in the list should always have a free block unless
+	 * we're completely out.
+	 */
+	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
+	if (resv->used == resv->len)
+		return -ENOSPC;
+
+	/*
+	 * Peel off a block from the start of the reservation.  We allocate
+	 * blocks in order to place blocks on disk in increasing record or key
+	 * order.  The block reservations tend to end up on the list in
+	 * decreasing order, which hopefully results in leaf blocks ending up
+	 * together.
+	 */
+	fsb = resv->fsbno + resv->used;
+	resv->used++;
+
+	/* If we used all the blocks in this reservation, move it to the end. */
+	if (resv->used == resv->len)
+		list_move_tail(&resv->list, &xnr->resv_list);
+
+	trace_xrep_newbt_claim_block(cur->bc_mp,
+			XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
+			XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
+			1, xnr->oinfo.oi_owner);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(fsb);
+	else
+		ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
+	return 0;
+}
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us.  However, there are some
+ * exceptions to this rule:
+ *
+ * (1) If someone turned one of the debug knobs.
+ * (2) If this is a per-AG btree and the AG has less than ~9% space free.
+ * (3) If this is an inode btree and the FS has less than ~9% space free.
+ *
+ * Note that we actually use 3/32 for the comparison to avoid division.
+ */
+void
+xrep_bload_estimate_slack(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_bload	*bload)
+{
+	uint64_t		free;
+	uint64_t		sz;
+
+	/* Let the btree code compute the default slack values. */
+	bload->leaf_slack = -1;
+	bload->node_slack = -1;
+
+	if (sc->ops->type == ST_PERAG) {
+		free = sc->sa.pag->pagf_freeblks;
+		sz = xfs_ag_block_count(sc->mp, sc->sa.agno);
+	} else {
+		free = percpu_counter_sum(&sc->mp->m_fdblocks);
+		sz = sc->mp->m_sb.sb_dblocks;
+	}
+
+	/* No further changes if there's more than 3/32ths space left. */
+	if (free >= ((sz * 3) >> 5))
+		return;
+
+	/* We're low on space; load the btrees as tightly as possible. */
+	if (bload->leaf_slack < 0)
+		bload->leaf_slack = 0;
+	if (bload->node_slack < 0)
+		bload->node_slack = 0;
+}
+
 /*
  * Reconstructing per-AG Btrees
  *
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 479cfe38065e..43eca74b19d0 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -6,6 +6,10 @@
 #ifndef __XFS_SCRUB_REPAIR_H__
 #define __XFS_SCRUB_REPAIR_H__
 
+#include "scrub/bitmap.h"
+#include "xfs_btree.h"
+union xfs_btree_ptr;
+
 static inline int xrep_notsupported(struct xfs_scrub *sc)
 {
 	return -EOPNOTSUPP;
@@ -59,6 +63,57 @@ int xrep_agf(struct xfs_scrub *sc);
 int xrep_agfl(struct xfs_scrub *sc);
 int xrep_agi(struct xfs_scrub *sc);
 
+struct xrep_newbt_resv {
+	/* Link to list of extents that we've reserved. */
+	struct list_head	list;
+
+	/* FSB of the block we reserved. */
+	xfs_fsblock_t		fsbno;
+
+	/* Length of the reservation. */
+	xfs_extlen_t		len;
+
+	/* How much of this reservation has been used. */
+	xfs_extlen_t		used;
+};
+
+struct xrep_newbt {
+	struct xfs_scrub	*sc;
+
+	/* List of extents that we've reserved. */
+	struct list_head	resv_list;
+
+	/* Fake root for new btree. */
+	union {
+		struct xbtree_afakeroot	afake;
+		struct xbtree_ifakeroot	ifake;
+	};
+
+	/* rmap owner of these blocks */
+	struct xfs_owner_info	oinfo;
+
+	/* Allocation hint */
+	xfs_fsblock_t		alloc_hint;
+
+	/* per-ag reservation type */
+	enum xfs_ag_resv_type	resv;
+};
+
+void xrep_newbt_init_bare(struct xrep_newbt *xba, struct xfs_scrub *sc);
+void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
+		const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
+		enum xfs_ag_resv_type resv);
+void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
+		int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_add_blocks(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
+		xfs_extlen_t len);
+int xrep_newbt_alloc_blocks(struct xrep_newbt *xba, uint64_t nr_blocks);
+void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
+int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
+		union xfs_btree_ptr *ptr);
+void xrep_bload_estimate_slack(struct xfs_scrub *sc,
+		struct xfs_btree_bload *bload);
+
 #else
 
 static inline int xrep_attempt(
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 3362bae28b46..651d7b8ee09f 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -904,6 +904,42 @@ TRACE_EVENT(xrep_ialloc_insert,
 		  __entry->freemask)
 )
 
+DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 int64_t owner),
+	TP_ARGS(mp, agno, agbno, len, owner),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int64_t, owner)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->owner = owner;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner)
+);
+#define DEFINE_NEWBT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_newbt_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len, \
+		 int64_t owner), \
+	TP_ARGS(mp, agno, agbno, len, owner))
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 #endif /* _TRACE_XFS_SCRUB_TRACE_H */


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] xfs: add debug knobs to control btree bulk load slack factors
  2020-01-01  1:02 [PATCH v2 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
  2020-01-01  1:02 ` [PATCH 1/3] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
@ 2020-01-01  1:02 ` Darrick J. Wong
  2020-01-01  1:02 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
  2 siblings, 0 replies; 9+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:02 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Add some debug knobs so that we can control the leaf and node block
slack when rebuilding btrees.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c |   10 ++++++---
 fs/xfs/xfs_globals.c  |   12 +++++++++++
 fs/xfs/xfs_sysctl.h   |    2 ++
 fs/xfs/xfs_sysfs.c    |   54 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 3 deletions(-)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 8c3e13d8a5bd..6fe9cffad5b3 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -639,9 +639,13 @@ xrep_bload_estimate_slack(
 	uint64_t		free;
 	uint64_t		sz;
 
-	/* Let the btree code compute the default slack values. */
-	bload->leaf_slack = -1;
-	bload->node_slack = -1;
+	/*
+	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
+	 * unless someone has set them otherwise, so we just pull the values
+	 * here.
+	 */
+	bload->leaf_slack = xfs_globals.bload_leaf_slack;
+	bload->node_slack = xfs_globals.bload_node_slack;
 
 	if (sc->ops->type == ST_PERAG) {
 		free = sc->sa.pag->pagf_freeblks;
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index fa55ab8b8d80..4e747384ad26 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -43,4 +43,16 @@ struct xfs_globals xfs_globals = {
 #ifdef DEBUG
 	.pwork_threads		=	-1,	/* automatic thread detection */
 #endif
+
+	/*
+	 * Leave this many record slots empty when bulk loading btrees.  By
+	 * default we load new btree leaf blocks 75% full.
+	 */
+	.bload_leaf_slack	=	-1,
+
+	/*
+	 * Leave this many key/ptr slots empty when bulk loading btrees.  By
+	 * default we load new btree node blocks 75% full.
+	 */
+	.bload_node_slack	=	-1,
 };
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 8abf4640f1d5..aecccceee4ca 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,8 @@ struct xfs_globals {
 #ifdef DEBUG
 	int	pwork_threads;		/* parallel workqueue threads */
 #endif
+	int	bload_leaf_slack;	/* btree bulk load leaf slack */
+	int	bload_node_slack;	/* btree bulk load node slack */
 	int	log_recovery_delay;	/* log recovery delay (secs) */
 	int	mount_delay;		/* mount setup delay (secs) */
 	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index f1bc88f4367c..673ad21a9585 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -228,6 +228,58 @@ pwork_threads_show(
 XFS_SYSFS_ATTR_RW(pwork_threads);
 #endif /* DEBUG */
 
+STATIC ssize_t
+bload_leaf_slack_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	xfs_globals.bload_leaf_slack = val;
+	return count;
+}
+
+STATIC ssize_t
+bload_leaf_slack_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_leaf_slack);
+
+STATIC ssize_t
+bload_node_slack_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	xfs_globals.bload_node_slack = val;
+	return count;
+}
+
+STATIC ssize_t
+bload_node_slack_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_node_slack);
+
 static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(bug_on_assert),
 	ATTR_LIST(log_recovery_delay),
@@ -236,6 +288,8 @@ static struct attribute *xfs_dbg_attrs[] = {
 #ifdef DEBUG
 	ATTR_LIST(pwork_threads),
 #endif
+	ATTR_LIST(bload_leaf_slack),
+	ATTR_LIST(bload_node_slack),
 	NULL,
 };
 


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2020-01-01  1:02 [PATCH v2 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
  2020-01-01  1:02 ` [PATCH 1/3] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
  2020-01-01  1:02 ` [PATCH 2/3] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
@ 2020-01-01  1:02 ` Darrick J. Wong
  2 siblings, 0 replies; 9+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:02 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

We need to log EFIs for every extent that we allocate for the purpose of
staging a new btree so that if we fail then the blocks will be freed
during log recovery.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c     |   63 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/scrub/repair.h     |    4 ++-
 fs/xfs/xfs_extfree_item.c |    2 -
 3 files changed, 64 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 6fe9cffad5b3..d9d09ae356be 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -25,6 +25,8 @@
 #include "xfs_ag_resv.h"
 #include "xfs_quota.h"
 #include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_extfree_item.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -409,7 +411,8 @@ int
 xrep_newbt_add_blocks(
 	struct xrep_newbt		*xnr,
 	xfs_fsblock_t			fsbno,
-	xfs_extlen_t			len)
+	xfs_extlen_t			len,
+	void				*priv)
 {
 	struct xrep_newbt_resv	*resv;
 
@@ -421,10 +424,55 @@ xrep_newbt_add_blocks(
 	resv->fsbno = fsbno;
 	resv->len = len;
 	resv->used = 0;
+	resv->priv = priv;
 	list_add_tail(&resv->list, &xnr->resv_list);
 	return 0;
 }
 
+/*
+ * Set up automatic reaping of the blocks reserved for btree reconstruction in
+ * case we crash by logging a deferred free item for each extent we allocate so
+ * that we can get all of the space back if we crash before we can commit the
+ * new btree.  This function returns a token that can be used to cancel
+ * automatic reaping if repair is successful.
+ */
+static void *
+xrep_newbt_schedule_reap(
+	struct xfs_trans		*tp,
+	struct xfs_owner_info		*oinfo,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len)
+{
+	struct xfs_extent_free_item	efi_item = {
+		.xefi_startblock	= fsbno,
+		.xefi_blockcount	= len,
+		.xefi_oinfo		= *oinfo, /* struct copy */
+		.xefi_skip_discard	= true,
+	};
+	struct xfs_efi_log_item		*efi;
+
+	INIT_LIST_HEAD(&efi_item.xefi_list);
+	efi = xfs_extent_free_defer_type.create_intent(tp, 1);
+	xfs_extent_free_defer_type.log_item(tp, efi, &efi_item.xefi_list);
+	return efi;
+}
+
+/*
+ * Cancel a previously scheduled automatic reap (see above) by logging a
+ * deferred free done for each extent we allocated.  We cheat since we know
+ * that log recovery has never looked at the extents attached to an EFD.
+ */
+static void
+xrep_newbt_cancel_reap(
+	struct xfs_trans	*tp,
+	void			*token)
+{
+	struct xfs_efd_log_item	*efd;
+
+	efd = xfs_extent_free_defer_type.create_done(tp, token, 0);
+	set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
+}
+
 /* Allocate disk space for our new btree. */
 int
 xrep_newbt_alloc_blocks(
@@ -454,6 +502,7 @@ xrep_newbt_alloc_blocks(
 			.prod		= nr_blocks,
 			.resv		= xnr->resv,
 		};
+		void			*token;
 
 		error = xfs_alloc_vextent(&args);
 		if (error)
@@ -466,7 +515,9 @@ xrep_newbt_alloc_blocks(
 				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
 				args.len, xnr->oinfo.oi_owner);
 
-		error = xrep_newbt_add_blocks(xnr, args.fsbno, args.len);
+		token = xrep_newbt_schedule_reap(sc->tp, &xnr->oinfo,
+				args.fsbno, args.len);
+		error = xrep_newbt_add_blocks(xnr, args.fsbno, args.len, token);
 		if (error)
 			break;
 
@@ -510,6 +561,13 @@ xrep_newbt_destroy_reservation(
 		return xrep_roll_ag_trans(sc);
 	}
 
+	/*
+	 * Since we succeeded in rebuilding the btree, we need to log an EFD
+	 * for every extent we reserved to prevent log recovery from freeing
+	 * them mistakenly.
+	 */
+	xrep_newbt_cancel_reap(sc->tp, resv->priv);
+
 	/*
 	 * Use the deferred freeing mechanism to schedule for deletion any
 	 * blocks we didn't use to rebuild the tree.  This enables us to log
@@ -564,6 +622,7 @@ xrep_newbt_destroy(
 	 * reservations.
 	 */
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		xfs_extent_free_defer_type.abort_intent(resv->priv);
 		list_del(&resv->list);
 		kmem_free(resv);
 	}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 43eca74b19d0..6ca5dc8dfb2d 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -67,6 +67,8 @@ struct xrep_newbt_resv {
 	/* Link to list of extents that we've reserved. */
 	struct list_head	list;
 
+	void			*priv;
+
 	/* FSB of the block we reserved. */
 	xfs_fsblock_t		fsbno;
 
@@ -106,7 +108,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
 void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
 		int whichfork, const struct xfs_owner_info *oinfo);
 int xrep_newbt_add_blocks(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
-		xfs_extlen_t len);
+		xfs_extlen_t len, void *priv);
 int xrep_newbt_alloc_blocks(struct xrep_newbt *xba, uint64_t nr_blocks);
 void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
 int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6ea847f6e298..51540f565ec3 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -329,8 +329,6 @@ xfs_trans_get_efd(
 {
 	struct xfs_efd_log_item		*efdp;
 
-	ASSERT(nextents > 0);
-
 	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
 		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
 				(nextents - 1) * sizeof(struct xfs_extent),


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-29 23:31 [PATCH v2 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
@ 2019-10-29 23:31 ` Darrick J. Wong
  0 siblings, 0 replies; 9+ messages in thread
From: Darrick J. Wong @ 2019-10-29 23:31 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

We need to log EFIs for every extent that we allocate for the purpose of
staging a new btree so that if we fail then the blocks will be freed
during log recovery.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c     |   63 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/scrub/repair.h     |    4 ++-
 fs/xfs/xfs_extfree_item.c |    2 -
 3 files changed, 64 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c38ff20b8fa9..e66596d0f21a 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -25,6 +25,8 @@
 #include "xfs_ag_resv.h"
 #include "xfs_quota.h"
 #include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_extfree_item.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -409,7 +411,8 @@ int
 xrep_newbt_add_blocks(
 	struct xrep_newbt		*xnr,
 	xfs_fsblock_t			fsbno,
-	xfs_extlen_t			len)
+	xfs_extlen_t			len,
+	void				*priv)
 {
 	struct xrep_newbt_resv	*resv;
 
@@ -421,10 +424,55 @@ xrep_newbt_add_blocks(
 	resv->fsbno = fsbno;
 	resv->len = len;
 	resv->used = 0;
+	resv->priv = priv;
 	list_add_tail(&resv->list, &xnr->resv_list);
 	return 0;
 }
 
+/*
+ * Set up automatic reaping of the blocks reserved for btree reconstruction in
+ * case we crash by logging a deferred free item for each extent we allocate so
+ * that we can get all of the space back if we crash before we can commit the
+ * new btree.  This function returns a token that can be used to cancel
+ * automatic reaping if repair is successful.
+ */
+static void *
+xrep_newbt_schedule_reap(
+	struct xfs_trans		*tp,
+	struct xfs_owner_info		*oinfo,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len)
+{
+	struct xfs_extent_free_item	efi_item = {
+		.xefi_startblock	= fsbno,
+		.xefi_blockcount	= len,
+		.xefi_oinfo		= *oinfo, /* struct copy */
+		.xefi_skip_discard	= true,
+	};
+	struct xfs_efi_log_item		*efi;
+
+	INIT_LIST_HEAD(&efi_item.xefi_list);
+	efi = xfs_extent_free_defer_type.create_intent(tp, 1);
+	xfs_extent_free_defer_type.log_item(tp, efi, &efi_item.xefi_list);
+	return efi;
+}
+
+/*
+ * Cancel a previously scheduled automatic reap (see above) by logging a
+ * deferred free done for each extent we allocated.  We cheat since we know
+ * that log recovery has never looked at the extents attached to an EFD.
+ */
+static void
+xrep_newbt_cancel_reap(
+	struct xfs_trans	*tp,
+	void			*token)
+{
+	struct xfs_efd_log_item	*efd;
+
+	efd = xfs_extent_free_defer_type.create_done(tp, token, 0);
+	set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
+}
+
 /* Allocate disk space for our new btree. */
 int
 xrep_newbt_alloc_blocks(
@@ -454,6 +502,7 @@ xrep_newbt_alloc_blocks(
 			.prod		= nr_blocks,
 			.resv		= xnr->resv,
 		};
+		void			*token;
 
 		error = xfs_alloc_vextent(&args);
 		if (error)
@@ -466,7 +515,9 @@ xrep_newbt_alloc_blocks(
 				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
 				args.len, xnr->oinfo.oi_owner);
 
-		error = xrep_newbt_add_blocks(xnr, args.fsbno, args.len);
+		token = xrep_newbt_schedule_reap(sc->tp, &xnr->oinfo,
+				args.fsbno, args.len);
+		error = xrep_newbt_add_blocks(xnr, args.fsbno, args.len, token);
 		if (error)
 			break;
 
@@ -510,6 +561,13 @@ xrep_newbt_destroy_reservation(
 		return xrep_roll_ag_trans(sc);
 	}
 
+	/*
+	 * Since we succeeded in rebuilding the btree, we need to log an EFD
+	 * for every extent we reserved to prevent log recovery from freeing
+	 * them mistakenly.
+	 */
+	xrep_newbt_cancel_reap(sc->tp, resv->priv);
+
 	/*
 	 * Use the deferred freeing mechanism to schedule for deletion any
 	 * blocks we didn't use to rebuild the tree.  This enables us to log
@@ -564,6 +622,7 @@ xrep_newbt_destroy(
 	 * reservations.
 	 */
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		xfs_extent_free_defer_type.abort_intent(resv->priv);
 		list_del(&resv->list);
 		kmem_free(resv);
 	}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 43eca74b19d0..6ca5dc8dfb2d 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -67,6 +67,8 @@ struct xrep_newbt_resv {
 	/* Link to list of extents that we've reserved. */
 	struct list_head	list;
 
+	void			*priv;
+
 	/* FSB of the block we reserved. */
 	xfs_fsblock_t		fsbno;
 
@@ -106,7 +108,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
 void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
 		int whichfork, const struct xfs_owner_info *oinfo);
 int xrep_newbt_add_blocks(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
-		xfs_extlen_t len);
+		xfs_extlen_t len, void *priv);
 int xrep_newbt_alloc_blocks(struct xrep_newbt *xba, uint64_t nr_blocks);
 void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
 int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index e44efc41a041..1e49936afbfb 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -328,8 +328,6 @@ xfs_trans_get_efd(
 {
 	struct xfs_efd_log_item		*efdp;
 
-	ASSERT(nextents > 0);
-
 	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
 		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
 				(nextents - 1) * sizeof(struct xfs_extent),


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-25 16:22     ` Darrick J. Wong
@ 2019-10-25 17:36       ` Brian Foster
  0 siblings, 0 replies; 9+ messages in thread
From: Brian Foster @ 2019-10-25 17:36 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 09:22:25AM -0700, Darrick J. Wong wrote:
> On Fri, Oct 25, 2019 at 10:24:01AM -0400, Brian Foster wrote:
> > On Wed, Oct 09, 2019 at 09:50:06AM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > We need to log EFIs for every extent that we allocate for the purpose of
> > > staging a new btree so that if we fail then the blocks will be freed
> > > during log recovery.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > Ok, so now I'm getting to the stuff we discussed earlier around pinning
> > the log tail and whatnot. I have no issue with getting this in as is in
> > the meantime, so long as we eventually account for those kind of issues
> > before we consider this non-experimental.
> 
> <nod> I also wondered in the past if it would be smarter just to freeze
> the whole filesystem (like I currently do for rmapbt rebuilding) so that
> repair is the only thing that gets to touch the log, but I bet that will
> be unpopular. :)
> 
> > >  fs/xfs/scrub/repair.c     |   37 +++++++++++++++++++++++++++++++++++--
> > >  fs/xfs/scrub/repair.h     |    4 +++-
> > >  fs/xfs/xfs_extfree_item.c |    2 --
> > >  3 files changed, 38 insertions(+), 5 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > index beebd484c5f3..49cea124148b 100644
> > > --- a/fs/xfs/scrub/repair.c
> > > +++ b/fs/xfs/scrub/repair.c
> > > @@ -25,6 +25,8 @@
> > >  #include "xfs_ag_resv.h"
> > >  #include "xfs_quota.h"
> > >  #include "xfs_bmap.h"
> > > +#include "xfs_defer.h"
> > > +#include "xfs_extfree_item.h"
> > >  #include "scrub/scrub.h"
> > >  #include "scrub/common.h"
> > >  #include "scrub/trace.h"
> > > @@ -412,7 +414,8 @@ int
> > >  xrep_newbt_add_reservation(
> > >  	struct xrep_newbt		*xnr,
> > >  	xfs_fsblock_t			fsbno,
> > > -	xfs_extlen_t			len)
> > > +	xfs_extlen_t			len,
> > > +	void				*priv)
> > >  {
> > >  	struct xrep_newbt_resv	*resv;
> > >  
> > > @@ -424,6 +427,7 @@ xrep_newbt_add_reservation(
> > >  	resv->fsbno = fsbno;
> > >  	resv->len = len;
> > >  	resv->used = 0;
> > > +	resv->priv = priv;
> > >  	list_add_tail(&resv->list, &xnr->reservations);
> > >  	return 0;
> > >  }
> > > @@ -434,6 +438,7 @@ xrep_newbt_reserve_space(
> > >  	struct xrep_newbt	*xnr,
> > >  	uint64_t		nr_blocks)
> > >  {
> > > +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
> > 
> > Heh. I feel like we should be able to do something a little cleaner
> > here, but I'm not sure what off the top of my head. Maybe a helper to
> > look up a generic "intent type" in the defer_op_types table and somewhat
> > abstract away the dfops-specific nature of the current interfaces..?
> > Maybe we should consider renaming xfs_defer_op_type to something more
> > generic too (xfs_intent_type ?). (This could all be a separate patch
> > btw.)
> 
> I dunno.  I also feel like this is borderline misuse of the defer ops
> code since we're overriding the default behavior to walk an EFI through
> the state machine manually... so perhaps it's ok to wait until we have a
> second reasonable user to abstract some of this away?
> 

I think that's reasonable. I was also wondering whether some of the
dfops functionality might be worth refactoring for this, but I'm not
totally sure what that would look like.

> > >  	struct xfs_scrub	*sc = xnr->sc;
> > >  	xfs_alloctype_t		type;
> > >  	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> > 
> > Also variable names look misaligned with the above (here and in the
> > destroy function below).
> 
> Yeah, I'll see if I can fix that.
> 
> > > @@ -442,6 +447,7 @@ xrep_newbt_reserve_space(
> > >  	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> > >  
> > >  	while (nr_blocks > 0 && !error) {
> > > +		struct xfs_extent_free_item	efi_item;
> > >  		struct xfs_alloc_arg	args = {
> > >  			.tp		= sc->tp,
> > >  			.mp		= sc->mp,
> > > @@ -453,6 +459,7 @@ xrep_newbt_reserve_space(
> > >  			.prod		= nr_blocks,
> > >  			.resv		= xnr->resv,
> > >  		};
> > > +		void			*efi;
> > >  
> > >  		error = xfs_alloc_vextent(&args);
> > >  		if (error)
> > > @@ -465,7 +472,20 @@ xrep_newbt_reserve_space(
> > >  				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> > >  				args.len, xnr->oinfo.oi_owner);
> > >  
> > > -		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> > > +		/*
> > > +		 * Log a deferred free item for each extent we allocate so that
> > > +		 * we can get all of the space back if we crash before we can
> > > +		 * commit the new btree.
> > > +		 */
> > > +		efi_item.xefi_startblock = args.fsbno;
> > > +		efi_item.xefi_blockcount = args.len;
> > > +		efi_item.xefi_oinfo = xnr->oinfo;
> > > +		efi_item.xefi_skip_discard = true;
> > > +		efi = efi_type->create_intent(sc->tp, 1);
> > > +		efi_type->log_item(sc->tp, efi, &efi_item.xefi_list);
> > > +
> > > +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len,
> > > +				efi);
> > >  		if (error)
> > >  			break;
> > >  
> > > @@ -487,6 +507,7 @@ xrep_newbt_destroy(
> > >  	struct xrep_newbt	*xnr,
> > >  	int			error)
> > >  {
> > > +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
> > >  	struct xfs_scrub	*sc = xnr->sc;
> > >  	struct xrep_newbt_resv	*resv, *n;
> > >  
> > > @@ -494,6 +515,17 @@ xrep_newbt_destroy(
> > >  		goto junkit;
> > >  
> > >  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > +		struct xfs_efd_log_item *efd;
> > > +
> > > +		/*
> > > +		 * Log a deferred free done for each extent we allocated now
> > > +		 * that we've linked the block into the filesystem.  We cheat
> > > +		 * since we know that log recovery has never looked at the
> > > +		 * extents attached to an EFD.
> > > +		 */
> > > +		efd = efi_type->create_done(sc->tp, resv->priv, 0);
> > > +		set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
> > > +
> > 
> > So here we've presumably succeeded so we drop the intent and actually
> > free any blocks that we didn't happen to use.
> 
> Correct.
> 
> > >  		/* Free every block we didn't use. */
> > >  		resv->fsbno += resv->used;
> > >  		resv->len -= resv->used;
> > > @@ -515,6 +547,7 @@ xrep_newbt_destroy(
> > >  
> > >  junkit:
> > >  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > +		efi_type->abort_intent(resv->priv);
> > 
> > And in this case we've failed so we abort the intent.. but what actually
> > happens to the blocks we might have allocated? Do we need to free those
> > somewhere or is that also handled by the above path somehow?
> 
> Hmm, my assumption here was that the error code would be passed all the
> way back to xchk_teardown, which in cancelling the dirty transaction
> would shut down the filesystem, and log recovery would take care of it
> for us.
> 

Ok.. that seems perfectly reasonable to me in a general sense. I'd just
prefer to see a "shutdown cleans up the mess" comment somewhere so the
expected error handling behavior is clear in the code. From a scrub
perspective, it sounds like we are saying a failed repair attempt is
going to shutdown and leave around a dirty log. I wonder how that would
play out in practice given that the filesystem is presumably corrupted,
so if we can't mount the fs after that point for whatever reason we've
basically added to the mess since repair will need to zero the log. We
could also crash at any point too if the fs is corrupted so I dunno..

> However, I suppose we could at least try to "recover" the intent to free
> the EFIs and clear the EFI; and only fall back to aborting if that also
> fails since then we'll know everything's dead in the water.
> 

That could be something for a follow on patch too depending on the
above..

Brian

> --D
> 
> > Brian
> > 
> > >  		list_del(&resv->list);
> > >  		kmem_free(resv);
> > >  	}
> > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > > index ab6c1199ecc0..cb86281de28b 100644
> > > --- a/fs/xfs/scrub/repair.h
> > > +++ b/fs/xfs/scrub/repair.h
> > > @@ -67,6 +67,8 @@ struct xrep_newbt_resv {
> > >  	/* Link to list of extents that we've reserved. */
> > >  	struct list_head	list;
> > >  
> > > +	void			*priv;
> > > +
> > >  	/* FSB of the block we reserved. */
> > >  	xfs_fsblock_t		fsbno;
> > >  
> > > @@ -112,7 +114,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
> > >  void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
> > >  		int whichfork, const struct xfs_owner_info *oinfo);
> > >  int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
> > > -		xfs_extlen_t len);
> > > +		xfs_extlen_t len, void *priv);
> > >  int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
> > >  void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
> > >  int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
> > > diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> > > index e44efc41a041..1e49936afbfb 100644
> > > --- a/fs/xfs/xfs_extfree_item.c
> > > +++ b/fs/xfs/xfs_extfree_item.c
> > > @@ -328,8 +328,6 @@ xfs_trans_get_efd(
> > >  {
> > >  	struct xfs_efd_log_item		*efdp;
> > >  
> > > -	ASSERT(nextents > 0);
> > > -
> > >  	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
> > >  		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
> > >  				(nextents - 1) * sizeof(struct xfs_extent),
> > > 
> > 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-25 14:24   ` Brian Foster
@ 2019-10-25 16:22     ` Darrick J. Wong
  2019-10-25 17:36       ` Brian Foster
  0 siblings, 1 reply; 9+ messages in thread
From: Darrick J. Wong @ 2019-10-25 16:22 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 10:24:01AM -0400, Brian Foster wrote:
> On Wed, Oct 09, 2019 at 09:50:06AM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > We need to log EFIs for every extent that we allocate for the purpose of
> > staging a new btree so that if we fail then the blocks will be freed
> > during log recovery.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> Ok, so now I'm getting to the stuff we discussed earlier around pinning
> the log tail and whatnot. I have no issue with getting this in as is in
> the meantime, so long as we eventually account for those kind of issues
> before we consider this non-experimental.

<nod> I also wondered in the past if it would be smarter just to freeze
the whole filesystem (like I currently do for rmapbt rebuilding) so that
repair is the only thing that gets to touch the log, but I bet that will
be unpopular. :)

> >  fs/xfs/scrub/repair.c     |   37 +++++++++++++++++++++++++++++++++++--
> >  fs/xfs/scrub/repair.h     |    4 +++-
> >  fs/xfs/xfs_extfree_item.c |    2 --
> >  3 files changed, 38 insertions(+), 5 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index beebd484c5f3..49cea124148b 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -25,6 +25,8 @@
> >  #include "xfs_ag_resv.h"
> >  #include "xfs_quota.h"
> >  #include "xfs_bmap.h"
> > +#include "xfs_defer.h"
> > +#include "xfs_extfree_item.h"
> >  #include "scrub/scrub.h"
> >  #include "scrub/common.h"
> >  #include "scrub/trace.h"
> > @@ -412,7 +414,8 @@ int
> >  xrep_newbt_add_reservation(
> >  	struct xrep_newbt		*xnr,
> >  	xfs_fsblock_t			fsbno,
> > -	xfs_extlen_t			len)
> > +	xfs_extlen_t			len,
> > +	void				*priv)
> >  {
> >  	struct xrep_newbt_resv	*resv;
> >  
> > @@ -424,6 +427,7 @@ xrep_newbt_add_reservation(
> >  	resv->fsbno = fsbno;
> >  	resv->len = len;
> >  	resv->used = 0;
> > +	resv->priv = priv;
> >  	list_add_tail(&resv->list, &xnr->reservations);
> >  	return 0;
> >  }
> > @@ -434,6 +438,7 @@ xrep_newbt_reserve_space(
> >  	struct xrep_newbt	*xnr,
> >  	uint64_t		nr_blocks)
> >  {
> > +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
> 
> Heh. I feel like we should be able to do something a little cleaner
> here, but I'm not sure what off the top of my head. Maybe a helper to
> look up a generic "intent type" in the defer_op_types table and somewhat
> abstract away the dfops-specific nature of the current interfaces..?
> Maybe we should consider renaming xfs_defer_op_type to something more
> generic too (xfs_intent_type ?). (This could all be a separate patch
> btw.)

I dunno.  I also feel like this is borderline misuse of the defer ops
code since we're overriding the default behavior to walk an EFI through
the state machine manually... so perhaps it's ok to wait until we have a
second reasonable user to abstract some of this away?

> >  	struct xfs_scrub	*sc = xnr->sc;
> >  	xfs_alloctype_t		type;
> >  	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> 
> Also variable names look misaligned with the above (here and in the
> destroy function below).

Yeah, I'll see if I can fix that.

> > @@ -442,6 +447,7 @@ xrep_newbt_reserve_space(
> >  	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> >  
> >  	while (nr_blocks > 0 && !error) {
> > +		struct xfs_extent_free_item	efi_item;
> >  		struct xfs_alloc_arg	args = {
> >  			.tp		= sc->tp,
> >  			.mp		= sc->mp,
> > @@ -453,6 +459,7 @@ xrep_newbt_reserve_space(
> >  			.prod		= nr_blocks,
> >  			.resv		= xnr->resv,
> >  		};
> > +		void			*efi;
> >  
> >  		error = xfs_alloc_vextent(&args);
> >  		if (error)
> > @@ -465,7 +472,20 @@ xrep_newbt_reserve_space(
> >  				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> >  				args.len, xnr->oinfo.oi_owner);
> >  
> > -		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> > +		/*
> > +		 * Log a deferred free item for each extent we allocate so that
> > +		 * we can get all of the space back if we crash before we can
> > +		 * commit the new btree.
> > +		 */
> > +		efi_item.xefi_startblock = args.fsbno;
> > +		efi_item.xefi_blockcount = args.len;
> > +		efi_item.xefi_oinfo = xnr->oinfo;
> > +		efi_item.xefi_skip_discard = true;
> > +		efi = efi_type->create_intent(sc->tp, 1);
> > +		efi_type->log_item(sc->tp, efi, &efi_item.xefi_list);
> > +
> > +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len,
> > +				efi);
> >  		if (error)
> >  			break;
> >  
> > @@ -487,6 +507,7 @@ xrep_newbt_destroy(
> >  	struct xrep_newbt	*xnr,
> >  	int			error)
> >  {
> > +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
> >  	struct xfs_scrub	*sc = xnr->sc;
> >  	struct xrep_newbt_resv	*resv, *n;
> >  
> > @@ -494,6 +515,17 @@ xrep_newbt_destroy(
> >  		goto junkit;
> >  
> >  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > +		struct xfs_efd_log_item *efd;
> > +
> > +		/*
> > +		 * Log a deferred free done for each extent we allocated now
> > +		 * that we've linked the block into the filesystem.  We cheat
> > +		 * since we know that log recovery has never looked at the
> > +		 * extents attached to an EFD.
> > +		 */
> > +		efd = efi_type->create_done(sc->tp, resv->priv, 0);
> > +		set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
> > +
> 
> So here we've presumably succeeded so we drop the intent and actually
> free any blocks that we didn't happen to use.

Correct.

> >  		/* Free every block we didn't use. */
> >  		resv->fsbno += resv->used;
> >  		resv->len -= resv->used;
> > @@ -515,6 +547,7 @@ xrep_newbt_destroy(
> >  
> >  junkit:
> >  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > +		efi_type->abort_intent(resv->priv);
> 
> And in this case we've failed so we abort the intent.. but what actually
> happens to the blocks we might have allocated? Do we need to free those
> somewhere or is that also handled by the above path somehow?

Hmm, my assumption here was that the error code would be passed all the
way back to xchk_teardown, which in cancelling the dirty transaction
would shut down the filesystem, and log recovery would take care of it
for us.

However, I suppose we could at least try to "recover" the intent to free
the EFIs and clear the EFI; and only fall back to aborting if that also
fails since then we'll know everything's dead in the water.

--D

> Brian
> 
> >  		list_del(&resv->list);
> >  		kmem_free(resv);
> >  	}
> > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > index ab6c1199ecc0..cb86281de28b 100644
> > --- a/fs/xfs/scrub/repair.h
> > +++ b/fs/xfs/scrub/repair.h
> > @@ -67,6 +67,8 @@ struct xrep_newbt_resv {
> >  	/* Link to list of extents that we've reserved. */
> >  	struct list_head	list;
> >  
> > +	void			*priv;
> > +
> >  	/* FSB of the block we reserved. */
> >  	xfs_fsblock_t		fsbno;
> >  
> > @@ -112,7 +114,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
> >  void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
> >  		int whichfork, const struct xfs_owner_info *oinfo);
> >  int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
> > -		xfs_extlen_t len);
> > +		xfs_extlen_t len, void *priv);
> >  int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
> >  void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
> >  int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
> > diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> > index e44efc41a041..1e49936afbfb 100644
> > --- a/fs/xfs/xfs_extfree_item.c
> > +++ b/fs/xfs/xfs_extfree_item.c
> > @@ -328,8 +328,6 @@ xfs_trans_get_efd(
> >  {
> >  	struct xfs_efd_log_item		*efdp;
> >  
> > -	ASSERT(nextents > 0);
> > -
> >  	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
> >  		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
> >  				(nextents - 1) * sizeof(struct xfs_extent),
> > 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-09 16:50 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
@ 2019-10-25 14:24   ` Brian Foster
  2019-10-25 16:22     ` Darrick J. Wong
  0 siblings, 1 reply; 9+ messages in thread
From: Brian Foster @ 2019-10-25 14:24 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Wed, Oct 09, 2019 at 09:50:06AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> We need to log EFIs for every extent that we allocate for the purpose of
> staging a new btree so that if we fail then the blocks will be freed
> during log recovery.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Ok, so now I'm getting to the stuff we discussed earlier around pinning
the log tail and whatnot. I have no issue with getting this in as is in
the meantime, so long as we eventually account for those kind of issues
before we consider this non-experimental.

>  fs/xfs/scrub/repair.c     |   37 +++++++++++++++++++++++++++++++++++--
>  fs/xfs/scrub/repair.h     |    4 +++-
>  fs/xfs/xfs_extfree_item.c |    2 --
>  3 files changed, 38 insertions(+), 5 deletions(-)
> 
> 
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index beebd484c5f3..49cea124148b 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -25,6 +25,8 @@
>  #include "xfs_ag_resv.h"
>  #include "xfs_quota.h"
>  #include "xfs_bmap.h"
> +#include "xfs_defer.h"
> +#include "xfs_extfree_item.h"
>  #include "scrub/scrub.h"
>  #include "scrub/common.h"
>  #include "scrub/trace.h"
> @@ -412,7 +414,8 @@ int
>  xrep_newbt_add_reservation(
>  	struct xrep_newbt		*xnr,
>  	xfs_fsblock_t			fsbno,
> -	xfs_extlen_t			len)
> +	xfs_extlen_t			len,
> +	void				*priv)
>  {
>  	struct xrep_newbt_resv	*resv;
>  
> @@ -424,6 +427,7 @@ xrep_newbt_add_reservation(
>  	resv->fsbno = fsbno;
>  	resv->len = len;
>  	resv->used = 0;
> +	resv->priv = priv;
>  	list_add_tail(&resv->list, &xnr->reservations);
>  	return 0;
>  }
> @@ -434,6 +438,7 @@ xrep_newbt_reserve_space(
>  	struct xrep_newbt	*xnr,
>  	uint64_t		nr_blocks)
>  {
> +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;

Heh. I feel like we should be able to do something a little cleaner
here, but I'm not sure what off the top of my head. Maybe a helper to
look up a generic "intent type" in the defer_op_types table and somewhat
abstract away the dfops-specific nature of the current interfaces..?
Maybe we should consider renaming xfs_defer_op_type to something more
generic too (xfs_intent_type ?). (This could all be a separate patch
btw.)

>  	struct xfs_scrub	*sc = xnr->sc;
>  	xfs_alloctype_t		type;
>  	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;

Also variable names look misaligned with the above (here and in the
destroy function below).

> @@ -442,6 +447,7 @@ xrep_newbt_reserve_space(
>  	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
>  
>  	while (nr_blocks > 0 && !error) {
> +		struct xfs_extent_free_item	efi_item;
>  		struct xfs_alloc_arg	args = {
>  			.tp		= sc->tp,
>  			.mp		= sc->mp,
> @@ -453,6 +459,7 @@ xrep_newbt_reserve_space(
>  			.prod		= nr_blocks,
>  			.resv		= xnr->resv,
>  		};
> +		void			*efi;
>  
>  		error = xfs_alloc_vextent(&args);
>  		if (error)
> @@ -465,7 +472,20 @@ xrep_newbt_reserve_space(
>  				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
>  				args.len, xnr->oinfo.oi_owner);
>  
> -		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> +		/*
> +		 * Log a deferred free item for each extent we allocate so that
> +		 * we can get all of the space back if we crash before we can
> +		 * commit the new btree.
> +		 */
> +		efi_item.xefi_startblock = args.fsbno;
> +		efi_item.xefi_blockcount = args.len;
> +		efi_item.xefi_oinfo = xnr->oinfo;
> +		efi_item.xefi_skip_discard = true;
> +		efi = efi_type->create_intent(sc->tp, 1);
> +		efi_type->log_item(sc->tp, efi, &efi_item.xefi_list);
> +
> +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len,
> +				efi);
>  		if (error)
>  			break;
>  
> @@ -487,6 +507,7 @@ xrep_newbt_destroy(
>  	struct xrep_newbt	*xnr,
>  	int			error)
>  {
> +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
>  	struct xfs_scrub	*sc = xnr->sc;
>  	struct xrep_newbt_resv	*resv, *n;
>  
> @@ -494,6 +515,17 @@ xrep_newbt_destroy(
>  		goto junkit;
>  
>  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> +		struct xfs_efd_log_item *efd;
> +
> +		/*
> +		 * Log a deferred free done for each extent we allocated now
> +		 * that we've linked the block into the filesystem.  We cheat
> +		 * since we know that log recovery has never looked at the
> +		 * extents attached to an EFD.
> +		 */
> +		efd = efi_type->create_done(sc->tp, resv->priv, 0);
> +		set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
> +

So here we've presumably succeeded so we drop the intent and actually
free any blocks that we didn't happen to use.

>  		/* Free every block we didn't use. */
>  		resv->fsbno += resv->used;
>  		resv->len -= resv->used;
> @@ -515,6 +547,7 @@ xrep_newbt_destroy(
>  
>  junkit:
>  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> +		efi_type->abort_intent(resv->priv);

And in this case we've failed so we abort the intent.. but what actually
happens to the blocks we might have allocated? Do we need to free those
somewhere or is that also handled by the above path somehow?

Brian

>  		list_del(&resv->list);
>  		kmem_free(resv);
>  	}
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index ab6c1199ecc0..cb86281de28b 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -67,6 +67,8 @@ struct xrep_newbt_resv {
>  	/* Link to list of extents that we've reserved. */
>  	struct list_head	list;
>  
> +	void			*priv;
> +
>  	/* FSB of the block we reserved. */
>  	xfs_fsblock_t		fsbno;
>  
> @@ -112,7 +114,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
>  void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
>  		int whichfork, const struct xfs_owner_info *oinfo);
>  int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
> -		xfs_extlen_t len);
> +		xfs_extlen_t len, void *priv);
>  int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
>  void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
>  int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> index e44efc41a041..1e49936afbfb 100644
> --- a/fs/xfs/xfs_extfree_item.c
> +++ b/fs/xfs/xfs_extfree_item.c
> @@ -328,8 +328,6 @@ xfs_trans_get_efd(
>  {
>  	struct xfs_efd_log_item		*efdp;
>  
> -	ASSERT(nextents > 0);
> -
>  	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
>  		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
>  				(nextents - 1) * sizeof(struct xfs_extent),
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-09 16:49 [PATCH 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
@ 2019-10-09 16:50 ` Darrick J. Wong
  2019-10-25 14:24   ` Brian Foster
  0 siblings, 1 reply; 9+ messages in thread
From: Darrick J. Wong @ 2019-10-09 16:50 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

We need to log EFIs for every extent that we allocate for the purpose of
staging a new btree so that if we fail then the blocks will be freed
during log recovery.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c     |   37 +++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/repair.h     |    4 +++-
 fs/xfs/xfs_extfree_item.c |    2 --
 3 files changed, 38 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index beebd484c5f3..49cea124148b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -25,6 +25,8 @@
 #include "xfs_ag_resv.h"
 #include "xfs_quota.h"
 #include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_extfree_item.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -412,7 +414,8 @@ int
 xrep_newbt_add_reservation(
 	struct xrep_newbt		*xnr,
 	xfs_fsblock_t			fsbno,
-	xfs_extlen_t			len)
+	xfs_extlen_t			len,
+	void				*priv)
 {
 	struct xrep_newbt_resv	*resv;
 
@@ -424,6 +427,7 @@ xrep_newbt_add_reservation(
 	resv->fsbno = fsbno;
 	resv->len = len;
 	resv->used = 0;
+	resv->priv = priv;
 	list_add_tail(&resv->list, &xnr->reservations);
 	return 0;
 }
@@ -434,6 +438,7 @@ xrep_newbt_reserve_space(
 	struct xrep_newbt	*xnr,
 	uint64_t		nr_blocks)
 {
+	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
 	struct xfs_scrub	*sc = xnr->sc;
 	xfs_alloctype_t		type;
 	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
@@ -442,6 +447,7 @@ xrep_newbt_reserve_space(
 	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
 
 	while (nr_blocks > 0 && !error) {
+		struct xfs_extent_free_item	efi_item;
 		struct xfs_alloc_arg	args = {
 			.tp		= sc->tp,
 			.mp		= sc->mp,
@@ -453,6 +459,7 @@ xrep_newbt_reserve_space(
 			.prod		= nr_blocks,
 			.resv		= xnr->resv,
 		};
+		void			*efi;
 
 		error = xfs_alloc_vextent(&args);
 		if (error)
@@ -465,7 +472,20 @@ xrep_newbt_reserve_space(
 				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
 				args.len, xnr->oinfo.oi_owner);
 
-		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
+		/*
+		 * Log a deferred free item for each extent we allocate so that
+		 * we can get all of the space back if we crash before we can
+		 * commit the new btree.
+		 */
+		efi_item.xefi_startblock = args.fsbno;
+		efi_item.xefi_blockcount = args.len;
+		efi_item.xefi_oinfo = xnr->oinfo;
+		efi_item.xefi_skip_discard = true;
+		efi = efi_type->create_intent(sc->tp, 1);
+		efi_type->log_item(sc->tp, efi, &efi_item.xefi_list);
+
+		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len,
+				efi);
 		if (error)
 			break;
 
@@ -487,6 +507,7 @@ xrep_newbt_destroy(
 	struct xrep_newbt	*xnr,
 	int			error)
 {
+	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
 	struct xfs_scrub	*sc = xnr->sc;
 	struct xrep_newbt_resv	*resv, *n;
 
@@ -494,6 +515,17 @@ xrep_newbt_destroy(
 		goto junkit;
 
 	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
+		struct xfs_efd_log_item *efd;
+
+		/*
+		 * Log a deferred free done for each extent we allocated now
+		 * that we've linked the block into the filesystem.  We cheat
+		 * since we know that log recovery has never looked at the
+		 * extents attached to an EFD.
+		 */
+		efd = efi_type->create_done(sc->tp, resv->priv, 0);
+		set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
+
 		/* Free every block we didn't use. */
 		resv->fsbno += resv->used;
 		resv->len -= resv->used;
@@ -515,6 +547,7 @@ xrep_newbt_destroy(
 
 junkit:
 	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
+		efi_type->abort_intent(resv->priv);
 		list_del(&resv->list);
 		kmem_free(resv);
 	}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ab6c1199ecc0..cb86281de28b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -67,6 +67,8 @@ struct xrep_newbt_resv {
 	/* Link to list of extents that we've reserved. */
 	struct list_head	list;
 
+	void			*priv;
+
 	/* FSB of the block we reserved. */
 	xfs_fsblock_t		fsbno;
 
@@ -112,7 +114,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
 void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
 		int whichfork, const struct xfs_owner_info *oinfo);
 int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
-		xfs_extlen_t len);
+		xfs_extlen_t len, void *priv);
 int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
 void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
 int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index e44efc41a041..1e49936afbfb 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -328,8 +328,6 @@ xfs_trans_get_efd(
 {
 	struct xfs_efd_log_item		*efdp;
 
-	ASSERT(nextents > 0);
-
 	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
 		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
 				(nextents - 1) * sizeof(struct xfs_extent),


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-01-01  1:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-01  1:02 [PATCH v2 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
2020-01-01  1:02 ` [PATCH 1/3] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
2020-01-01  1:02 ` [PATCH 2/3] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
2020-01-01  1:02 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2019-10-29 23:31 [PATCH v2 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
2019-10-29 23:31 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
2019-10-09 16:49 [PATCH 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
2019-10-09 16:50 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
2019-10-25 14:24   ` Brian Foster
2019-10-25 16:22     ` Darrick J. Wong
2019-10-25 17:36       ` Brian Foster

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).