All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] xfs: prepare repair for bulk loading
@ 2019-10-09 16:49 Darrick J. Wong
  2019-10-09 16:49 ` [PATCH 1/3] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Darrick J. Wong @ 2019-10-09 16:49 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

Hi all,

Add some new helper functions (and debug knobs) to online repair to
prepare us to stage new btrees.

If you're going to start using this mess, you probably ought to just
pull from my git trees, which are linked below.

This is an extraordinary way to destroy everything.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-prep-for-bulk-loading

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] xfs: add debug knobs to control btree bulk load slack factors
  2019-10-09 16:49 [PATCH 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
@ 2019-10-09 16:49 ` Darrick J. Wong
  2019-10-25 14:19   ` Brian Foster
  2019-10-09 16:49 ` [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
  2019-10-09 16:50 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
  2 siblings, 1 reply; 14+ messages in thread
From: Darrick J. Wong @ 2019-10-09 16:49 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Add some debug knobs so that we can control the leaf and node block
slack when rebuilding btrees.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_globals.c |    6 ++++++
 fs/xfs/xfs_sysctl.h  |    2 ++
 fs/xfs/xfs_sysfs.c   |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)


diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index fa55ab8b8d80..8f67027c144b 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -43,4 +43,10 @@ struct xfs_globals xfs_globals = {
 #ifdef DEBUG
 	.pwork_threads		=	-1,	/* automatic thread detection */
 #endif
+
+	/* Bulk load new btree leaf blocks to 75% full. */
+	.bload_leaf_slack	=	-1,
+
+	/* Bulk load new btree node blocks to 75% full. */
+	.bload_node_slack	=	-1,
 };
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 8abf4640f1d5..aecccceee4ca 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,8 @@ struct xfs_globals {
 #ifdef DEBUG
 	int	pwork_threads;		/* parallel workqueue threads */
 #endif
+	int	bload_leaf_slack;	/* btree bulk load leaf slack */
+	int	bload_node_slack;	/* btree bulk load node slack */
 	int	log_recovery_delay;	/* log recovery delay (secs) */
 	int	mount_delay;		/* mount setup delay (secs) */
 	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index f1bc88f4367c..673ad21a9585 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -228,6 +228,58 @@ pwork_threads_show(
 XFS_SYSFS_ATTR_RW(pwork_threads);
 #endif /* DEBUG */
 
+STATIC ssize_t
+bload_leaf_slack_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	xfs_globals.bload_leaf_slack = val;
+	return count;
+}
+
+STATIC ssize_t
+bload_leaf_slack_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_leaf_slack);
+
+STATIC ssize_t
+bload_node_slack_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	xfs_globals.bload_node_slack = val;
+	return count;
+}
+
+STATIC ssize_t
+bload_node_slack_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_node_slack);
+
 static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(bug_on_assert),
 	ATTR_LIST(log_recovery_delay),
@@ -236,6 +288,8 @@ static struct attribute *xfs_dbg_attrs[] = {
 #ifdef DEBUG
 	ATTR_LIST(pwork_threads),
 #endif
+	ATTR_LIST(bload_leaf_slack),
+	ATTR_LIST(bload_node_slack),
 	NULL,
 };
 


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging
  2019-10-09 16:49 [PATCH 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
  2019-10-09 16:49 ` [PATCH 1/3] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
@ 2019-10-09 16:49 ` Darrick J. Wong
  2019-10-25 14:22   ` Brian Foster
  2019-10-09 16:50 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
  2 siblings, 1 reply; 14+ messages in thread
From: Darrick J. Wong @ 2019-10-09 16:49 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Create a new xrep_newbt structure to encapsulate a fake root for
creating a staged btree cursor as well as to track all the blocks that
we need to reserve in order to build that btree.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c |  260 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |   61 +++++++++++
 fs/xfs/scrub/trace.h  |   58 +++++++++++
 3 files changed, 379 insertions(+)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 588bc054db5c..beebd484c5f3 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -359,6 +359,266 @@ xrep_init_btblock(
 	return 0;
 }
 
+/* Initialize accounting resources for staging a new AG btree. */
+void
+xrep_newbt_init_ag(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc,
+	const struct xfs_owner_info	*oinfo,
+	xfs_fsblock_t			alloc_hint,
+	enum xfs_ag_resv_type		resv)
+{
+	memset(xnr, 0, sizeof(struct xrep_newbt));
+	xnr->sc = sc;
+	xnr->oinfo = *oinfo; /* structure copy */
+	xnr->alloc_hint = alloc_hint;
+	xnr->resv = resv;
+	INIT_LIST_HEAD(&xnr->reservations);
+}
+
+/* Initialize accounting resources for staging a new inode fork btree. */
+void
+xrep_newbt_init_inode(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc,
+	int				whichfork,
+	const struct xfs_owner_info	*oinfo)
+{
+	memset(xnr, 0, sizeof(struct xrep_newbt));
+	xnr->sc = sc;
+	xnr->oinfo = *oinfo; /* structure copy */
+	xnr->alloc_hint = XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino);
+	xnr->resv = XFS_AG_RESV_NONE;
+	xnr->ifake.if_fork = kmem_zone_zalloc(xfs_ifork_zone, 0);
+	xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
+	INIT_LIST_HEAD(&xnr->reservations);
+}
+
+/*
+ * Initialize accounting resources for staging a new btree.  Callers are
+ * expected to add their own reservations (and clean them up) manually.
+ */
+void
+xrep_newbt_init_bare(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc)
+{
+	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
+			XFS_AG_RESV_NONE);
+}
+
+/* Add a space reservation manually. */
+int
+xrep_newbt_add_reservation(
+	struct xrep_newbt		*xnr,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len)
+{
+	struct xrep_newbt_resv	*resv;
+
+	resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
+	if (!resv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&resv->list);
+	resv->fsbno = fsbno;
+	resv->len = len;
+	resv->used = 0;
+	list_add_tail(&resv->list, &xnr->reservations);
+	return 0;
+}
+
+/* Reserve disk space for our new btree. */
+int
+xrep_newbt_reserve_space(
+	struct xrep_newbt	*xnr,
+	uint64_t		nr_blocks)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	xfs_alloctype_t		type;
+	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
+	int			error = 0;
+
+	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
+
+	while (nr_blocks > 0 && !error) {
+		struct xfs_alloc_arg	args = {
+			.tp		= sc->tp,
+			.mp		= sc->mp,
+			.type		= type,
+			.fsbno		= alloc_hint,
+			.oinfo		= xnr->oinfo,
+			.minlen		= 1,
+			.maxlen		= nr_blocks,
+			.prod		= nr_blocks,
+			.resv		= xnr->resv,
+		};
+
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			return error;
+		if (args.fsbno == NULLFSBLOCK)
+			return -ENOSPC;
+
+		trace_xrep_newbt_reserve_space(sc->mp,
+				XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
+				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
+				args.len, xnr->oinfo.oi_owner);
+
+		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
+		if (error)
+			break;
+
+		nr_blocks -= args.len;
+		alloc_hint = args.fsbno + args.len - 1;
+
+		if (sc->ip)
+			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+		else
+			error = xrep_roll_ag_trans(sc);
+	}
+
+	return error;
+}
+
+/* Free all the accounting info and disk space we reserved for a new btree. */
+void
+xrep_newbt_destroy(
+	struct xrep_newbt	*xnr,
+	int			error)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	struct xrep_newbt_resv	*resv, *n;
+
+	if (error)
+		goto junkit;
+
+	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
+		/* Free every block we didn't use. */
+		resv->fsbno += resv->used;
+		resv->len -= resv->used;
+		resv->used = 0;
+
+		if (resv->len > 0) {
+			trace_xrep_newbt_unreserve_space(sc->mp,
+					XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
+					XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
+					resv->len, xnr->oinfo.oi_owner);
+
+			__xfs_bmap_add_free(sc->tp, resv->fsbno, resv->len,
+					&xnr->oinfo, true);
+		}
+
+		list_del(&resv->list);
+		kmem_free(resv);
+	}
+
+junkit:
+	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
+		list_del(&resv->list);
+		kmem_free(resv);
+	}
+
+	if (sc->ip) {
+		kmem_zone_free(xfs_ifork_zone, xnr->ifake.if_fork);
+		xnr->ifake.if_fork = NULL;
+	}
+}
+
+/* Feed one of the reserved btree blocks to the bulk loader. */
+int
+xrep_newbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	struct xrep_newbt	*xnr,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xrep_newbt_resv	*resv;
+	xfs_fsblock_t		fsb;
+
+	/*
+	 * If last_resv doesn't have a block for us, move forward until we find
+	 * one that does (or run out of reservations).
+	 */
+	if (xnr->last_resv == NULL) {
+		list_for_each_entry(resv, &xnr->reservations, list) {
+			if (resv->used < resv->len) {
+				xnr->last_resv = resv;
+				break;
+			}
+		}
+		if (xnr->last_resv == NULL)
+			return -ENOSPC;
+	} else if (xnr->last_resv->used == xnr->last_resv->len) {
+		if (xnr->last_resv->list.next == &xnr->reservations)
+			return -ENOSPC;
+		xnr->last_resv = list_entry(xnr->last_resv->list.next,
+				struct xrep_newbt_resv, list);
+	}
+
+	/* Nab the block. */
+	fsb = xnr->last_resv->fsbno + xnr->last_resv->used;
+	xnr->last_resv->used++;
+
+	trace_xrep_newbt_alloc_block(cur->bc_mp,
+			XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
+			XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
+			xnr->oinfo.oi_owner);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(fsb);
+	else
+		ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
+	return 0;
+}
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us.  However, there are some
+ * exceptions to this rule:
+ *
+ * (1) If someone turned one of the debug knobs.
+ * (2) If this is a per-AG btree and the AG has less than ~9% space free.
+ * (3) If this is an inode btree and the FS has less than ~9% space free.
+ *
+ * Note that we actually use 3/32 for the comparison to avoid division.
+ */
+void
+xrep_bload_estimate_slack(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_bload	*bload)
+{
+	uint64_t		free;
+	uint64_t		sz;
+
+	/*
+	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
+	 * unless someone has set them otherwise, so we just pull the values
+	 * here.
+	 */
+	bload->leaf_slack = xfs_globals.bload_leaf_slack;
+	bload->node_slack = xfs_globals.bload_node_slack;
+
+	if (sc->ops->type == ST_PERAG) {
+		free = sc->sa.pag->pagf_freeblks;
+		sz = xfs_ag_block_count(sc->mp, sc->sa.agno);
+	} else {
+		free = percpu_counter_sum(&sc->mp->m_fdblocks);
+		sz = sc->mp->m_sb.sb_dblocks;
+	}
+
+	/* No further changes if there's more than 3/32ths space left. */
+	if (free >= ((sz * 3) >> 5))
+		return;
+
+	/* We're low on space; load the btrees as tightly as possible. */
+	if (bload->leaf_slack < 0)
+		bload->leaf_slack = 0;
+	if (bload->node_slack < 0)
+		bload->node_slack = 0;
+}
+
 /*
  * Reconstructing per-AG Btrees
  *
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 479cfe38065e..ab6c1199ecc0 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -6,6 +6,10 @@
 #ifndef __XFS_SCRUB_REPAIR_H__
 #define __XFS_SCRUB_REPAIR_H__
 
+#include "scrub/bitmap.h"
+#include "xfs_btree.h"
+union xfs_btree_ptr;
+
 static inline int xrep_notsupported(struct xfs_scrub *sc)
 {
 	return -EOPNOTSUPP;
@@ -59,6 +63,63 @@ int xrep_agf(struct xfs_scrub *sc);
 int xrep_agfl(struct xfs_scrub *sc);
 int xrep_agi(struct xfs_scrub *sc);
 
+struct xrep_newbt_resv {
+	/* Link to list of extents that we've reserved. */
+	struct list_head	list;
+
+	/* FSB of the block we reserved. */
+	xfs_fsblock_t		fsbno;
+
+	/* Length of the reservation. */
+	xfs_extlen_t		len;
+
+	/* How much of this reservation we've used. */
+	xfs_extlen_t		used;
+};
+
+struct xrep_newbt {
+	struct xfs_scrub	*sc;
+
+	/* List of extents that we've reserved. */
+	struct list_head	reservations;
+
+	/* Fake root for new btree. */
+	union {
+		struct xbtree_afakeroot	afake;
+		struct xbtree_ifakeroot	ifake;
+	};
+
+	/* rmap owner of these blocks */
+	struct xfs_owner_info	oinfo;
+
+	/* The last reservation we allocated from. */
+	struct xrep_newbt_resv	*last_resv;
+
+	/* Allocation hint */
+	xfs_fsblock_t		alloc_hint;
+
+	/* per-ag reservation type */
+	enum xfs_ag_resv_type	resv;
+};
+
+#define for_each_xrep_newbt_reservation(xnr, resv, n)	\
+	list_for_each_entry_safe((resv), (n), &(xnr)->reservations, list)
+
+void xrep_newbt_init_bare(struct xrep_newbt *xba, struct xfs_scrub *sc);
+void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
+		const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
+		enum xfs_ag_resv_type resv);
+void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
+		int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
+		xfs_extlen_t len);
+int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
+void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
+int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
+		union xfs_btree_ptr *ptr);
+void xrep_bload_estimate_slack(struct xfs_scrub *sc,
+		struct xfs_btree_bload *bload);
+
 #else
 
 static inline int xrep_attempt(
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 3362bae28b46..deb177abf652 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -904,6 +904,64 @@ TRACE_EVENT(xrep_ialloc_insert,
 		  __entry->freemask)
 )
 
+DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 int64_t owner),
+	TP_ARGS(mp, agno, agbno, len, owner),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int64_t, owner)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->owner = owner;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner)
+);
+#define DEFINE_NEWBT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_newbt_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len, \
+		 int64_t owner), \
+	TP_ARGS(mp, agno, agbno, len, owner))
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_reserve_space);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_unreserve_space);
+
+TRACE_EVENT(xrep_newbt_alloc_block,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, int64_t owner),
+	TP_ARGS(mp, agno, agbno, owner),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(int64_t, owner)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->owner = owner;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u owner %lld",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->owner)
+);
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 #endif /* _TRACE_XFS_SCRUB_TRACE_H */


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-09 16:49 [PATCH 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
  2019-10-09 16:49 ` [PATCH 1/3] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
  2019-10-09 16:49 ` [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
@ 2019-10-09 16:50 ` Darrick J. Wong
  2019-10-25 14:24   ` Brian Foster
  2 siblings, 1 reply; 14+ messages in thread
From: Darrick J. Wong @ 2019-10-09 16:50 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

We need to log EFIs for every extent that we allocate for the purpose of
staging a new btree so that if we fail then the blocks will be freed
during log recovery.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c     |   37 +++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/repair.h     |    4 +++-
 fs/xfs/xfs_extfree_item.c |    2 --
 3 files changed, 38 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index beebd484c5f3..49cea124148b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -25,6 +25,8 @@
 #include "xfs_ag_resv.h"
 #include "xfs_quota.h"
 #include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_extfree_item.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -412,7 +414,8 @@ int
 xrep_newbt_add_reservation(
 	struct xrep_newbt		*xnr,
 	xfs_fsblock_t			fsbno,
-	xfs_extlen_t			len)
+	xfs_extlen_t			len,
+	void				*priv)
 {
 	struct xrep_newbt_resv	*resv;
 
@@ -424,6 +427,7 @@ xrep_newbt_add_reservation(
 	resv->fsbno = fsbno;
 	resv->len = len;
 	resv->used = 0;
+	resv->priv = priv;
 	list_add_tail(&resv->list, &xnr->reservations);
 	return 0;
 }
@@ -434,6 +438,7 @@ xrep_newbt_reserve_space(
 	struct xrep_newbt	*xnr,
 	uint64_t		nr_blocks)
 {
+	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
 	struct xfs_scrub	*sc = xnr->sc;
 	xfs_alloctype_t		type;
 	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
@@ -442,6 +447,7 @@ xrep_newbt_reserve_space(
 	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
 
 	while (nr_blocks > 0 && !error) {
+		struct xfs_extent_free_item	efi_item;
 		struct xfs_alloc_arg	args = {
 			.tp		= sc->tp,
 			.mp		= sc->mp,
@@ -453,6 +459,7 @@ xrep_newbt_reserve_space(
 			.prod		= nr_blocks,
 			.resv		= xnr->resv,
 		};
+		void			*efi;
 
 		error = xfs_alloc_vextent(&args);
 		if (error)
@@ -465,7 +472,20 @@ xrep_newbt_reserve_space(
 				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
 				args.len, xnr->oinfo.oi_owner);
 
-		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
+		/*
+		 * Log a deferred free item for each extent we allocate so that
+		 * we can get all of the space back if we crash before we can
+		 * commit the new btree.
+		 */
+		efi_item.xefi_startblock = args.fsbno;
+		efi_item.xefi_blockcount = args.len;
+		efi_item.xefi_oinfo = xnr->oinfo;
+		efi_item.xefi_skip_discard = true;
+		efi = efi_type->create_intent(sc->tp, 1);
+		efi_type->log_item(sc->tp, efi, &efi_item.xefi_list);
+
+		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len,
+				efi);
 		if (error)
 			break;
 
@@ -487,6 +507,7 @@ xrep_newbt_destroy(
 	struct xrep_newbt	*xnr,
 	int			error)
 {
+	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
 	struct xfs_scrub	*sc = xnr->sc;
 	struct xrep_newbt_resv	*resv, *n;
 
@@ -494,6 +515,17 @@ xrep_newbt_destroy(
 		goto junkit;
 
 	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
+		struct xfs_efd_log_item *efd;
+
+		/*
+		 * Log a deferred free done for each extent we allocated now
+		 * that we've linked the block into the filesystem.  We cheat
+		 * since we know that log recovery has never looked at the
+		 * extents attached to an EFD.
+		 */
+		efd = efi_type->create_done(sc->tp, resv->priv, 0);
+		set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
+
 		/* Free every block we didn't use. */
 		resv->fsbno += resv->used;
 		resv->len -= resv->used;
@@ -515,6 +547,7 @@ xrep_newbt_destroy(
 
 junkit:
 	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
+		efi_type->abort_intent(resv->priv);
 		list_del(&resv->list);
 		kmem_free(resv);
 	}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ab6c1199ecc0..cb86281de28b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -67,6 +67,8 @@ struct xrep_newbt_resv {
 	/* Link to list of extents that we've reserved. */
 	struct list_head	list;
 
+	void			*priv;
+
 	/* FSB of the block we reserved. */
 	xfs_fsblock_t		fsbno;
 
@@ -112,7 +114,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
 void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
 		int whichfork, const struct xfs_owner_info *oinfo);
 int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
-		xfs_extlen_t len);
+		xfs_extlen_t len, void *priv);
 int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
 void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
 int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index e44efc41a041..1e49936afbfb 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -328,8 +328,6 @@ xfs_trans_get_efd(
 {
 	struct xfs_efd_log_item		*efdp;
 
-	ASSERT(nextents > 0);
-
 	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
 		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
 				(nextents - 1) * sizeof(struct xfs_extent),


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: add debug knobs to control btree bulk load slack factors
  2019-10-09 16:49 ` [PATCH 1/3] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
@ 2019-10-25 14:19   ` Brian Foster
  2019-10-25 16:39     ` Darrick J. Wong
  0 siblings, 1 reply; 14+ messages in thread
From: Brian Foster @ 2019-10-25 14:19 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Wed, Oct 09, 2019 at 09:49:53AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add some debug knobs so that we can control the leaf and node block
> slack when rebuilding btrees.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_globals.c |    6 ++++++
>  fs/xfs/xfs_sysctl.h  |    2 ++
>  fs/xfs/xfs_sysfs.c   |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 62 insertions(+)
> 
> 
> diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
> index fa55ab8b8d80..8f67027c144b 100644
> --- a/fs/xfs/xfs_globals.c
> +++ b/fs/xfs/xfs_globals.c
> @@ -43,4 +43,10 @@ struct xfs_globals xfs_globals = {
>  #ifdef DEBUG
>  	.pwork_threads		=	-1,	/* automatic thread detection */
>  #endif
> +
> +	/* Bulk load new btree leaf blocks to 75% full. */
> +	.bload_leaf_slack	=	-1,
> +
> +	/* Bulk load new btree node blocks to 75% full. */
> +	.bload_node_slack	=	-1,

What are the units for these fields? Seems reasonable outside of that,
though I'd probably reorder it to after the related code such that this
patch also includes whatever bits that actually use the fields.

Brian

>  };
> diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> index 8abf4640f1d5..aecccceee4ca 100644
> --- a/fs/xfs/xfs_sysctl.h
> +++ b/fs/xfs/xfs_sysctl.h
> @@ -85,6 +85,8 @@ struct xfs_globals {
>  #ifdef DEBUG
>  	int	pwork_threads;		/* parallel workqueue threads */
>  #endif
> +	int	bload_leaf_slack;	/* btree bulk load leaf slack */
> +	int	bload_node_slack;	/* btree bulk load node slack */
>  	int	log_recovery_delay;	/* log recovery delay (secs) */
>  	int	mount_delay;		/* mount setup delay (secs) */
>  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> index f1bc88f4367c..673ad21a9585 100644
> --- a/fs/xfs/xfs_sysfs.c
> +++ b/fs/xfs/xfs_sysfs.c
> @@ -228,6 +228,58 @@ pwork_threads_show(
>  XFS_SYSFS_ATTR_RW(pwork_threads);
>  #endif /* DEBUG */
>  
> +STATIC ssize_t
> +bload_leaf_slack_store(
> +	struct kobject	*kobject,
> +	const char	*buf,
> +	size_t		count)
> +{
> +	int		ret;
> +	int		val;
> +
> +	ret = kstrtoint(buf, 0, &val);
> +	if (ret)
> +		return ret;
> +
> +	xfs_globals.bload_leaf_slack = val;
> +	return count;
> +}
> +
> +STATIC ssize_t
> +bload_leaf_slack_show(
> +	struct kobject	*kobject,
> +	char		*buf)
> +{
> +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
> +}
> +XFS_SYSFS_ATTR_RW(bload_leaf_slack);
> +
> +STATIC ssize_t
> +bload_node_slack_store(
> +	struct kobject	*kobject,
> +	const char	*buf,
> +	size_t		count)
> +{
> +	int		ret;
> +	int		val;
> +
> +	ret = kstrtoint(buf, 0, &val);
> +	if (ret)
> +		return ret;
> +
> +	xfs_globals.bload_node_slack = val;
> +	return count;
> +}
> +
> +STATIC ssize_t
> +bload_node_slack_show(
> +	struct kobject	*kobject,
> +	char		*buf)
> +{
> +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
> +}
> +XFS_SYSFS_ATTR_RW(bload_node_slack);
> +
>  static struct attribute *xfs_dbg_attrs[] = {
>  	ATTR_LIST(bug_on_assert),
>  	ATTR_LIST(log_recovery_delay),
> @@ -236,6 +288,8 @@ static struct attribute *xfs_dbg_attrs[] = {
>  #ifdef DEBUG
>  	ATTR_LIST(pwork_threads),
>  #endif
> +	ATTR_LIST(bload_leaf_slack),
> +	ATTR_LIST(bload_node_slack),
>  	NULL,
>  };
>  
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging
  2019-10-09 16:49 ` [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
@ 2019-10-25 14:22   ` Brian Foster
  2019-10-25 16:35     ` Darrick J. Wong
  0 siblings, 1 reply; 14+ messages in thread
From: Brian Foster @ 2019-10-25 14:22 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Wed, Oct 09, 2019 at 09:49:59AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Create a new xrep_newbt structure to encapsulate a fake root for
> creating a staged btree cursor as well as to track all the blocks that
> we need to reserve in order to build that btree.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/scrub/repair.c |  260 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/scrub/repair.h |   61 +++++++++++
>  fs/xfs/scrub/trace.h  |   58 +++++++++++
>  3 files changed, 379 insertions(+)
> 
> 
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 588bc054db5c..beebd484c5f3 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -359,6 +359,266 @@ xrep_init_btblock(
>  	return 0;
>  }
>  
...
> +/* Initialize accounting resources for staging a new inode fork btree. */
> +void
> +xrep_newbt_init_inode(
> +	struct xrep_newbt		*xnr,
> +	struct xfs_scrub		*sc,
> +	int				whichfork,
> +	const struct xfs_owner_info	*oinfo)
> +{
> +	memset(xnr, 0, sizeof(struct xrep_newbt));
> +	xnr->sc = sc;
> +	xnr->oinfo = *oinfo; /* structure copy */
> +	xnr->alloc_hint = XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino);
> +	xnr->resv = XFS_AG_RESV_NONE;
> +	xnr->ifake.if_fork = kmem_zone_zalloc(xfs_ifork_zone, 0);
> +	xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
> +	INIT_LIST_HEAD(&xnr->reservations);

Looks like this could reuse the above function for everything outside of
the fake root bits.

> +}
> +
> +/*
> + * Initialize accounting resources for staging a new btree.  Callers are
> + * expected to add their own reservations (and clean them up) manually.
> + */
> +void
> +xrep_newbt_init_bare(
> +	struct xrep_newbt		*xnr,
> +	struct xfs_scrub		*sc)
> +{
> +	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
> +			XFS_AG_RESV_NONE);
> +}
> +
> +/* Add a space reservation manually. */
> +int
> +xrep_newbt_add_reservation(
> +	struct xrep_newbt		*xnr,
> +	xfs_fsblock_t			fsbno,
> +	xfs_extlen_t			len)
> +{

FWIW the "reservation" terminology sounds a bit funny to me. Perhaps
it's just because I've had log reservation on my mind :P, but something
that "reserves blocks" as opposed to "adds reservation" might be a bit
more clear from a naming perspective.

> +	struct xrep_newbt_resv	*resv;
> +
> +	resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
> +	if (!resv)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&resv->list);
> +	resv->fsbno = fsbno;
> +	resv->len = len;
> +	resv->used = 0;

Is ->used purely a count or does it also serve as a pointer to the next
"unused" block?

> +	list_add_tail(&resv->list, &xnr->reservations);
> +	return 0;
> +}
> +
> +/* Reserve disk space for our new btree. */
> +int
> +xrep_newbt_reserve_space(
> +	struct xrep_newbt	*xnr,
> +	uint64_t		nr_blocks)
> +{
> +	struct xfs_scrub	*sc = xnr->sc;
> +	xfs_alloctype_t		type;
> +	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> +	int			error = 0;
> +
> +	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> +

So I take it this distinguishes between reconstruction of a bmapbt
where we can allocate across AGs vs an AG tree..? If so, a one liner
comment wouldn't hurt here.

> +	while (nr_blocks > 0 && !error) {
> +		struct xfs_alloc_arg	args = {
> +			.tp		= sc->tp,
> +			.mp		= sc->mp,
> +			.type		= type,
> +			.fsbno		= alloc_hint,
> +			.oinfo		= xnr->oinfo,
> +			.minlen		= 1,
> +			.maxlen		= nr_blocks,
> +			.prod		= nr_blocks,

Why .prod? Is this relevant if .mod isn't set?

> +			.resv		= xnr->resv,
> +		};
> +
> +		error = xfs_alloc_vextent(&args);
> +		if (error)
> +			return error;
> +		if (args.fsbno == NULLFSBLOCK)
> +			return -ENOSPC;
> +
> +		trace_xrep_newbt_reserve_space(sc->mp,
> +				XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
> +				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> +				args.len, xnr->oinfo.oi_owner);
> +
> +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> +		if (error)
> +			break;
> +
> +		nr_blocks -= args.len;
> +		alloc_hint = args.fsbno + args.len - 1;
> +
> +		if (sc->ip)
> +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> +		else
> +			error = xrep_roll_ag_trans(sc);
> +	}
> +
> +	return error;
> +}
> +
> +/* Free all the accounting info and disk space we reserved for a new btree. */
> +void
> +xrep_newbt_destroy(
> +	struct xrep_newbt	*xnr,
> +	int			error)
> +{
> +	struct xfs_scrub	*sc = xnr->sc;
> +	struct xrep_newbt_resv	*resv, *n;
> +
> +	if (error)
> +		goto junkit;
> +
> +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> +		/* Free every block we didn't use. */
> +		resv->fsbno += resv->used;
> +		resv->len -= resv->used;
> +		resv->used = 0;

That answers my count/pointer question. :)

> +
> +		if (resv->len > 0) {
> +			trace_xrep_newbt_unreserve_space(sc->mp,
> +					XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
> +					XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
> +					resv->len, xnr->oinfo.oi_owner);
> +
> +			__xfs_bmap_add_free(sc->tp, resv->fsbno, resv->len,
> +					&xnr->oinfo, true);
> +		}
> +
> +		list_del(&resv->list);
> +		kmem_free(resv);
> +	}
> +
> +junkit:
> +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> +		list_del(&resv->list);
> +		kmem_free(resv);
> +	}

Seems like this could be folded into the above loop by just checking
error and skipping the free logic appropriately.

> +
> +	if (sc->ip) {
> +		kmem_zone_free(xfs_ifork_zone, xnr->ifake.if_fork);
> +		xnr->ifake.if_fork = NULL;
> +	}
> +}
> +
> +/* Feed one of the reserved btree blocks to the bulk loader. */
> +int
> +xrep_newbt_alloc_block(
> +	struct xfs_btree_cur	*cur,
> +	struct xrep_newbt	*xnr,
> +	union xfs_btree_ptr	*ptr)
> +{
> +	struct xrep_newbt_resv	*resv;
> +	xfs_fsblock_t		fsb;
> +
> +	/*
> +	 * If last_resv doesn't have a block for us, move forward until we find
> +	 * one that does (or run out of reservations).
> +	 */
> +	if (xnr->last_resv == NULL) {
> +		list_for_each_entry(resv, &xnr->reservations, list) {
> +			if (resv->used < resv->len) {
> +				xnr->last_resv = resv;
> +				break;
> +			}
> +		}

Not a big deal right now, but it might be worth eventually considering
something more efficient. For example, perhaps we could rotate depleted
entries to the end of the list and if we rotate and find nothing in the
next entry at the head, we know we've run out of space.

> +		if (xnr->last_resv == NULL)
> +			return -ENOSPC;
> +	} else if (xnr->last_resv->used == xnr->last_resv->len) {
> +		if (xnr->last_resv->list.next == &xnr->reservations)
> +			return -ENOSPC;
> +		xnr->last_resv = list_entry(xnr->last_resv->list.next,
> +				struct xrep_newbt_resv, list);
> +	}
> +
> +	/* Nab the block. */
> +	fsb = xnr->last_resv->fsbno + xnr->last_resv->used;
> +	xnr->last_resv->used++;
> +
> +	trace_xrep_newbt_alloc_block(cur->bc_mp,
> +			XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
> +			XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
> +			xnr->oinfo.oi_owner);
> +
> +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
> +		ptr->l = cpu_to_be64(fsb);
> +	else
> +		ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
> +	return 0;
> +}
> +
...
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index 479cfe38065e..ab6c1199ecc0 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
...
> @@ -59,6 +63,63 @@ int xrep_agf(struct xfs_scrub *sc);
>  int xrep_agfl(struct xfs_scrub *sc);
>  int xrep_agi(struct xfs_scrub *sc);
>  
...
> +
> +#define for_each_xrep_newbt_reservation(xnr, resv, n)	\
> +	list_for_each_entry_safe((resv), (n), &(xnr)->reservations, list)

This is unused (and seems unnecessary for a simple list).

...
> diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
> index 3362bae28b46..deb177abf652 100644
> --- a/fs/xfs/scrub/trace.h
> +++ b/fs/xfs/scrub/trace.h
> @@ -904,6 +904,64 @@ TRACE_EVENT(xrep_ialloc_insert,
>  		  __entry->freemask)
>  )
>  
...
> +#define DEFINE_NEWBT_EXTENT_EVENT(name) \
> +DEFINE_EVENT(xrep_newbt_extent_class, name, \
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> +		 xfs_agblock_t agbno, xfs_extlen_t len, \
> +		 int64_t owner), \
> +	TP_ARGS(mp, agno, agbno, len, owner))
> +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_reserve_space);
> +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_unreserve_space);
> +
> +TRACE_EVENT(xrep_newbt_alloc_block,
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> +		 xfs_agblock_t agbno, int64_t owner),

This could be folded into the above class if we just passed 1 for the
length, eh?

Brian

> +	TP_ARGS(mp, agno, agbno, owner),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(xfs_agblock_t, agbno)
> +		__field(int64_t, owner)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->agno = agno;
> +		__entry->agbno = agbno;
> +		__entry->owner = owner;
> +	),
> +	TP_printk("dev %d:%d agno %u agbno %u owner %lld",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->agno,
> +		  __entry->agbno,
> +		  __entry->owner)
> +);
> +
>  #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
>  
>  #endif /* _TRACE_XFS_SCRUB_TRACE_H */
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-09 16:50 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
@ 2019-10-25 14:24   ` Brian Foster
  2019-10-25 16:22     ` Darrick J. Wong
  0 siblings, 1 reply; 14+ messages in thread
From: Brian Foster @ 2019-10-25 14:24 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Wed, Oct 09, 2019 at 09:50:06AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> We need to log EFIs for every extent that we allocate for the purpose of
> staging a new btree so that if we fail then the blocks will be freed
> during log recovery.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Ok, so now I'm getting to the stuff we discussed earlier around pinning
the log tail and whatnot. I have no issue with getting this in as is in
the meantime, so long as we eventually account for those kind of issues
before we consider this non-experimental.

>  fs/xfs/scrub/repair.c     |   37 +++++++++++++++++++++++++++++++++++--
>  fs/xfs/scrub/repair.h     |    4 +++-
>  fs/xfs/xfs_extfree_item.c |    2 --
>  3 files changed, 38 insertions(+), 5 deletions(-)
> 
> 
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index beebd484c5f3..49cea124148b 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -25,6 +25,8 @@
>  #include "xfs_ag_resv.h"
>  #include "xfs_quota.h"
>  #include "xfs_bmap.h"
> +#include "xfs_defer.h"
> +#include "xfs_extfree_item.h"
>  #include "scrub/scrub.h"
>  #include "scrub/common.h"
>  #include "scrub/trace.h"
> @@ -412,7 +414,8 @@ int
>  xrep_newbt_add_reservation(
>  	struct xrep_newbt		*xnr,
>  	xfs_fsblock_t			fsbno,
> -	xfs_extlen_t			len)
> +	xfs_extlen_t			len,
> +	void				*priv)
>  {
>  	struct xrep_newbt_resv	*resv;
>  
> @@ -424,6 +427,7 @@ xrep_newbt_add_reservation(
>  	resv->fsbno = fsbno;
>  	resv->len = len;
>  	resv->used = 0;
> +	resv->priv = priv;
>  	list_add_tail(&resv->list, &xnr->reservations);
>  	return 0;
>  }
> @@ -434,6 +438,7 @@ xrep_newbt_reserve_space(
>  	struct xrep_newbt	*xnr,
>  	uint64_t		nr_blocks)
>  {
> +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;

Heh. I feel like we should be able to do something a little cleaner
here, but I'm not sure what off the top of my head. Maybe a helper to
look up a generic "intent type" in the defer_op_types table and somewhat
abstract away the dfops-specific nature of the current interfaces..?
Maybe we should consider renaming xfs_defer_op_type to something more
generic too (xfs_intent_type ?). (This could all be a separate patch
btw.)

>  	struct xfs_scrub	*sc = xnr->sc;
>  	xfs_alloctype_t		type;
>  	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;

Also variable names look misaligned with the above (here and in the
destroy function below).

> @@ -442,6 +447,7 @@ xrep_newbt_reserve_space(
>  	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
>  
>  	while (nr_blocks > 0 && !error) {
> +		struct xfs_extent_free_item	efi_item;
>  		struct xfs_alloc_arg	args = {
>  			.tp		= sc->tp,
>  			.mp		= sc->mp,
> @@ -453,6 +459,7 @@ xrep_newbt_reserve_space(
>  			.prod		= nr_blocks,
>  			.resv		= xnr->resv,
>  		};
> +		void			*efi;
>  
>  		error = xfs_alloc_vextent(&args);
>  		if (error)
> @@ -465,7 +472,20 @@ xrep_newbt_reserve_space(
>  				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
>  				args.len, xnr->oinfo.oi_owner);
>  
> -		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> +		/*
> +		 * Log a deferred free item for each extent we allocate so that
> +		 * we can get all of the space back if we crash before we can
> +		 * commit the new btree.
> +		 */
> +		efi_item.xefi_startblock = args.fsbno;
> +		efi_item.xefi_blockcount = args.len;
> +		efi_item.xefi_oinfo = xnr->oinfo;
> +		efi_item.xefi_skip_discard = true;
> +		efi = efi_type->create_intent(sc->tp, 1);
> +		efi_type->log_item(sc->tp, efi, &efi_item.xefi_list);
> +
> +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len,
> +				efi);
>  		if (error)
>  			break;
>  
> @@ -487,6 +507,7 @@ xrep_newbt_destroy(
>  	struct xrep_newbt	*xnr,
>  	int			error)
>  {
> +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
>  	struct xfs_scrub	*sc = xnr->sc;
>  	struct xrep_newbt_resv	*resv, *n;
>  
> @@ -494,6 +515,17 @@ xrep_newbt_destroy(
>  		goto junkit;
>  
>  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> +		struct xfs_efd_log_item *efd;
> +
> +		/*
> +		 * Log a deferred free done for each extent we allocated now
> +		 * that we've linked the block into the filesystem.  We cheat
> +		 * since we know that log recovery has never looked at the
> +		 * extents attached to an EFD.
> +		 */
> +		efd = efi_type->create_done(sc->tp, resv->priv, 0);
> +		set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
> +

So here we've presumably succeeded so we drop the intent and actually
free any blocks that we didn't happen to use.

>  		/* Free every block we didn't use. */
>  		resv->fsbno += resv->used;
>  		resv->len -= resv->used;
> @@ -515,6 +547,7 @@ xrep_newbt_destroy(
>  
>  junkit:
>  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> +		efi_type->abort_intent(resv->priv);

And in this case we've failed so we abort the intent.. but what actually
happens to the blocks we might have allocated? Do we need to free those
somewhere or is that also handled by the above path somehow?

Brian

>  		list_del(&resv->list);
>  		kmem_free(resv);
>  	}
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index ab6c1199ecc0..cb86281de28b 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -67,6 +67,8 @@ struct xrep_newbt_resv {
>  	/* Link to list of extents that we've reserved. */
>  	struct list_head	list;
>  
> +	void			*priv;
> +
>  	/* FSB of the block we reserved. */
>  	xfs_fsblock_t		fsbno;
>  
> @@ -112,7 +114,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
>  void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
>  		int whichfork, const struct xfs_owner_info *oinfo);
>  int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
> -		xfs_extlen_t len);
> +		xfs_extlen_t len, void *priv);
>  int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
>  void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
>  int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> index e44efc41a041..1e49936afbfb 100644
> --- a/fs/xfs/xfs_extfree_item.c
> +++ b/fs/xfs/xfs_extfree_item.c
> @@ -328,8 +328,6 @@ xfs_trans_get_efd(
>  {
>  	struct xfs_efd_log_item		*efdp;
>  
> -	ASSERT(nextents > 0);
> -
>  	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
>  		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
>  				(nextents - 1) * sizeof(struct xfs_extent),
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-25 14:24   ` Brian Foster
@ 2019-10-25 16:22     ` Darrick J. Wong
  2019-10-25 17:36       ` Brian Foster
  0 siblings, 1 reply; 14+ messages in thread
From: Darrick J. Wong @ 2019-10-25 16:22 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 10:24:01AM -0400, Brian Foster wrote:
> On Wed, Oct 09, 2019 at 09:50:06AM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > We need to log EFIs for every extent that we allocate for the purpose of
> > staging a new btree so that if we fail then the blocks will be freed
> > during log recovery.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> Ok, so now I'm getting to the stuff we discussed earlier around pinning
> the log tail and whatnot. I have no issue with getting this in as is in
> the meantime, so long as we eventually account for those kind of issues
> before we consider this non-experimental.

<nod> I also wondered in the past if it would be smarter just to freeze
the whole filesystem (like I currently do for rmapbt rebuilding) so that
repair is the only thing that gets to touch the log, but I bet that will
be unpopular. :)

> >  fs/xfs/scrub/repair.c     |   37 +++++++++++++++++++++++++++++++++++--
> >  fs/xfs/scrub/repair.h     |    4 +++-
> >  fs/xfs/xfs_extfree_item.c |    2 --
> >  3 files changed, 38 insertions(+), 5 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index beebd484c5f3..49cea124148b 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -25,6 +25,8 @@
> >  #include "xfs_ag_resv.h"
> >  #include "xfs_quota.h"
> >  #include "xfs_bmap.h"
> > +#include "xfs_defer.h"
> > +#include "xfs_extfree_item.h"
> >  #include "scrub/scrub.h"
> >  #include "scrub/common.h"
> >  #include "scrub/trace.h"
> > @@ -412,7 +414,8 @@ int
> >  xrep_newbt_add_reservation(
> >  	struct xrep_newbt		*xnr,
> >  	xfs_fsblock_t			fsbno,
> > -	xfs_extlen_t			len)
> > +	xfs_extlen_t			len,
> > +	void				*priv)
> >  {
> >  	struct xrep_newbt_resv	*resv;
> >  
> > @@ -424,6 +427,7 @@ xrep_newbt_add_reservation(
> >  	resv->fsbno = fsbno;
> >  	resv->len = len;
> >  	resv->used = 0;
> > +	resv->priv = priv;
> >  	list_add_tail(&resv->list, &xnr->reservations);
> >  	return 0;
> >  }
> > @@ -434,6 +438,7 @@ xrep_newbt_reserve_space(
> >  	struct xrep_newbt	*xnr,
> >  	uint64_t		nr_blocks)
> >  {
> > +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
> 
> Heh. I feel like we should be able to do something a little cleaner
> here, but I'm not sure what off the top of my head. Maybe a helper to
> look up a generic "intent type" in the defer_op_types table and somewhat
> abstract away the dfops-specific nature of the current interfaces..?
> Maybe we should consider renaming xfs_defer_op_type to something more
> generic too (xfs_intent_type ?). (This could all be a separate patch
> btw.)

I dunno.  I also feel like this is borderline misuse of the defer ops
code since we're overriding the default behavior to walk an EFI through
the state machine manually... so perhaps it's ok to wait until we have a
second reasonable user to abstract some of this away?

> >  	struct xfs_scrub	*sc = xnr->sc;
> >  	xfs_alloctype_t		type;
> >  	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> 
> Also variable names look misaligned with the above (here and in the
> destroy function below).

Yeah, I'll see if I can fix that.

> > @@ -442,6 +447,7 @@ xrep_newbt_reserve_space(
> >  	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> >  
> >  	while (nr_blocks > 0 && !error) {
> > +		struct xfs_extent_free_item	efi_item;
> >  		struct xfs_alloc_arg	args = {
> >  			.tp		= sc->tp,
> >  			.mp		= sc->mp,
> > @@ -453,6 +459,7 @@ xrep_newbt_reserve_space(
> >  			.prod		= nr_blocks,
> >  			.resv		= xnr->resv,
> >  		};
> > +		void			*efi;
> >  
> >  		error = xfs_alloc_vextent(&args);
> >  		if (error)
> > @@ -465,7 +472,20 @@ xrep_newbt_reserve_space(
> >  				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> >  				args.len, xnr->oinfo.oi_owner);
> >  
> > -		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> > +		/*
> > +		 * Log a deferred free item for each extent we allocate so that
> > +		 * we can get all of the space back if we crash before we can
> > +		 * commit the new btree.
> > +		 */
> > +		efi_item.xefi_startblock = args.fsbno;
> > +		efi_item.xefi_blockcount = args.len;
> > +		efi_item.xefi_oinfo = xnr->oinfo;
> > +		efi_item.xefi_skip_discard = true;
> > +		efi = efi_type->create_intent(sc->tp, 1);
> > +		efi_type->log_item(sc->tp, efi, &efi_item.xefi_list);
> > +
> > +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len,
> > +				efi);
> >  		if (error)
> >  			break;
> >  
> > @@ -487,6 +507,7 @@ xrep_newbt_destroy(
> >  	struct xrep_newbt	*xnr,
> >  	int			error)
> >  {
> > +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
> >  	struct xfs_scrub	*sc = xnr->sc;
> >  	struct xrep_newbt_resv	*resv, *n;
> >  
> > @@ -494,6 +515,17 @@ xrep_newbt_destroy(
> >  		goto junkit;
> >  
> >  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > +		struct xfs_efd_log_item *efd;
> > +
> > +		/*
> > +		 * Log a deferred free done for each extent we allocated now
> > +		 * that we've linked the block into the filesystem.  We cheat
> > +		 * since we know that log recovery has never looked at the
> > +		 * extents attached to an EFD.
> > +		 */
> > +		efd = efi_type->create_done(sc->tp, resv->priv, 0);
> > +		set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
> > +
> 
> So here we've presumably succeeded so we drop the intent and actually
> free any blocks that we didn't happen to use.

Correct.

> >  		/* Free every block we didn't use. */
> >  		resv->fsbno += resv->used;
> >  		resv->len -= resv->used;
> > @@ -515,6 +547,7 @@ xrep_newbt_destroy(
> >  
> >  junkit:
> >  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > +		efi_type->abort_intent(resv->priv);
> 
> And in this case we've failed so we abort the intent.. but what actually
> happens to the blocks we might have allocated? Do we need to free those
> somewhere or is that also handled by the above path somehow?

Hmm, my assumption here was that the error code would be passed all the
way back to xchk_teardown, which in cancelling the dirty transaction
would shut down the filesystem, and log recovery would take care of it
for us.

However, I suppose we could at least try to "recover" the intent to free
the EFIs and clear the EFI; and only fall back to aborting if that also
fails since then we'll know everything's dead in the water.

--D

> Brian
> 
> >  		list_del(&resv->list);
> >  		kmem_free(resv);
> >  	}
> > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > index ab6c1199ecc0..cb86281de28b 100644
> > --- a/fs/xfs/scrub/repair.h
> > +++ b/fs/xfs/scrub/repair.h
> > @@ -67,6 +67,8 @@ struct xrep_newbt_resv {
> >  	/* Link to list of extents that we've reserved. */
> >  	struct list_head	list;
> >  
> > +	void			*priv;
> > +
> >  	/* FSB of the block we reserved. */
> >  	xfs_fsblock_t		fsbno;
> >  
> > @@ -112,7 +114,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
> >  void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
> >  		int whichfork, const struct xfs_owner_info *oinfo);
> >  int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
> > -		xfs_extlen_t len);
> > +		xfs_extlen_t len, void *priv);
> >  int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
> >  void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
> >  int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
> > diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> > index e44efc41a041..1e49936afbfb 100644
> > --- a/fs/xfs/xfs_extfree_item.c
> > +++ b/fs/xfs/xfs_extfree_item.c
> > @@ -328,8 +328,6 @@ xfs_trans_get_efd(
> >  {
> >  	struct xfs_efd_log_item		*efdp;
> >  
> > -	ASSERT(nextents > 0);
> > -
> >  	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
> >  		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
> >  				(nextents - 1) * sizeof(struct xfs_extent),
> > 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging
  2019-10-25 14:22   ` Brian Foster
@ 2019-10-25 16:35     ` Darrick J. Wong
  2019-10-25 17:35       ` Brian Foster
  0 siblings, 1 reply; 14+ messages in thread
From: Darrick J. Wong @ 2019-10-25 16:35 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 10:22:27AM -0400, Brian Foster wrote:
> On Wed, Oct 09, 2019 at 09:49:59AM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Create a new xrep_newbt structure to encapsulate a fake root for
> > creating a staged btree cursor as well as to track all the blocks that
> > we need to reserve in order to build that btree.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/scrub/repair.c |  260 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/scrub/repair.h |   61 +++++++++++
> >  fs/xfs/scrub/trace.h  |   58 +++++++++++
> >  3 files changed, 379 insertions(+)
> > 
> > 
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index 588bc054db5c..beebd484c5f3 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -359,6 +359,266 @@ xrep_init_btblock(
> >  	return 0;
> >  }
> >  
> ...
> > +/* Initialize accounting resources for staging a new inode fork btree. */
> > +void
> > +xrep_newbt_init_inode(
> > +	struct xrep_newbt		*xnr,
> > +	struct xfs_scrub		*sc,
> > +	int				whichfork,
> > +	const struct xfs_owner_info	*oinfo)
> > +{
> > +	memset(xnr, 0, sizeof(struct xrep_newbt));
> > +	xnr->sc = sc;
> > +	xnr->oinfo = *oinfo; /* structure copy */
> > +	xnr->alloc_hint = XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino);
> > +	xnr->resv = XFS_AG_RESV_NONE;
> > +	xnr->ifake.if_fork = kmem_zone_zalloc(xfs_ifork_zone, 0);
> > +	xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
> > +	INIT_LIST_HEAD(&xnr->reservations);
> 
> Looks like this could reuse the above function for everything outside of
> the fake root bits.

Ok.

> > +}
> > +
> > +/*
> > + * Initialize accounting resources for staging a new btree.  Callers are
> > + * expected to add their own reservations (and clean them up) manually.
> > + */
> > +void
> > +xrep_newbt_init_bare(
> > +	struct xrep_newbt		*xnr,
> > +	struct xfs_scrub		*sc)
> > +{
> > +	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
> > +			XFS_AG_RESV_NONE);
> > +}
> > +
> > +/* Add a space reservation manually. */
> > +int
> > +xrep_newbt_add_reservation(
> > +	struct xrep_newbt		*xnr,
> > +	xfs_fsblock_t			fsbno,
> > +	xfs_extlen_t			len)
> > +{
> 
> FWIW the "reservation" terminology sounds a bit funny to me. Perhaps
> it's just because I've had log reservation on my mind :P, but something
> that "reserves blocks" as opposed to "adds reservation" might be a bit
> more clear from a naming perspective.

xrep_newbt_reserve_space() ?

I feel that's a little awkward since it's not actually reserving
anything; all it's doing is some accounting work for some space that the
caller already allocated.  But it's probably no worse than the current
name. :)

> > +	struct xrep_newbt_resv	*resv;
> > +
> > +	resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
> > +	if (!resv)
> > +		return -ENOMEM;
> > +
> > +	INIT_LIST_HEAD(&resv->list);
> > +	resv->fsbno = fsbno;
> > +	resv->len = len;
> > +	resv->used = 0;
> 
> Is ->used purely a count or does it also serve as a pointer to the next
> "unused" block?

It's a counter, as documented in the struct declaration.

> > +	list_add_tail(&resv->list, &xnr->reservations);
> > +	return 0;
> > +}
> > +
> > +/* Reserve disk space for our new btree. */
> > +int
> > +xrep_newbt_reserve_space(
> > +	struct xrep_newbt	*xnr,
> > +	uint64_t		nr_blocks)
> > +{
> > +	struct xfs_scrub	*sc = xnr->sc;
> > +	xfs_alloctype_t		type;
> > +	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> > +	int			error = 0;
> > +
> > +	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> > +
> 
> So I take it this distinguishes between reconstruction of a bmapbt
> where we can allocate across AGs vs an AG tree..? If so, a one liner
> comment wouldn't hurt here.

Ok.

> > +	while (nr_blocks > 0 && !error) {
> > +		struct xfs_alloc_arg	args = {
> > +			.tp		= sc->tp,
> > +			.mp		= sc->mp,
> > +			.type		= type,
> > +			.fsbno		= alloc_hint,
> > +			.oinfo		= xnr->oinfo,
> > +			.minlen		= 1,
> > +			.maxlen		= nr_blocks,
> > +			.prod		= nr_blocks,
> 
> Why .prod? Is this relevant if .mod isn't set?

Not sure why that's even in there. :/

> > +			.resv		= xnr->resv,
> > +		};
> > +
> > +		error = xfs_alloc_vextent(&args);
> > +		if (error)
> > +			return error;
> > +		if (args.fsbno == NULLFSBLOCK)
> > +			return -ENOSPC;
> > +
> > +		trace_xrep_newbt_reserve_space(sc->mp,
> > +				XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
> > +				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> > +				args.len, xnr->oinfo.oi_owner);
> > +
> > +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> > +		if (error)
> > +			break;
> > +
> > +		nr_blocks -= args.len;
> > +		alloc_hint = args.fsbno + args.len - 1;
> > +
> > +		if (sc->ip)
> > +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> > +		else
> > +			error = xrep_roll_ag_trans(sc);
> > +	}
> > +
> > +	return error;
> > +}
> > +
> > +/* Free all the accounting info and disk space we reserved for a new btree. */
> > +void
> > +xrep_newbt_destroy(
> > +	struct xrep_newbt	*xnr,
> > +	int			error)
> > +{
> > +	struct xfs_scrub	*sc = xnr->sc;
> > +	struct xrep_newbt_resv	*resv, *n;
> > +
> > +	if (error)
> > +		goto junkit;
> > +
> > +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > +		/* Free every block we didn't use. */
> > +		resv->fsbno += resv->used;
> > +		resv->len -= resv->used;
> > +		resv->used = 0;
> 
> That answers my count/pointer question. :)

> > +
> > +		if (resv->len > 0) {
> > +			trace_xrep_newbt_unreserve_space(sc->mp,
> > +					XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
> > +					XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
> > +					resv->len, xnr->oinfo.oi_owner);
> > +
> > +			__xfs_bmap_add_free(sc->tp, resv->fsbno, resv->len,
> > +					&xnr->oinfo, true);
> > +		}
> > +
> > +		list_del(&resv->list);
> > +		kmem_free(resv);
> > +	}
> > +
> > +junkit:
> > +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > +		list_del(&resv->list);
> > +		kmem_free(resv);
> > +	}
> 
> Seems like this could be folded into the above loop by just checking
> error and skipping the free logic appropriately.
> 
> > +
> > +	if (sc->ip) {
> > +		kmem_zone_free(xfs_ifork_zone, xnr->ifake.if_fork);
> > +		xnr->ifake.if_fork = NULL;
> > +	}
> > +}
> > +
> > +/* Feed one of the reserved btree blocks to the bulk loader. */
> > +int
> > +xrep_newbt_alloc_block(
> > +	struct xfs_btree_cur	*cur,
> > +	struct xrep_newbt	*xnr,
> > +	union xfs_btree_ptr	*ptr)
> > +{
> > +	struct xrep_newbt_resv	*resv;
> > +	xfs_fsblock_t		fsb;
> > +
> > +	/*
> > +	 * If last_resv doesn't have a block for us, move forward until we find
> > +	 * one that does (or run out of reservations).
> > +	 */
> > +	if (xnr->last_resv == NULL) {
> > +		list_for_each_entry(resv, &xnr->reservations, list) {
> > +			if (resv->used < resv->len) {
> > +				xnr->last_resv = resv;
> > +				break;
> > +			}
> > +		}
> 
> Not a big deal right now, but it might be worth eventually considering
> something more efficient. For example, perhaps we could rotate depleted
> entries to the end of the list and if we rotate and find nothing in the
> next entry at the head, we know we've run out of space.

Hm, yeah, this part would be much simpler if all we had to do was latch
on to the head element and rotate them to the tail when we're done.

> 
> > +		if (xnr->last_resv == NULL)
> > +			return -ENOSPC;
> > +	} else if (xnr->last_resv->used == xnr->last_resv->len) {
> > +		if (xnr->last_resv->list.next == &xnr->reservations)
> > +			return -ENOSPC;
> > +		xnr->last_resv = list_entry(xnr->last_resv->list.next,
> > +				struct xrep_newbt_resv, list);
> > +	}
> > +
> > +	/* Nab the block. */
> > +	fsb = xnr->last_resv->fsbno + xnr->last_resv->used;
> > +	xnr->last_resv->used++;
> > +
> > +	trace_xrep_newbt_alloc_block(cur->bc_mp,
> > +			XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
> > +			XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
> > +			xnr->oinfo.oi_owner);
> > +
> > +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
> > +		ptr->l = cpu_to_be64(fsb);
> > +	else
> > +		ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
> > +	return 0;
> > +}
> > +
> ...
> > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > index 479cfe38065e..ab6c1199ecc0 100644
> > --- a/fs/xfs/scrub/repair.h
> > +++ b/fs/xfs/scrub/repair.h
> ...
> > @@ -59,6 +63,63 @@ int xrep_agf(struct xfs_scrub *sc);
> >  int xrep_agfl(struct xfs_scrub *sc);
> >  int xrep_agi(struct xfs_scrub *sc);
> >  
> ...
> > +
> > +#define for_each_xrep_newbt_reservation(xnr, resv, n)	\
> > +	list_for_each_entry_safe((resv), (n), &(xnr)->reservations, list)
> 
> This is unused (and seems unnecessary for a simple list).

It's used by the free space rebuilder in the next patch; I suppose I
could move it down.  That said, I've been trying to keep the common code
out of that particular patch so that the repair patches can be merged in
any order.

> ...
> > diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
> > index 3362bae28b46..deb177abf652 100644
> > --- a/fs/xfs/scrub/trace.h
> > +++ b/fs/xfs/scrub/trace.h
> > @@ -904,6 +904,64 @@ TRACE_EVENT(xrep_ialloc_insert,
> >  		  __entry->freemask)
> >  )
> >  
> ...
> > +#define DEFINE_NEWBT_EXTENT_EVENT(name) \
> > +DEFINE_EVENT(xrep_newbt_extent_class, name, \
> > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> > +		 xfs_agblock_t agbno, xfs_extlen_t len, \
> > +		 int64_t owner), \
> > +	TP_ARGS(mp, agno, agbno, len, owner))
> > +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_reserve_space);
> > +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_unreserve_space);
> > +
> > +TRACE_EVENT(xrep_newbt_alloc_block,
> > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > +		 xfs_agblock_t agbno, int64_t owner),
> 
> This could be folded into the above class if we just passed 1 for the
> length, eh?

Er, yes.  Fixed.

--D

> Brian
> 
> > +	TP_ARGS(mp, agno, agbno, owner),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(xfs_agblock_t, agbno)
> > +		__field(int64_t, owner)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = mp->m_super->s_dev;
> > +		__entry->agno = agno;
> > +		__entry->agbno = agbno;
> > +		__entry->owner = owner;
> > +	),
> > +	TP_printk("dev %d:%d agno %u agbno %u owner %lld",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __entry->agno,
> > +		  __entry->agbno,
> > +		  __entry->owner)
> > +);
> > +
> >  #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
> >  
> >  #endif /* _TRACE_XFS_SCRUB_TRACE_H */
> > 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: add debug knobs to control btree bulk load slack factors
  2019-10-25 14:19   ` Brian Foster
@ 2019-10-25 16:39     ` Darrick J. Wong
  0 siblings, 0 replies; 14+ messages in thread
From: Darrick J. Wong @ 2019-10-25 16:39 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 10:19:47AM -0400, Brian Foster wrote:
> On Wed, Oct 09, 2019 at 09:49:53AM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Add some debug knobs so that we can control the leaf and node block
> > slack when rebuilding btrees.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_globals.c |    6 ++++++
> >  fs/xfs/xfs_sysctl.h  |    2 ++
> >  fs/xfs/xfs_sysfs.c   |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 62 insertions(+)
> > 
> > 
> > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
> > index fa55ab8b8d80..8f67027c144b 100644
> > --- a/fs/xfs/xfs_globals.c
> > +++ b/fs/xfs/xfs_globals.c
> > @@ -43,4 +43,10 @@ struct xfs_globals xfs_globals = {
> >  #ifdef DEBUG
> >  	.pwork_threads		=	-1,	/* automatic thread detection */
> >  #endif
> > +
> > +	/* Bulk load new btree leaf blocks to 75% full. */
> > +	.bload_leaf_slack	=	-1,
> > +
> > +	/* Bulk load new btree node blocks to 75% full. */
> > +	.bload_node_slack	=	-1,
> 
> What are the units for these fields?

Records (or key/ptr pairs).

> Seems reasonable outside of that, though I'd probably reorder it to
> after the related code such that this patch also includes whatever
> bits that actually use the fields.

It's the next patch.  Will reorder.

Thanks for reviewing!  I totally replied to these in reverse redro! :)

--D

> Brian
> 
> >  };
> > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> > index 8abf4640f1d5..aecccceee4ca 100644
> > --- a/fs/xfs/xfs_sysctl.h
> > +++ b/fs/xfs/xfs_sysctl.h
> > @@ -85,6 +85,8 @@ struct xfs_globals {
> >  #ifdef DEBUG
> >  	int	pwork_threads;		/* parallel workqueue threads */
> >  #endif
> > +	int	bload_leaf_slack;	/* btree bulk load leaf slack */
> > +	int	bload_node_slack;	/* btree bulk load node slack */
> >  	int	log_recovery_delay;	/* log recovery delay (secs) */
> >  	int	mount_delay;		/* mount setup delay (secs) */
> >  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> > index f1bc88f4367c..673ad21a9585 100644
> > --- a/fs/xfs/xfs_sysfs.c
> > +++ b/fs/xfs/xfs_sysfs.c
> > @@ -228,6 +228,58 @@ pwork_threads_show(
> >  XFS_SYSFS_ATTR_RW(pwork_threads);
> >  #endif /* DEBUG */
> >  
> > +STATIC ssize_t
> > +bload_leaf_slack_store(
> > +	struct kobject	*kobject,
> > +	const char	*buf,
> > +	size_t		count)
> > +{
> > +	int		ret;
> > +	int		val;
> > +
> > +	ret = kstrtoint(buf, 0, &val);
> > +	if (ret)
> > +		return ret;
> > +
> > +	xfs_globals.bload_leaf_slack = val;
> > +	return count;
> > +}
> > +
> > +STATIC ssize_t
> > +bload_leaf_slack_show(
> > +	struct kobject	*kobject,
> > +	char		*buf)
> > +{
> > +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
> > +}
> > +XFS_SYSFS_ATTR_RW(bload_leaf_slack);
> > +
> > +STATIC ssize_t
> > +bload_node_slack_store(
> > +	struct kobject	*kobject,
> > +	const char	*buf,
> > +	size_t		count)
> > +{
> > +	int		ret;
> > +	int		val;
> > +
> > +	ret = kstrtoint(buf, 0, &val);
> > +	if (ret)
> > +		return ret;
> > +
> > +	xfs_globals.bload_node_slack = val;
> > +	return count;
> > +}
> > +
> > +STATIC ssize_t
> > +bload_node_slack_show(
> > +	struct kobject	*kobject,
> > +	char		*buf)
> > +{
> > +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
> > +}
> > +XFS_SYSFS_ATTR_RW(bload_node_slack);
> > +
> >  static struct attribute *xfs_dbg_attrs[] = {
> >  	ATTR_LIST(bug_on_assert),
> >  	ATTR_LIST(log_recovery_delay),
> > @@ -236,6 +288,8 @@ static struct attribute *xfs_dbg_attrs[] = {
> >  #ifdef DEBUG
> >  	ATTR_LIST(pwork_threads),
> >  #endif
> > +	ATTR_LIST(bload_leaf_slack),
> > +	ATTR_LIST(bload_node_slack),
> >  	NULL,
> >  };
> >  
> > 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging
  2019-10-25 16:35     ` Darrick J. Wong
@ 2019-10-25 17:35       ` Brian Foster
  2019-10-25 20:52         ` Darrick J. Wong
  0 siblings, 1 reply; 14+ messages in thread
From: Brian Foster @ 2019-10-25 17:35 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 09:35:17AM -0700, Darrick J. Wong wrote:
> On Fri, Oct 25, 2019 at 10:22:27AM -0400, Brian Foster wrote:
> > On Wed, Oct 09, 2019 at 09:49:59AM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Create a new xrep_newbt structure to encapsulate a fake root for
> > > creating a staged btree cursor as well as to track all the blocks that
> > > we need to reserve in order to build that btree.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/scrub/repair.c |  260 +++++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/scrub/repair.h |   61 +++++++++++
> > >  fs/xfs/scrub/trace.h  |   58 +++++++++++
> > >  3 files changed, 379 insertions(+)
> > > 
> > > 
> > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > index 588bc054db5c..beebd484c5f3 100644
> > > --- a/fs/xfs/scrub/repair.c
> > > +++ b/fs/xfs/scrub/repair.c
> > > @@ -359,6 +359,266 @@ xrep_init_btblock(
> > >  	return 0;
> > >  }
> > >  
> > ...
> > > +/* Initialize accounting resources for staging a new inode fork btree. */
> > > +void
> > > +xrep_newbt_init_inode(
> > > +	struct xrep_newbt		*xnr,
> > > +	struct xfs_scrub		*sc,
> > > +	int				whichfork,
> > > +	const struct xfs_owner_info	*oinfo)
> > > +{
> > > +	memset(xnr, 0, sizeof(struct xrep_newbt));
> > > +	xnr->sc = sc;
> > > +	xnr->oinfo = *oinfo; /* structure copy */
> > > +	xnr->alloc_hint = XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino);
> > > +	xnr->resv = XFS_AG_RESV_NONE;
> > > +	xnr->ifake.if_fork = kmem_zone_zalloc(xfs_ifork_zone, 0);
> > > +	xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
> > > +	INIT_LIST_HEAD(&xnr->reservations);
> > 
> > Looks like this could reuse the above function for everything outside of
> > the fake root bits.
> 
> Ok.
> 
> > > +}
> > > +
> > > +/*
> > > + * Initialize accounting resources for staging a new btree.  Callers are
> > > + * expected to add their own reservations (and clean them up) manually.
> > > + */
> > > +void
> > > +xrep_newbt_init_bare(
> > > +	struct xrep_newbt		*xnr,
> > > +	struct xfs_scrub		*sc)
> > > +{
> > > +	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
> > > +			XFS_AG_RESV_NONE);
> > > +}
> > > +
> > > +/* Add a space reservation manually. */
> > > +int
> > > +xrep_newbt_add_reservation(
> > > +	struct xrep_newbt		*xnr,
> > > +	xfs_fsblock_t			fsbno,
> > > +	xfs_extlen_t			len)
> > > +{
> > 
> > FWIW the "reservation" terminology sounds a bit funny to me. Perhaps
> > it's just because I've had log reservation on my mind :P, but something
> > that "reserves blocks" as opposed to "adds reservation" might be a bit
> > more clear from a naming perspective.
> 
> xrep_newbt_reserve_space() ?
> 
> I feel that's a little awkward since it's not actually reserving
> anything; all it's doing is some accounting work for some space that the
> caller already allocated.  But it's probably no worse than the current
> name. :)
> 

Maybe _add_blocks() and _alloc_blocks() for these two and something like
_[get|use]_block() in the later helper that populates the btree..? That
seems more descriptive to me than "reservation" and "space," but that's
just my .02.

Brian

> > > +	struct xrep_newbt_resv	*resv;
> > > +
> > > +	resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
> > > +	if (!resv)
> > > +		return -ENOMEM;
> > > +
> > > +	INIT_LIST_HEAD(&resv->list);
> > > +	resv->fsbno = fsbno;
> > > +	resv->len = len;
> > > +	resv->used = 0;
> > 
> > Is ->used purely a count or does it also serve as a pointer to the next
> > "unused" block?
> 
> It's a counter, as documented in the struct declaration.
> 
> > > +	list_add_tail(&resv->list, &xnr->reservations);
> > > +	return 0;
> > > +}
> > > +
> > > +/* Reserve disk space for our new btree. */
> > > +int
> > > +xrep_newbt_reserve_space(
> > > +	struct xrep_newbt	*xnr,
> > > +	uint64_t		nr_blocks)
> > > +{
> > > +	struct xfs_scrub	*sc = xnr->sc;
> > > +	xfs_alloctype_t		type;
> > > +	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> > > +	int			error = 0;
> > > +
> > > +	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> > > +
> > 
> > So I take it this distinguishes between reconstruction of a bmapbt
> > where we can allocate across AGs vs an AG tree..? If so, a one liner
> > comment wouldn't hurt here.
> 
> Ok.
> 
> > > +	while (nr_blocks > 0 && !error) {
> > > +		struct xfs_alloc_arg	args = {
> > > +			.tp		= sc->tp,
> > > +			.mp		= sc->mp,
> > > +			.type		= type,
> > > +			.fsbno		= alloc_hint,
> > > +			.oinfo		= xnr->oinfo,
> > > +			.minlen		= 1,
> > > +			.maxlen		= nr_blocks,
> > > +			.prod		= nr_blocks,
> > 
> > Why .prod? Is this relevant if .mod isn't set?
> 
> Not sure why that's even in there. :/
> 
> > > +			.resv		= xnr->resv,
> > > +		};
> > > +
> > > +		error = xfs_alloc_vextent(&args);
> > > +		if (error)
> > > +			return error;
> > > +		if (args.fsbno == NULLFSBLOCK)
> > > +			return -ENOSPC;
> > > +
> > > +		trace_xrep_newbt_reserve_space(sc->mp,
> > > +				XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
> > > +				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> > > +				args.len, xnr->oinfo.oi_owner);
> > > +
> > > +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> > > +		if (error)
> > > +			break;
> > > +
> > > +		nr_blocks -= args.len;
> > > +		alloc_hint = args.fsbno + args.len - 1;
> > > +
> > > +		if (sc->ip)
> > > +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> > > +		else
> > > +			error = xrep_roll_ag_trans(sc);
> > > +	}
> > > +
> > > +	return error;
> > > +}
> > > +
> > > +/* Free all the accounting info and disk space we reserved for a new btree. */
> > > +void
> > > +xrep_newbt_destroy(
> > > +	struct xrep_newbt	*xnr,
> > > +	int			error)
> > > +{
> > > +	struct xfs_scrub	*sc = xnr->sc;
> > > +	struct xrep_newbt_resv	*resv, *n;
> > > +
> > > +	if (error)
> > > +		goto junkit;
> > > +
> > > +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > +		/* Free every block we didn't use. */
> > > +		resv->fsbno += resv->used;
> > > +		resv->len -= resv->used;
> > > +		resv->used = 0;
> > 
> > That answers my count/pointer question. :)
> 
> > > +
> > > +		if (resv->len > 0) {
> > > +			trace_xrep_newbt_unreserve_space(sc->mp,
> > > +					XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
> > > +					XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
> > > +					resv->len, xnr->oinfo.oi_owner);
> > > +
> > > +			__xfs_bmap_add_free(sc->tp, resv->fsbno, resv->len,
> > > +					&xnr->oinfo, true);
> > > +		}
> > > +
> > > +		list_del(&resv->list);
> > > +		kmem_free(resv);
> > > +	}
> > > +
> > > +junkit:
> > > +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > +		list_del(&resv->list);
> > > +		kmem_free(resv);
> > > +	}
> > 
> > Seems like this could be folded into the above loop by just checking
> > error and skipping the free logic appropriately.
> > 
> > > +
> > > +	if (sc->ip) {
> > > +		kmem_zone_free(xfs_ifork_zone, xnr->ifake.if_fork);
> > > +		xnr->ifake.if_fork = NULL;
> > > +	}
> > > +}
> > > +
> > > +/* Feed one of the reserved btree blocks to the bulk loader. */
> > > +int
> > > +xrep_newbt_alloc_block(
> > > +	struct xfs_btree_cur	*cur,
> > > +	struct xrep_newbt	*xnr,
> > > +	union xfs_btree_ptr	*ptr)
> > > +{
> > > +	struct xrep_newbt_resv	*resv;
> > > +	xfs_fsblock_t		fsb;
> > > +
> > > +	/*
> > > +	 * If last_resv doesn't have a block for us, move forward until we find
> > > +	 * one that does (or run out of reservations).
> > > +	 */
> > > +	if (xnr->last_resv == NULL) {
> > > +		list_for_each_entry(resv, &xnr->reservations, list) {
> > > +			if (resv->used < resv->len) {
> > > +				xnr->last_resv = resv;
> > > +				break;
> > > +			}
> > > +		}
> > 
> > Not a big deal right now, but it might be worth eventually considering
> > something more efficient. For example, perhaps we could rotate depleted
> > entries to the end of the list and if we rotate and find nothing in the
> > next entry at the head, we know we've run out of space.
> 
> Hm, yeah, this part would be much simpler if all we had to do was latch
> on to the head element and rotate them to the tail when we're done.
> 
> > 
> > > +		if (xnr->last_resv == NULL)
> > > +			return -ENOSPC;
> > > +	} else if (xnr->last_resv->used == xnr->last_resv->len) {
> > > +		if (xnr->last_resv->list.next == &xnr->reservations)
> > > +			return -ENOSPC;
> > > +		xnr->last_resv = list_entry(xnr->last_resv->list.next,
> > > +				struct xrep_newbt_resv, list);
> > > +	}
> > > +
> > > +	/* Nab the block. */
> > > +	fsb = xnr->last_resv->fsbno + xnr->last_resv->used;
> > > +	xnr->last_resv->used++;
> > > +
> > > +	trace_xrep_newbt_alloc_block(cur->bc_mp,
> > > +			XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
> > > +			XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
> > > +			xnr->oinfo.oi_owner);
> > > +
> > > +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
> > > +		ptr->l = cpu_to_be64(fsb);
> > > +	else
> > > +		ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
> > > +	return 0;
> > > +}
> > > +
> > ...
> > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > > index 479cfe38065e..ab6c1199ecc0 100644
> > > --- a/fs/xfs/scrub/repair.h
> > > +++ b/fs/xfs/scrub/repair.h
> > ...
> > > @@ -59,6 +63,63 @@ int xrep_agf(struct xfs_scrub *sc);
> > >  int xrep_agfl(struct xfs_scrub *sc);
> > >  int xrep_agi(struct xfs_scrub *sc);
> > >  
> > ...
> > > +
> > > +#define for_each_xrep_newbt_reservation(xnr, resv, n)	\
> > > +	list_for_each_entry_safe((resv), (n), &(xnr)->reservations, list)
> > 
> > This is unused (and seems unnecessary for a simple list).
> 
> It's used by the free space rebuilder in the next patch; I suppose I
> could move it down.  That said, I've been trying to keep the common code
> out of that particular patch so that the repair patches can be merged in
> any order.
> 
> > ...
> > > diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
> > > index 3362bae28b46..deb177abf652 100644
> > > --- a/fs/xfs/scrub/trace.h
> > > +++ b/fs/xfs/scrub/trace.h
> > > @@ -904,6 +904,64 @@ TRACE_EVENT(xrep_ialloc_insert,
> > >  		  __entry->freemask)
> > >  )
> > >  
> > ...
> > > +#define DEFINE_NEWBT_EXTENT_EVENT(name) \
> > > +DEFINE_EVENT(xrep_newbt_extent_class, name, \
> > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> > > +		 xfs_agblock_t agbno, xfs_extlen_t len, \
> > > +		 int64_t owner), \
> > > +	TP_ARGS(mp, agno, agbno, len, owner))
> > > +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_reserve_space);
> > > +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_unreserve_space);
> > > +
> > > +TRACE_EVENT(xrep_newbt_alloc_block,
> > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > > +		 xfs_agblock_t agbno, int64_t owner),
> > 
> > This could be folded into the above class if we just passed 1 for the
> > length, eh?
> 
> Er, yes.  Fixed.
> 
> --D
> 
> > Brian
> > 
> > > +	TP_ARGS(mp, agno, agbno, owner),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(xfs_agnumber_t, agno)
> > > +		__field(xfs_agblock_t, agbno)
> > > +		__field(int64_t, owner)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = mp->m_super->s_dev;
> > > +		__entry->agno = agno;
> > > +		__entry->agbno = agbno;
> > > +		__entry->owner = owner;
> > > +	),
> > > +	TP_printk("dev %d:%d agno %u agbno %u owner %lld",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > +		  __entry->agno,
> > > +		  __entry->agbno,
> > > +		  __entry->owner)
> > > +);
> > > +
> > >  #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
> > >  
> > >  #endif /* _TRACE_XFS_SCRUB_TRACE_H */
> > > 
> > 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree
  2019-10-25 16:22     ` Darrick J. Wong
@ 2019-10-25 17:36       ` Brian Foster
  0 siblings, 0 replies; 14+ messages in thread
From: Brian Foster @ 2019-10-25 17:36 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 09:22:25AM -0700, Darrick J. Wong wrote:
> On Fri, Oct 25, 2019 at 10:24:01AM -0400, Brian Foster wrote:
> > On Wed, Oct 09, 2019 at 09:50:06AM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > We need to log EFIs for every extent that we allocate for the purpose of
> > > staging a new btree so that if we fail then the blocks will be freed
> > > during log recovery.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > Ok, so now I'm getting to the stuff we discussed earlier around pinning
> > the log tail and whatnot. I have no issue with getting this in as is in
> > the meantime, so long as we eventually account for those kind of issues
> > before we consider this non-experimental.
> 
> <nod> I also wondered in the past if it would be smarter just to freeze
> the whole filesystem (like I currently do for rmapbt rebuilding) so that
> repair is the only thing that gets to touch the log, but I bet that will
> be unpopular. :)
> 
> > >  fs/xfs/scrub/repair.c     |   37 +++++++++++++++++++++++++++++++++++--
> > >  fs/xfs/scrub/repair.h     |    4 +++-
> > >  fs/xfs/xfs_extfree_item.c |    2 --
> > >  3 files changed, 38 insertions(+), 5 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > index beebd484c5f3..49cea124148b 100644
> > > --- a/fs/xfs/scrub/repair.c
> > > +++ b/fs/xfs/scrub/repair.c
> > > @@ -25,6 +25,8 @@
> > >  #include "xfs_ag_resv.h"
> > >  #include "xfs_quota.h"
> > >  #include "xfs_bmap.h"
> > > +#include "xfs_defer.h"
> > > +#include "xfs_extfree_item.h"
> > >  #include "scrub/scrub.h"
> > >  #include "scrub/common.h"
> > >  #include "scrub/trace.h"
> > > @@ -412,7 +414,8 @@ int
> > >  xrep_newbt_add_reservation(
> > >  	struct xrep_newbt		*xnr,
> > >  	xfs_fsblock_t			fsbno,
> > > -	xfs_extlen_t			len)
> > > +	xfs_extlen_t			len,
> > > +	void				*priv)
> > >  {
> > >  	struct xrep_newbt_resv	*resv;
> > >  
> > > @@ -424,6 +427,7 @@ xrep_newbt_add_reservation(
> > >  	resv->fsbno = fsbno;
> > >  	resv->len = len;
> > >  	resv->used = 0;
> > > +	resv->priv = priv;
> > >  	list_add_tail(&resv->list, &xnr->reservations);
> > >  	return 0;
> > >  }
> > > @@ -434,6 +438,7 @@ xrep_newbt_reserve_space(
> > >  	struct xrep_newbt	*xnr,
> > >  	uint64_t		nr_blocks)
> > >  {
> > > +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
> > 
> > Heh. I feel like we should be able to do something a little cleaner
> > here, but I'm not sure what off the top of my head. Maybe a helper to
> > look up a generic "intent type" in the defer_op_types table and somewhat
> > abstract away the dfops-specific nature of the current interfaces..?
> > Maybe we should consider renaming xfs_defer_op_type to something more
> > generic too (xfs_intent_type ?). (This could all be a separate patch
> > btw.)
> 
> I dunno.  I also feel like this is borderline misuse of the defer ops
> code since we're overriding the default behavior to walk an EFI through
> the state machine manually... so perhaps it's ok to wait until we have a
> second reasonable user to abstract some of this away?
> 

I think that's reasonable. I was also wondering whether some of the
dfops functionality might be worth refactoring for this, but I'm not
totally sure what that would look like.

> > >  	struct xfs_scrub	*sc = xnr->sc;
> > >  	xfs_alloctype_t		type;
> > >  	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> > 
> > Also variable names look misaligned with the above (here and in the
> > destroy function below).
> 
> Yeah, I'll see if I can fix that.
> 
> > > @@ -442,6 +447,7 @@ xrep_newbt_reserve_space(
> > >  	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> > >  
> > >  	while (nr_blocks > 0 && !error) {
> > > +		struct xfs_extent_free_item	efi_item;
> > >  		struct xfs_alloc_arg	args = {
> > >  			.tp		= sc->tp,
> > >  			.mp		= sc->mp,
> > > @@ -453,6 +459,7 @@ xrep_newbt_reserve_space(
> > >  			.prod		= nr_blocks,
> > >  			.resv		= xnr->resv,
> > >  		};
> > > +		void			*efi;
> > >  
> > >  		error = xfs_alloc_vextent(&args);
> > >  		if (error)
> > > @@ -465,7 +472,20 @@ xrep_newbt_reserve_space(
> > >  				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> > >  				args.len, xnr->oinfo.oi_owner);
> > >  
> > > -		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> > > +		/*
> > > +		 * Log a deferred free item for each extent we allocate so that
> > > +		 * we can get all of the space back if we crash before we can
> > > +		 * commit the new btree.
> > > +		 */
> > > +		efi_item.xefi_startblock = args.fsbno;
> > > +		efi_item.xefi_blockcount = args.len;
> > > +		efi_item.xefi_oinfo = xnr->oinfo;
> > > +		efi_item.xefi_skip_discard = true;
> > > +		efi = efi_type->create_intent(sc->tp, 1);
> > > +		efi_type->log_item(sc->tp, efi, &efi_item.xefi_list);
> > > +
> > > +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len,
> > > +				efi);
> > >  		if (error)
> > >  			break;
> > >  
> > > @@ -487,6 +507,7 @@ xrep_newbt_destroy(
> > >  	struct xrep_newbt	*xnr,
> > >  	int			error)
> > >  {
> > > +	const struct xfs_defer_op_type *efi_type = &xfs_extent_free_defer_type;
> > >  	struct xfs_scrub	*sc = xnr->sc;
> > >  	struct xrep_newbt_resv	*resv, *n;
> > >  
> > > @@ -494,6 +515,17 @@ xrep_newbt_destroy(
> > >  		goto junkit;
> > >  
> > >  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > +		struct xfs_efd_log_item *efd;
> > > +
> > > +		/*
> > > +		 * Log a deferred free done for each extent we allocated now
> > > +		 * that we've linked the block into the filesystem.  We cheat
> > > +		 * since we know that log recovery has never looked at the
> > > +		 * extents attached to an EFD.
> > > +		 */
> > > +		efd = efi_type->create_done(sc->tp, resv->priv, 0);
> > > +		set_bit(XFS_LI_DIRTY, &efd->efd_item.li_flags);
> > > +
> > 
> > So here we've presumably succeeded so we drop the intent and actually
> > free any blocks that we didn't happen to use.
> 
> Correct.
> 
> > >  		/* Free every block we didn't use. */
> > >  		resv->fsbno += resv->used;
> > >  		resv->len -= resv->used;
> > > @@ -515,6 +547,7 @@ xrep_newbt_destroy(
> > >  
> > >  junkit:
> > >  	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > +		efi_type->abort_intent(resv->priv);
> > 
> > And in this case we've failed so we abort the intent.. but what actually
> > happens to the blocks we might have allocated? Do we need to free those
> > somewhere or is that also handled by the above path somehow?
> 
> Hmm, my assumption here was that the error code would be passed all the
> way back to xchk_teardown, which in cancelling the dirty transaction
> would shut down the filesystem, and log recovery would take care of it
> for us.
> 

Ok.. that seems perfectly reasonable to me in a general sense. I'd just
prefer to see a "shutdown cleans up the mess" comment somewhere so the
expected error handling behavior is clear in the code. From a scrub
perspective, it sounds like we are saying a failed repair attempt is
going to shutdown and leave around a dirty log. I wonder how that would
play out in practice given that the filesystem is presumably corrupted,
so if we can't mount the fs after that point for whatever reason we've
basically added to the mess since repair will need to zero the log. We
could also crash at any point too if the fs is corrupted so I dunno..

> However, I suppose we could at least try to "recover" the intent to free
> the EFIs and clear the EFI; and only fall back to aborting if that also
> fails since then we'll know everything's dead in the water.
> 

That could be something for a follow on patch too depending on the
above..

Brian

> --D
> 
> > Brian
> > 
> > >  		list_del(&resv->list);
> > >  		kmem_free(resv);
> > >  	}
> > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > > index ab6c1199ecc0..cb86281de28b 100644
> > > --- a/fs/xfs/scrub/repair.h
> > > +++ b/fs/xfs/scrub/repair.h
> > > @@ -67,6 +67,8 @@ struct xrep_newbt_resv {
> > >  	/* Link to list of extents that we've reserved. */
> > >  	struct list_head	list;
> > >  
> > > +	void			*priv;
> > > +
> > >  	/* FSB of the block we reserved. */
> > >  	xfs_fsblock_t		fsbno;
> > >  
> > > @@ -112,7 +114,7 @@ void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc,
> > >  void xrep_newbt_init_inode(struct xrep_newbt *xba, struct xfs_scrub *sc,
> > >  		int whichfork, const struct xfs_owner_info *oinfo);
> > >  int xrep_newbt_add_reservation(struct xrep_newbt *xba, xfs_fsblock_t fsbno,
> > > -		xfs_extlen_t len);
> > > +		xfs_extlen_t len, void *priv);
> > >  int xrep_newbt_reserve_space(struct xrep_newbt *xba, uint64_t nr_blocks);
> > >  void xrep_newbt_destroy(struct xrep_newbt *xba, int error);
> > >  int xrep_newbt_alloc_block(struct xfs_btree_cur *cur, struct xrep_newbt *xba,
> > > diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> > > index e44efc41a041..1e49936afbfb 100644
> > > --- a/fs/xfs/xfs_extfree_item.c
> > > +++ b/fs/xfs/xfs_extfree_item.c
> > > @@ -328,8 +328,6 @@ xfs_trans_get_efd(
> > >  {
> > >  	struct xfs_efd_log_item		*efdp;
> > >  
> > > -	ASSERT(nextents > 0);
> > > -
> > >  	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
> > >  		efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
> > >  				(nextents - 1) * sizeof(struct xfs_extent),
> > > 
> > 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging
  2019-10-25 17:35       ` Brian Foster
@ 2019-10-25 20:52         ` Darrick J. Wong
  2020-02-27 15:51           ` Darrick J. Wong
  0 siblings, 1 reply; 14+ messages in thread
From: Darrick J. Wong @ 2019-10-25 20:52 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 01:35:54PM -0400, Brian Foster wrote:
> On Fri, Oct 25, 2019 at 09:35:17AM -0700, Darrick J. Wong wrote:
> > On Fri, Oct 25, 2019 at 10:22:27AM -0400, Brian Foster wrote:
> > > On Wed, Oct 09, 2019 at 09:49:59AM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Create a new xrep_newbt structure to encapsulate a fake root for
> > > > creating a staged btree cursor as well as to track all the blocks that
> > > > we need to reserve in order to build that btree.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > >  fs/xfs/scrub/repair.c |  260 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/scrub/repair.h |   61 +++++++++++
> > > >  fs/xfs/scrub/trace.h  |   58 +++++++++++
> > > >  3 files changed, 379 insertions(+)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > > index 588bc054db5c..beebd484c5f3 100644
> > > > --- a/fs/xfs/scrub/repair.c
> > > > +++ b/fs/xfs/scrub/repair.c
> > > > @@ -359,6 +359,266 @@ xrep_init_btblock(
> > > >  	return 0;
> > > >  }
> > > >  
> > > ...
> > > > +/* Initialize accounting resources for staging a new inode fork btree. */
> > > > +void
> > > > +xrep_newbt_init_inode(
> > > > +	struct xrep_newbt		*xnr,
> > > > +	struct xfs_scrub		*sc,
> > > > +	int				whichfork,
> > > > +	const struct xfs_owner_info	*oinfo)
> > > > +{
> > > > +	memset(xnr, 0, sizeof(struct xrep_newbt));
> > > > +	xnr->sc = sc;
> > > > +	xnr->oinfo = *oinfo; /* structure copy */
> > > > +	xnr->alloc_hint = XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino);
> > > > +	xnr->resv = XFS_AG_RESV_NONE;
> > > > +	xnr->ifake.if_fork = kmem_zone_zalloc(xfs_ifork_zone, 0);
> > > > +	xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
> > > > +	INIT_LIST_HEAD(&xnr->reservations);
> > > 
> > > Looks like this could reuse the above function for everything outside of
> > > the fake root bits.
> > 
> > Ok.
> > 
> > > > +}
> > > > +
> > > > +/*
> > > > + * Initialize accounting resources for staging a new btree.  Callers are
> > > > + * expected to add their own reservations (and clean them up) manually.
> > > > + */
> > > > +void
> > > > +xrep_newbt_init_bare(
> > > > +	struct xrep_newbt		*xnr,
> > > > +	struct xfs_scrub		*sc)
> > > > +{
> > > > +	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
> > > > +			XFS_AG_RESV_NONE);
> > > > +}
> > > > +
> > > > +/* Add a space reservation manually. */
> > > > +int
> > > > +xrep_newbt_add_reservation(
> > > > +	struct xrep_newbt		*xnr,
> > > > +	xfs_fsblock_t			fsbno,
> > > > +	xfs_extlen_t			len)
> > > > +{
> > > 
> > > FWIW the "reservation" terminology sounds a bit funny to me. Perhaps
> > > it's just because I've had log reservation on my mind :P, but something
> > > that "reserves blocks" as opposed to "adds reservation" might be a bit
> > > more clear from a naming perspective.
> > 
> > xrep_newbt_reserve_space() ?
> > 
> > I feel that's a little awkward since it's not actually reserving
> > anything; all it's doing is some accounting work for some space that the
> > caller already allocated.  But it's probably no worse than the current
> > name. :)
> > 
> 
> Maybe _add_blocks() and _alloc_blocks() for these two and something like
> _[get|use]_block() in the later helper that populates the btree..? That
> seems more descriptive to me than "reservation" and "space," but that's
> just my .02.

Yeah, that works.  Fixed.

> Brian
> 
> > > > +	struct xrep_newbt_resv	*resv;
> > > > +
> > > > +	resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
> > > > +	if (!resv)
> > > > +		return -ENOMEM;
> > > > +
> > > > +	INIT_LIST_HEAD(&resv->list);
> > > > +	resv->fsbno = fsbno;
> > > > +	resv->len = len;
> > > > +	resv->used = 0;
> > > 
> > > Is ->used purely a count or does it also serve as a pointer to the next
> > > "unused" block?
> > 
> > It's a counter, as documented in the struct declaration.
> > 
> > > > +	list_add_tail(&resv->list, &xnr->reservations);
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/* Reserve disk space for our new btree. */
> > > > +int
> > > > +xrep_newbt_reserve_space(
> > > > +	struct xrep_newbt	*xnr,
> > > > +	uint64_t		nr_blocks)
> > > > +{
> > > > +	struct xfs_scrub	*sc = xnr->sc;
> > > > +	xfs_alloctype_t		type;
> > > > +	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> > > > +	int			error = 0;
> > > > +
> > > > +	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> > > > +
> > > 
> > > So I take it this distinguishes between reconstruction of a bmapbt
> > > where we can allocate across AGs vs an AG tree..? If so, a one liner
> > > comment wouldn't hurt here.
> > 
> > Ok.
> > 
> > > > +	while (nr_blocks > 0 && !error) {
> > > > +		struct xfs_alloc_arg	args = {
> > > > +			.tp		= sc->tp,
> > > > +			.mp		= sc->mp,
> > > > +			.type		= type,
> > > > +			.fsbno		= alloc_hint,
> > > > +			.oinfo		= xnr->oinfo,
> > > > +			.minlen		= 1,
> > > > +			.maxlen		= nr_blocks,
> > > > +			.prod		= nr_blocks,
> > > 
> > > Why .prod? Is this relevant if .mod isn't set?
> > 
> > Not sure why that's even in there. :/
> > 
> > > > +			.resv		= xnr->resv,
> > > > +		};
> > > > +
> > > > +		error = xfs_alloc_vextent(&args);
> > > > +		if (error)
> > > > +			return error;
> > > > +		if (args.fsbno == NULLFSBLOCK)
> > > > +			return -ENOSPC;
> > > > +
> > > > +		trace_xrep_newbt_reserve_space(sc->mp,
> > > > +				XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
> > > > +				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> > > > +				args.len, xnr->oinfo.oi_owner);
> > > > +
> > > > +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> > > > +		if (error)
> > > > +			break;
> > > > +
> > > > +		nr_blocks -= args.len;
> > > > +		alloc_hint = args.fsbno + args.len - 1;
> > > > +
> > > > +		if (sc->ip)
> > > > +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> > > > +		else
> > > > +			error = xrep_roll_ag_trans(sc);
> > > > +	}
> > > > +
> > > > +	return error;
> > > > +}
> > > > +
> > > > +/* Free all the accounting info and disk space we reserved for a new btree. */
> > > > +void
> > > > +xrep_newbt_destroy(
> > > > +	struct xrep_newbt	*xnr,
> > > > +	int			error)
> > > > +{
> > > > +	struct xfs_scrub	*sc = xnr->sc;
> > > > +	struct xrep_newbt_resv	*resv, *n;
> > > > +
> > > > +	if (error)
> > > > +		goto junkit;
> > > > +
> > > > +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > > +		/* Free every block we didn't use. */
> > > > +		resv->fsbno += resv->used;
> > > > +		resv->len -= resv->used;
> > > > +		resv->used = 0;
> > > 
> > > That answers my count/pointer question. :)
> > 
> > > > +
> > > > +		if (resv->len > 0) {
> > > > +			trace_xrep_newbt_unreserve_space(sc->mp,
> > > > +					XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
> > > > +					XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
> > > > +					resv->len, xnr->oinfo.oi_owner);
> > > > +
> > > > +			__xfs_bmap_add_free(sc->tp, resv->fsbno, resv->len,
> > > > +					&xnr->oinfo, true);
> > > > +		}
> > > > +
> > > > +		list_del(&resv->list);
> > > > +		kmem_free(resv);
> > > > +	}
> > > > +
> > > > +junkit:
> > > > +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > > +		list_del(&resv->list);
> > > > +		kmem_free(resv);
> > > > +	}
> > > 
> > > Seems like this could be folded into the above loop by just checking
> > > error and skipping the free logic appropriately.
> > > 
> > > > +
> > > > +	if (sc->ip) {
> > > > +		kmem_zone_free(xfs_ifork_zone, xnr->ifake.if_fork);
> > > > +		xnr->ifake.if_fork = NULL;
> > > > +	}
> > > > +}
> > > > +
> > > > +/* Feed one of the reserved btree blocks to the bulk loader. */
> > > > +int
> > > > +xrep_newbt_alloc_block(
> > > > +	struct xfs_btree_cur	*cur,
> > > > +	struct xrep_newbt	*xnr,
> > > > +	union xfs_btree_ptr	*ptr)
> > > > +{
> > > > +	struct xrep_newbt_resv	*resv;
> > > > +	xfs_fsblock_t		fsb;
> > > > +
> > > > +	/*
> > > > +	 * If last_resv doesn't have a block for us, move forward until we find
> > > > +	 * one that does (or run out of reservations).
> > > > +	 */
> > > > +	if (xnr->last_resv == NULL) {
> > > > +		list_for_each_entry(resv, &xnr->reservations, list) {
> > > > +			if (resv->used < resv->len) {
> > > > +				xnr->last_resv = resv;
> > > > +				break;
> > > > +			}
> > > > +		}
> > > 
> > > Not a big deal right now, but it might be worth eventually considering
> > > something more efficient. For example, perhaps we could rotate depleted
> > > entries to the end of the list and if we rotate and find nothing in the
> > > next entry at the head, we know we've run out of space.
> > 
> > Hm, yeah, this part would be much simpler if all we had to do was latch
> > on to the head element and rotate them to the tail when we're done.
> > 
> > > 
> > > > +		if (xnr->last_resv == NULL)
> > > > +			return -ENOSPC;
> > > > +	} else if (xnr->last_resv->used == xnr->last_resv->len) {
> > > > +		if (xnr->last_resv->list.next == &xnr->reservations)
> > > > +			return -ENOSPC;
> > > > +		xnr->last_resv = list_entry(xnr->last_resv->list.next,
> > > > +				struct xrep_newbt_resv, list);
> > > > +	}
> > > > +
> > > > +	/* Nab the block. */
> > > > +	fsb = xnr->last_resv->fsbno + xnr->last_resv->used;
> > > > +	xnr->last_resv->used++;
> > > > +
> > > > +	trace_xrep_newbt_alloc_block(cur->bc_mp,
> > > > +			XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
> > > > +			XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
> > > > +			xnr->oinfo.oi_owner);
> > > > +
> > > > +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
> > > > +		ptr->l = cpu_to_be64(fsb);
> > > > +	else
> > > > +		ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
> > > > +	return 0;
> > > > +}
> > > > +
> > > ...
> > > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > > > index 479cfe38065e..ab6c1199ecc0 100644
> > > > --- a/fs/xfs/scrub/repair.h
> > > > +++ b/fs/xfs/scrub/repair.h
> > > ...
> > > > @@ -59,6 +63,63 @@ int xrep_agf(struct xfs_scrub *sc);
> > > >  int xrep_agfl(struct xfs_scrub *sc);
> > > >  int xrep_agi(struct xfs_scrub *sc);
> > > >  
> > > ...
> > > > +
> > > > +#define for_each_xrep_newbt_reservation(xnr, resv, n)	\
> > > > +	list_for_each_entry_safe((resv), (n), &(xnr)->reservations, list)
> > > 
> > > This is unused (and seems unnecessary for a simple list).
> > 
> > It's used by the free space rebuilder in the next patch; I suppose I
> > could move it down.  That said, I've been trying to keep the common code
> > out of that particular patch so that the repair patches can be merged in
> > any order.
> > 
> > > ...
> > > > diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
> > > > index 3362bae28b46..deb177abf652 100644
> > > > --- a/fs/xfs/scrub/trace.h
> > > > +++ b/fs/xfs/scrub/trace.h
> > > > @@ -904,6 +904,64 @@ TRACE_EVENT(xrep_ialloc_insert,
> > > >  		  __entry->freemask)
> > > >  )
> > > >  
> > > ...
> > > > +#define DEFINE_NEWBT_EXTENT_EVENT(name) \
> > > > +DEFINE_EVENT(xrep_newbt_extent_class, name, \
> > > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> > > > +		 xfs_agblock_t agbno, xfs_extlen_t len, \
> > > > +		 int64_t owner), \
> > > > +	TP_ARGS(mp, agno, agbno, len, owner))
> > > > +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_reserve_space);
> > > > +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_unreserve_space);
> > > > +
> > > > +TRACE_EVENT(xrep_newbt_alloc_block,
> > > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > > > +		 xfs_agblock_t agbno, int64_t owner),
> > > 
> > > This could be folded into the above class if we just passed 1 for the
> > > length, eh?
> > 
> > Er, yes.  Fixed.
> > 
> > --D
> > 
> > > Brian
> > > 
> > > > +	TP_ARGS(mp, agno, agbno, owner),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_agnumber_t, agno)
> > > > +		__field(xfs_agblock_t, agbno)
> > > > +		__field(int64_t, owner)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = mp->m_super->s_dev;
> > > > +		__entry->agno = agno;
> > > > +		__entry->agbno = agbno;
> > > > +		__entry->owner = owner;
> > > > +	),
> > > > +	TP_printk("dev %d:%d agno %u agbno %u owner %lld",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __entry->agno,
> > > > +		  __entry->agbno,
> > > > +		  __entry->owner)
> > > > +);
> > > > +
> > > >  #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
> > > >  
> > > >  #endif /* _TRACE_XFS_SCRUB_TRACE_H */
> > > > 
> > > 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging
  2019-10-25 20:52         ` Darrick J. Wong
@ 2020-02-27 15:51           ` Darrick J. Wong
  0 siblings, 0 replies; 14+ messages in thread
From: Darrick J. Wong @ 2020-02-27 15:51 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Oct 25, 2019 at 01:52:41PM -0700, Darrick J. Wong wrote:
> On Fri, Oct 25, 2019 at 01:35:54PM -0400, Brian Foster wrote:
> > On Fri, Oct 25, 2019 at 09:35:17AM -0700, Darrick J. Wong wrote:
> > > On Fri, Oct 25, 2019 at 10:22:27AM -0400, Brian Foster wrote:
> > > > On Wed, Oct 09, 2019 at 09:49:59AM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > 
> > > > > Create a new xrep_newbt structure to encapsulate a fake root for
> > > > > creating a staged btree cursor as well as to track all the blocks that
> > > > > we need to reserve in order to build that btree.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > ---
> > > > >  fs/xfs/scrub/repair.c |  260 +++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  fs/xfs/scrub/repair.h |   61 +++++++++++
> > > > >  fs/xfs/scrub/trace.h  |   58 +++++++++++
> > > > >  3 files changed, 379 insertions(+)
> > > > > 
> > > > > 
> > > > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > > > index 588bc054db5c..beebd484c5f3 100644
> > > > > --- a/fs/xfs/scrub/repair.c
> > > > > +++ b/fs/xfs/scrub/repair.c
> > > > > @@ -359,6 +359,266 @@ xrep_init_btblock(
> > > > >  	return 0;
> > > > >  }
> > > > >  
> > > > ...
> > > > > +/* Initialize accounting resources for staging a new inode fork btree. */
> > > > > +void
> > > > > +xrep_newbt_init_inode(
> > > > > +	struct xrep_newbt		*xnr,
> > > > > +	struct xfs_scrub		*sc,
> > > > > +	int				whichfork,
> > > > > +	const struct xfs_owner_info	*oinfo)
> > > > > +{
> > > > > +	memset(xnr, 0, sizeof(struct xrep_newbt));
> > > > > +	xnr->sc = sc;
> > > > > +	xnr->oinfo = *oinfo; /* structure copy */
> > > > > +	xnr->alloc_hint = XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino);
> > > > > +	xnr->resv = XFS_AG_RESV_NONE;
> > > > > +	xnr->ifake.if_fork = kmem_zone_zalloc(xfs_ifork_zone, 0);
> > > > > +	xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
> > > > > +	INIT_LIST_HEAD(&xnr->reservations);
> > > > 
> > > > Looks like this could reuse the above function for everything outside of
> > > > the fake root bits.
> > > 
> > > Ok.
> > > 
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Initialize accounting resources for staging a new btree.  Callers are
> > > > > + * expected to add their own reservations (and clean them up) manually.
> > > > > + */
> > > > > +void
> > > > > +xrep_newbt_init_bare(
> > > > > +	struct xrep_newbt		*xnr,
> > > > > +	struct xfs_scrub		*sc)
> > > > > +{
> > > > > +	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
> > > > > +			XFS_AG_RESV_NONE);
> > > > > +}
> > > > > +
> > > > > +/* Add a space reservation manually. */
> > > > > +int
> > > > > +xrep_newbt_add_reservation(
> > > > > +	struct xrep_newbt		*xnr,
> > > > > +	xfs_fsblock_t			fsbno,
> > > > > +	xfs_extlen_t			len)
> > > > > +{
> > > > 
> > > > FWIW the "reservation" terminology sounds a bit funny to me. Perhaps
> > > > it's just because I've had log reservation on my mind :P, but something
> > > > that "reserves blocks" as opposed to "adds reservation" might be a bit
> > > > more clear from a naming perspective.
> > > 
> > > xrep_newbt_reserve_space() ?
> > > 
> > > I feel that's a little awkward since it's not actually reserving
> > > anything; all it's doing is some accounting work for some space that the
> > > caller already allocated.  But it's probably no worse than the current
> > > name. :)
> > > 
> > 
> > Maybe _add_blocks() and _alloc_blocks() for these two and something like
> > _[get|use]_block() in the later helper that populates the btree..? That
> > seems more descriptive to me than "reservation" and "space," but that's
> > just my .02.
> 
> Yeah, that works.  Fixed.
> 
> > Brian
> > 
> > > > > +	struct xrep_newbt_resv	*resv;
> > > > > +
> > > > > +	resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
> > > > > +	if (!resv)
> > > > > +		return -ENOMEM;
> > > > > +
> > > > > +	INIT_LIST_HEAD(&resv->list);
> > > > > +	resv->fsbno = fsbno;
> > > > > +	resv->len = len;
> > > > > +	resv->used = 0;
> > > > 
> > > > Is ->used purely a count or does it also serve as a pointer to the next
> > > > "unused" block?
> > > 
> > > It's a counter, as documented in the struct declaration.
> > > 
> > > > > +	list_add_tail(&resv->list, &xnr->reservations);
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/* Reserve disk space for our new btree. */
> > > > > +int
> > > > > +xrep_newbt_reserve_space(
> > > > > +	struct xrep_newbt	*xnr,
> > > > > +	uint64_t		nr_blocks)
> > > > > +{
> > > > > +	struct xfs_scrub	*sc = xnr->sc;
> > > > > +	xfs_alloctype_t		type;
> > > > > +	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
> > > > > +	int			error = 0;
> > > > > +
> > > > > +	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
> > > > > +
> > > > 
> > > > So I take it this distinguishes between reconstruction of a bmapbt
> > > > where we can allocate across AGs vs an AG tree..? If so, a one liner
> > > > comment wouldn't hurt here.
> > > 
> > > Ok.
> > > 
> > > > > +	while (nr_blocks > 0 && !error) {
> > > > > +		struct xfs_alloc_arg	args = {
> > > > > +			.tp		= sc->tp,
> > > > > +			.mp		= sc->mp,
> > > > > +			.type		= type,
> > > > > +			.fsbno		= alloc_hint,
> > > > > +			.oinfo		= xnr->oinfo,
> > > > > +			.minlen		= 1,
> > > > > +			.maxlen		= nr_blocks,
> > > > > +			.prod		= nr_blocks,
> > > > 
> > > > Why .prod? Is this relevant if .mod isn't set?
> > > 
> > > Not sure why that's even in there. :/

Oh, dumb copy pasta error.  That ought to be .prod = 1 since we don't
care about alignment.

--D

> > > > > +			.resv		= xnr->resv,
> > > > > +		};
> > > > > +
> > > > > +		error = xfs_alloc_vextent(&args);
> > > > > +		if (error)
> > > > > +			return error;
> > > > > +		if (args.fsbno == NULLFSBLOCK)
> > > > > +			return -ENOSPC;
> > > > > +
> > > > > +		trace_xrep_newbt_reserve_space(sc->mp,
> > > > > +				XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
> > > > > +				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
> > > > > +				args.len, xnr->oinfo.oi_owner);
> > > > > +
> > > > > +		error = xrep_newbt_add_reservation(xnr, args.fsbno, args.len);
> > > > > +		if (error)
> > > > > +			break;
> > > > > +
> > > > > +		nr_blocks -= args.len;
> > > > > +		alloc_hint = args.fsbno + args.len - 1;
> > > > > +
> > > > > +		if (sc->ip)
> > > > > +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> > > > > +		else
> > > > > +			error = xrep_roll_ag_trans(sc);
> > > > > +	}
> > > > > +
> > > > > +	return error;
> > > > > +}
> > > > > +
> > > > > +/* Free all the accounting info and disk space we reserved for a new btree. */
> > > > > +void
> > > > > +xrep_newbt_destroy(
> > > > > +	struct xrep_newbt	*xnr,
> > > > > +	int			error)
> > > > > +{
> > > > > +	struct xfs_scrub	*sc = xnr->sc;
> > > > > +	struct xrep_newbt_resv	*resv, *n;
> > > > > +
> > > > > +	if (error)
> > > > > +		goto junkit;
> > > > > +
> > > > > +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > > > +		/* Free every block we didn't use. */
> > > > > +		resv->fsbno += resv->used;
> > > > > +		resv->len -= resv->used;
> > > > > +		resv->used = 0;
> > > > 
> > > > That answers my count/pointer question. :)
> > > 
> > > > > +
> > > > > +		if (resv->len > 0) {
> > > > > +			trace_xrep_newbt_unreserve_space(sc->mp,
> > > > > +					XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
> > > > > +					XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
> > > > > +					resv->len, xnr->oinfo.oi_owner);
> > > > > +
> > > > > +			__xfs_bmap_add_free(sc->tp, resv->fsbno, resv->len,
> > > > > +					&xnr->oinfo, true);
> > > > > +		}
> > > > > +
> > > > > +		list_del(&resv->list);
> > > > > +		kmem_free(resv);
> > > > > +	}
> > > > > +
> > > > > +junkit:
> > > > > +	list_for_each_entry_safe(resv, n, &xnr->reservations, list) {
> > > > > +		list_del(&resv->list);
> > > > > +		kmem_free(resv);
> > > > > +	}
> > > > 
> > > > Seems like this could be folded into the above loop by just checking
> > > > error and skipping the free logic appropriately.
> > > > 
> > > > > +
> > > > > +	if (sc->ip) {
> > > > > +		kmem_zone_free(xfs_ifork_zone, xnr->ifake.if_fork);
> > > > > +		xnr->ifake.if_fork = NULL;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/* Feed one of the reserved btree blocks to the bulk loader. */
> > > > > +int
> > > > > +xrep_newbt_alloc_block(
> > > > > +	struct xfs_btree_cur	*cur,
> > > > > +	struct xrep_newbt	*xnr,
> > > > > +	union xfs_btree_ptr	*ptr)
> > > > > +{
> > > > > +	struct xrep_newbt_resv	*resv;
> > > > > +	xfs_fsblock_t		fsb;
> > > > > +
> > > > > +	/*
> > > > > +	 * If last_resv doesn't have a block for us, move forward until we find
> > > > > +	 * one that does (or run out of reservations).
> > > > > +	 */
> > > > > +	if (xnr->last_resv == NULL) {
> > > > > +		list_for_each_entry(resv, &xnr->reservations, list) {
> > > > > +			if (resv->used < resv->len) {
> > > > > +				xnr->last_resv = resv;
> > > > > +				break;
> > > > > +			}
> > > > > +		}
> > > > 
> > > > Not a big deal right now, but it might be worth eventually considering
> > > > something more efficient. For example, perhaps we could rotate depleted
> > > > entries to the end of the list and if we rotate and find nothing in the
> > > > next entry at the head, we know we've run out of space.
> > > 
> > > Hm, yeah, this part would be much simpler if all we had to do was latch
> > > on to the head element and rotate them to the tail when we're done.
> > > 
> > > > 
> > > > > +		if (xnr->last_resv == NULL)
> > > > > +			return -ENOSPC;
> > > > > +	} else if (xnr->last_resv->used == xnr->last_resv->len) {
> > > > > +		if (xnr->last_resv->list.next == &xnr->reservations)
> > > > > +			return -ENOSPC;
> > > > > +		xnr->last_resv = list_entry(xnr->last_resv->list.next,
> > > > > +				struct xrep_newbt_resv, list);
> > > > > +	}
> > > > > +
> > > > > +	/* Nab the block. */
> > > > > +	fsb = xnr->last_resv->fsbno + xnr->last_resv->used;
> > > > > +	xnr->last_resv->used++;
> > > > > +
> > > > > +	trace_xrep_newbt_alloc_block(cur->bc_mp,
> > > > > +			XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
> > > > > +			XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
> > > > > +			xnr->oinfo.oi_owner);
> > > > > +
> > > > > +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
> > > > > +		ptr->l = cpu_to_be64(fsb);
> > > > > +	else
> > > > > +		ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > ...
> > > > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > > > > index 479cfe38065e..ab6c1199ecc0 100644
> > > > > --- a/fs/xfs/scrub/repair.h
> > > > > +++ b/fs/xfs/scrub/repair.h
> > > > ...
> > > > > @@ -59,6 +63,63 @@ int xrep_agf(struct xfs_scrub *sc);
> > > > >  int xrep_agfl(struct xfs_scrub *sc);
> > > > >  int xrep_agi(struct xfs_scrub *sc);
> > > > >  
> > > > ...
> > > > > +
> > > > > +#define for_each_xrep_newbt_reservation(xnr, resv, n)	\
> > > > > +	list_for_each_entry_safe((resv), (n), &(xnr)->reservations, list)
> > > > 
> > > > This is unused (and seems unnecessary for a simple list).
> > > 
> > > It's used by the free space rebuilder in the next patch; I suppose I
> > > could move it down.  That said, I've been trying to keep the common code
> > > out of that particular patch so that the repair patches can be merged in
> > > any order.
> > > 
> > > > ...
> > > > > diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
> > > > > index 3362bae28b46..deb177abf652 100644
> > > > > --- a/fs/xfs/scrub/trace.h
> > > > > +++ b/fs/xfs/scrub/trace.h
> > > > > @@ -904,6 +904,64 @@ TRACE_EVENT(xrep_ialloc_insert,
> > > > >  		  __entry->freemask)
> > > > >  )
> > > > >  
> > > > ...
> > > > > +#define DEFINE_NEWBT_EXTENT_EVENT(name) \
> > > > > +DEFINE_EVENT(xrep_newbt_extent_class, name, \
> > > > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
> > > > > +		 xfs_agblock_t agbno, xfs_extlen_t len, \
> > > > > +		 int64_t owner), \
> > > > > +	TP_ARGS(mp, agno, agbno, len, owner))
> > > > > +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_reserve_space);
> > > > > +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_unreserve_space);
> > > > > +
> > > > > +TRACE_EVENT(xrep_newbt_alloc_block,
> > > > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > > > > +		 xfs_agblock_t agbno, int64_t owner),
> > > > 
> > > > This could be folded into the above class if we just passed 1 for the
> > > > length, eh?
> > > 
> > > Er, yes.  Fixed.
> > > 
> > > --D
> > > 
> > > > Brian
> > > > 
> > > > > +	TP_ARGS(mp, agno, agbno, owner),
> > > > > +	TP_STRUCT__entry(
> > > > > +		__field(dev_t, dev)
> > > > > +		__field(xfs_agnumber_t, agno)
> > > > > +		__field(xfs_agblock_t, agbno)
> > > > > +		__field(int64_t, owner)
> > > > > +	),
> > > > > +	TP_fast_assign(
> > > > > +		__entry->dev = mp->m_super->s_dev;
> > > > > +		__entry->agno = agno;
> > > > > +		__entry->agbno = agbno;
> > > > > +		__entry->owner = owner;
> > > > > +	),
> > > > > +	TP_printk("dev %d:%d agno %u agbno %u owner %lld",
> > > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > > +		  __entry->agno,
> > > > > +		  __entry->agbno,
> > > > > +		  __entry->owner)
> > > > > +);
> > > > > +
> > > > >  #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
> > > > >  
> > > > >  #endif /* _TRACE_XFS_SCRUB_TRACE_H */
> > > > > 
> > > > 
> > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-02-27 15:52 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-09 16:49 [PATCH 0/3] xfs: prepare repair for bulk loading Darrick J. Wong
2019-10-09 16:49 ` [PATCH 1/3] xfs: add debug knobs to control btree bulk load slack factors Darrick J. Wong
2019-10-25 14:19   ` Brian Foster
2019-10-25 16:39     ` Darrick J. Wong
2019-10-09 16:49 ` [PATCH 2/3] xfs: implement block reservation accounting for btrees we're staging Darrick J. Wong
2019-10-25 14:22   ` Brian Foster
2019-10-25 16:35     ` Darrick J. Wong
2019-10-25 17:35       ` Brian Foster
2019-10-25 20:52         ` Darrick J. Wong
2020-02-27 15:51           ` Darrick J. Wong
2019-10-09 16:50 ` [PATCH 3/3] xfs: log EFIs for all btree blocks being used to stage a btree Darrick J. Wong
2019-10-25 14:24   ` Brian Foster
2019-10-25 16:22     ` Darrick J. Wong
2019-10-25 17:36       ` Brian Foster

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.