All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] xfs: btree bulk loading
@ 2020-01-01  1:00 Darrick J. Wong
  2020-01-01  1:00 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
                   ` (3 more replies)
  0 siblings, 4 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:00 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

Hi all,

This series creates a bulk loading function for metadata btree cursors.

We start by creating the idea of a "fake root" for each of the btree
root types (AG header and inode) so that we can use a special btree
cursor to stage a new btree without altering anything that might already
exist.

Next, we add utility functions to compute the desired btree shape for a
given number of records, load records into new leaf blocks, compute the
node blocks from that, and present the new root ready for commit.

Finally we extend all four per-AG btree cursor types to support staging
cursors and therefore bulk loading.  This will be used by upcoming patch
series to implement online repair and refactor offline repair.

If you're going to start using this mess, you probably ought to just
pull from my git trees, which are linked below.

This is an extraordinary way to destroy everything.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=btree-bulk-loading

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=btree-bulk-loading

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-01-01  1:00 [PATCH v2 0/4] xfs: btree bulk loading Darrick J. Wong
@ 2020-01-01  1:00 ` Darrick J. Wong
  2020-01-01  1:01 ` [PATCH 2/4] xfs: introduce fake roots for inode-rooted btrees Darrick J. Wong
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:00 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Create an in-core fake root for AG-rooted btree types so that callers
can generate a whole new btree using the upcoming btree bulk load
function without making the new tree accessible from the rest of the
filesystem.  It is up to the individual btree type to provide a function
to create a staged cursor (presumably with the appropriate callouts to
update the fakeroot) and then commit the staged root back into the
filesystem.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
 fs/xfs/xfs_trace.h        |   28 +++++++++++
 3 files changed, 182 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index e2cc98931552..6d3d2a301f7d 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -382,6 +382,8 @@ xfs_btree_del_cursor(
 	/*
 	 * Free the cursor.
 	 */
+	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+		kmem_free((void *)cur->bc_ops);
 	kmem_cache_free(xfs_btree_cur_zone, cur);
 }
 
@@ -4947,3 +4949,118 @@ xfs_btree_has_more_records(
 	else
 		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
 }
+
+/* We don't allow staging cursors to be duplicated. */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	ASSERT(0);
+	return NULL;
+}
+
+/* Refuse to allow regular block allocation for a staging cursor. */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start_bno,
+	union xfs_btree_ptr	*new_bno,
+	int			*stat)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Refuse to allow block freeing for a staging cursor. */
+STATIC int
+xfs_btree_fakeroot_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xbtree_afakeroot	*afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	afake = cur->bc_private.a.afake;
+	ptr->s = cpu_to_be32(afake->af_root);
+}
+
+/* Set the root block when our tree has a fakeroot. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	afake->af_root = be32_to_cpu(ptr->s);
+	afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG btree fake root.  The
+ * btree cursor's @bc_ops will be overridden as needed to make the staging
+ * functionality work.  If @new_ops is not NULL, these new ops will be passed
+ * out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xbtree_afakeroot		*afake,
+	struct xfs_btree_ops		**new_ops)
+{
+	struct xfs_btree_ops		*nops;
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+
+	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+	nops->free_block = xfs_btree_fakeroot_free_block;
+	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+	nops->set_root = xfs_btree_afakeroot_set_root;
+	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+	cur->bc_private.a.afake = afake;
+	cur->bc_nlevels = afake->af_levels;
+	cur->bc_ops = nops;
+	cur->bc_flags |= XFS_BTREE_STAGING;
+
+	if (new_ops)
+		*new_ops = nops;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops.  The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xfs_buf			*agbp,
+	const struct xfs_btree_ops	*ops)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	trace_xfs_btree_commit_afakeroot(cur);
+
+	kmem_free((void *)cur->bc_ops);
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_ops = ops;
+	cur->bc_flags &= ~XFS_BTREE_STAGING;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index fb9b2121c628..524f9ef89d1d 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -188,6 +188,16 @@ union xfs_btree_cur_private {
 	} abt;
 };
 
+/* Private information for a AG-rooted btree. */
+struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
+	union {
+		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
+		struct xbtree_afakeroot	*afake;	/* fake ag header root */
+	};
+	xfs_agnumber_t			agno;	/* ag number */
+	union xfs_btree_cur_private	priv;
+};
+
 /*
  * Btree cursor structure.
  * This collects all information needed by the btree code in one place.
@@ -209,11 +219,7 @@ typedef struct xfs_btree_cur
 	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
 	int		bc_statoff;	/* offset of btre stats array */
 	union {
-		struct {			/* needed for BNO, CNT, INO */
-			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
-			xfs_agnumber_t	agno;	/* ag number */
-			union xfs_btree_cur_private	priv;
-		} a;
+		struct xfs_btree_priv_ag a;
 		struct {			/* needed for BMAP */
 			struct xfs_inode *ip;	/* pointer to our inode */
 			int		allocated;	/* count of alloced */
@@ -232,6 +238,12 @@ typedef struct xfs_btree_cur
 #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
 #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
 #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata.  The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING		(1<<5)
 
 
 #define	XFS_BTREE_NOERROR	0
@@ -533,4 +545,24 @@ xfs_btree_islastblock(
 	return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 }
 
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+	/* AG block number of the new btree root. */
+	xfs_agblock_t		af_root;
+
+	/* Height of the new btree. */
+	unsigned int		af_levels;
+
+	/* Number of blocks used by the btree. */
+	unsigned int		af_blocks;
+};
+
+/* Cursor interactions with with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+		struct xbtree_afakeroot *afake,
+		struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
+		struct xfs_buf *agbp,
+		const struct xfs_btree_ops *ops);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a86be7f807ee..0bc245988501 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3594,6 +3594,34 @@ TRACE_EVENT(xfs_check_new_dalign,
 		  __entry->calc_rootino)
 )
 
+TRACE_EVENT(xfs_btree_commit_afakeroot,
+	TP_PROTO(struct xfs_btree_cur *cur),
+	TP_ARGS(cur),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(unsigned int, levels)
+		__field(unsigned int, blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->agno = cur->bc_private.a.agno;
+		__entry->agbno = cur->bc_private.a.afake->af_root;
+		__entry->levels = cur->bc_private.a.afake->af_levels;
+		__entry->blocks = cur->bc_private.a.afake->af_blocks;
+	),
+	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->agno,
+		  __entry->levels,
+		  __entry->blocks,
+		  __entry->agbno)
+)
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/4] xfs: introduce fake roots for inode-rooted btrees
  2020-01-01  1:00 [PATCH v2 0/4] xfs: btree bulk loading Darrick J. Wong
  2020-01-01  1:00 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
@ 2020-01-01  1:01 ` Darrick J. Wong
  2020-01-01  1:01 ` [PATCH 3/4] xfs: support bulk loading of staged btrees Darrick J. Wong
  2020-01-01  1:01 ` [PATCH 4/4] xfs: support staging cursors for per-AG btree types Darrick J. Wong
  3 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:01 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Create an in-core fake root for inode-rooted btree types so that callers
can generate a whole new btree using the upcoming btree bulk load
function without making the new tree accessible from the rest of the
filesystem.  It is up to the individual btree type to provide a function
to create a staged cursor (presumably with the appropriate callouts to
update the fakeroot) and then commit the staged root back into the
filesystem.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_btree.c |   82 +++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/libxfs/xfs_btree.h |   52 ++++++++++++++++++++++++-----
 fs/xfs/xfs_trace.h        |   33 ++++++++++++++++++
 3 files changed, 154 insertions(+), 13 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 6d3d2a301f7d..ea4aa22d0b89 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -644,6 +644,17 @@ xfs_btree_ptr_addr(
 		((char *)block + xfs_btree_ptr_offset(cur, n, level));
 }
 
+struct xfs_ifork *
+xfs_btree_ifork_ptr(
+	struct xfs_btree_cur	*cur)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	if (cur->bc_flags & XFS_BTREE_STAGING)
+		return cur->bc_private.b.ifake->if_fork;
+	return XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+}
+
 /*
  * Get the root block which is stored in the inode.
  *
@@ -654,9 +665,8 @@ STATIC struct xfs_btree_block *
 xfs_btree_get_iroot(
 	struct xfs_btree_cur	*cur)
 {
-	struct xfs_ifork	*ifp;
+	struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
 
-	ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
 	return (struct xfs_btree_block *)ifp->if_broot;
 }
 
@@ -4991,8 +5001,17 @@ xfs_btree_fakeroot_init_ptr_from_cur(
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 
-	afake = cur->bc_private.a.afake;
-	ptr->s = cpu_to_be32(afake->af_root);
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		/*
+		 * The root block lives in the inode core, so we zero the
+		 * pointer (like the bmbt code does) to make it obvious if
+		 * anyone ever tries to use this pointer.
+		 */
+		ptr->l = cpu_to_be64(0);
+	} else {
+		afake = cur->bc_private.a.afake;
+		ptr->s = cpu_to_be32(afake->af_root);
+	}
 }
 
 /* Set the root block when our tree has a fakeroot. */
@@ -5064,3 +5083,58 @@ xfs_btree_commit_afakeroot(
 	cur->bc_ops = ops;
 	cur->bc_flags &= ~XFS_BTREE_STAGING;
 }
+/*
+ * Initialize an inode-rooted btree cursor with the given inode btree fake
+ * root.  The btree cursor's @bc_ops will be overridden as needed to make the
+ * staging functionality work.  If @new_ops is not NULL, these new ops will be
+ * passed out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_ifakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xbtree_ifakeroot		*ifake,
+	struct xfs_btree_ops		**new_ops)
+{
+	struct xfs_btree_ops		*nops;
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+	nops->free_block = xfs_btree_fakeroot_free_block;
+	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+	cur->bc_private.b.ifake = ifake;
+	cur->bc_nlevels = ifake->if_levels;
+	cur->bc_ops = nops;
+	cur->bc_flags |= XFS_BTREE_STAGING;
+
+	if (new_ops)
+		*new_ops = nops;
+}
+
+/*
+ * Transform an inode-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops.  The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_ifakeroot(
+	struct xfs_btree_cur		*cur,
+	int				whichfork,
+	const struct xfs_btree_ops	*ops)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	trace_xfs_btree_commit_ifakeroot(cur);
+
+	kmem_free((void *)cur->bc_ops);
+	cur->bc_private.b.ifake = NULL;
+	cur->bc_private.b.whichfork = whichfork;
+	cur->bc_ops = ops;
+	cur->bc_flags &= ~XFS_BTREE_STAGING;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 524f9ef89d1d..086619a373ff 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -10,6 +10,7 @@ struct xfs_buf;
 struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ifork;
 
 extern kmem_zone_t	*xfs_btree_cur_zone;
 
@@ -198,6 +199,18 @@ struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
 	union xfs_btree_cur_private	priv;
 };
 
+/* Private information for an inode-rooted btree. */
+struct xfs_btree_priv_inode {			/* needed for BMAP */
+	struct xfs_inode	*ip;		/* pointer to our inode */
+	struct xbtree_ifakeroot	*ifake;		/* fake inode fork */
+	int			allocated;	/* count of alloced */
+	short			forksize;	/* fork's inode space */
+	char			whichfork;	/* data or attr fork */
+	char			flags;		/* flags */
+#define	XFS_BTCUR_BPRV_WASDEL		(1<<0)	/* was delayed */
+#define	XFS_BTCUR_BPRV_INVALID_OWNER	(1<<1)	/* for ext swap */
+};
+
 /*
  * Btree cursor structure.
  * This collects all information needed by the btree code in one place.
@@ -220,15 +233,7 @@ typedef struct xfs_btree_cur
 	int		bc_statoff;	/* offset of btre stats array */
 	union {
 		struct xfs_btree_priv_ag a;
-		struct {			/* needed for BMAP */
-			struct xfs_inode *ip;	/* pointer to our inode */
-			int		allocated;	/* count of alloced */
-			short		forksize;	/* fork's inode space */
-			char		whichfork;	/* data or attr fork */
-			char		flags;		/* flags */
-#define	XFS_BTCUR_BPRV_WASDEL		(1<<0)		/* was delayed */
-#define	XFS_BTCUR_BPRV_INVALID_OWNER	(1<<1)		/* for ext swap */
-		} b;
+		struct xfs_btree_priv_inode b;
 	}		bc_private;	/* per-btree type data */
 } xfs_btree_cur_t;
 
@@ -527,6 +532,7 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
 int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
 		union xfs_btree_irec *high, bool *exists);
 bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
+struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
 
 /* Does this cursor point to the last block in the given level? */
 static inline bool
@@ -565,4 +571,32 @@ void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
 		struct xfs_buf *agbp,
 		const struct xfs_btree_ops *ops);
 
+/* Fake root for an inode-rooted btree. */
+struct xbtree_ifakeroot {
+	/* Fake inode fork. */
+	struct xfs_ifork	*if_fork;
+
+	/* Number of blocks used by the btree. */
+	int64_t			if_blocks;
+
+	/* Height of the new btree. */
+	unsigned int		if_levels;
+
+	/* Number of bytes available for this fork in the inode. */
+	unsigned int		if_fork_size;
+
+	/* Fork format. */
+	unsigned int		if_format;
+
+	/* Number of records. */
+	unsigned int		if_extents;
+};
+
+/* Cursor interactions with with fake roots for inode-rooted btrees. */
+void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
+		struct xbtree_ifakeroot *ifake,
+		struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, int whichfork,
+		const struct xfs_btree_ops *ops);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0bc245988501..174df5a74f8c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3622,6 +3622,39 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
 		  __entry->agbno)
 )
 
+TRACE_EVENT(xfs_btree_commit_ifakeroot,
+	TP_PROTO(struct xfs_btree_cur *cur),
+	TP_ARGS(cur),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(unsigned int, levels)
+		__field(unsigned int, blocks)
+		__field(int, whichfork)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
+					cur->bc_private.b.ip->i_ino);
+		__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
+					cur->bc_private.b.ip->i_ino);
+		__entry->levels = cur->bc_private.b.ifake->if_levels;
+		__entry->blocks = cur->bc_private.b.ifake->if_blocks;
+		__entry->whichfork = cur->bc_private.b.whichfork;
+	),
+	TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->agno,
+		  __entry->agino,
+		  __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data",
+		  __entry->levels,
+		  __entry->blocks)
+)
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/4] xfs: support bulk loading of staged btrees
  2020-01-01  1:00 [PATCH v2 0/4] xfs: btree bulk loading Darrick J. Wong
  2020-01-01  1:00 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
  2020-01-01  1:01 ` [PATCH 2/4] xfs: introduce fake roots for inode-rooted btrees Darrick J. Wong
@ 2020-01-01  1:01 ` Darrick J. Wong
  2020-01-01  1:01 ` [PATCH 4/4] xfs: support staging cursors for per-AG btree types Darrick J. Wong
  3 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:01 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Add a new btree function that enables us to bulk load a btree cursor.
This will be used by the upcoming online repair patches to generate new
btrees.  This avoids the programmatic inefficiency of calling
xfs_btree_insert in a loop (which generates a lot of log traffic) in
favor of stamping out new btree blocks with ordered buffers, and then
committing both the new root and scheduling the removal of the old btree
blocks in a single transaction commit.

The design of this new generic code is based off the btree rebuilding
code in xfs_repair's phase 5 code, with the explicit goal of enabling us
to share that code between scrub and repair.  It has the additional
feature of being able to control btree block loading factors.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_btree.c |  581 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_btree.h |   46 ++++
 fs/xfs/xfs_trace.c        |    1 
 fs/xfs/xfs_trace.h        |   85 +++++++
 4 files changed, 712 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index ea4aa22d0b89..7e5d9668c8d9 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1361,7 +1361,7 @@ STATIC void
 xfs_btree_copy_ptrs(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_ptr	*dst_ptr,
-	union xfs_btree_ptr	*src_ptr,
+	const union xfs_btree_ptr *src_ptr,
 	int			numptrs)
 {
 	ASSERT(numptrs >= 0);
@@ -5138,3 +5138,582 @@ xfs_btree_commit_ifakeroot(
 	cur->bc_ops = ops;
 	cur->bc_flags &= ~XFS_BTREE_STAGING;
 }
+
+/*
+ * Bulk Loading of Staged Btrees
+ * =============================
+ *
+ * This interface is used with a staged btree cursor to create a totally new
+ * btree with a large number of records (i.e. more than what would fit in a
+ * single block).  When the creation is complete, the new root can be linked
+ * atomically into the filesystem by committing the staged cursor.
+ *
+ * The first step for the caller is to construct a fake btree root structure
+ * and a staged btree cursor.  A staging cursor contains all the geometry
+ * information for the btree type but will fail all operations that could have
+ * side effects in the filesystem (e.g. btree shape changes).  Regular
+ * operations will not work unless the staging cursor is committed and becomes
+ * a regular cursor.
+ *
+ * For a btree rooted in an AG header, use an xbtree_afakeroot structure.
+ * This should be initialized to zero.  For a btree rooted in an inode fork,
+ * use an xbtree_ifakeroot structure.  @if_fork_size field should be set to
+ * the number of bytes available to the fork in the inode; @if_fork should
+ * point to a freshly allocated xfs_inode_fork; and @if_format should be set
+ * to the appropriate fork type (e.g. XFS_DINODE_FMT_BTREE).
+ *
+ * The next step for the caller is to initialize a struct xfs_btree_bload
+ * context.  The @nr_records field is the number of records that are to be
+ * loaded into the btree.  The @leaf_slack and @node_slack fields are the
+ * number of records (or key/ptr) slots to leave empty in new btree blocks.
+ * If a caller sets a slack value to -1, the slack value will be computed to
+ * fill the block halfway between minrecs and maxrecs items per block.
+ *
+ * The number of items placed in each btree block is computed via the following
+ * algorithm: For leaf levels, the number of items for the level is nr_records.
+ * For node levels, the number of items for the level is the number of blocks
+ * in the next lower level of the tree.  For each level, the desired number of
+ * items per block is defined as:
+ *
+ * desired = max(minrecs, maxrecs - slack factor)
+ *
+ * The number of blocks for the level is defined to be:
+ *
+ * blocks = nr_items / desired
+ *
+ * Note this is rounded down so that the npb calculation below will never fall
+ * below minrecs.  The number of items that will actually be loaded into each
+ * btree block is defined as:
+ *
+ * npb =  nr_items / blocks
+ *
+ * Some of the leftmost blocks in the level will contain one extra record as
+ * needed to handle uneven division.  If the number of records in any block
+ * would exceed maxrecs for that level, blocks is incremented and npb is
+ * recalculated.
+ *
+ * In other words, we compute the number of blocks needed to satisfy a given
+ * loading level, then spread the items as evenly as possible.
+ *
+ * To complete this step, call xfs_btree_bload_compute_geometry, which uses
+ * those settings to compute the height of the btree and the number of blocks
+ * that will be needed to construct the btree.  These values are stored in the
+ * @btree_height and @nr_blocks fields.
+ *
+ * At this point, the caller must allocate @nr_blocks blocks and save them for
+ * later.  If space is to be allocated transactionally, the staging cursor
+ * must be deleted before and recreated after, which is why computing the
+ * geometry is a separate step.
+ *
+ * The fourth step in the bulk loading process is to set the function pointers
+ * in the bload context structure.  @get_data will be called for each record
+ * that will be loaded into the btree; it should set the cursor's bc_rec
+ * field, which will be converted to on-disk format and copied into the
+ * appropriate record slot.  @alloc_block should supply one of the blocks
+ * allocated in the previous step.  For btrees which are rooted in an inode
+ * fork, @iroot_size is called to compute the size of the incore btree root
+ * block.  Call xfs_btree_bload to start constructing the btree.
+ *
+ * The final step is to commit the staging cursor, which logs the new btree
+ * root and turns the btree into a regular btree cursor, and free the fake
+ * roots.
+ */
+
+/*
+ * Put a btree block that we're loading onto the ordered list and release it.
+ * The btree blocks will be written when the final transaction swapping the
+ * btree roots is committed.
+ */
+static void
+xfs_btree_bload_drop_buf(
+	struct xfs_btree_bload	*bbl,
+	struct xfs_trans	*tp,
+	struct xfs_buf		**bpp)
+{
+	if (*bpp == NULL)
+		return;
+
+	xfs_buf_delwri_queue(*bpp, &bbl->buffers_list);
+	xfs_trans_brelse(tp, *bpp);
+	*bpp = NULL;
+}
+
+/* Allocate and initialize one btree block for bulk loading. */
+STATIC int
+xfs_btree_bload_prep_block(
+	struct xfs_btree_cur		*cur,
+	struct xfs_btree_bload		*bbl,
+	unsigned int			level,
+	unsigned int			nr_this_block,
+	union xfs_btree_ptr		*ptrp,
+	struct xfs_buf			**bpp,
+	struct xfs_btree_block		**blockp,
+	void				*priv)
+{
+	union xfs_btree_ptr		new_ptr;
+	struct xfs_buf			*new_bp;
+	struct xfs_btree_block		*new_block;
+	int				ret;
+
+	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	    level == cur->bc_nlevels - 1) {
+		struct xfs_ifork	*ifp = cur->bc_private.b.ifake->if_fork;
+		size_t			new_size;
+
+		/* Allocate a new incore btree root block. */
+		new_size = bbl->iroot_size(cur, nr_this_block, priv);
+		ifp->if_broot = kmem_zalloc(new_size, 0);
+		ifp->if_broot_bytes = (int)new_size;
+		ifp->if_flags |= XFS_IFBROOT;
+
+		/* Initialize it and send it out. */
+		xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
+				XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
+				nr_this_block, cur->bc_private.b.ip->i_ino,
+				cur->bc_flags);
+
+		*bpp = NULL;
+		*blockp = ifp->if_broot;
+		xfs_btree_set_ptr_null(cur, ptrp);
+		return 0;
+	}
+
+	/* Allocate a new leaf block. */
+	ret = bbl->alloc_block(cur, &new_ptr, priv);
+	if (ret)
+		return ret;
+
+	ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
+
+	ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
+	if (ret)
+		return ret;
+
+	/* Initialize the btree block. */
+	xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
+	if (*blockp)
+		xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
+	xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
+	xfs_btree_set_numrecs(new_block, nr_this_block);
+
+	/* Release the old block and set the out parameters. */
+	xfs_btree_bload_drop_buf(bbl, cur->bc_tp, bpp);
+	*blockp = new_block;
+	*bpp = new_bp;
+	xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
+	return 0;
+}
+
+/* Load one leaf block. */
+STATIC int
+xfs_btree_bload_leaf(
+	struct xfs_btree_cur		*cur,
+	unsigned int			recs_this_block,
+	xfs_btree_bload_get_fn		get_data,
+	struct xfs_btree_block		*block,
+	void				*priv)
+{
+	unsigned int			j;
+	int				ret;
+
+	/* Fill the leaf block with records. */
+	for (j = 1; j <= recs_this_block; j++) {
+		union xfs_btree_rec	*block_recs;
+
+		ret = get_data(cur, priv);
+		if (ret)
+			return ret;
+		block_recs = xfs_btree_rec_addr(cur, j, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_recs);
+	}
+
+	return 0;
+}
+
+/* Load one node block. */
+STATIC int
+xfs_btree_bload_node(
+	struct xfs_btree_cur	*cur,
+	unsigned int		recs_this_block,
+	union xfs_btree_ptr	*child_ptr,
+	struct xfs_btree_block	*block)
+{
+	unsigned int		j;
+	int			ret;
+
+	/* Fill the node block with keys and pointers. */
+	for (j = 1; j <= recs_this_block; j++) {
+		union xfs_btree_key	child_key;
+		union xfs_btree_ptr	*block_ptr;
+		union xfs_btree_key	*block_key;
+		struct xfs_btree_block	*child_block;
+		struct xfs_buf		*child_bp;
+
+		ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
+
+		ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+				&child_bp);
+		if (ret)
+			return ret;
+
+		xfs_btree_get_keys(cur, child_block, &child_key);
+
+		block_ptr = xfs_btree_ptr_addr(cur, j, block);
+		xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
+
+		block_key = xfs_btree_key_addr(cur, j, block);
+		xfs_btree_copy_keys(cur, block_key, &child_key, 1);
+
+		xfs_btree_get_sibling(cur, child_block, child_ptr,
+				XFS_BB_RIGHTSIB);
+		xfs_trans_brelse(cur->bc_tp, child_bp);
+	}
+
+	return 0;
+}
+
+/*
+ * Compute the maximum number of records (or keyptrs) per block that we want to
+ * install at this level in the btree.  Caller is responsible for having set
+ * @cur->bc_private.b.forksize to the desired fork size, if appropriate.
+ */
+STATIC unsigned int
+xfs_btree_bload_max_npb(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_bload	*bbl,
+	unsigned int		level)
+{
+	unsigned int		ret;
+
+	if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
+		return cur->bc_ops->get_dmaxrecs(cur, level);
+
+	ret = cur->bc_ops->get_maxrecs(cur, level);
+	if (level == 0)
+		ret -= bbl->leaf_slack;
+	else
+		ret -= bbl->node_slack;
+	return ret;
+}
+
+/*
+ * Compute the desired number of records (or keyptrs) per block that we want to
+ * install at this level in the btree, which must be somewhere between minrecs
+ * and max_npb.  The caller is free to install fewer records per block.
+ */
+STATIC unsigned int
+xfs_btree_bload_desired_npb(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_bload	*bbl,
+	unsigned int		level)
+{
+	unsigned int		npb = xfs_btree_bload_max_npb(cur, bbl, level);
+
+	/* Root blocks are not subject to minrecs rules. */
+	if (level == cur->bc_nlevels - 1)
+		return max(1U, npb);
+
+	return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
+}
+
+/*
+ * Compute the number of records to be stored in each block at this level and
+ * the number of blocks for this level.  For leaf levels, we must populate an
+ * empty root block even if there are no records, so we have to have at least
+ * one block.
+ */
+STATIC void
+xfs_btree_bload_level_geometry(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_bload	*bbl,
+	unsigned int		level,
+	uint64_t		nr_this_level,
+	unsigned int		*avg_per_block,
+	uint64_t		*blocks,
+	uint64_t		*blocks_with_extra)
+{
+	uint64_t		npb;
+	uint64_t		dontcare;
+	unsigned int		desired_npb;
+	unsigned int		maxnr;
+
+	maxnr = cur->bc_ops->get_maxrecs(cur, level);
+
+	/*
+	 * Compute the number of blocks we need to fill each block with the
+	 * desired number of records/keyptrs per block.  Because desired_npb
+	 * could be minrecs, we use regular integer division (which rounds
+	 * the block count down) so that in the next step the effective # of
+	 * items per block will never be less than desired_npb.
+	 */
+	desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
+	*blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
+	*blocks = max(1ULL, *blocks);
+
+	/*
+	 * Compute the number of records that we will actually put in each
+	 * block, assuming that we want to spread the records evenly between
+	 * the blocks.  Take care that the effective # of items per block (npb)
+	 * won't exceed maxrecs even for the blocks that get an extra record,
+	 * since desired_npb could be maxrecs, and in the previous step we
+	 * rounded the block count down.
+	 */
+	npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+	if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
+		(*blocks)++;
+		npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+	}
+
+	*avg_per_block = min_t(uint64_t, npb, nr_this_level);
+
+	trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
+			*avg_per_block, desired_npb, *blocks,
+			*blocks_with_extra);
+}
+
+/*
+ * Ensure a slack value is appropriate for the btree.
+ *
+ * If the slack value is negative, set slack so that we fill the block to
+ * halfway between minrecs and maxrecs.  Make sure the slack is never so large
+ * that we can underflow minrecs.
+ */
+static void
+xfs_btree_bload_ensure_slack(
+	struct xfs_btree_cur	*cur,
+	int			*slack,
+	int			level)
+{
+	int			maxr;
+	int			minr;
+
+	/*
+	 * We only care about slack for btree blocks, so set the btree nlevels
+	 * to 3 so that level 0 is a leaf block and level 1 is a node block.
+	 * Avoid straying into inode roots, since we don't do slack there.
+	 */
+	cur->bc_nlevels = 3;
+	maxr = cur->bc_ops->get_maxrecs(cur, level);
+	minr = cur->bc_ops->get_minrecs(cur, level);
+
+	/*
+	 * If slack is negative, automatically set slack so that we load the
+	 * btree block approximately halfway between minrecs and maxrecs.
+	 * Generally, this will net us 75% loading.
+	 */
+	if (*slack < 0)
+		*slack = maxr - ((maxr + minr) >> 1);
+
+	*slack = min(*slack, maxr - minr);
+}
+
+/*
+ * Prepare a btree cursor for a bulk load operation by computing the geometry
+ * fields in @bbl.  Caller must ensure that the btree cursor is a staging
+ * cursor.  This function can be called multiple times.
+ */
+int
+xfs_btree_bload_compute_geometry(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_bload	*bbl,
+	uint64_t		nr_records)
+{
+	uint64_t		nr_blocks = 0;
+	uint64_t		nr_this_level;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
+	xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
+
+	bbl->nr_records = nr_this_level = nr_records;
+	for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) {
+		uint64_t	level_blocks;
+		uint64_t	dontcare64;
+		unsigned int	level = cur->bc_nlevels - 1;
+		unsigned int	avg_per_block;
+
+		/*
+		 * If all the things we want to store at this level would fit
+		 * in a single root block, then we have our btree root and are
+		 * done.  Note that bmap btrees do not allow records in the
+		 * root.
+		 */
+		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level != 0) {
+			xfs_btree_bload_level_geometry(cur, bbl, level,
+					nr_this_level, &avg_per_block,
+					&level_blocks, &dontcare64);
+			if (nr_this_level <= avg_per_block) {
+				nr_blocks++;
+				break;
+			}
+		}
+
+		/*
+		 * Otherwise, we have to store all the records for this level
+		 * in blocks and therefore need another level of btree to point
+		 * to those blocks.  Increase the number of levels and
+		 * recompute the number of records we can store at this level
+		 * because that can change depending on whether or not a level
+		 * is the root level.
+		 */
+		cur->bc_nlevels++;
+		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+				&avg_per_block, &level_blocks, &dontcare64);
+		nr_blocks += level_blocks;
+		nr_this_level = level_blocks;
+	}
+
+	if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS)
+		return -EOVERFLOW;
+
+	bbl->btree_height = cur->bc_nlevels;
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+		bbl->nr_blocks = nr_blocks - 1;
+	else
+		bbl->nr_blocks = nr_blocks;
+	return 0;
+}
+
+/*
+ * Bulk load a btree.
+ *
+ * Load @bbl->nr_records quantity of records into a btree using the supplied
+ * empty and staging btree cursor @cur and a @bbl that has been filled out by
+ * the xfs_btree_bload_compute_geometry function.
+ *
+ * The @bbl->get_data function must populate the cursor's bc_rec every time it
+ * is called.  The @bbl->alloc_block function will be used to allocate new
+ * btree blocks.  @priv is passed to both functions.
+ *
+ * Caller must ensure that @cur is a staging cursor.  Any existing btree rooted
+ * in the fakeroot will be lost, so do not call this function twice.
+ */
+int
+xfs_btree_bload(
+	struct xfs_btree_cur		*cur,
+	struct xfs_btree_bload		*bbl,
+	void				*priv)
+{
+	union xfs_btree_ptr		child_ptr;
+	union xfs_btree_ptr		ptr;
+	struct xfs_buf			*bp = NULL;
+	struct xfs_btree_block		*block = NULL;
+	uint64_t			nr_this_level = bbl->nr_records;
+	uint64_t			blocks;
+	uint64_t			i;
+	uint64_t			blocks_with_extra;
+	uint64_t			total_blocks = 0;
+	unsigned int			avg_per_block;
+	unsigned int			level = 0;
+	int				ret;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	INIT_LIST_HEAD(&bbl->buffers_list);
+	cur->bc_nlevels = bbl->btree_height;
+	xfs_btree_set_ptr_null(cur, &child_ptr);
+	xfs_btree_set_ptr_null(cur, &ptr);
+
+	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+			&avg_per_block, &blocks, &blocks_with_extra);
+
+	/* Load each leaf block. */
+	for (i = 0; i < blocks; i++) {
+		unsigned int		nr_this_block = avg_per_block;
+
+		if (i < blocks_with_extra)
+			nr_this_block++;
+
+		ret = xfs_btree_bload_prep_block(cur, bbl, level,
+				nr_this_block, &ptr, &bp, &block, priv);
+		if (ret)
+			return ret;
+
+		trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
+				nr_this_block);
+
+		ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_data,
+				block, priv);
+		if (ret)
+			goto out;
+
+		/* Record the leftmost pointer to start the next level. */
+		if (i == 0)
+			xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
+	}
+	total_blocks += blocks;
+	xfs_btree_bload_drop_buf(bbl, cur->bc_tp, &bp);
+
+	/* Populate the internal btree nodes. */
+	for (level = 1; level < cur->bc_nlevels; level++) {
+		union xfs_btree_ptr	first_ptr;
+
+		nr_this_level = blocks;
+		block = NULL;
+		xfs_btree_set_ptr_null(cur, &ptr);
+
+		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+				&avg_per_block, &blocks, &blocks_with_extra);
+
+		/* Load each node block. */
+		for (i = 0; i < blocks; i++) {
+			unsigned int	nr_this_block = avg_per_block;
+
+			if (i < blocks_with_extra)
+				nr_this_block++;
+
+			ret = xfs_btree_bload_prep_block(cur, bbl, level,
+					nr_this_block, &ptr, &bp, &block,
+					priv);
+			if (ret)
+				return ret;
+
+			trace_xfs_btree_bload_block(cur, level, i, blocks,
+					&ptr, nr_this_block);
+
+			ret = xfs_btree_bload_node(cur, nr_this_block,
+					&child_ptr, block);
+			if (ret)
+				goto out;
+
+			/*
+			 * Record the leftmost pointer to start the next level.
+			 */
+			if (i == 0)
+				xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
+		}
+		total_blocks += blocks;
+		xfs_btree_bload_drop_buf(bbl, cur->bc_tp, &bp);
+		xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
+	}
+
+	/* Initialize the new root. */
+	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+		cur->bc_private.b.ifake->if_levels = cur->bc_nlevels;
+		cur->bc_private.b.ifake->if_blocks = total_blocks - 1;
+	} else {
+		cur->bc_private.a.afake->af_root = be32_to_cpu(ptr.s);
+		cur->bc_private.a.afake->af_levels = cur->bc_nlevels;
+		cur->bc_private.a.afake->af_blocks = total_blocks;
+	}
+
+	/*
+	 * Write the new blocks to disk.  If the ordered list isn't empty after
+	 * that, then something went wrong and we have to fail.  This should
+	 * never happen, but we'll check anyway.
+	 */
+	ret = xfs_buf_delwri_submit(&bbl->buffers_list);
+	if (ret)
+		goto out;
+	if (!list_empty(&bbl->buffers_list)) {
+		ASSERT(list_empty(&bbl->buffers_list));
+		ret = -EIO;
+	}
+out:
+	xfs_buf_delwri_cancel(&bbl->buffers_list);
+	if (bp)
+		xfs_trans_brelse(cur->bc_tp, bp);
+	return ret;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 086619a373ff..6708ab2433cf 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -599,4 +599,50 @@ void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
 void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, int whichfork,
 		const struct xfs_btree_ops *ops);
 
+typedef int (*xfs_btree_bload_get_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_alloc_block_fn)(struct xfs_btree_cur *cur,
+		union xfs_btree_ptr *ptr, void *priv);
+typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
+		unsigned int nr_this_level, void *priv);
+
+/* Bulk loading of staged btrees. */
+struct xfs_btree_bload {
+	/* Buffer list for delwri_queue. */
+	struct list_head		buffers_list;
+
+	/* Function to store a record in the cursor. */
+	xfs_btree_bload_get_fn		get_data;
+
+	/* Function to allocate a block for the btree. */
+	xfs_btree_bload_alloc_block_fn	alloc_block;
+
+	/* Function to compute the size of the in-core btree root block. */
+	xfs_btree_bload_iroot_size_fn	iroot_size;
+
+	/* Number of records the caller wants to store. */
+	uint64_t			nr_records;
+
+	/* Number of btree blocks needed to store those records. */
+	uint64_t			nr_blocks;
+
+	/*
+	 * Number of free records to leave in each leaf block.  If this (or
+	 * any of the slack values) are negative, this will be computed to
+	 * be halfway between maxrecs and minrecs.  This typically leaves the
+	 * block 75% full.
+	 */
+	int				leaf_slack;
+
+	/* Number of free keyptrs to leave in each node block. */
+	int				node_slack;
+
+	/* Computed btree height. */
+	unsigned int			btree_height;
+};
+
+int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
+		struct xfs_btree_bload *bbl, uint64_t nr_records);
+int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl,
+		void *priv);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index bc85b89f88ca..9b5e58a92381 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -6,6 +6,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_bit.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 174df5a74f8c..222b313f61b0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -35,6 +35,7 @@ struct xfs_icreate_log;
 struct xfs_owner_info;
 struct xfs_trans_res;
 struct xfs_inobt_rec_incore;
+union xfs_btree_ptr;
 
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -3655,6 +3656,90 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
 		  __entry->blocks)
 )
 
+TRACE_EVENT(xfs_btree_bload_level_geometry,
+	TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+		 uint64_t nr_this_level, unsigned int nr_per_block,
+		 unsigned int desired_npb, uint64_t blocks,
+		 uint64_t blocks_with_extra),
+	TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks,
+		blocks_with_extra),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(unsigned int, level)
+		__field(unsigned int, nlevels)
+		__field(uint64_t, nr_this_level)
+		__field(unsigned int, nr_per_block)
+		__field(unsigned int, desired_npb)
+		__field(unsigned long long, blocks)
+		__field(unsigned long long, blocks_with_extra)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->nlevels = cur->bc_nlevels;
+		__entry->nr_this_level = nr_this_level;
+		__entry->nr_per_block = nr_per_block;
+		__entry->desired_npb = desired_npb;
+		__entry->blocks = blocks;
+		__entry->blocks_with_extra = blocks_with_extra;
+	),
+	TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->nlevels,
+		  __entry->nr_this_level,
+		  __entry->nr_per_block,
+		  __entry->desired_npb,
+		  __entry->blocks,
+		  __entry->blocks_with_extra)
+)
+
+TRACE_EVENT(xfs_btree_bload_block,
+	TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+		 uint64_t block_idx, uint64_t nr_blocks,
+		 union xfs_btree_ptr *ptr, unsigned int nr_records),
+	TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(unsigned int, level)
+		__field(unsigned long long, block_idx)
+		__field(unsigned long long, nr_blocks)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(unsigned int, nr_records)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->level = level;
+		__entry->block_idx = block_idx;
+		__entry->nr_blocks = nr_blocks;
+		if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+			xfs_fsblock_t	fsb = be64_to_cpu(ptr->l);
+
+			__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
+			__entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb);
+		} else {
+			__entry->agno = cur->bc_private.a.agno;
+			__entry->agbno = be32_to_cpu(ptr->s);
+		}
+		__entry->nr_records = nr_records;
+	),
+	TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->level,
+		  __entry->block_idx,
+		  __entry->nr_blocks,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->nr_records)
+)
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 4/4] xfs: support staging cursors for per-AG btree types
  2020-01-01  1:00 [PATCH v2 0/4] xfs: btree bulk loading Darrick J. Wong
                   ` (2 preceding siblings ...)
  2020-01-01  1:01 ` [PATCH 3/4] xfs: support bulk loading of staged btrees Darrick J. Wong
@ 2020-01-01  1:01 ` Darrick J. Wong
  3 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-01-01  1:01 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Add support for btree staging cursors for the per-AG btree types.  This
is needed both for online repair and also to convert xfs_repair to use
btree bulk loading.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_alloc_btree.c    |   99 +++++++++++++++++++++++++++++-------
 fs/xfs/libxfs/xfs_alloc_btree.h    |    7 +++
 fs/xfs/libxfs/xfs_ialloc_btree.c   |   84 +++++++++++++++++++++++++++----
 fs/xfs/libxfs/xfs_ialloc_btree.h   |    6 ++
 fs/xfs/libxfs/xfs_refcount_btree.c |   69 +++++++++++++++++++++----
 fs/xfs/libxfs/xfs_refcount_btree.h |    7 +++
 fs/xfs/libxfs/xfs_rmap_btree.c     |   66 ++++++++++++++++++++----
 fs/xfs/libxfs/xfs_rmap_btree.h     |    6 ++
 8 files changed, 295 insertions(+), 49 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 279694d73e4e..94dc18c8f9bc 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -471,6 +471,41 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
 	.recs_inorder		= xfs_cntbt_recs_inorder,
 };
 
+/* Allocate most of a new allocation btree cursor. */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = btnum;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	cur->bc_private.a.agno = agno;
+	cur->bc_private.a.priv.abt.active = false;
+
+	if (btnum == XFS_BTNUM_CNT) {
+		cur->bc_ops = &xfs_cntbt_ops;
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+	} else {
+		cur->bc_ops = &xfs_bnobt_ops;
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+	}
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	return cur;
+}
+
 /*
  * Allocate a new allocation btree cursor.
  */
@@ -485,36 +520,64 @@ xfs_allocbt_init_cursor(
 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
 	struct xfs_btree_cur	*cur;
 
-	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
-	cur->bc_tp = tp;
-	cur->bc_mp = mp;
-	cur->bc_btnum = btnum;
-	cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
+	cur = xfs_allocbt_init_common(mp, tp, agno, btnum);
 	if (btnum == XFS_BTNUM_CNT) {
-		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
-		cur->bc_ops = &xfs_cntbt_ops;
 		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
-		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+		cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
 	} else {
-		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
-		cur->bc_ops = &xfs_bnobt_ops;
 		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
 	}
 
 	cur->bc_private.a.agbp = agbp;
-	cur->bc_private.a.agno = agno;
-	cur->bc_private.a.priv.abt.active = false;
 
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+	return cur;
+}
 
+/* Create a free space btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_allocbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xbtree_afakeroot	*afake,
+	xfs_agnumber_t		agno,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_allocbt_init_common(mp, tp, agno, btnum);
+	if (btnum == XFS_BTNUM_BNO)
+		xfs_btree_stage_afakeroot(cur, afake, NULL);
+	else
+		xfs_btree_stage_afakeroot(cur, afake, NULL);
 	return cur;
 }
 
+/*
+ * Install a new free space btree root.  Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_allocbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*agbp)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+	agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		xfs_btree_commit_afakeroot(cur, agbp, &xfs_bnobt_ops);
+	} else {
+		cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
+		xfs_btree_commit_afakeroot(cur, agbp, &xfs_cntbt_ops);
+	}
+}
+
 /*
  * Calculate number of records in an alloc btree block.
  */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index c9305ebb69f6..dde324609a89 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -13,6 +13,7 @@
 struct xfs_buf;
 struct xfs_btree_cur;
 struct xfs_mount;
+struct xbtree_afakeroot;
 
 /*
  * Btree block header size depends on a superblock flag.
@@ -48,8 +49,14 @@ struct xfs_mount;
 extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_buf *,
 		xfs_agnumber_t, xfs_btnum_t);
+struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xbtree_afakeroot *afake,
+		xfs_agnumber_t agno, xfs_btnum_t btnum);
 extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
 
+void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_buf *agbp);
+
 #endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index b82992f795aa..15d8ec692a6e 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -400,32 +400,27 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
 };
 
 /*
- * Allocate a new inode btree cursor.
+ * Initialize a new inode btree cursor.
  */
-struct xfs_btree_cur *				/* new inode btree cursor */
-xfs_inobt_init_cursor(
+static struct xfs_btree_cur *
+xfs_inobt_init_common(
 	struct xfs_mount	*mp,		/* file system mount point */
 	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_buf		*agbp,		/* buffer for agi structure */
 	xfs_agnumber_t		agno,		/* allocation group number */
 	xfs_btnum_t		btnum)		/* ialloc or free ino btree */
 {
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
 	struct xfs_btree_cur	*cur;
 
 	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
 	cur->bc_btnum = btnum;
 	if (btnum == XFS_BTNUM_INO) {
-		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
-		cur->bc_ops = &xfs_inobt_ops;
 		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
+		cur->bc_ops = &xfs_inobt_ops;
 	} else {
-		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
-		cur->bc_ops = &xfs_finobt_ops;
 		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
+		cur->bc_ops = &xfs_finobt_ops;
 	}
 
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
@@ -433,12 +428,79 @@ xfs_inobt_init_cursor(
 	if (xfs_sb_version_hascrc(&mp->m_sb))
 		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
 
-	cur->bc_private.a.agbp = agbp;
 	cur->bc_private.a.agno = agno;
+	return cur;
+}
 
+/* Create an inode btree cursor. */
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agnumber_t		agno,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+
+	cur = xfs_inobt_init_common(mp, tp, agno, btnum);
+	if (btnum == XFS_BTNUM_INO)
+		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+	else
+		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+	cur->bc_private.a.agbp = agbp;
 	return cur;
 }
 
+/* Create an inode btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_inobt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xbtree_afakeroot	*afake,
+	xfs_agnumber_t		agno,
+	xfs_btnum_t		btnum)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_inobt_init_common(mp, tp, agno, btnum);
+	if (btnum == XFS_BTNUM_INO)
+		xfs_btree_stage_afakeroot(cur, afake, NULL);
+	else
+		xfs_btree_stage_afakeroot(cur, afake, NULL);
+	return cur;
+}
+
+/*
+ * Install a new inobt btree root.  Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_inobt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*agbp)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
+	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	if (cur->bc_btnum == XFS_BTNUM_INO) {
+		agi->agi_root = cpu_to_be32(afake->af_root);
+		agi->agi_level = cpu_to_be32(afake->af_levels);
+		xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT |
+						     XFS_AGI_LEVEL);
+		xfs_btree_commit_afakeroot(cur, agbp, &xfs_inobt_ops);
+	} else {
+		agi->agi_free_root = cpu_to_be32(afake->af_root);
+		agi->agi_free_level = cpu_to_be32(afake->af_levels);
+		xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_FREE_ROOT |
+						     XFS_AGI_FREE_LEVEL);
+		xfs_btree_commit_afakeroot(cur, agbp, &xfs_finobt_ops);
+	}
+}
+
 /*
  * Calculate number of records in an inobt btree block.
  */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 951305ecaae1..9265b3e08c69 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -48,6 +48,9 @@ struct xfs_mount;
 extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
 		xfs_btnum_t);
+struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xbtree_afakeroot *afake,
+		xfs_agnumber_t agno, xfs_btnum_t btnum);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
 /* ir_holemask to inode allocation bitmap conversion */
@@ -68,4 +71,7 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
 		xfs_agnumber_t agno, xfs_btnum_t btnum,
 		struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
 
+void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_buf *agbp);
+
 #endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 38529dbacd55..9034b40bd5cf 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -311,41 +311,90 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
 };
 
 /*
- * Allocate a new refcount btree cursor.
+ * Initialize a new refcount btree cursor.
  */
-struct xfs_btree_cur *
-xfs_refcountbt_init_cursor(
+static struct xfs_btree_cur *
+xfs_refcountbt_init_common(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
 	xfs_agnumber_t		agno)
 {
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
 	struct xfs_btree_cur	*cur;
 
 	ASSERT(agno != NULLAGNUMBER);
 	ASSERT(agno < mp->m_sb.sb_agcount);
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
 	cur->bc_btnum = XFS_BTNUM_REFC;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
-	cur->bc_ops = &xfs_refcountbt_ops;
 	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
 
-	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
-
-	cur->bc_private.a.agbp = agbp;
 	cur->bc_private.a.agno = agno;
 	cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
 
 	cur->bc_private.a.priv.refc.nr_ops = 0;
 	cur->bc_private.a.priv.refc.shape_changes = 0;
+	cur->bc_ops = &xfs_refcountbt_ops;
+	return cur;
+}
+
+/* Create a btree cursor. */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_btree_cur	*cur;
 
+	cur = xfs_refcountbt_init_common(mp, tp, agno);
+	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+	cur->bc_private.a.agbp = agbp;
 	return cur;
 }
 
+/* Create a btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_refcountbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xbtree_afakeroot	*afake,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_refcountbt_init_common(mp, tp, agno);
+	xfs_btree_stage_afakeroot(cur, afake, NULL);
+	return cur;
+}
+
+/*
+ * Swap in the new btree root.  Once we pass this point the newly rebuilt btree
+ * is in place and we have to kill off all the old btree blocks.
+ */
+void
+xfs_refcountbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*agbp)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	agf->agf_refcount_root = cpu_to_be32(afake->af_root);
+	agf->agf_refcount_level = cpu_to_be32(afake->af_levels);
+	agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
+					    XFS_AGF_REFCOUNT_ROOT |
+					    XFS_AGF_REFCOUNT_LEVEL);
+	xfs_btree_commit_afakeroot(cur, agbp, &xfs_refcountbt_ops);
+}
+
 /*
  * Calculate the number of records in a refcount btree block.
  */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index ba416f71c824..978b714be9f4 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -13,6 +13,7 @@
 struct xfs_buf;
 struct xfs_btree_cur;
 struct xfs_mount;
+struct xbtree_afakeroot;
 
 /*
  * Btree block header size
@@ -46,6 +47,9 @@ struct xfs_mount;
 extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
 		struct xfs_trans *tp, struct xfs_buf *agbp,
 		xfs_agnumber_t agno);
+struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xbtree_afakeroot *afake,
+		xfs_agnumber_t agno);
 extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
 extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
 
@@ -58,4 +62,7 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
 		struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask,
 		xfs_extlen_t *used);
 
+void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_buf *agbp);
+
 #endif	/* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index fc78efa52c94..062aeaaa7a8c 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -448,17 +448,12 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
 	.recs_inorder		= xfs_rmapbt_recs_inorder,
 };
 
-/*
- * Allocate a new allocation btree cursor.
- */
-struct xfs_btree_cur *
-xfs_rmapbt_init_cursor(
+static struct xfs_btree_cur *
+xfs_rmapbt_init_common(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
 	xfs_agnumber_t		agno)
 {
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
 	struct xfs_btree_cur	*cur;
 
 	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
@@ -468,16 +463,67 @@ xfs_rmapbt_init_cursor(
 	cur->bc_btnum = XFS_BTNUM_RMAP;
 	cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
-	cur->bc_ops = &xfs_rmapbt_ops;
-	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
 	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+	cur->bc_private.a.agno = agno;
+	cur->bc_ops = &xfs_rmapbt_ops;
 
+	return cur;
+}
+
+/* Create a new reverse mapping btree cursor. */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_rmapbt_init_common(mp, tp, agno);
+	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
 	cur->bc_private.a.agbp = agbp;
-	cur->bc_private.a.agno = agno;
+	return cur;
+}
+
+/* Create a new reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rmapbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xbtree_afakeroot	*afake,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_btree_cur	*cur;
 
+	cur = xfs_rmapbt_init_common(mp, tp, agno);
+	xfs_btree_stage_afakeroot(cur, afake, NULL);
 	return cur;
 }
 
+/*
+ * Install a new reverse mapping btree root.  Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rmapbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*agbp)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+	agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+	agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
+					    XFS_AGF_RMAP_BLOCKS);
+	xfs_btree_commit_afakeroot(cur, agbp, &xfs_rmapbt_ops);
+}
+
 /*
  * Calculate number of records in an rmap btree block.
  */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 820d668b063d..c6785c7851a8 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -9,6 +9,7 @@
 struct xfs_buf;
 struct xfs_btree_cur;
 struct xfs_mount;
+struct xbtree_afakeroot;
 
 /* rmaps only exist on crc enabled filesystems */
 #define XFS_RMAP_BLOCK_LEN	XFS_BTREE_SBLOCK_CRC_LEN
@@ -43,6 +44,11 @@ struct xfs_mount;
 struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
 				struct xfs_trans *tp, struct xfs_buf *bp,
 				xfs_agnumber_t agno);
+struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xbtree_afakeroot *afake,
+		xfs_agnumber_t agno);
+void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_buf *agbp);
 int xfs_rmapbt_maxrecs(int blocklen, int leaf);
 extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
 


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-05 14:30       ` Brian Foster
@ 2020-03-05 17:39         ` Darrick J. Wong
  0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-03-05 17:39 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Thu, Mar 05, 2020 at 09:30:00AM -0500, Brian Foster wrote:
> On Wed, Mar 04, 2020 at 03:34:59PM -0800, Darrick J. Wong wrote:
> > On Wed, Mar 04, 2020 at 01:21:03PM -0500, Brian Foster wrote:
> > > On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Create an in-core fake root for AG-rooted btree types so that callers
> > > > can generate a whole new btree using the upcoming btree bulk load
> > > > function without making the new tree accessible from the rest of the
> > > > filesystem.  It is up to the individual btree type to provide a function
> > > > to create a staged cursor (presumably with the appropriate callouts to
> > > > update the fakeroot) and then commit the staged root back into the
> > > > filesystem.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > 
> > > The code all seems reasonable, mostly infrastructure. Just a few high
> > > level comments..
> > > 
> > > It would be helpful if the commit log (or code comments) explained more
> > > about the callouts that are replaced for a staging tree (and why).
> > 
> > Ok.  I have two block comments to add.
> > 
> > > >  fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
> > > >  fs/xfs/xfs_trace.h        |   28 +++++++++++
> > > >  3 files changed, 182 insertions(+), 5 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> > > > index e6f898bf3174..9a7c1a4d0423 100644
> > > > --- a/fs/xfs/libxfs/xfs_btree.c
> > > > +++ b/fs/xfs/libxfs/xfs_btree.c
> > > > @@ -382,6 +382,8 @@ xfs_btree_del_cursor(
> > > >  	/*
> > > >  	 * Free the cursor.
> > > >  	 */
> > > > +	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
> > > > +		kmem_free((void *)cur->bc_ops);
> > > >  	kmem_cache_free(xfs_btree_cur_zone, cur);
> > > >  }
> > > >  
> > > > @@ -4908,3 +4910,118 @@ xfs_btree_has_more_records(
> > > >  	else
> > > >  		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
> > > >  }
> > 
> > Add here a new comment:
> > 
> > /*
> >  * Staging Cursors and Fake Roots for Btrees
> >  * =========================================
> >  *
> >  * A staging btree cursor is a special type of btree cursor that callers
> >  * must use to construct a new btree index using the btree bulk loader
> >  * code.  The bulk loading code uses the staging btree cursor to
> >  * abstract the details of initializing new btree blocks and filling
> >  * them with records or key/ptr pairs.  Regular btree operations (e.g.
> >  * queries and modifications) are not supported with staging cursors,
> >  * and callers must not invoke them.
> >  *
> >  * Fake root structures contain all the information about a btree that
> >  * is under construction by the bulk loading code.  Staging btree
> >  * cursors point to fake root structures instead of the usual AG header
> >  * or inode structure.
> >  *
> >  * Callers are expected to initialize a fake root structure and pass it
> >  * into the _stage_cursor function for a specific btree type.  When bulk
> >  * loading is complete, callers should call the _commit_staged_btree
> >  * function for that specific btree type to commit the new btree into
> >  * the filesystem.
> >  */
> > 
> 
> Looks good.
> 
> > 
> > > > +
> > > > +/* We don't allow staging cursors to be duplicated. */
> > 
> > /*
> >  * Don't allow staging cursors to be duplicated because they're supposed
> >  * to be kept private to a single thread.
> >  */
> > 
> > 
> > > > +STATIC struct xfs_btree_cur *
> > > > +xfs_btree_fakeroot_dup_cursor(
> > > > +	struct xfs_btree_cur	*cur)
> > > > +{
> > > > +	ASSERT(0);
> > > > +	return NULL;
> > > > +}
> > > > +
> > > > +/* Refuse to allow regular block allocation for a staging cursor. */
> > 
> > /*
> >  * Don't allow block allocation for a staging cursor.  Bulk loading
> >  * requires all the blocks to be allocated ahead of time to prevent
> >  * ENOSPC failures.
> >  */
> > 
> > > > +STATIC int
> > > > +xfs_btree_fakeroot_alloc_block(
> > > > +	struct xfs_btree_cur	*cur,
> > > > +	union xfs_btree_ptr	*start_bno,
> > > > +	union xfs_btree_ptr	*new_bno,
> > > > +	int			*stat)
> > > > +{
> > > > +	ASSERT(0);
> > > > +	return -EFSCORRUPTED;
> > > 
> > > Calling these is a runtime bug as opposed to corruption, right?
> > 
> > Correct.  These functions should never be called, because doing so
> > implies either a bug in the btree code or a caller is misusing a staging
> > cursor.
> > 
> > I'm not sure what's a good error code for this.  I hope that "Structure
> > needs cleaning" will cause admins to run xfs_repair like they would for
> > any other "structure needs cleaning" error, though that's not so helpful
> > if it's xfs_repair itself doing that.
> > 
> > I also thought about "ENOSR" (as in, "No, sir!") but whinging about
> > streams resources is likely to cause more confusion than it clears up.
> > 
> 
> Ok. I was expecting -EINVAL or something more generic like that, but
> it's not that important.
> 
> > > > +}
> > > > +
> > > > +/* Refuse to allow block freeing for a staging cursor. */
> > 
> > /*
> >  * Don't allow block freeing for a staging cursor, because staging
> >  * cursors do not support regular btree modifications.
> >  */
> > 
> > > > +STATIC int
> > > > +xfs_btree_fakeroot_free_block(
> > > > +	struct xfs_btree_cur	*cur,
> > > > +	struct xfs_buf		*bp)
> > > > +{
> > > > +	ASSERT(0);
> > > > +	return -EFSCORRUPTED;
> > > > +}
> > > > +
> > > 
> > > For example, why do we not allow alloc/frees of blocks into a staging
> > > tree? Is this something related to how staging trees will be constructed
> > > vs. normal trees, or is this just stubbed in and to be implemented
> > > later?
> > 
> > The only user of staging cursors is the bulk loading code, and the bulk
> > loader requires the caller to allocate all the blocks they'll need ahead
> > of time.  We don't allow any of the regular btree functions on a staging
> > cursor, and in fact we're really only using it to abstract the details
> > of writing records, keys, pointers, and btree block headers.
> > 
> 
> Sure, but what's not clear at this point in the series is how those
> blocks are fed into the bulk loader. Presumably we need some mechanism
> to do that, and that appears in the later patches via a separate
> ->alloc_block() hook in the struct xfs_btree_bload. IOW, I'd find it
> more clear if one the comments above was a bit more explicit and said
> something like: "Disable block allocation because bulk loading uses a
> separate callback ..."

Ok, I'll state that explicitly.  I'm ... somewhat hesitant to add that
here because staging btree cursors and bulk loading are separate
concepts, but OTOH the one user of staging cursors is bulk loading, and
bulk loading must use staging cursors, so until we discover another use
of staging cursors, maybe this slight conflation is not so bad...

--D

> Brian
> 
> > > > +/* Initialize a pointer to the root block from the fakeroot. */
> > > > +STATIC void
> > > > +xfs_btree_fakeroot_init_ptr_from_cur(
> > > > +	struct xfs_btree_cur	*cur,
> > > > +	union xfs_btree_ptr	*ptr)
> > > > +{
> > > > +	struct xbtree_afakeroot	*afake;
> > > > +
> > > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > > +
> > > > +	afake = cur->bc_private.a.afake;
> > > > +	ptr->s = cpu_to_be32(afake->af_root);
> > > > +}
> > 
> > Add here another block comment:
> > 
> > /*
> >  * Bulk Loading for AG Btrees
> >  * ==========================
> >  *
> >  * For a btree rooted in an AG header, pass a xbtree_afakeroot structure
> >  * to the staging cursor.  Callers should initialize this to zero.
> >  *
> >  * The _stage_cursor() function for a specific btree type should call
> >  * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
> >  * cursor.  The corresponding _commit_staged_btree() function should log
> >  * the new root and call xfs_btree_commit_afakeroot() to transform the
> >  * staging cursor into a regular btree cursor.
> >  */
> > 
> > > > +/* Set the root block when our tree has a fakeroot. */
> > > > +STATIC void
> > > > +xfs_btree_afakeroot_set_root(
> > > > +	struct xfs_btree_cur	*cur,
> > > > +	union xfs_btree_ptr	*ptr,
> > > > +	int			inc)
> > > > +{
> > > > +	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
> > > > +
> > > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > > +	afake->af_root = be32_to_cpu(ptr->s);
> > > > +	afake->af_levels += inc;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Initialize a AG-rooted btree cursor with the given AG btree fake root.  The
> > > > + * btree cursor's @bc_ops will be overridden as needed to make the staging
> > > > + * functionality work.  If @new_ops is not NULL, these new ops will be passed
> > > > + * out to the caller for further overriding.
> > > > + */
> > > > +void
> > > > +xfs_btree_stage_afakeroot(
> > > > +	struct xfs_btree_cur		*cur,
> > > > +	struct xbtree_afakeroot		*afake,
> > > > +	struct xfs_btree_ops		**new_ops)
> > > > +{
> > > > +	struct xfs_btree_ops		*nops;
> > > > +
> > > > +	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
> > > > +	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
> > > > +
> > > > +	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
> > > > +	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
> > > > +	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
> > > > +	nops->free_block = xfs_btree_fakeroot_free_block;
> > > > +	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
> > > > +	nops->set_root = xfs_btree_afakeroot_set_root;
> > > > +	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
> > > > +
> > > > +	cur->bc_private.a.afake = afake;
> > > > +	cur->bc_nlevels = afake->af_levels;
> > > > +	cur->bc_ops = nops;
> > > > +	cur->bc_flags |= XFS_BTREE_STAGING;
> > > > +
> > > > +	if (new_ops)
> > > > +		*new_ops = nops;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Transform an AG-rooted staging btree cursor back into a regular cursor by
> > > > + * substituting a real btree root for the fake one and restoring normal btree
> > > > + * cursor ops.  The caller must log the btree root change prior to calling
> > > > + * this.
> > > > + */
> > > > +void
> > > > +xfs_btree_commit_afakeroot(
> > > > +	struct xfs_btree_cur		*cur,
> > > > +	struct xfs_buf			*agbp,
> > > > +	const struct xfs_btree_ops	*ops)
> > > > +{
> > > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > > +
> > > > +	trace_xfs_btree_commit_afakeroot(cur);
> > > > +
> > > > +	kmem_free((void *)cur->bc_ops);
> > > > +	cur->bc_private.a.agbp = agbp;
> > > > +	cur->bc_ops = ops;
> > > > +	cur->bc_flags &= ~XFS_BTREE_STAGING;
> > > > +}
> > > 
> > > Any reason this new code isn't off in a new xfs_staging_btree.c or some
> > > such instead of xfs_btree.c?
> > 
> > <shrug> It could be.  I tried it and it looks like that would only
> > require exporting six more symbols:
> > 
> > xfs_btree_set_ptr_null
> > xfs_btree_get_buf_block
> > xfs_btree_init_block_cur
> > xfs_btree_set_sibling
> > xfs_btree_copy_ptrs
> > xfs_btree_copy_keys
> > 
> > > > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> > > > index 3eff7c321d43..3ada085609a8 100644
> > > > --- a/fs/xfs/libxfs/xfs_btree.h
> > > > +++ b/fs/xfs/libxfs/xfs_btree.h
> > > > @@ -188,6 +188,16 @@ union xfs_btree_cur_private {
> > > >  	} abt;
> > > >  };
> > > >  
> > > > +/* Private information for a AG-rooted btree. */
> > > > +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> > > > +	union {
> > > > +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> > > > +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> > > > +	};
> > > > +	xfs_agnumber_t			agno;	/* ag number */
> > > > +	union xfs_btree_cur_private	priv;
> > > > +};
> > > > +
> > > 
> > > Ideally refactoring this would be a separate patch from adding a new
> > > field.
> > 
> > Ok, I'll break that out.
> > 
> > --D
> > 
> > > Brian
> > > 
> > > >  /*
> > > >   * Btree cursor structure.
> > > >   * This collects all information needed by the btree code in one place.
> > > > @@ -209,11 +219,7 @@ typedef struct xfs_btree_cur
> > > >  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
> > > >  	int		bc_statoff;	/* offset of btre stats array */
> > > >  	union {
> > > > -		struct {			/* needed for BNO, CNT, INO */
> > > > -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> > > > -			xfs_agnumber_t	agno;	/* ag number */
> > > > -			union xfs_btree_cur_private	priv;
> > > > -		} a;
> > > > +		struct xfs_btree_priv_ag a;
> > > >  		struct {			/* needed for BMAP */
> > > >  			struct xfs_inode *ip;	/* pointer to our inode */
> > > >  			int		allocated;	/* count of alloced */
> > > > @@ -232,6 +238,12 @@ typedef struct xfs_btree_cur
> > > >  #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
> > > >  #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
> > > >  #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
> > > > +/*
> > > > + * The root of this btree is a fakeroot structure so that we can stage a btree
> > > > + * rebuild without leaving it accessible via primary metadata.  The ops struct
> > > > + * is dynamically allocated and must be freed when the cursor is deleted.
> > > > + */
> > > > +#define XFS_BTREE_STAGING		(1<<5)
> > > >  
> > > >  
> > > >  #define	XFS_BTREE_NOERROR	0
> > > > @@ -512,4 +524,24 @@ xfs_btree_islastblock(
> > > >  	return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
> > > >  }
> > > >  
> > > > +/* Fake root for an AG-rooted btree. */
> > > > +struct xbtree_afakeroot {
> > > > +	/* AG block number of the new btree root. */
> > > > +	xfs_agblock_t		af_root;
> > > > +
> > > > +	/* Height of the new btree. */
> > > > +	unsigned int		af_levels;
> > > > +
> > > > +	/* Number of blocks used by the btree. */
> > > > +	unsigned int		af_blocks;
> > > > +};
> > > > +
> > > > +/* Cursor interactions with with fake roots for AG-rooted btrees. */
> > > > +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
> > > > +		struct xbtree_afakeroot *afake,
> > > > +		struct xfs_btree_ops **new_ops);
> > > > +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
> > > > +		struct xfs_buf *agbp,
> > > > +		const struct xfs_btree_ops *ops);
> > > > +
> > > >  #endif	/* __XFS_BTREE_H__ */
> > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > > index e242988f57fb..57ff9f583b5f 100644
> > > > --- a/fs/xfs/xfs_trace.h
> > > > +++ b/fs/xfs/xfs_trace.h
> > > > @@ -3594,6 +3594,34 @@ TRACE_EVENT(xfs_check_new_dalign,
> > > >  		  __entry->calc_rootino)
> > > >  )
> > > >  
> > > > +TRACE_EVENT(xfs_btree_commit_afakeroot,
> > > > +	TP_PROTO(struct xfs_btree_cur *cur),
> > > > +	TP_ARGS(cur),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_btnum_t, btnum)
> > > > +		__field(xfs_agnumber_t, agno)
> > > > +		__field(xfs_agblock_t, agbno)
> > > > +		__field(unsigned int, levels)
> > > > +		__field(unsigned int, blocks)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = cur->bc_mp->m_super->s_dev;
> > > > +		__entry->btnum = cur->bc_btnum;
> > > > +		__entry->agno = cur->bc_private.a.agno;
> > > > +		__entry->agbno = cur->bc_private.a.afake->af_root;
> > > > +		__entry->levels = cur->bc_private.a.afake->af_levels;
> > > > +		__entry->blocks = cur->bc_private.a.afake->af_blocks;
> > > > +	),
> > > > +	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
> > > > +		  __entry->agno,
> > > > +		  __entry->levels,
> > > > +		  __entry->blocks,
> > > > +		  __entry->agbno)
> > > > +)
> > > > +
> > > >  #endif /* _TRACE_XFS_H */
> > > >  
> > > >  #undef TRACE_INCLUDE_PATH
> > > > 
> > > 
> > 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-04 23:34     ` Darrick J. Wong
  2020-03-04 23:53       ` Dave Chinner
@ 2020-03-05 14:30       ` Brian Foster
  2020-03-05 17:39         ` Darrick J. Wong
  1 sibling, 1 reply; 20+ messages in thread
From: Brian Foster @ 2020-03-05 14:30 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Wed, Mar 04, 2020 at 03:34:59PM -0800, Darrick J. Wong wrote:
> On Wed, Mar 04, 2020 at 01:21:03PM -0500, Brian Foster wrote:
> > On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Create an in-core fake root for AG-rooted btree types so that callers
> > > can generate a whole new btree using the upcoming btree bulk load
> > > function without making the new tree accessible from the rest of the
> > > filesystem.  It is up to the individual btree type to provide a function
> > > to create a staged cursor (presumably with the appropriate callouts to
> > > update the fakeroot) and then commit the staged root back into the
> > > filesystem.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > The code all seems reasonable, mostly infrastructure. Just a few high
> > level comments..
> > 
> > It would be helpful if the commit log (or code comments) explained more
> > about the callouts that are replaced for a staging tree (and why).
> 
> Ok.  I have two block comments to add.
> 
> > >  fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
> > >  fs/xfs/xfs_trace.h        |   28 +++++++++++
> > >  3 files changed, 182 insertions(+), 5 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> > > index e6f898bf3174..9a7c1a4d0423 100644
> > > --- a/fs/xfs/libxfs/xfs_btree.c
> > > +++ b/fs/xfs/libxfs/xfs_btree.c
> > > @@ -382,6 +382,8 @@ xfs_btree_del_cursor(
> > >  	/*
> > >  	 * Free the cursor.
> > >  	 */
> > > +	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
> > > +		kmem_free((void *)cur->bc_ops);
> > >  	kmem_cache_free(xfs_btree_cur_zone, cur);
> > >  }
> > >  
> > > @@ -4908,3 +4910,118 @@ xfs_btree_has_more_records(
> > >  	else
> > >  		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
> > >  }
> 
> Add here a new comment:
> 
> /*
>  * Staging Cursors and Fake Roots for Btrees
>  * =========================================
>  *
>  * A staging btree cursor is a special type of btree cursor that callers
>  * must use to construct a new btree index using the btree bulk loader
>  * code.  The bulk loading code uses the staging btree cursor to
>  * abstract the details of initializing new btree blocks and filling
>  * them with records or key/ptr pairs.  Regular btree operations (e.g.
>  * queries and modifications) are not supported with staging cursors,
>  * and callers must not invoke them.
>  *
>  * Fake root structures contain all the information about a btree that
>  * is under construction by the bulk loading code.  Staging btree
>  * cursors point to fake root structures instead of the usual AG header
>  * or inode structure.
>  *
>  * Callers are expected to initialize a fake root structure and pass it
>  * into the _stage_cursor function for a specific btree type.  When bulk
>  * loading is complete, callers should call the _commit_staged_btree
>  * function for that specific btree type to commit the new btree into
>  * the filesystem.
>  */
> 

Looks good.

> 
> > > +
> > > +/* We don't allow staging cursors to be duplicated. */
> 
> /*
>  * Don't allow staging cursors to be duplicated because they're supposed
>  * to be kept private to a single thread.
>  */
> 
> 
> > > +STATIC struct xfs_btree_cur *
> > > +xfs_btree_fakeroot_dup_cursor(
> > > +	struct xfs_btree_cur	*cur)
> > > +{
> > > +	ASSERT(0);
> > > +	return NULL;
> > > +}
> > > +
> > > +/* Refuse to allow regular block allocation for a staging cursor. */
> 
> /*
>  * Don't allow block allocation for a staging cursor.  Bulk loading
>  * requires all the blocks to be allocated ahead of time to prevent
>  * ENOSPC failures.
>  */
> 
> > > +STATIC int
> > > +xfs_btree_fakeroot_alloc_block(
> > > +	struct xfs_btree_cur	*cur,
> > > +	union xfs_btree_ptr	*start_bno,
> > > +	union xfs_btree_ptr	*new_bno,
> > > +	int			*stat)
> > > +{
> > > +	ASSERT(0);
> > > +	return -EFSCORRUPTED;
> > 
> > Calling these is a runtime bug as opposed to corruption, right?
> 
> Correct.  These functions should never be called, because doing so
> implies either a bug in the btree code or a caller is misusing a staging
> cursor.
> 
> I'm not sure what's a good error code for this.  I hope that "Structure
> needs cleaning" will cause admins to run xfs_repair like they would for
> any other "structure needs cleaning" error, though that's not so helpful
> if it's xfs_repair itself doing that.
> 
> I also thought about "ENOSR" (as in, "No, sir!") but whinging about
> streams resources is likely to cause more confusion than it clears up.
> 

Ok. I was expecting -EINVAL or something more generic like that, but
it's not that important.

> > > +}
> > > +
> > > +/* Refuse to allow block freeing for a staging cursor. */
> 
> /*
>  * Don't allow block freeing for a staging cursor, because staging
>  * cursors do not support regular btree modifications.
>  */
> 
> > > +STATIC int
> > > +xfs_btree_fakeroot_free_block(
> > > +	struct xfs_btree_cur	*cur,
> > > +	struct xfs_buf		*bp)
> > > +{
> > > +	ASSERT(0);
> > > +	return -EFSCORRUPTED;
> > > +}
> > > +
> > 
> > For example, why do we not allow alloc/frees of blocks into a staging
> > tree? Is this something related to how staging trees will be constructed
> > vs. normal trees, or is this just stubbed in and to be implemented
> > later?
> 
> The only user of staging cursors is the bulk loading code, and the bulk
> loader requires the caller to allocate all the blocks they'll need ahead
> of time.  We don't allow any of the regular btree functions on a staging
> cursor, and in fact we're really only using it to abstract the details
> of writing records, keys, pointers, and btree block headers.
> 

Sure, but what's not clear at this point in the series is how those
blocks are fed into the bulk loader. Presumably we need some mechanism
to do that, and that appears in the later patches via a separate
->alloc_block() hook in the struct xfs_btree_bload. IOW, I'd find it
more clear if one the comments above was a bit more explicit and said
something like: "Disable block allocation because bulk loading uses a
separate callback ..."

Brian

> > > +/* Initialize a pointer to the root block from the fakeroot. */
> > > +STATIC void
> > > +xfs_btree_fakeroot_init_ptr_from_cur(
> > > +	struct xfs_btree_cur	*cur,
> > > +	union xfs_btree_ptr	*ptr)
> > > +{
> > > +	struct xbtree_afakeroot	*afake;
> > > +
> > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > +
> > > +	afake = cur->bc_private.a.afake;
> > > +	ptr->s = cpu_to_be32(afake->af_root);
> > > +}
> 
> Add here another block comment:
> 
> /*
>  * Bulk Loading for AG Btrees
>  * ==========================
>  *
>  * For a btree rooted in an AG header, pass a xbtree_afakeroot structure
>  * to the staging cursor.  Callers should initialize this to zero.
>  *
>  * The _stage_cursor() function for a specific btree type should call
>  * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
>  * cursor.  The corresponding _commit_staged_btree() function should log
>  * the new root and call xfs_btree_commit_afakeroot() to transform the
>  * staging cursor into a regular btree cursor.
>  */
> 
> > > +/* Set the root block when our tree has a fakeroot. */
> > > +STATIC void
> > > +xfs_btree_afakeroot_set_root(
> > > +	struct xfs_btree_cur	*cur,
> > > +	union xfs_btree_ptr	*ptr,
> > > +	int			inc)
> > > +{
> > > +	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
> > > +
> > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > +	afake->af_root = be32_to_cpu(ptr->s);
> > > +	afake->af_levels += inc;
> > > +}
> > > +
> > > +/*
> > > + * Initialize a AG-rooted btree cursor with the given AG btree fake root.  The
> > > + * btree cursor's @bc_ops will be overridden as needed to make the staging
> > > + * functionality work.  If @new_ops is not NULL, these new ops will be passed
> > > + * out to the caller for further overriding.
> > > + */
> > > +void
> > > +xfs_btree_stage_afakeroot(
> > > +	struct xfs_btree_cur		*cur,
> > > +	struct xbtree_afakeroot		*afake,
> > > +	struct xfs_btree_ops		**new_ops)
> > > +{
> > > +	struct xfs_btree_ops		*nops;
> > > +
> > > +	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
> > > +	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
> > > +
> > > +	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
> > > +	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
> > > +	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
> > > +	nops->free_block = xfs_btree_fakeroot_free_block;
> > > +	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
> > > +	nops->set_root = xfs_btree_afakeroot_set_root;
> > > +	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
> > > +
> > > +	cur->bc_private.a.afake = afake;
> > > +	cur->bc_nlevels = afake->af_levels;
> > > +	cur->bc_ops = nops;
> > > +	cur->bc_flags |= XFS_BTREE_STAGING;
> > > +
> > > +	if (new_ops)
> > > +		*new_ops = nops;
> > > +}
> > > +
> > > +/*
> > > + * Transform an AG-rooted staging btree cursor back into a regular cursor by
> > > + * substituting a real btree root for the fake one and restoring normal btree
> > > + * cursor ops.  The caller must log the btree root change prior to calling
> > > + * this.
> > > + */
> > > +void
> > > +xfs_btree_commit_afakeroot(
> > > +	struct xfs_btree_cur		*cur,
> > > +	struct xfs_buf			*agbp,
> > > +	const struct xfs_btree_ops	*ops)
> > > +{
> > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > +
> > > +	trace_xfs_btree_commit_afakeroot(cur);
> > > +
> > > +	kmem_free((void *)cur->bc_ops);
> > > +	cur->bc_private.a.agbp = agbp;
> > > +	cur->bc_ops = ops;
> > > +	cur->bc_flags &= ~XFS_BTREE_STAGING;
> > > +}
> > 
> > Any reason this new code isn't off in a new xfs_staging_btree.c or some
> > such instead of xfs_btree.c?
> 
> <shrug> It could be.  I tried it and it looks like that would only
> require exporting six more symbols:
> 
> xfs_btree_set_ptr_null
> xfs_btree_get_buf_block
> xfs_btree_init_block_cur
> xfs_btree_set_sibling
> xfs_btree_copy_ptrs
> xfs_btree_copy_keys
> 
> > > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> > > index 3eff7c321d43..3ada085609a8 100644
> > > --- a/fs/xfs/libxfs/xfs_btree.h
> > > +++ b/fs/xfs/libxfs/xfs_btree.h
> > > @@ -188,6 +188,16 @@ union xfs_btree_cur_private {
> > >  	} abt;
> > >  };
> > >  
> > > +/* Private information for a AG-rooted btree. */
> > > +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> > > +	union {
> > > +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> > > +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> > > +	};
> > > +	xfs_agnumber_t			agno;	/* ag number */
> > > +	union xfs_btree_cur_private	priv;
> > > +};
> > > +
> > 
> > Ideally refactoring this would be a separate patch from adding a new
> > field.
> 
> Ok, I'll break that out.
> 
> --D
> 
> > Brian
> > 
> > >  /*
> > >   * Btree cursor structure.
> > >   * This collects all information needed by the btree code in one place.
> > > @@ -209,11 +219,7 @@ typedef struct xfs_btree_cur
> > >  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
> > >  	int		bc_statoff;	/* offset of btre stats array */
> > >  	union {
> > > -		struct {			/* needed for BNO, CNT, INO */
> > > -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> > > -			xfs_agnumber_t	agno;	/* ag number */
> > > -			union xfs_btree_cur_private	priv;
> > > -		} a;
> > > +		struct xfs_btree_priv_ag a;
> > >  		struct {			/* needed for BMAP */
> > >  			struct xfs_inode *ip;	/* pointer to our inode */
> > >  			int		allocated;	/* count of alloced */
> > > @@ -232,6 +238,12 @@ typedef struct xfs_btree_cur
> > >  #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
> > >  #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
> > >  #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
> > > +/*
> > > + * The root of this btree is a fakeroot structure so that we can stage a btree
> > > + * rebuild without leaving it accessible via primary metadata.  The ops struct
> > > + * is dynamically allocated and must be freed when the cursor is deleted.
> > > + */
> > > +#define XFS_BTREE_STAGING		(1<<5)
> > >  
> > >  
> > >  #define	XFS_BTREE_NOERROR	0
> > > @@ -512,4 +524,24 @@ xfs_btree_islastblock(
> > >  	return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
> > >  }
> > >  
> > > +/* Fake root for an AG-rooted btree. */
> > > +struct xbtree_afakeroot {
> > > +	/* AG block number of the new btree root. */
> > > +	xfs_agblock_t		af_root;
> > > +
> > > +	/* Height of the new btree. */
> > > +	unsigned int		af_levels;
> > > +
> > > +	/* Number of blocks used by the btree. */
> > > +	unsigned int		af_blocks;
> > > +};
> > > +
> > > +/* Cursor interactions with with fake roots for AG-rooted btrees. */
> > > +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
> > > +		struct xbtree_afakeroot *afake,
> > > +		struct xfs_btree_ops **new_ops);
> > > +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
> > > +		struct xfs_buf *agbp,
> > > +		const struct xfs_btree_ops *ops);
> > > +
> > >  #endif	/* __XFS_BTREE_H__ */
> > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > index e242988f57fb..57ff9f583b5f 100644
> > > --- a/fs/xfs/xfs_trace.h
> > > +++ b/fs/xfs/xfs_trace.h
> > > @@ -3594,6 +3594,34 @@ TRACE_EVENT(xfs_check_new_dalign,
> > >  		  __entry->calc_rootino)
> > >  )
> > >  
> > > +TRACE_EVENT(xfs_btree_commit_afakeroot,
> > > +	TP_PROTO(struct xfs_btree_cur *cur),
> > > +	TP_ARGS(cur),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(xfs_btnum_t, btnum)
> > > +		__field(xfs_agnumber_t, agno)
> > > +		__field(xfs_agblock_t, agbno)
> > > +		__field(unsigned int, levels)
> > > +		__field(unsigned int, blocks)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = cur->bc_mp->m_super->s_dev;
> > > +		__entry->btnum = cur->bc_btnum;
> > > +		__entry->agno = cur->bc_private.a.agno;
> > > +		__entry->agbno = cur->bc_private.a.afake->af_root;
> > > +		__entry->levels = cur->bc_private.a.afake->af_levels;
> > > +		__entry->blocks = cur->bc_private.a.afake->af_blocks;
> > > +	),
> > > +	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > +		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
> > > +		  __entry->agno,
> > > +		  __entry->levels,
> > > +		  __entry->blocks,
> > > +		  __entry->agbno)
> > > +)
> > > +
> > >  #endif /* _TRACE_XFS_H */
> > >  
> > >  #undef TRACE_INCLUDE_PATH
> > > 
> > 
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-05  1:20   ` Dave Chinner
@ 2020-03-05  1:23     ` Darrick J. Wong
  0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-03-05  1:23 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, bfoster

On Thu, Mar 05, 2020 at 12:20:54PM +1100, Dave Chinner wrote:
> On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Create an in-core fake root for AG-rooted btree types so that callers
> > can generate a whole new btree using the upcoming btree bulk load
> > function without making the new tree accessible from the rest of the
> > filesystem.  It is up to the individual btree type to provide a function
> > to create a staged cursor (presumably with the appropriate callouts to
> > update the fakeroot) and then commit the staged root back into the
> > filesystem.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> .....
> > @@ -188,6 +188,16 @@ union xfs_btree_cur_private {
> >  	} abt;
> >  };
> >  
> > +/* Private information for a AG-rooted btree. */
> > +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> > +	union {
> > +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> > +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> > +	};
> > +	xfs_agnumber_t			agno;	/* ag number */
> > +	union xfs_btree_cur_private	priv;
> > +};
> > +
> >  /*
> >   * Btree cursor structure.
> >   * This collects all information needed by the btree code in one place.
> > @@ -209,11 +219,7 @@ typedef struct xfs_btree_cur
> >  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
> >  	int		bc_statoff;	/* offset of btre stats array */
> >  	union {
> > -		struct {			/* needed for BNO, CNT, INO */
> > -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> > -			xfs_agnumber_t	agno;	/* ag number */
> > -			union xfs_btree_cur_private	priv;
> > -		} a;
> > +		struct xfs_btree_priv_ag a;
> >  		struct {			/* needed for BMAP */
> >  			struct xfs_inode *ip;	/* pointer to our inode */
> >  			int		allocated;	/* count of alloced */
> 
> I don't really like the mess this is turning into. I'll write a
> quick cleanup patch set for this union to make it much neater and
> the code much less verbose before we make the code even more
> unreadable. :/

Ok, thank you!

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-04 23:53       ` Dave Chinner
@ 2020-03-05  1:23         ` Darrick J. Wong
  0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-03-05  1:23 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, linux-xfs

On Thu, Mar 05, 2020 at 10:53:35AM +1100, Dave Chinner wrote:
> On Wed, Mar 04, 2020 at 03:34:59PM -0800, Darrick J. Wong wrote:
> > On Wed, Mar 04, 2020 at 01:21:03PM -0500, Brian Foster wrote:
> > > On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Create an in-core fake root for AG-rooted btree types so that callers
> > > > can generate a whole new btree using the upcoming btree bulk load
> > > > function without making the new tree accessible from the rest of the
> > > > filesystem.  It is up to the individual btree type to provide a function
> > > > to create a staged cursor (presumably with the appropriate callouts to
> > > > update the fakeroot) and then commit the staged root back into the
> > > > filesystem.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > 
> > > The code all seems reasonable, mostly infrastructure. Just a few high
> > > level comments..
> > > 
> > > It would be helpful if the commit log (or code comments) explained more
> > > about the callouts that are replaced for a staging tree (and why).
> > 
> > Ok.  I have two block comments to add.
> > 
> > > >  fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
> > > >  fs/xfs/xfs_trace.h        |   28 +++++++++++
> > > >  3 files changed, 182 insertions(+), 5 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> > > > index e6f898bf3174..9a7c1a4d0423 100644
> > > > --- a/fs/xfs/libxfs/xfs_btree.c
> > > > +++ b/fs/xfs/libxfs/xfs_btree.c
> > > > @@ -382,6 +382,8 @@ xfs_btree_del_cursor(
> > > >  	/*
> > > >  	 * Free the cursor.
> > > >  	 */
> > > > +	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
> > > > +		kmem_free((void *)cur->bc_ops);
> > > >  	kmem_cache_free(xfs_btree_cur_zone, cur);
> > > >  }
> > > >  
> > > > @@ -4908,3 +4910,118 @@ xfs_btree_has_more_records(
> > > >  	else
> > > >  		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
> > > >  }
> > 
> > Add here a new comment:
> > 
> > /*
> >  * Staging Cursors and Fake Roots for Btrees
> >  * =========================================
> >  *
> >  * A staging btree cursor is a special type of btree cursor that callers
> >  * must use to construct a new btree index using the btree bulk loader
> >  * code.  The bulk loading code uses the staging btree cursor to
> >  * abstract the details of initializing new btree blocks and filling
> >  * them with records or key/ptr pairs.  Regular btree operations (e.g.
> >  * queries and modifications) are not supported with staging cursors,
> >  * and callers must not invoke them.
> >  *
> >  * Fake root structures contain all the information about a btree that
> >  * is under construction by the bulk loading code.  Staging btree
> >  * cursors point to fake root structures instead of the usual AG header
> >  * or inode structure.
> >  *
> >  * Callers are expected to initialize a fake root structure and pass it
> >  * into the _stage_cursor function for a specific btree type.  When bulk
> >  * loading is complete, callers should call the _commit_staged_btree
> >  * function for that specific btree type to commit the new btree into
> >  * the filesystem.
> >  */
> > 
> > 
> > > > +
> > > > +/* We don't allow staging cursors to be duplicated. */
> > 
> > /*
> >  * Don't allow staging cursors to be duplicated because they're supposed
> >  * to be kept private to a single thread.
> >  */
> 
> private to a single -thread- or a single -btree modification
> context-?

Private to a single btree rebuilding context, really. :)

ATM btree rebuilding contexts are single-threaded.

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-04  3:28 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
  2020-03-04 18:21   ` Brian Foster
@ 2020-03-05  1:20   ` Dave Chinner
  2020-03-05  1:23     ` Darrick J. Wong
  1 sibling, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2020-03-05  1:20 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs, bfoster

On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Create an in-core fake root for AG-rooted btree types so that callers
> can generate a whole new btree using the upcoming btree bulk load
> function without making the new tree accessible from the rest of the
> filesystem.  It is up to the individual btree type to provide a function
> to create a staged cursor (presumably with the appropriate callouts to
> update the fakeroot) and then commit the staged root back into the
> filesystem.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
.....
> @@ -188,6 +188,16 @@ union xfs_btree_cur_private {
>  	} abt;
>  };
>  
> +/* Private information for a AG-rooted btree. */
> +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> +	union {
> +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> +	};
> +	xfs_agnumber_t			agno;	/* ag number */
> +	union xfs_btree_cur_private	priv;
> +};
> +
>  /*
>   * Btree cursor structure.
>   * This collects all information needed by the btree code in one place.
> @@ -209,11 +219,7 @@ typedef struct xfs_btree_cur
>  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
>  	int		bc_statoff;	/* offset of btre stats array */
>  	union {
> -		struct {			/* needed for BNO, CNT, INO */
> -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> -			xfs_agnumber_t	agno;	/* ag number */
> -			union xfs_btree_cur_private	priv;
> -		} a;
> +		struct xfs_btree_priv_ag a;
>  		struct {			/* needed for BMAP */
>  			struct xfs_inode *ip;	/* pointer to our inode */
>  			int		allocated;	/* count of alloced */

I don't really like the mess this is turning into. I'll write a
quick cleanup patch set for this union to make it much neater and
the code much less verbose before we make the code even more
unreadable. :/

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-04 23:34     ` Darrick J. Wong
@ 2020-03-04 23:53       ` Dave Chinner
  2020-03-05  1:23         ` Darrick J. Wong
  2020-03-05 14:30       ` Brian Foster
  1 sibling, 1 reply; 20+ messages in thread
From: Dave Chinner @ 2020-03-04 23:53 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Brian Foster, linux-xfs

On Wed, Mar 04, 2020 at 03:34:59PM -0800, Darrick J. Wong wrote:
> On Wed, Mar 04, 2020 at 01:21:03PM -0500, Brian Foster wrote:
> > On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Create an in-core fake root for AG-rooted btree types so that callers
> > > can generate a whole new btree using the upcoming btree bulk load
> > > function without making the new tree accessible from the rest of the
> > > filesystem.  It is up to the individual btree type to provide a function
> > > to create a staged cursor (presumably with the appropriate callouts to
> > > update the fakeroot) and then commit the staged root back into the
> > > filesystem.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > The code all seems reasonable, mostly infrastructure. Just a few high
> > level comments..
> > 
> > It would be helpful if the commit log (or code comments) explained more
> > about the callouts that are replaced for a staging tree (and why).
> 
> Ok.  I have two block comments to add.
> 
> > >  fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
> > >  fs/xfs/xfs_trace.h        |   28 +++++++++++
> > >  3 files changed, 182 insertions(+), 5 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> > > index e6f898bf3174..9a7c1a4d0423 100644
> > > --- a/fs/xfs/libxfs/xfs_btree.c
> > > +++ b/fs/xfs/libxfs/xfs_btree.c
> > > @@ -382,6 +382,8 @@ xfs_btree_del_cursor(
> > >  	/*
> > >  	 * Free the cursor.
> > >  	 */
> > > +	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
> > > +		kmem_free((void *)cur->bc_ops);
> > >  	kmem_cache_free(xfs_btree_cur_zone, cur);
> > >  }
> > >  
> > > @@ -4908,3 +4910,118 @@ xfs_btree_has_more_records(
> > >  	else
> > >  		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
> > >  }
> 
> Add here a new comment:
> 
> /*
>  * Staging Cursors and Fake Roots for Btrees
>  * =========================================
>  *
>  * A staging btree cursor is a special type of btree cursor that callers
>  * must use to construct a new btree index using the btree bulk loader
>  * code.  The bulk loading code uses the staging btree cursor to
>  * abstract the details of initializing new btree blocks and filling
>  * them with records or key/ptr pairs.  Regular btree operations (e.g.
>  * queries and modifications) are not supported with staging cursors,
>  * and callers must not invoke them.
>  *
>  * Fake root structures contain all the information about a btree that
>  * is under construction by the bulk loading code.  Staging btree
>  * cursors point to fake root structures instead of the usual AG header
>  * or inode structure.
>  *
>  * Callers are expected to initialize a fake root structure and pass it
>  * into the _stage_cursor function for a specific btree type.  When bulk
>  * loading is complete, callers should call the _commit_staged_btree
>  * function for that specific btree type to commit the new btree into
>  * the filesystem.
>  */
> 
> 
> > > +
> > > +/* We don't allow staging cursors to be duplicated. */
> 
> /*
>  * Don't allow staging cursors to be duplicated because they're supposed
>  * to be kept private to a single thread.
>  */

private to a single -thread- or a single -btree modification
context-?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-04 18:21   ` Brian Foster
@ 2020-03-04 23:34     ` Darrick J. Wong
  2020-03-04 23:53       ` Dave Chinner
  2020-03-05 14:30       ` Brian Foster
  0 siblings, 2 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-03-04 23:34 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Wed, Mar 04, 2020 at 01:21:03PM -0500, Brian Foster wrote:
> On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Create an in-core fake root for AG-rooted btree types so that callers
> > can generate a whole new btree using the upcoming btree bulk load
> > function without making the new tree accessible from the rest of the
> > filesystem.  It is up to the individual btree type to provide a function
> > to create a staged cursor (presumably with the appropriate callouts to
> > update the fakeroot) and then commit the staged root back into the
> > filesystem.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> The code all seems reasonable, mostly infrastructure. Just a few high
> level comments..
> 
> It would be helpful if the commit log (or code comments) explained more
> about the callouts that are replaced for a staging tree (and why).

Ok.  I have two block comments to add.

> >  fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
> >  fs/xfs/xfs_trace.h        |   28 +++++++++++
> >  3 files changed, 182 insertions(+), 5 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> > index e6f898bf3174..9a7c1a4d0423 100644
> > --- a/fs/xfs/libxfs/xfs_btree.c
> > +++ b/fs/xfs/libxfs/xfs_btree.c
> > @@ -382,6 +382,8 @@ xfs_btree_del_cursor(
> >  	/*
> >  	 * Free the cursor.
> >  	 */
> > +	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
> > +		kmem_free((void *)cur->bc_ops);
> >  	kmem_cache_free(xfs_btree_cur_zone, cur);
> >  }
> >  
> > @@ -4908,3 +4910,118 @@ xfs_btree_has_more_records(
> >  	else
> >  		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
> >  }

Add here a new comment:

/*
 * Staging Cursors and Fake Roots for Btrees
 * =========================================
 *
 * A staging btree cursor is a special type of btree cursor that callers
 * must use to construct a new btree index using the btree bulk loader
 * code.  The bulk loading code uses the staging btree cursor to
 * abstract the details of initializing new btree blocks and filling
 * them with records or key/ptr pairs.  Regular btree operations (e.g.
 * queries and modifications) are not supported with staging cursors,
 * and callers must not invoke them.
 *
 * Fake root structures contain all the information about a btree that
 * is under construction by the bulk loading code.  Staging btree
 * cursors point to fake root structures instead of the usual AG header
 * or inode structure.
 *
 * Callers are expected to initialize a fake root structure and pass it
 * into the _stage_cursor function for a specific btree type.  When bulk
 * loading is complete, callers should call the _commit_staged_btree
 * function for that specific btree type to commit the new btree into
 * the filesystem.
 */


> > +
> > +/* We don't allow staging cursors to be duplicated. */

/*
 * Don't allow staging cursors to be duplicated because they're supposed
 * to be kept private to a single thread.
 */


> > +STATIC struct xfs_btree_cur *
> > +xfs_btree_fakeroot_dup_cursor(
> > +	struct xfs_btree_cur	*cur)
> > +{
> > +	ASSERT(0);
> > +	return NULL;
> > +}
> > +
> > +/* Refuse to allow regular block allocation for a staging cursor. */

/*
 * Don't allow block allocation for a staging cursor.  Bulk loading
 * requires all the blocks to be allocated ahead of time to prevent
 * ENOSPC failures.
 */

> > +STATIC int
> > +xfs_btree_fakeroot_alloc_block(
> > +	struct xfs_btree_cur	*cur,
> > +	union xfs_btree_ptr	*start_bno,
> > +	union xfs_btree_ptr	*new_bno,
> > +	int			*stat)
> > +{
> > +	ASSERT(0);
> > +	return -EFSCORRUPTED;
> 
> Calling these is a runtime bug as opposed to corruption, right?

Correct.  These functions should never be called, because doing so
implies either a bug in the btree code or a caller is misusing a staging
cursor.

I'm not sure what's a good error code for this.  I hope that "Structure
needs cleaning" will cause admins to run xfs_repair like they would for
any other "structure needs cleaning" error, though that's not so helpful
if it's xfs_repair itself doing that.

I also thought about "ENOSR" (as in, "No, sir!") but whinging about
streams resources is likely to cause more confusion than it clears up.

> > +}
> > +
> > +/* Refuse to allow block freeing for a staging cursor. */

/*
 * Don't allow block freeing for a staging cursor, because staging
 * cursors do not support regular btree modifications.
 */

> > +STATIC int
> > +xfs_btree_fakeroot_free_block(
> > +	struct xfs_btree_cur	*cur,
> > +	struct xfs_buf		*bp)
> > +{
> > +	ASSERT(0);
> > +	return -EFSCORRUPTED;
> > +}
> > +
> 
> For example, why do we not allow alloc/frees of blocks into a staging
> tree? Is this something related to how staging trees will be constructed
> vs. normal trees, or is this just stubbed in and to be implemented
> later?

The only user of staging cursors is the bulk loading code, and the bulk
loader requires the caller to allocate all the blocks they'll need ahead
of time.  We don't allow any of the regular btree functions on a staging
cursor, and in fact we're really only using it to abstract the details
of writing records, keys, pointers, and btree block headers.

> > +/* Initialize a pointer to the root block from the fakeroot. */
> > +STATIC void
> > +xfs_btree_fakeroot_init_ptr_from_cur(
> > +	struct xfs_btree_cur	*cur,
> > +	union xfs_btree_ptr	*ptr)
> > +{
> > +	struct xbtree_afakeroot	*afake;
> > +
> > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > +
> > +	afake = cur->bc_private.a.afake;
> > +	ptr->s = cpu_to_be32(afake->af_root);
> > +}

Add here another block comment:

/*
 * Bulk Loading for AG Btrees
 * ==========================
 *
 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure
 * to the staging cursor.  Callers should initialize this to zero.
 *
 * The _stage_cursor() function for a specific btree type should call
 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
 * cursor.  The corresponding _commit_staged_btree() function should log
 * the new root and call xfs_btree_commit_afakeroot() to transform the
 * staging cursor into a regular btree cursor.
 */

> > +/* Set the root block when our tree has a fakeroot. */
> > +STATIC void
> > +xfs_btree_afakeroot_set_root(
> > +	struct xfs_btree_cur	*cur,
> > +	union xfs_btree_ptr	*ptr,
> > +	int			inc)
> > +{
> > +	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
> > +
> > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > +	afake->af_root = be32_to_cpu(ptr->s);
> > +	afake->af_levels += inc;
> > +}
> > +
> > +/*
> > + * Initialize a AG-rooted btree cursor with the given AG btree fake root.  The
> > + * btree cursor's @bc_ops will be overridden as needed to make the staging
> > + * functionality work.  If @new_ops is not NULL, these new ops will be passed
> > + * out to the caller for further overriding.
> > + */
> > +void
> > +xfs_btree_stage_afakeroot(
> > +	struct xfs_btree_cur		*cur,
> > +	struct xbtree_afakeroot		*afake,
> > +	struct xfs_btree_ops		**new_ops)
> > +{
> > +	struct xfs_btree_ops		*nops;
> > +
> > +	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
> > +	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
> > +
> > +	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
> > +	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
> > +	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
> > +	nops->free_block = xfs_btree_fakeroot_free_block;
> > +	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
> > +	nops->set_root = xfs_btree_afakeroot_set_root;
> > +	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
> > +
> > +	cur->bc_private.a.afake = afake;
> > +	cur->bc_nlevels = afake->af_levels;
> > +	cur->bc_ops = nops;
> > +	cur->bc_flags |= XFS_BTREE_STAGING;
> > +
> > +	if (new_ops)
> > +		*new_ops = nops;
> > +}
> > +
> > +/*
> > + * Transform an AG-rooted staging btree cursor back into a regular cursor by
> > + * substituting a real btree root for the fake one and restoring normal btree
> > + * cursor ops.  The caller must log the btree root change prior to calling
> > + * this.
> > + */
> > +void
> > +xfs_btree_commit_afakeroot(
> > +	struct xfs_btree_cur		*cur,
> > +	struct xfs_buf			*agbp,
> > +	const struct xfs_btree_ops	*ops)
> > +{
> > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > +
> > +	trace_xfs_btree_commit_afakeroot(cur);
> > +
> > +	kmem_free((void *)cur->bc_ops);
> > +	cur->bc_private.a.agbp = agbp;
> > +	cur->bc_ops = ops;
> > +	cur->bc_flags &= ~XFS_BTREE_STAGING;
> > +}
> 
> Any reason this new code isn't off in a new xfs_staging_btree.c or some
> such instead of xfs_btree.c?

<shrug> It could be.  I tried it and it looks like that would only
require exporting six more symbols:

xfs_btree_set_ptr_null
xfs_btree_get_buf_block
xfs_btree_init_block_cur
xfs_btree_set_sibling
xfs_btree_copy_ptrs
xfs_btree_copy_keys

> > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> > index 3eff7c321d43..3ada085609a8 100644
> > --- a/fs/xfs/libxfs/xfs_btree.h
> > +++ b/fs/xfs/libxfs/xfs_btree.h
> > @@ -188,6 +188,16 @@ union xfs_btree_cur_private {
> >  	} abt;
> >  };
> >  
> > +/* Private information for a AG-rooted btree. */
> > +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> > +	union {
> > +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> > +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> > +	};
> > +	xfs_agnumber_t			agno;	/* ag number */
> > +	union xfs_btree_cur_private	priv;
> > +};
> > +
> 
> Ideally refactoring this would be a separate patch from adding a new
> field.

Ok, I'll break that out.

--D

> Brian
> 
> >  /*
> >   * Btree cursor structure.
> >   * This collects all information needed by the btree code in one place.
> > @@ -209,11 +219,7 @@ typedef struct xfs_btree_cur
> >  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
> >  	int		bc_statoff;	/* offset of btre stats array */
> >  	union {
> > -		struct {			/* needed for BNO, CNT, INO */
> > -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> > -			xfs_agnumber_t	agno;	/* ag number */
> > -			union xfs_btree_cur_private	priv;
> > -		} a;
> > +		struct xfs_btree_priv_ag a;
> >  		struct {			/* needed for BMAP */
> >  			struct xfs_inode *ip;	/* pointer to our inode */
> >  			int		allocated;	/* count of alloced */
> > @@ -232,6 +238,12 @@ typedef struct xfs_btree_cur
> >  #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
> >  #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
> >  #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
> > +/*
> > + * The root of this btree is a fakeroot structure so that we can stage a btree
> > + * rebuild without leaving it accessible via primary metadata.  The ops struct
> > + * is dynamically allocated and must be freed when the cursor is deleted.
> > + */
> > +#define XFS_BTREE_STAGING		(1<<5)
> >  
> >  
> >  #define	XFS_BTREE_NOERROR	0
> > @@ -512,4 +524,24 @@ xfs_btree_islastblock(
> >  	return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
> >  }
> >  
> > +/* Fake root for an AG-rooted btree. */
> > +struct xbtree_afakeroot {
> > +	/* AG block number of the new btree root. */
> > +	xfs_agblock_t		af_root;
> > +
> > +	/* Height of the new btree. */
> > +	unsigned int		af_levels;
> > +
> > +	/* Number of blocks used by the btree. */
> > +	unsigned int		af_blocks;
> > +};
> > +
> > +/* Cursor interactions with with fake roots for AG-rooted btrees. */
> > +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
> > +		struct xbtree_afakeroot *afake,
> > +		struct xfs_btree_ops **new_ops);
> > +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
> > +		struct xfs_buf *agbp,
> > +		const struct xfs_btree_ops *ops);
> > +
> >  #endif	/* __XFS_BTREE_H__ */
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index e242988f57fb..57ff9f583b5f 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3594,6 +3594,34 @@ TRACE_EVENT(xfs_check_new_dalign,
> >  		  __entry->calc_rootino)
> >  )
> >  
> > +TRACE_EVENT(xfs_btree_commit_afakeroot,
> > +	TP_PROTO(struct xfs_btree_cur *cur),
> > +	TP_ARGS(cur),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_btnum_t, btnum)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(xfs_agblock_t, agbno)
> > +		__field(unsigned int, levels)
> > +		__field(unsigned int, blocks)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = cur->bc_mp->m_super->s_dev;
> > +		__entry->btnum = cur->bc_btnum;
> > +		__entry->agno = cur->bc_private.a.agno;
> > +		__entry->agbno = cur->bc_private.a.afake->af_root;
> > +		__entry->levels = cur->bc_private.a.afake->af_levels;
> > +		__entry->blocks = cur->bc_private.a.afake->af_blocks;
> > +	),
> > +	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
> > +		  __entry->agno,
> > +		  __entry->levels,
> > +		  __entry->blocks,
> > +		  __entry->agbno)
> > +)
> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> > 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-04  3:28 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
@ 2020-03-04 18:21   ` Brian Foster
  2020-03-04 23:34     ` Darrick J. Wong
  2020-03-05  1:20   ` Dave Chinner
  1 sibling, 1 reply; 20+ messages in thread
From: Brian Foster @ 2020-03-04 18:21 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Create an in-core fake root for AG-rooted btree types so that callers
> can generate a whole new btree using the upcoming btree bulk load
> function without making the new tree accessible from the rest of the
> filesystem.  It is up to the individual btree type to provide a function
> to create a staged cursor (presumably with the appropriate callouts to
> update the fakeroot) and then commit the staged root back into the
> filesystem.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

The code all seems reasonable, mostly infrastructure. Just a few high
level comments..

It would be helpful if the commit log (or code comments) explained more
about the callouts that are replaced for a staging tree (and why).

>  fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
>  fs/xfs/xfs_trace.h        |   28 +++++++++++
>  3 files changed, 182 insertions(+), 5 deletions(-)
> 
> 
> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> index e6f898bf3174..9a7c1a4d0423 100644
> --- a/fs/xfs/libxfs/xfs_btree.c
> +++ b/fs/xfs/libxfs/xfs_btree.c
> @@ -382,6 +382,8 @@ xfs_btree_del_cursor(
>  	/*
>  	 * Free the cursor.
>  	 */
> +	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
> +		kmem_free((void *)cur->bc_ops);
>  	kmem_cache_free(xfs_btree_cur_zone, cur);
>  }
>  
> @@ -4908,3 +4910,118 @@ xfs_btree_has_more_records(
>  	else
>  		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
>  }
> +
> +/* We don't allow staging cursors to be duplicated. */
> +STATIC struct xfs_btree_cur *
> +xfs_btree_fakeroot_dup_cursor(
> +	struct xfs_btree_cur	*cur)
> +{
> +	ASSERT(0);
> +	return NULL;
> +}
> +
> +/* Refuse to allow regular block allocation for a staging cursor. */
> +STATIC int
> +xfs_btree_fakeroot_alloc_block(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*start_bno,
> +	union xfs_btree_ptr	*new_bno,
> +	int			*stat)
> +{
> +	ASSERT(0);
> +	return -EFSCORRUPTED;

Calling these is a runtime bug as opposed to corruption, right?

> +}
> +
> +/* Refuse to allow block freeing for a staging cursor. */
> +STATIC int
> +xfs_btree_fakeroot_free_block(
> +	struct xfs_btree_cur	*cur,
> +	struct xfs_buf		*bp)
> +{
> +	ASSERT(0);
> +	return -EFSCORRUPTED;
> +}
> +

For example, why do we not allow alloc/frees of blocks into a staging
tree? Is this something related to how staging trees will be constructed
vs. normal trees, or is this just stubbed in and to be implemented
later?

> +/* Initialize a pointer to the root block from the fakeroot. */
> +STATIC void
> +xfs_btree_fakeroot_init_ptr_from_cur(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*ptr)
> +{
> +	struct xbtree_afakeroot	*afake;
> +
> +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> +
> +	afake = cur->bc_private.a.afake;
> +	ptr->s = cpu_to_be32(afake->af_root);
> +}
> +
> +/* Set the root block when our tree has a fakeroot. */
> +STATIC void
> +xfs_btree_afakeroot_set_root(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*ptr,
> +	int			inc)
> +{
> +	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
> +
> +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> +	afake->af_root = be32_to_cpu(ptr->s);
> +	afake->af_levels += inc;
> +}
> +
> +/*
> + * Initialize a AG-rooted btree cursor with the given AG btree fake root.  The
> + * btree cursor's @bc_ops will be overridden as needed to make the staging
> + * functionality work.  If @new_ops is not NULL, these new ops will be passed
> + * out to the caller for further overriding.
> + */
> +void
> +xfs_btree_stage_afakeroot(
> +	struct xfs_btree_cur		*cur,
> +	struct xbtree_afakeroot		*afake,
> +	struct xfs_btree_ops		**new_ops)
> +{
> +	struct xfs_btree_ops		*nops;
> +
> +	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
> +	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
> +
> +	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
> +	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
> +	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
> +	nops->free_block = xfs_btree_fakeroot_free_block;
> +	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
> +	nops->set_root = xfs_btree_afakeroot_set_root;
> +	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
> +
> +	cur->bc_private.a.afake = afake;
> +	cur->bc_nlevels = afake->af_levels;
> +	cur->bc_ops = nops;
> +	cur->bc_flags |= XFS_BTREE_STAGING;
> +
> +	if (new_ops)
> +		*new_ops = nops;
> +}
> +
> +/*
> + * Transform an AG-rooted staging btree cursor back into a regular cursor by
> + * substituting a real btree root for the fake one and restoring normal btree
> + * cursor ops.  The caller must log the btree root change prior to calling
> + * this.
> + */
> +void
> +xfs_btree_commit_afakeroot(
> +	struct xfs_btree_cur		*cur,
> +	struct xfs_buf			*agbp,
> +	const struct xfs_btree_ops	*ops)
> +{
> +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> +
> +	trace_xfs_btree_commit_afakeroot(cur);
> +
> +	kmem_free((void *)cur->bc_ops);
> +	cur->bc_private.a.agbp = agbp;
> +	cur->bc_ops = ops;
> +	cur->bc_flags &= ~XFS_BTREE_STAGING;
> +}

Any reason this new code isn't off in a new xfs_staging_btree.c or some
such instead of xfs_btree.c?

> diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> index 3eff7c321d43..3ada085609a8 100644
> --- a/fs/xfs/libxfs/xfs_btree.h
> +++ b/fs/xfs/libxfs/xfs_btree.h
> @@ -188,6 +188,16 @@ union xfs_btree_cur_private {
>  	} abt;
>  };
>  
> +/* Private information for a AG-rooted btree. */
> +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> +	union {
> +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> +	};
> +	xfs_agnumber_t			agno;	/* ag number */
> +	union xfs_btree_cur_private	priv;
> +};
> +

Ideally refactoring this would be a separate patch from adding a new
field.

Brian

>  /*
>   * Btree cursor structure.
>   * This collects all information needed by the btree code in one place.
> @@ -209,11 +219,7 @@ typedef struct xfs_btree_cur
>  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
>  	int		bc_statoff;	/* offset of btre stats array */
>  	union {
> -		struct {			/* needed for BNO, CNT, INO */
> -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> -			xfs_agnumber_t	agno;	/* ag number */
> -			union xfs_btree_cur_private	priv;
> -		} a;
> +		struct xfs_btree_priv_ag a;
>  		struct {			/* needed for BMAP */
>  			struct xfs_inode *ip;	/* pointer to our inode */
>  			int		allocated;	/* count of alloced */
> @@ -232,6 +238,12 @@ typedef struct xfs_btree_cur
>  #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
>  #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
>  #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
> +/*
> + * The root of this btree is a fakeroot structure so that we can stage a btree
> + * rebuild without leaving it accessible via primary metadata.  The ops struct
> + * is dynamically allocated and must be freed when the cursor is deleted.
> + */
> +#define XFS_BTREE_STAGING		(1<<5)
>  
>  
>  #define	XFS_BTREE_NOERROR	0
> @@ -512,4 +524,24 @@ xfs_btree_islastblock(
>  	return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
>  }
>  
> +/* Fake root for an AG-rooted btree. */
> +struct xbtree_afakeroot {
> +	/* AG block number of the new btree root. */
> +	xfs_agblock_t		af_root;
> +
> +	/* Height of the new btree. */
> +	unsigned int		af_levels;
> +
> +	/* Number of blocks used by the btree. */
> +	unsigned int		af_blocks;
> +};
> +
> +/* Cursor interactions with with fake roots for AG-rooted btrees. */
> +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
> +		struct xbtree_afakeroot *afake,
> +		struct xfs_btree_ops **new_ops);
> +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
> +		struct xfs_buf *agbp,
> +		const struct xfs_btree_ops *ops);
> +
>  #endif	/* __XFS_BTREE_H__ */
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index e242988f57fb..57ff9f583b5f 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3594,6 +3594,34 @@ TRACE_EVENT(xfs_check_new_dalign,
>  		  __entry->calc_rootino)
>  )
>  
> +TRACE_EVENT(xfs_btree_commit_afakeroot,
> +	TP_PROTO(struct xfs_btree_cur *cur),
> +	TP_ARGS(cur),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_btnum_t, btnum)
> +		__field(xfs_agnumber_t, agno)
> +		__field(xfs_agblock_t, agbno)
> +		__field(unsigned int, levels)
> +		__field(unsigned int, blocks)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = cur->bc_mp->m_super->s_dev;
> +		__entry->btnum = cur->bc_btnum;
> +		__entry->agno = cur->bc_private.a.agno;
> +		__entry->agbno = cur->bc_private.a.afake->af_root;
> +		__entry->levels = cur->bc_private.a.afake->af_levels;
> +		__entry->blocks = cur->bc_private.a.afake->af_blocks;
> +	),
> +	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
> +		  __entry->agno,
> +		  __entry->levels,
> +		  __entry->blocks,
> +		  __entry->agbno)
> +)
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2020-03-04  3:28 [PATCH v3 0/4] xfs: btree bulk loading Darrick J. Wong
@ 2020-03-04  3:28 ` Darrick J. Wong
  2020-03-04 18:21   ` Brian Foster
  2020-03-05  1:20   ` Dave Chinner
  0 siblings, 2 replies; 20+ messages in thread
From: Darrick J. Wong @ 2020-03-04  3:28 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs, bfoster

From: Darrick J. Wong <darrick.wong@oracle.com>

Create an in-core fake root for AG-rooted btree types so that callers
can generate a whole new btree using the upcoming btree bulk load
function without making the new tree accessible from the rest of the
filesystem.  It is up to the individual btree type to provide a function
to create a staged cursor (presumably with the appropriate callouts to
update the fakeroot) and then commit the staged root back into the
filesystem.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
 fs/xfs/xfs_trace.h        |   28 +++++++++++
 3 files changed, 182 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index e6f898bf3174..9a7c1a4d0423 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -382,6 +382,8 @@ xfs_btree_del_cursor(
 	/*
 	 * Free the cursor.
 	 */
+	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+		kmem_free((void *)cur->bc_ops);
 	kmem_cache_free(xfs_btree_cur_zone, cur);
 }
 
@@ -4908,3 +4910,118 @@ xfs_btree_has_more_records(
 	else
 		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
 }
+
+/* We don't allow staging cursors to be duplicated. */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	ASSERT(0);
+	return NULL;
+}
+
+/* Refuse to allow regular block allocation for a staging cursor. */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start_bno,
+	union xfs_btree_ptr	*new_bno,
+	int			*stat)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Refuse to allow block freeing for a staging cursor. */
+STATIC int
+xfs_btree_fakeroot_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xbtree_afakeroot	*afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	afake = cur->bc_private.a.afake;
+	ptr->s = cpu_to_be32(afake->af_root);
+}
+
+/* Set the root block when our tree has a fakeroot. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	afake->af_root = be32_to_cpu(ptr->s);
+	afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG btree fake root.  The
+ * btree cursor's @bc_ops will be overridden as needed to make the staging
+ * functionality work.  If @new_ops is not NULL, these new ops will be passed
+ * out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xbtree_afakeroot		*afake,
+	struct xfs_btree_ops		**new_ops)
+{
+	struct xfs_btree_ops		*nops;
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+
+	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+	nops->free_block = xfs_btree_fakeroot_free_block;
+	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+	nops->set_root = xfs_btree_afakeroot_set_root;
+	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+	cur->bc_private.a.afake = afake;
+	cur->bc_nlevels = afake->af_levels;
+	cur->bc_ops = nops;
+	cur->bc_flags |= XFS_BTREE_STAGING;
+
+	if (new_ops)
+		*new_ops = nops;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops.  The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xfs_buf			*agbp,
+	const struct xfs_btree_ops	*ops)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	trace_xfs_btree_commit_afakeroot(cur);
+
+	kmem_free((void *)cur->bc_ops);
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_ops = ops;
+	cur->bc_flags &= ~XFS_BTREE_STAGING;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3eff7c321d43..3ada085609a8 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -188,6 +188,16 @@ union xfs_btree_cur_private {
 	} abt;
 };
 
+/* Private information for a AG-rooted btree. */
+struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
+	union {
+		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
+		struct xbtree_afakeroot	*afake;	/* fake ag header root */
+	};
+	xfs_agnumber_t			agno;	/* ag number */
+	union xfs_btree_cur_private	priv;
+};
+
 /*
  * Btree cursor structure.
  * This collects all information needed by the btree code in one place.
@@ -209,11 +219,7 @@ typedef struct xfs_btree_cur
 	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
 	int		bc_statoff;	/* offset of btre stats array */
 	union {
-		struct {			/* needed for BNO, CNT, INO */
-			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
-			xfs_agnumber_t	agno;	/* ag number */
-			union xfs_btree_cur_private	priv;
-		} a;
+		struct xfs_btree_priv_ag a;
 		struct {			/* needed for BMAP */
 			struct xfs_inode *ip;	/* pointer to our inode */
 			int		allocated;	/* count of alloced */
@@ -232,6 +238,12 @@ typedef struct xfs_btree_cur
 #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
 #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
 #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata.  The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING		(1<<5)
 
 
 #define	XFS_BTREE_NOERROR	0
@@ -512,4 +524,24 @@ xfs_btree_islastblock(
 	return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 }
 
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+	/* AG block number of the new btree root. */
+	xfs_agblock_t		af_root;
+
+	/* Height of the new btree. */
+	unsigned int		af_levels;
+
+	/* Number of blocks used by the btree. */
+	unsigned int		af_blocks;
+};
+
+/* Cursor interactions with with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+		struct xbtree_afakeroot *afake,
+		struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
+		struct xfs_buf *agbp,
+		const struct xfs_btree_ops *ops);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e242988f57fb..57ff9f583b5f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3594,6 +3594,34 @@ TRACE_EVENT(xfs_check_new_dalign,
 		  __entry->calc_rootino)
 )
 
+TRACE_EVENT(xfs_btree_commit_afakeroot,
+	TP_PROTO(struct xfs_btree_cur *cur),
+	TP_ARGS(cur),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(unsigned int, levels)
+		__field(unsigned int, blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->agno = cur->bc_private.a.agno;
+		__entry->agbno = cur->bc_private.a.afake->af_root;
+		__entry->levels = cur->bc_private.a.afake->af_levels;
+		__entry->blocks = cur->bc_private.a.afake->af_blocks;
+	),
+	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->agno,
+		  __entry->levels,
+		  __entry->blocks,
+		  __entry->agbno)
+)
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2019-10-29 23:30 [PATCH v2 0/4] xfs: btree bulk loading Darrick J. Wong
@ 2019-10-29 23:30 ` Darrick J. Wong
  0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2019-10-29 23:30 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Create an in-core fake root for AG-rooted btree types so that callers
can generate a whole new btree using the upcoming btree bulk load
function without making the new tree accessible from the rest of the
filesystem.  It is up to the individual btree type to provide a function
to create a staged cursor (presumably with the appropriate callouts to
update the fakeroot) and then commit the staged root back into the
filesystem.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_btree.c |  117 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_btree.h |   42 ++++++++++++++--
 fs/xfs/xfs_trace.h        |   28 +++++++++++
 3 files changed, 182 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 4fd89c80c821..ab0d459c10ea 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -384,6 +384,8 @@ xfs_btree_del_cursor(
 	/*
 	 * Free the cursor.
 	 */
+	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+		kmem_free((void *)cur->bc_ops);
 	kmem_zone_free(xfs_btree_cur_zone, cur);
 }
 
@@ -4936,3 +4938,118 @@ xfs_btree_has_more_records(
 	else
 		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
 }
+
+/* We don't allow staging cursors to be duplicated. */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	ASSERT(0);
+	return NULL;
+}
+
+/* Refuse to allow regular block allocation for a staging cursor. */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start_bno,
+	union xfs_btree_ptr	*new_bno,
+	int			*stat)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Refuse to allow block freeing for a staging cursor. */
+STATIC int
+xfs_btree_fakeroot_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xbtree_afakeroot	*afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	afake = cur->bc_private.a.afake;
+	ptr->s = cpu_to_be32(afake->af_root);
+}
+
+/* Set the root block when our tree has a fakeroot. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	afake->af_root = be32_to_cpu(ptr->s);
+	afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG btree fake root.  The
+ * btree cursor's @bc_ops will be overridden as needed to make the staging
+ * functionality work.  If @new_ops is not NULL, these new ops will be passed
+ * out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xbtree_afakeroot		*afake,
+	struct xfs_btree_ops		**new_ops)
+{
+	struct xfs_btree_ops		*nops;
+
+	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+
+	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+	nops->free_block = xfs_btree_fakeroot_free_block;
+	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+	nops->set_root = xfs_btree_afakeroot_set_root;
+	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+	cur->bc_private.a.afake = afake;
+	cur->bc_nlevels = afake->af_levels;
+	cur->bc_ops = nops;
+	cur->bc_flags |= XFS_BTREE_STAGING;
+
+	if (new_ops)
+		*new_ops = nops;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops.  The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xfs_buf			*agbp,
+	const struct xfs_btree_ops	*ops)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	trace_xfs_btree_commit_afakeroot(cur);
+
+	kmem_free((void *)cur->bc_ops);
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_ops = ops;
+	cur->bc_flags &= ~XFS_BTREE_STAGING;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index bac15668aeee..39ca3f0a3bf8 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -185,6 +185,16 @@ union xfs_btree_cur_private {
 	} refc;
 };
 
+/* Private information for a AG-rooted btree. */
+struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
+	union {
+		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
+		struct xbtree_afakeroot	*afake;	/* fake ag header root */
+	};
+	xfs_agnumber_t			agno;	/* ag number */
+	union xfs_btree_cur_private	priv;
+};
+
 /*
  * Btree cursor structure.
  * This collects all information needed by the btree code in one place.
@@ -206,11 +216,7 @@ typedef struct xfs_btree_cur
 	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
 	int		bc_statoff;	/* offset of btre stats array */
 	union {
-		struct {			/* needed for BNO, CNT, INO */
-			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
-			xfs_agnumber_t	agno;	/* ag number */
-			union xfs_btree_cur_private	priv;
-		} a;
+		struct xfs_btree_priv_ag a;
 		struct {			/* needed for BMAP */
 			struct xfs_inode *ip;	/* pointer to our inode */
 			int		allocated;	/* count of alloced */
@@ -229,6 +235,12 @@ typedef struct xfs_btree_cur
 #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
 #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
 #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata.  The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING		(1<<5)
 
 
 #define	XFS_BTREE_NOERROR	0
@@ -521,4 +533,24 @@ int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
 		union xfs_btree_irec *high, bool *exists);
 bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
 
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+	/* AG block number of the new btree root. */
+	xfs_agblock_t		af_root;
+
+	/* Height of the new btree. */
+	unsigned int		af_levels;
+
+	/* Number of blocks used by the btree. */
+	unsigned int		af_blocks;
+};
+
+/* Cursor interactions with with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+		struct xbtree_afakeroot *afake,
+		struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
+		struct xfs_buf *agbp,
+		const struct xfs_btree_ops *ops);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index eaae275ed430..e04ed5324696 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3609,6 +3609,34 @@ DEFINE_KMEM_EVENT(kmem_alloc_large);
 DEFINE_KMEM_EVENT(kmem_realloc);
 DEFINE_KMEM_EVENT(kmem_zone_alloc);
 
+TRACE_EVENT(xfs_btree_commit_afakeroot,
+	TP_PROTO(struct xfs_btree_cur *cur),
+	TP_ARGS(cur),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(unsigned int, levels)
+		__field(unsigned int, blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->agno = cur->bc_private.a.agno;
+		__entry->agbno = cur->bc_private.a.afake->af_root;
+		__entry->levels = cur->bc_private.a.afake->af_levels;
+		__entry->blocks = cur->bc_private.a.afake->af_blocks;
+	),
+	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->agno,
+		  __entry->levels,
+		  __entry->blocks,
+		  __entry->agbno)
+)
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2019-10-16 17:36       ` Brian Foster
@ 2019-10-25 19:04         ` Darrick J. Wong
  0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2019-10-25 19:04 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Wed, Oct 16, 2019 at 01:36:38PM -0400, Brian Foster wrote:
> On Wed, Oct 16, 2019 at 09:53:56AM -0700, Darrick J. Wong wrote:
> > On Wed, Oct 16, 2019 at 11:26:29AM -0400, Brian Foster wrote:
> > > On Wed, Oct 09, 2019 at 09:48:05AM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Create an in-core fake root for AG-rooted btree types so that callers
> > > > can generate a whole new btree using the upcoming btree bulk load
> > > > function without making the new tree accessible from the rest of the
> > > > filesystem.  It is up to the individual btree type to provide a function
> > > > to create a staged cursor (presumably with the appropriate callouts to
> > > > update the fakeroot) and then commit the staged root back into the
> > > > filesystem.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > 
> > > >  fs/xfs/libxfs/xfs_btree.c |  115 +++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/libxfs/xfs_btree.h |   43 +++++++++++++++--
> > > >  fs/xfs/xfs_trace.h        |   28 +++++++++++
> > > >  3 files changed, 181 insertions(+), 5 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> > > > index 71de937f9e64..7b18f0fa5e99 100644
> > > > --- a/fs/xfs/libxfs/xfs_btree.c
> > > > +++ b/fs/xfs/libxfs/xfs_btree.c
> > > ...
> > > > @@ -4930,3 +4932,116 @@ xfs_btree_has_more_records(
> > > ...
> > > > +/* Initialize a pointer to the root block from the fakeroot. */
> > > > +STATIC void
> > > > +xfs_btree_fakeroot_init_ptr_from_cur(
> > > > +	struct xfs_btree_cur	*cur,
> > > > +	union xfs_btree_ptr	*ptr)
> > > > +{
> > > > +	struct xbtree_afakeroot	*afake;
> > > > +
> > > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > > +
> > > > +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {

Heh, this should be XFS_BTREE_ROOT_IN_INODE, and it ought to be in the
next patch.  Will fix. :)

> > > > +		ptr->l = cpu_to_be64(0);
> > > 
> > > Why zero here? Is this just not supported?
> > 
> > <shrug> The only existing longptr btree does this too.
> > 
> > There's no root block so we might as well set the pointer to 0 to catch
> > incorrect accesses.
> > 
> > > Otherwise this seems straightforward code-wise. I think I get the
> > > general idea, but it's hard to reason further about the pieces until I
> > > see the broader context..
> > 
> > Sorry about that. :/
> > 
> 
> Not a big deal..
> 
> > You can skip to the v20 repair series to see how this all gets used in
> > the kernel, since the two sets in between are cleanups of other common
> > code.  I also have a series to refactor xfs_repair to use btree bulk
> > loading[1], if that helps.
> > 
> 
> I'll get there eventually. :P This was just an FYI that I'd probably
> have to at least make a cursory pass through the rest and come back to
> this in order to properly review.

<nod>

--D

> Brian
> 
> > --D
> > 
> > [1] https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=repair-bulk-load
> > 
> > > Brian
> > > 
> > > > +	} else {
> > > > +		afake = cur->bc_private.a.afake;
> > > > +		ptr->s = cpu_to_be32(afake->af_root);
> > > > +	}
> > > > +}
> > > > +
> > > > +/* Set the root block when our tree has a fakeroot. */
> > > > +STATIC void
> > > > +xfs_btree_afakeroot_set_root(
> > > > +	struct xfs_btree_cur	*cur,
> > > > +	union xfs_btree_ptr	*ptr,
> > > > +	int			inc)
> > > > +{
> > > > +	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
> > > > +
> > > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > > +	afake->af_root = be32_to_cpu(ptr->s);
> > > > +	afake->af_levels += inc;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Initialize a AG-rooted btree cursor with the given AG @fakeroot, and prepare
> > > > + * @ops for overriding by duplicating them into @new_ops.  The caller can
> > > > + * replace pointers in @new_ops as necessary once this call completes.  Note
> > > > + * that staging cursors can only be used for bulk loading.
> > > > + */
> > > > +void
> > > > +xfs_btree_stage_afakeroot(
> > > > +	struct xfs_btree_cur		*cur,
> > > > +	struct xbtree_afakeroot		*afake,
> > > > +	const struct xfs_btree_ops	*ops,
> > > > +	struct xfs_btree_ops		**new_ops)
> > > > +{
> > > > +	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
> > > > +	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
> > > > +
> > > > +	*new_ops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
> > > > +	memcpy(*new_ops, ops, sizeof(struct xfs_btree_ops));
> > > > +	(*new_ops)->alloc_block = xfs_btree_fakeroot_alloc_block;
> > > > +	(*new_ops)->free_block = xfs_btree_fakeroot_free_block;
> > > > +	(*new_ops)->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
> > > > +	(*new_ops)->set_root = xfs_btree_afakeroot_set_root;
> > > > +	(*new_ops)->dup_cursor = xfs_btree_fakeroot_dup_cursor;
> > > > +
> > > > +	cur->bc_private.a.afake = afake;
> > > > +	cur->bc_nlevels = afake->af_levels;
> > > > +	cur->bc_ops = *new_ops;
> > > > +	cur->bc_flags |= XFS_BTREE_STAGING;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Transform an AG-rooted staging btree cursor back into a regular btree
> > > > + * cursor.  Caller is responsible for logging changes before this.
> > > > + */
> > > > +void
> > > > +xfs_btree_commit_afakeroot(
> > > > +	struct xfs_btree_cur		*cur,
> > > > +	struct xfs_buf			*agbp,
> > > > +	const struct xfs_btree_ops	*ops)
> > > > +{
> > > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > > +
> > > > +	trace_xfs_btree_commit_afakeroot(cur);
> > > > +
> > > > +	kmem_free((void *)cur->bc_ops);
> > > > +	cur->bc_private.a.agbp = agbp;
> > > > +	cur->bc_ops = ops;
> > > > +	cur->bc_flags &= ~XFS_BTREE_STAGING;
> > > > +}
> > > > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> > > > index ced1e65d1483..453de8a49e96 100644
> > > > --- a/fs/xfs/libxfs/xfs_btree.h
> > > > +++ b/fs/xfs/libxfs/xfs_btree.h
> > > > @@ -185,6 +185,16 @@ union xfs_btree_cur_private {
> > > >  	} refc;
> > > >  };
> > > >  
> > > > +/* Private information for a AG-rooted btree. */
> > > > +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> > > > +	union {
> > > > +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> > > > +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> > > > +	};
> > > > +	xfs_agnumber_t			agno;	/* ag number */
> > > > +	union xfs_btree_cur_private	priv;
> > > > +};
> > > > +
> > > >  /*
> > > >   * Btree cursor structure.
> > > >   * This collects all information needed by the btree code in one place.
> > > > @@ -206,11 +216,7 @@ typedef struct xfs_btree_cur
> > > >  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
> > > >  	int		bc_statoff;	/* offset of btre stats array */
> > > >  	union {
> > > > -		struct {			/* needed for BNO, CNT, INO */
> > > > -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> > > > -			xfs_agnumber_t	agno;	/* ag number */
> > > > -			union xfs_btree_cur_private	priv;
> > > > -		} a;
> > > > +		struct xfs_btree_priv_ag a;
> > > >  		struct {			/* needed for BMAP */
> > > >  			struct xfs_inode *ip;	/* pointer to our inode */
> > > >  			int		allocated;	/* count of alloced */
> > > > @@ -229,6 +235,12 @@ typedef struct xfs_btree_cur
> > > >  #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
> > > >  #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
> > > >  #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
> > > > +/*
> > > > + * The root of this btree is a fakeroot structure so that we can stage a btree
> > > > + * rebuild without leaving it accessible via primary metadata.  The ops struct
> > > > + * is dynamically allocated and must be freed when the cursor is deleted.
> > > > + */
> > > > +#define XFS_BTREE_STAGING		(1<<5)
> > > >  
> > > >  
> > > >  #define	XFS_BTREE_NOERROR	0
> > > > @@ -514,4 +526,25 @@ int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
> > > >  		union xfs_btree_irec *high, bool *exists);
> > > >  bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
> > > >  
> > > > +/* Fake root for an AG-rooted btree. */
> > > > +struct xbtree_afakeroot {
> > > > +	/* AG block number of the new btree root. */
> > > > +	xfs_agblock_t		af_root;
> > > > +
> > > > +	/* Height of the new btree. */
> > > > +	unsigned int		af_levels;
> > > > +
> > > > +	/* Number of blocks used by the btree. */
> > > > +	unsigned int		af_blocks;
> > > > +};
> > > > +
> > > > +/* Cursor interactions with with fake roots for AG-rooted btrees. */
> > > > +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
> > > > +		struct xbtree_afakeroot *afake,
> > > > +		const struct xfs_btree_ops *ops,
> > > > +		struct xfs_btree_ops **new_ops);
> > > > +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
> > > > +		struct xfs_buf *agbp,
> > > > +		const struct xfs_btree_ops *ops);
> > > > +
> > > >  #endif	/* __XFS_BTREE_H__ */
> > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > > index eaae275ed430..e04ed5324696 100644
> > > > --- a/fs/xfs/xfs_trace.h
> > > > +++ b/fs/xfs/xfs_trace.h
> > > > @@ -3609,6 +3609,34 @@ DEFINE_KMEM_EVENT(kmem_alloc_large);
> > > >  DEFINE_KMEM_EVENT(kmem_realloc);
> > > >  DEFINE_KMEM_EVENT(kmem_zone_alloc);
> > > >  
> > > > +TRACE_EVENT(xfs_btree_commit_afakeroot,
> > > > +	TP_PROTO(struct xfs_btree_cur *cur),
> > > > +	TP_ARGS(cur),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_btnum_t, btnum)
> > > > +		__field(xfs_agnumber_t, agno)
> > > > +		__field(xfs_agblock_t, agbno)
> > > > +		__field(unsigned int, levels)
> > > > +		__field(unsigned int, blocks)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = cur->bc_mp->m_super->s_dev;
> > > > +		__entry->btnum = cur->bc_btnum;
> > > > +		__entry->agno = cur->bc_private.a.agno;
> > > > +		__entry->agbno = cur->bc_private.a.afake->af_root;
> > > > +		__entry->levels = cur->bc_private.a.afake->af_levels;
> > > > +		__entry->blocks = cur->bc_private.a.afake->af_blocks;
> > > > +	),
> > > > +	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
> > > > +		  __entry->agno,
> > > > +		  __entry->levels,
> > > > +		  __entry->blocks,
> > > > +		  __entry->agbno)
> > > > +)
> > > > +
> > > >  #endif /* _TRACE_XFS_H */
> > > >  
> > > >  #undef TRACE_INCLUDE_PATH
> > > > 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2019-10-16 16:53     ` Darrick J. Wong
@ 2019-10-16 17:36       ` Brian Foster
  2019-10-25 19:04         ` Darrick J. Wong
  0 siblings, 1 reply; 20+ messages in thread
From: Brian Foster @ 2019-10-16 17:36 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Wed, Oct 16, 2019 at 09:53:56AM -0700, Darrick J. Wong wrote:
> On Wed, Oct 16, 2019 at 11:26:29AM -0400, Brian Foster wrote:
> > On Wed, Oct 09, 2019 at 09:48:05AM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Create an in-core fake root for AG-rooted btree types so that callers
> > > can generate a whole new btree using the upcoming btree bulk load
> > > function without making the new tree accessible from the rest of the
> > > filesystem.  It is up to the individual btree type to provide a function
> > > to create a staged cursor (presumably with the appropriate callouts to
> > > update the fakeroot) and then commit the staged root back into the
> > > filesystem.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > >  fs/xfs/libxfs/xfs_btree.c |  115 +++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/libxfs/xfs_btree.h |   43 +++++++++++++++--
> > >  fs/xfs/xfs_trace.h        |   28 +++++++++++
> > >  3 files changed, 181 insertions(+), 5 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> > > index 71de937f9e64..7b18f0fa5e99 100644
> > > --- a/fs/xfs/libxfs/xfs_btree.c
> > > +++ b/fs/xfs/libxfs/xfs_btree.c
> > ...
> > > @@ -4930,3 +4932,116 @@ xfs_btree_has_more_records(
> > ...
> > > +/* Initialize a pointer to the root block from the fakeroot. */
> > > +STATIC void
> > > +xfs_btree_fakeroot_init_ptr_from_cur(
> > > +	struct xfs_btree_cur	*cur,
> > > +	union xfs_btree_ptr	*ptr)
> > > +{
> > > +	struct xbtree_afakeroot	*afake;
> > > +
> > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > +
> > > +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
> > > +		ptr->l = cpu_to_be64(0);
> > 
> > Why zero here? Is this just not supported?
> 
> <shrug> The only existing longptr btree does this too.
> 
> There's no root block so we might as well set the pointer to 0 to catch
> incorrect accesses.
> 
> > Otherwise this seems straightforward code-wise. I think I get the
> > general idea, but it's hard to reason further about the pieces until I
> > see the broader context..
> 
> Sorry about that. :/
> 

Not a big deal..

> You can skip to the v20 repair series to see how this all gets used in
> the kernel, since the two sets in between are cleanups of other common
> code.  I also have a series to refactor xfs_repair to use btree bulk
> loading[1], if that helps.
> 

I'll get there eventually. :P This was just an FYI that I'd probably
have to at least make a cursory pass through the rest and come back to
this in order to properly review.

Brian

> --D
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=repair-bulk-load
> 
> > Brian
> > 
> > > +	} else {
> > > +		afake = cur->bc_private.a.afake;
> > > +		ptr->s = cpu_to_be32(afake->af_root);
> > > +	}
> > > +}
> > > +
> > > +/* Set the root block when our tree has a fakeroot. */
> > > +STATIC void
> > > +xfs_btree_afakeroot_set_root(
> > > +	struct xfs_btree_cur	*cur,
> > > +	union xfs_btree_ptr	*ptr,
> > > +	int			inc)
> > > +{
> > > +	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
> > > +
> > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > +	afake->af_root = be32_to_cpu(ptr->s);
> > > +	afake->af_levels += inc;
> > > +}
> > > +
> > > +/*
> > > + * Initialize a AG-rooted btree cursor with the given AG @fakeroot, and prepare
> > > + * @ops for overriding by duplicating them into @new_ops.  The caller can
> > > + * replace pointers in @new_ops as necessary once this call completes.  Note
> > > + * that staging cursors can only be used for bulk loading.
> > > + */
> > > +void
> > > +xfs_btree_stage_afakeroot(
> > > +	struct xfs_btree_cur		*cur,
> > > +	struct xbtree_afakeroot		*afake,
> > > +	const struct xfs_btree_ops	*ops,
> > > +	struct xfs_btree_ops		**new_ops)
> > > +{
> > > +	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
> > > +	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
> > > +
> > > +	*new_ops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
> > > +	memcpy(*new_ops, ops, sizeof(struct xfs_btree_ops));
> > > +	(*new_ops)->alloc_block = xfs_btree_fakeroot_alloc_block;
> > > +	(*new_ops)->free_block = xfs_btree_fakeroot_free_block;
> > > +	(*new_ops)->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
> > > +	(*new_ops)->set_root = xfs_btree_afakeroot_set_root;
> > > +	(*new_ops)->dup_cursor = xfs_btree_fakeroot_dup_cursor;
> > > +
> > > +	cur->bc_private.a.afake = afake;
> > > +	cur->bc_nlevels = afake->af_levels;
> > > +	cur->bc_ops = *new_ops;
> > > +	cur->bc_flags |= XFS_BTREE_STAGING;
> > > +}
> > > +
> > > +/*
> > > + * Transform an AG-rooted staging btree cursor back into a regular btree
> > > + * cursor.  Caller is responsible for logging changes before this.
> > > + */
> > > +void
> > > +xfs_btree_commit_afakeroot(
> > > +	struct xfs_btree_cur		*cur,
> > > +	struct xfs_buf			*agbp,
> > > +	const struct xfs_btree_ops	*ops)
> > > +{
> > > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > > +
> > > +	trace_xfs_btree_commit_afakeroot(cur);
> > > +
> > > +	kmem_free((void *)cur->bc_ops);
> > > +	cur->bc_private.a.agbp = agbp;
> > > +	cur->bc_ops = ops;
> > > +	cur->bc_flags &= ~XFS_BTREE_STAGING;
> > > +}
> > > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> > > index ced1e65d1483..453de8a49e96 100644
> > > --- a/fs/xfs/libxfs/xfs_btree.h
> > > +++ b/fs/xfs/libxfs/xfs_btree.h
> > > @@ -185,6 +185,16 @@ union xfs_btree_cur_private {
> > >  	} refc;
> > >  };
> > >  
> > > +/* Private information for a AG-rooted btree. */
> > > +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> > > +	union {
> > > +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> > > +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> > > +	};
> > > +	xfs_agnumber_t			agno;	/* ag number */
> > > +	union xfs_btree_cur_private	priv;
> > > +};
> > > +
> > >  /*
> > >   * Btree cursor structure.
> > >   * This collects all information needed by the btree code in one place.
> > > @@ -206,11 +216,7 @@ typedef struct xfs_btree_cur
> > >  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
> > >  	int		bc_statoff;	/* offset of btre stats array */
> > >  	union {
> > > -		struct {			/* needed for BNO, CNT, INO */
> > > -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> > > -			xfs_agnumber_t	agno;	/* ag number */
> > > -			union xfs_btree_cur_private	priv;
> > > -		} a;
> > > +		struct xfs_btree_priv_ag a;
> > >  		struct {			/* needed for BMAP */
> > >  			struct xfs_inode *ip;	/* pointer to our inode */
> > >  			int		allocated;	/* count of alloced */
> > > @@ -229,6 +235,12 @@ typedef struct xfs_btree_cur
> > >  #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
> > >  #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
> > >  #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
> > > +/*
> > > + * The root of this btree is a fakeroot structure so that we can stage a btree
> > > + * rebuild without leaving it accessible via primary metadata.  The ops struct
> > > + * is dynamically allocated and must be freed when the cursor is deleted.
> > > + */
> > > +#define XFS_BTREE_STAGING		(1<<5)
> > >  
> > >  
> > >  #define	XFS_BTREE_NOERROR	0
> > > @@ -514,4 +526,25 @@ int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
> > >  		union xfs_btree_irec *high, bool *exists);
> > >  bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
> > >  
> > > +/* Fake root for an AG-rooted btree. */
> > > +struct xbtree_afakeroot {
> > > +	/* AG block number of the new btree root. */
> > > +	xfs_agblock_t		af_root;
> > > +
> > > +	/* Height of the new btree. */
> > > +	unsigned int		af_levels;
> > > +
> > > +	/* Number of blocks used by the btree. */
> > > +	unsigned int		af_blocks;
> > > +};
> > > +
> > > +/* Cursor interactions with with fake roots for AG-rooted btrees. */
> > > +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
> > > +		struct xbtree_afakeroot *afake,
> > > +		const struct xfs_btree_ops *ops,
> > > +		struct xfs_btree_ops **new_ops);
> > > +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
> > > +		struct xfs_buf *agbp,
> > > +		const struct xfs_btree_ops *ops);
> > > +
> > >  #endif	/* __XFS_BTREE_H__ */
> > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > index eaae275ed430..e04ed5324696 100644
> > > --- a/fs/xfs/xfs_trace.h
> > > +++ b/fs/xfs/xfs_trace.h
> > > @@ -3609,6 +3609,34 @@ DEFINE_KMEM_EVENT(kmem_alloc_large);
> > >  DEFINE_KMEM_EVENT(kmem_realloc);
> > >  DEFINE_KMEM_EVENT(kmem_zone_alloc);
> > >  
> > > +TRACE_EVENT(xfs_btree_commit_afakeroot,
> > > +	TP_PROTO(struct xfs_btree_cur *cur),
> > > +	TP_ARGS(cur),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(xfs_btnum_t, btnum)
> > > +		__field(xfs_agnumber_t, agno)
> > > +		__field(xfs_agblock_t, agbno)
> > > +		__field(unsigned int, levels)
> > > +		__field(unsigned int, blocks)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = cur->bc_mp->m_super->s_dev;
> > > +		__entry->btnum = cur->bc_btnum;
> > > +		__entry->agno = cur->bc_private.a.agno;
> > > +		__entry->agbno = cur->bc_private.a.afake->af_root;
> > > +		__entry->levels = cur->bc_private.a.afake->af_levels;
> > > +		__entry->blocks = cur->bc_private.a.afake->af_blocks;
> > > +	),
> > > +	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > +		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
> > > +		  __entry->agno,
> > > +		  __entry->levels,
> > > +		  __entry->blocks,
> > > +		  __entry->agbno)
> > > +)
> > > +
> > >  #endif /* _TRACE_XFS_H */
> > >  
> > >  #undef TRACE_INCLUDE_PATH
> > > 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2019-10-16 15:26   ` Brian Foster
@ 2019-10-16 16:53     ` Darrick J. Wong
  2019-10-16 17:36       ` Brian Foster
  0 siblings, 1 reply; 20+ messages in thread
From: Darrick J. Wong @ 2019-10-16 16:53 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Wed, Oct 16, 2019 at 11:26:29AM -0400, Brian Foster wrote:
> On Wed, Oct 09, 2019 at 09:48:05AM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Create an in-core fake root for AG-rooted btree types so that callers
> > can generate a whole new btree using the upcoming btree bulk load
> > function without making the new tree accessible from the rest of the
> > filesystem.  It is up to the individual btree type to provide a function
> > to create a staged cursor (presumably with the appropriate callouts to
> > update the fakeroot) and then commit the staged root back into the
> > filesystem.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> >  fs/xfs/libxfs/xfs_btree.c |  115 +++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/libxfs/xfs_btree.h |   43 +++++++++++++++--
> >  fs/xfs/xfs_trace.h        |   28 +++++++++++
> >  3 files changed, 181 insertions(+), 5 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> > index 71de937f9e64..7b18f0fa5e99 100644
> > --- a/fs/xfs/libxfs/xfs_btree.c
> > +++ b/fs/xfs/libxfs/xfs_btree.c
> ...
> > @@ -4930,3 +4932,116 @@ xfs_btree_has_more_records(
> ...
> > +/* Initialize a pointer to the root block from the fakeroot. */
> > +STATIC void
> > +xfs_btree_fakeroot_init_ptr_from_cur(
> > +	struct xfs_btree_cur	*cur,
> > +	union xfs_btree_ptr	*ptr)
> > +{
> > +	struct xbtree_afakeroot	*afake;
> > +
> > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > +
> > +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
> > +		ptr->l = cpu_to_be64(0);
> 
> Why zero here? Is this just not supported?

<shrug> The only existing longptr btree does this too.

There's no root block so we might as well set the pointer to 0 to catch
incorrect accesses.

> Otherwise this seems straightforward code-wise. I think I get the
> general idea, but it's hard to reason further about the pieces until I
> see the broader context..

Sorry about that. :/

You can skip to the v20 repair series to see how this all gets used in
the kernel, since the two sets in between are cleanups of other common
code.  I also have a series to refactor xfs_repair to use btree bulk
loading[1], if that helps.

--D

[1] https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=repair-bulk-load

> Brian
> 
> > +	} else {
> > +		afake = cur->bc_private.a.afake;
> > +		ptr->s = cpu_to_be32(afake->af_root);
> > +	}
> > +}
> > +
> > +/* Set the root block when our tree has a fakeroot. */
> > +STATIC void
> > +xfs_btree_afakeroot_set_root(
> > +	struct xfs_btree_cur	*cur,
> > +	union xfs_btree_ptr	*ptr,
> > +	int			inc)
> > +{
> > +	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
> > +
> > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > +	afake->af_root = be32_to_cpu(ptr->s);
> > +	afake->af_levels += inc;
> > +}
> > +
> > +/*
> > + * Initialize a AG-rooted btree cursor with the given AG @fakeroot, and prepare
> > + * @ops for overriding by duplicating them into @new_ops.  The caller can
> > + * replace pointers in @new_ops as necessary once this call completes.  Note
> > + * that staging cursors can only be used for bulk loading.
> > + */
> > +void
> > +xfs_btree_stage_afakeroot(
> > +	struct xfs_btree_cur		*cur,
> > +	struct xbtree_afakeroot		*afake,
> > +	const struct xfs_btree_ops	*ops,
> > +	struct xfs_btree_ops		**new_ops)
> > +{
> > +	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
> > +	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
> > +
> > +	*new_ops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
> > +	memcpy(*new_ops, ops, sizeof(struct xfs_btree_ops));
> > +	(*new_ops)->alloc_block = xfs_btree_fakeroot_alloc_block;
> > +	(*new_ops)->free_block = xfs_btree_fakeroot_free_block;
> > +	(*new_ops)->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
> > +	(*new_ops)->set_root = xfs_btree_afakeroot_set_root;
> > +	(*new_ops)->dup_cursor = xfs_btree_fakeroot_dup_cursor;
> > +
> > +	cur->bc_private.a.afake = afake;
> > +	cur->bc_nlevels = afake->af_levels;
> > +	cur->bc_ops = *new_ops;
> > +	cur->bc_flags |= XFS_BTREE_STAGING;
> > +}
> > +
> > +/*
> > + * Transform an AG-rooted staging btree cursor back into a regular btree
> > + * cursor.  Caller is responsible for logging changes before this.
> > + */
> > +void
> > +xfs_btree_commit_afakeroot(
> > +	struct xfs_btree_cur		*cur,
> > +	struct xfs_buf			*agbp,
> > +	const struct xfs_btree_ops	*ops)
> > +{
> > +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> > +
> > +	trace_xfs_btree_commit_afakeroot(cur);
> > +
> > +	kmem_free((void *)cur->bc_ops);
> > +	cur->bc_private.a.agbp = agbp;
> > +	cur->bc_ops = ops;
> > +	cur->bc_flags &= ~XFS_BTREE_STAGING;
> > +}
> > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> > index ced1e65d1483..453de8a49e96 100644
> > --- a/fs/xfs/libxfs/xfs_btree.h
> > +++ b/fs/xfs/libxfs/xfs_btree.h
> > @@ -185,6 +185,16 @@ union xfs_btree_cur_private {
> >  	} refc;
> >  };
> >  
> > +/* Private information for a AG-rooted btree. */
> > +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> > +	union {
> > +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> > +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> > +	};
> > +	xfs_agnumber_t			agno;	/* ag number */
> > +	union xfs_btree_cur_private	priv;
> > +};
> > +
> >  /*
> >   * Btree cursor structure.
> >   * This collects all information needed by the btree code in one place.
> > @@ -206,11 +216,7 @@ typedef struct xfs_btree_cur
> >  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
> >  	int		bc_statoff;	/* offset of btre stats array */
> >  	union {
> > -		struct {			/* needed for BNO, CNT, INO */
> > -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> > -			xfs_agnumber_t	agno;	/* ag number */
> > -			union xfs_btree_cur_private	priv;
> > -		} a;
> > +		struct xfs_btree_priv_ag a;
> >  		struct {			/* needed for BMAP */
> >  			struct xfs_inode *ip;	/* pointer to our inode */
> >  			int		allocated;	/* count of alloced */
> > @@ -229,6 +235,12 @@ typedef struct xfs_btree_cur
> >  #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
> >  #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
> >  #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
> > +/*
> > + * The root of this btree is a fakeroot structure so that we can stage a btree
> > + * rebuild without leaving it accessible via primary metadata.  The ops struct
> > + * is dynamically allocated and must be freed when the cursor is deleted.
> > + */
> > +#define XFS_BTREE_STAGING		(1<<5)
> >  
> >  
> >  #define	XFS_BTREE_NOERROR	0
> > @@ -514,4 +526,25 @@ int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
> >  		union xfs_btree_irec *high, bool *exists);
> >  bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
> >  
> > +/* Fake root for an AG-rooted btree. */
> > +struct xbtree_afakeroot {
> > +	/* AG block number of the new btree root. */
> > +	xfs_agblock_t		af_root;
> > +
> > +	/* Height of the new btree. */
> > +	unsigned int		af_levels;
> > +
> > +	/* Number of blocks used by the btree. */
> > +	unsigned int		af_blocks;
> > +};
> > +
> > +/* Cursor interactions with with fake roots for AG-rooted btrees. */
> > +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
> > +		struct xbtree_afakeroot *afake,
> > +		const struct xfs_btree_ops *ops,
> > +		struct xfs_btree_ops **new_ops);
> > +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
> > +		struct xfs_buf *agbp,
> > +		const struct xfs_btree_ops *ops);
> > +
> >  #endif	/* __XFS_BTREE_H__ */
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index eaae275ed430..e04ed5324696 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3609,6 +3609,34 @@ DEFINE_KMEM_EVENT(kmem_alloc_large);
> >  DEFINE_KMEM_EVENT(kmem_realloc);
> >  DEFINE_KMEM_EVENT(kmem_zone_alloc);
> >  
> > +TRACE_EVENT(xfs_btree_commit_afakeroot,
> > +	TP_PROTO(struct xfs_btree_cur *cur),
> > +	TP_ARGS(cur),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_btnum_t, btnum)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(xfs_agblock_t, agbno)
> > +		__field(unsigned int, levels)
> > +		__field(unsigned int, blocks)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = cur->bc_mp->m_super->s_dev;
> > +		__entry->btnum = cur->bc_btnum;
> > +		__entry->agno = cur->bc_private.a.agno;
> > +		__entry->agbno = cur->bc_private.a.afake->af_root;
> > +		__entry->levels = cur->bc_private.a.afake->af_levels;
> > +		__entry->blocks = cur->bc_private.a.afake->af_blocks;
> > +	),
> > +	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
> > +		  __entry->agno,
> > +		  __entry->levels,
> > +		  __entry->blocks,
> > +		  __entry->agbno)
> > +)
> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> > 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2019-10-09 16:48 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
@ 2019-10-16 15:26   ` Brian Foster
  2019-10-16 16:53     ` Darrick J. Wong
  0 siblings, 1 reply; 20+ messages in thread
From: Brian Foster @ 2019-10-16 15:26 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Wed, Oct 09, 2019 at 09:48:05AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Create an in-core fake root for AG-rooted btree types so that callers
> can generate a whole new btree using the upcoming btree bulk load
> function without making the new tree accessible from the rest of the
> filesystem.  It is up to the individual btree type to provide a function
> to create a staged cursor (presumably with the appropriate callouts to
> update the fakeroot) and then commit the staged root back into the
> filesystem.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

>  fs/xfs/libxfs/xfs_btree.c |  115 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_btree.h |   43 +++++++++++++++--
>  fs/xfs/xfs_trace.h        |   28 +++++++++++
>  3 files changed, 181 insertions(+), 5 deletions(-)
> 
> 
> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> index 71de937f9e64..7b18f0fa5e99 100644
> --- a/fs/xfs/libxfs/xfs_btree.c
> +++ b/fs/xfs/libxfs/xfs_btree.c
...
> @@ -4930,3 +4932,116 @@ xfs_btree_has_more_records(
...
> +/* Initialize a pointer to the root block from the fakeroot. */
> +STATIC void
> +xfs_btree_fakeroot_init_ptr_from_cur(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*ptr)
> +{
> +	struct xbtree_afakeroot	*afake;
> +
> +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> +
> +	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
> +		ptr->l = cpu_to_be64(0);

Why zero here? Is this just not supported?

Otherwise this seems straightforward code-wise. I think I get the
general idea, but it's hard to reason further about the pieces until I
see the broader context..

Brian

> +	} else {
> +		afake = cur->bc_private.a.afake;
> +		ptr->s = cpu_to_be32(afake->af_root);
> +	}
> +}
> +
> +/* Set the root block when our tree has a fakeroot. */
> +STATIC void
> +xfs_btree_afakeroot_set_root(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*ptr,
> +	int			inc)
> +{
> +	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
> +
> +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> +	afake->af_root = be32_to_cpu(ptr->s);
> +	afake->af_levels += inc;
> +}
> +
> +/*
> + * Initialize a AG-rooted btree cursor with the given AG @fakeroot, and prepare
> + * @ops for overriding by duplicating them into @new_ops.  The caller can
> + * replace pointers in @new_ops as necessary once this call completes.  Note
> + * that staging cursors can only be used for bulk loading.
> + */
> +void
> +xfs_btree_stage_afakeroot(
> +	struct xfs_btree_cur		*cur,
> +	struct xbtree_afakeroot		*afake,
> +	const struct xfs_btree_ops	*ops,
> +	struct xfs_btree_ops		**new_ops)
> +{
> +	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
> +	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
> +
> +	*new_ops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
> +	memcpy(*new_ops, ops, sizeof(struct xfs_btree_ops));
> +	(*new_ops)->alloc_block = xfs_btree_fakeroot_alloc_block;
> +	(*new_ops)->free_block = xfs_btree_fakeroot_free_block;
> +	(*new_ops)->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
> +	(*new_ops)->set_root = xfs_btree_afakeroot_set_root;
> +	(*new_ops)->dup_cursor = xfs_btree_fakeroot_dup_cursor;
> +
> +	cur->bc_private.a.afake = afake;
> +	cur->bc_nlevels = afake->af_levels;
> +	cur->bc_ops = *new_ops;
> +	cur->bc_flags |= XFS_BTREE_STAGING;
> +}
> +
> +/*
> + * Transform an AG-rooted staging btree cursor back into a regular btree
> + * cursor.  Caller is responsible for logging changes before this.
> + */
> +void
> +xfs_btree_commit_afakeroot(
> +	struct xfs_btree_cur		*cur,
> +	struct xfs_buf			*agbp,
> +	const struct xfs_btree_ops	*ops)
> +{
> +	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
> +
> +	trace_xfs_btree_commit_afakeroot(cur);
> +
> +	kmem_free((void *)cur->bc_ops);
> +	cur->bc_private.a.agbp = agbp;
> +	cur->bc_ops = ops;
> +	cur->bc_flags &= ~XFS_BTREE_STAGING;
> +}
> diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> index ced1e65d1483..453de8a49e96 100644
> --- a/fs/xfs/libxfs/xfs_btree.h
> +++ b/fs/xfs/libxfs/xfs_btree.h
> @@ -185,6 +185,16 @@ union xfs_btree_cur_private {
>  	} refc;
>  };
>  
> +/* Private information for a AG-rooted btree. */
> +struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
> +	union {
> +		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
> +		struct xbtree_afakeroot	*afake;	/* fake ag header root */
> +	};
> +	xfs_agnumber_t			agno;	/* ag number */
> +	union xfs_btree_cur_private	priv;
> +};
> +
>  /*
>   * Btree cursor structure.
>   * This collects all information needed by the btree code in one place.
> @@ -206,11 +216,7 @@ typedef struct xfs_btree_cur
>  	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
>  	int		bc_statoff;	/* offset of btre stats array */
>  	union {
> -		struct {			/* needed for BNO, CNT, INO */
> -			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
> -			xfs_agnumber_t	agno;	/* ag number */
> -			union xfs_btree_cur_private	priv;
> -		} a;
> +		struct xfs_btree_priv_ag a;
>  		struct {			/* needed for BMAP */
>  			struct xfs_inode *ip;	/* pointer to our inode */
>  			int		allocated;	/* count of alloced */
> @@ -229,6 +235,12 @@ typedef struct xfs_btree_cur
>  #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
>  #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
>  #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
> +/*
> + * The root of this btree is a fakeroot structure so that we can stage a btree
> + * rebuild without leaving it accessible via primary metadata.  The ops struct
> + * is dynamically allocated and must be freed when the cursor is deleted.
> + */
> +#define XFS_BTREE_STAGING		(1<<5)
>  
>  
>  #define	XFS_BTREE_NOERROR	0
> @@ -514,4 +526,25 @@ int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
>  		union xfs_btree_irec *high, bool *exists);
>  bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
>  
> +/* Fake root for an AG-rooted btree. */
> +struct xbtree_afakeroot {
> +	/* AG block number of the new btree root. */
> +	xfs_agblock_t		af_root;
> +
> +	/* Height of the new btree. */
> +	unsigned int		af_levels;
> +
> +	/* Number of blocks used by the btree. */
> +	unsigned int		af_blocks;
> +};
> +
> +/* Cursor interactions with with fake roots for AG-rooted btrees. */
> +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
> +		struct xbtree_afakeroot *afake,
> +		const struct xfs_btree_ops *ops,
> +		struct xfs_btree_ops **new_ops);
> +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
> +		struct xfs_buf *agbp,
> +		const struct xfs_btree_ops *ops);
> +
>  #endif	/* __XFS_BTREE_H__ */
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index eaae275ed430..e04ed5324696 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3609,6 +3609,34 @@ DEFINE_KMEM_EVENT(kmem_alloc_large);
>  DEFINE_KMEM_EVENT(kmem_realloc);
>  DEFINE_KMEM_EVENT(kmem_zone_alloc);
>  
> +TRACE_EVENT(xfs_btree_commit_afakeroot,
> +	TP_PROTO(struct xfs_btree_cur *cur),
> +	TP_ARGS(cur),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_btnum_t, btnum)
> +		__field(xfs_agnumber_t, agno)
> +		__field(xfs_agblock_t, agbno)
> +		__field(unsigned int, levels)
> +		__field(unsigned int, blocks)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = cur->bc_mp->m_super->s_dev;
> +		__entry->btnum = cur->bc_btnum;
> +		__entry->agno = cur->bc_private.a.agno;
> +		__entry->agbno = cur->bc_private.a.afake->af_root;
> +		__entry->levels = cur->bc_private.a.afake->af_levels;
> +		__entry->blocks = cur->bc_private.a.afake->af_blocks;
> +	),
> +	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
> +		  __entry->agno,
> +		  __entry->levels,
> +		  __entry->blocks,
> +		  __entry->agbno)
> +)
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees
  2019-10-09 16:47 [PATCH 0/4] xfs: btree bulk loading Darrick J. Wong
@ 2019-10-09 16:48 ` Darrick J. Wong
  2019-10-16 15:26   ` Brian Foster
  0 siblings, 1 reply; 20+ messages in thread
From: Darrick J. Wong @ 2019-10-09 16:48 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Create an in-core fake root for AG-rooted btree types so that callers
can generate a whole new btree using the upcoming btree bulk load
function without making the new tree accessible from the rest of the
filesystem.  It is up to the individual btree type to provide a function
to create a staged cursor (presumably with the appropriate callouts to
update the fakeroot) and then commit the staged root back into the
filesystem.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_btree.c |  115 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_btree.h |   43 +++++++++++++++--
 fs/xfs/xfs_trace.h        |   28 +++++++++++
 3 files changed, 181 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 71de937f9e64..7b18f0fa5e99 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -384,6 +384,8 @@ xfs_btree_del_cursor(
 	/*
 	 * Free the cursor.
 	 */
+	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+		kmem_free((void *)cur->bc_ops);
 	kmem_zone_free(xfs_btree_cur_zone, cur);
 }
 
@@ -4930,3 +4932,116 @@ xfs_btree_has_more_records(
 	else
 		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
 }
+
+/* We don't allow staging cursors to be duplicated. */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	ASSERT(0);
+	return NULL;
+}
+
+/* Refuse to allow regular block allocation for a staging cursor. */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start_bno,
+	union xfs_btree_ptr	*new_bno,
+	int			*stat)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Refuse to allow block freeing for a staging cursor. */
+STATIC int
+xfs_btree_fakeroot_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xbtree_afakeroot	*afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		ptr->l = cpu_to_be64(0);
+	} else {
+		afake = cur->bc_private.a.afake;
+		ptr->s = cpu_to_be32(afake->af_root);
+	}
+}
+
+/* Set the root block when our tree has a fakeroot. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xbtree_afakeroot	*afake = cur->bc_private.a.afake;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	afake->af_root = be32_to_cpu(ptr->s);
+	afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG @fakeroot, and prepare
+ * @ops for overriding by duplicating them into @new_ops.  The caller can
+ * replace pointers in @new_ops as necessary once this call completes.  Note
+ * that staging cursors can only be used for bulk loading.
+ */
+void
+xfs_btree_stage_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xbtree_afakeroot		*afake,
+	const struct xfs_btree_ops	*ops,
+	struct xfs_btree_ops		**new_ops)
+{
+	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+
+	*new_ops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+	memcpy(*new_ops, ops, sizeof(struct xfs_btree_ops));
+	(*new_ops)->alloc_block = xfs_btree_fakeroot_alloc_block;
+	(*new_ops)->free_block = xfs_btree_fakeroot_free_block;
+	(*new_ops)->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+	(*new_ops)->set_root = xfs_btree_afakeroot_set_root;
+	(*new_ops)->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+	cur->bc_private.a.afake = afake;
+	cur->bc_nlevels = afake->af_levels;
+	cur->bc_ops = *new_ops;
+	cur->bc_flags |= XFS_BTREE_STAGING;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular btree
+ * cursor.  Caller is responsible for logging changes before this.
+ */
+void
+xfs_btree_commit_afakeroot(
+	struct xfs_btree_cur		*cur,
+	struct xfs_buf			*agbp,
+	const struct xfs_btree_ops	*ops)
+{
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+	trace_xfs_btree_commit_afakeroot(cur);
+
+	kmem_free((void *)cur->bc_ops);
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_ops = ops;
+	cur->bc_flags &= ~XFS_BTREE_STAGING;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index ced1e65d1483..453de8a49e96 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -185,6 +185,16 @@ union xfs_btree_cur_private {
 	} refc;
 };
 
+/* Private information for a AG-rooted btree. */
+struct xfs_btree_priv_ag {			/* needed for BNO, CNT, INO */
+	union {
+		struct xfs_buf		*agbp;	/* agf/agi buffer pointer */
+		struct xbtree_afakeroot	*afake;	/* fake ag header root */
+	};
+	xfs_agnumber_t			agno;	/* ag number */
+	union xfs_btree_cur_private	priv;
+};
+
 /*
  * Btree cursor structure.
  * This collects all information needed by the btree code in one place.
@@ -206,11 +216,7 @@ typedef struct xfs_btree_cur
 	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
 	int		bc_statoff;	/* offset of btre stats array */
 	union {
-		struct {			/* needed for BNO, CNT, INO */
-			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
-			xfs_agnumber_t	agno;	/* ag number */
-			union xfs_btree_cur_private	priv;
-		} a;
+		struct xfs_btree_priv_ag a;
 		struct {			/* needed for BMAP */
 			struct xfs_inode *ip;	/* pointer to our inode */
 			int		allocated;	/* count of alloced */
@@ -229,6 +235,12 @@ typedef struct xfs_btree_cur
 #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
 #define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
 #define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata.  The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING		(1<<5)
 
 
 #define	XFS_BTREE_NOERROR	0
@@ -514,4 +526,25 @@ int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
 		union xfs_btree_irec *high, bool *exists);
 bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
 
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+	/* AG block number of the new btree root. */
+	xfs_agblock_t		af_root;
+
+	/* Height of the new btree. */
+	unsigned int		af_levels;
+
+	/* Number of blocks used by the btree. */
+	unsigned int		af_blocks;
+};
+
+/* Cursor interactions with with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+		struct xbtree_afakeroot *afake,
+		const struct xfs_btree_ops *ops,
+		struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur,
+		struct xfs_buf *agbp,
+		const struct xfs_btree_ops *ops);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index eaae275ed430..e04ed5324696 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3609,6 +3609,34 @@ DEFINE_KMEM_EVENT(kmem_alloc_large);
 DEFINE_KMEM_EVENT(kmem_realloc);
 DEFINE_KMEM_EVENT(kmem_zone_alloc);
 
+TRACE_EVENT(xfs_btree_commit_afakeroot,
+	TP_PROTO(struct xfs_btree_cur *cur),
+	TP_ARGS(cur),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_btnum_t, btnum)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(unsigned int, levels)
+		__field(unsigned int, blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->btnum = cur->bc_btnum;
+		__entry->agno = cur->bc_private.a.agno;
+		__entry->agbno = cur->bc_private.a.afake->af_root;
+		__entry->levels = cur->bc_private.a.afake->af_levels;
+		__entry->blocks = cur->bc_private.a.afake->af_blocks;
+	),
+	TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __entry->agno,
+		  __entry->levels,
+		  __entry->blocks,
+		  __entry->agbno)
+)
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH


^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2020-03-05 17:39 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-01  1:00 [PATCH v2 0/4] xfs: btree bulk loading Darrick J. Wong
2020-01-01  1:00 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
2020-01-01  1:01 ` [PATCH 2/4] xfs: introduce fake roots for inode-rooted btrees Darrick J. Wong
2020-01-01  1:01 ` [PATCH 3/4] xfs: support bulk loading of staged btrees Darrick J. Wong
2020-01-01  1:01 ` [PATCH 4/4] xfs: support staging cursors for per-AG btree types Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2020-03-04  3:28 [PATCH v3 0/4] xfs: btree bulk loading Darrick J. Wong
2020-03-04  3:28 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
2020-03-04 18:21   ` Brian Foster
2020-03-04 23:34     ` Darrick J. Wong
2020-03-04 23:53       ` Dave Chinner
2020-03-05  1:23         ` Darrick J. Wong
2020-03-05 14:30       ` Brian Foster
2020-03-05 17:39         ` Darrick J. Wong
2020-03-05  1:20   ` Dave Chinner
2020-03-05  1:23     ` Darrick J. Wong
2019-10-29 23:30 [PATCH v2 0/4] xfs: btree bulk loading Darrick J. Wong
2019-10-29 23:30 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
2019-10-09 16:47 [PATCH 0/4] xfs: btree bulk loading Darrick J. Wong
2019-10-09 16:48 ` [PATCH 1/4] xfs: introduce fake roots for ag-rooted btrees Darrick J. Wong
2019-10-16 15:26   ` Brian Foster
2019-10-16 16:53     ` Darrick J. Wong
2019-10-16 17:36       ` Brian Foster
2019-10-25 19:04         ` Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.